def __init__(self, seed, neighbourhood_size, stopping_criterion): self.seed = check_random_state(seed) self.elite = None self.random_state = check_random_state(seed) self.neighborhood = list() self.neighborhood_size = neighbourhood_size self.stopping_criterion = stopping_criterion self.current_generation = 0
def __init__(self, n_topics, alpha=0.1, beta=0.01, random_state=0): self.n_topics = n_topics self.alpha = alpha self.beta = beta self.random_state = random_state rng = utils.check_random_state(random_state) self._rands = rng.rand(1024**2 // 8)
def optimal_transport_two_gaussians(mu_1, cov_1, mu_2, cov_2, nsamples=1, random_state=None): """ .. seealso:: Remark 2.31 of Peyré and Cuturi (2020) http://arxiv.org/pdf/1803.00567.pdf """ rng = check_random_state(random_state) dim = len(cov_1) sqrt_cov_1 = la.sqrtm(cov_1) inv_sqrt_cov_1 = la.inv(sqrt_cov_1) scaling_matrix = la.sqrtm(sqrt_cov_1.dot(cov_2).dot(sqrt_cov_1)) scaling_matrix = inv_sqrt_cov_1.dot(scaling_matrix).dot(inv_sqrt_cov_1) X = rng.randn(nsamples, dim) X = X.dot(sqrt_cov_1) + mu_1 Y = mu_2 + (X - mu_1).dot(scaling_matrix) return X, Y
def make_user_item_regression(random_state=123, n_user=20, n_item=20, label_stdev=0.4, rank=2, bias=True, first_order=True, stdev_w0=.2, stdev_w=0.3, stdev_V=0.4, mean_w0=2, mean_w=5, mean_V=10): n_features = n_user + n_item n_samples = n_user * n_item # create design matrix user_cols = np.repeat(range(n_user), n_item) item_cols = np.array(range(n_item) * n_user) + n_user cols = np.hstack((user_cols, item_cols)) rows = np.hstack((np.arange(n_item*n_user), np.arange(n_item*n_user))) X = sp.coo_matrix((np.ones_like(cols, dtype=np.float64), (rows, cols))) X = sp.csc_matrix(X) assert X.shape[0] == n_samples assert X.shape[1] == n_features # sample the model parameter random_state = check_random_state(random_state) w0 = random_state.normal(mean_w0, stdev_w0) w = random_state.normal(mean_w, stdev_w, n_features) V = random_state.normal(mean_V, stdev_V, (rank, n_features)) y = ffm_predict(w0, w, V, X) if label_stdev > 0: y = random_state.normal(y, label_stdev) return X, y, (w0, w, V)
def __init__( self, params: Dict, batch_size: int = 1, n_sims: int = 1000, distortion: Tuple[float, float] = None, seed: int = None, min_bin_length: int = 4, normalize_samples: bool = False, train: bool = True, n_workers: int = 1, ) -> None: self.params = params self.batch_size = batch_size self.rng = check_random_state(seed) self.seed = seed self.distortion = distortion self.normalize_samples = normalize_samples self.n_sims = n_sims self.train = train self.n_workers = n_workers self.batch_seed_sequence = np.random.SeedSequence(seed) self.seed = seed
def __iter__(self): if not self.train: self.batch_seed_sequence = np.random.SeedSequence(self.seed) self.rng = check_random_state(self.seed) with concurrent.futures.ProcessPoolExecutor(self.n_workers) as executor: futures = [ executor.submit( SimulationBatch( distortion=self.distortion, variable_binning=self.params["variable_binning"], start=self.params["start"], varying_start_value=self.params["varying_start_value"], n_sims=self.batch_size, n_agents=self.params["n_agents"], timesteps=self.params["timesteps"], seed=np.random.default_rng(rng), compute_fiv=self.params["compute_fiv"], normalize_samples=self.normalize_samples, ).next ) for rng in self.batch_seed_sequence.spawn(self.n_sims // self.batch_size) ] for future in concurrent.futures.as_completed(futures): yield future.result()
def __init__( self, distortion: Tuple[float, float] = None, variable_binning = False, start: float = 0.5, varying_start_value: bool = False, n_sims: int = 1000, n_agents: int = 1000, timesteps: int = 200, seed: int = None, compute_fiv: bool = False, normalize_samples: bool = False, ) -> None: self.rng = check_random_state(seed) self.distortion = distortion if distortion is not None: self.distortion = Distorter(*distortion, seed=self.rng) if variable_binning and self.distortion is None: raise ValueError("Variable binning requires distortion prior") self.variable = variable_binning self.n_sims = n_sims self.start = start self.varying_start_value = varying_start_value self.n_agents = n_agents self.timesteps = timesteps self.compute_fiv = compute_fiv self.normalize_samples = normalize_samples self.seed = seed self.n_samples = 0 self.set_priors()
def maximal_coupling_rejection(p, q, nsamples=1, random_state=None): """Rejection sampler to obtain pairs (X, Y) such that :math:`X \\sim p`, :math:`Y \\sim q` and the event :math:`\\{X=Y\\}` occurs with maximal probability. :param p: distribution of X :param q: distribution of Y Both parameter distributions ``p, q`` must have the methods - ``.rvs()`` to generate samples - ``.logpdf()`` to evaluate the log probability density function .. seealso:: `scipy.stats module <https://docs.scipy.org/doc/scipy/reference/stats.html>`_ """ rng = check_random_state(random_state) for _ in range(nsamples): X = p.rvs(random_state=rng) if np.log(rng.rand()) < (q.logpdf(X) - p.logpdf(X)): return X, X Y = q.rvs(random_state=rng) while np.log(rng.rand()) < (p.logpdf(Y) - q.logpdf(Y)): Y = q.rvs(random_state=rng) return X, Y
def __init__(self, n, sig_init, T_nov, v_min=5.0, sp_min=2.5, random_state=check_random_state(0)): """Initialize igmm Parameters: n - dimension of data sig_init - scale for initial covariance matrix T_nov - novelty constant 0 < T_nov <= 1, defines distance that new data point must be from any other components in order to create a new component. Bigger means that more components will be created and T_nov = 1 means that every point will have a new component. The authors used 0.01 v_min - how many updates must pass before checking if a component should be removed sp_min - minimum cumulative probability to keep a component """ self.n_dim = n self.n_components = 0 self.sig_init = sig_init self.T_nov = T_nov self.v_min = v_min self.sp_min = sp_min self.random_state = random_state
def maximal_coupling_two_bernoullis(p, q, nsamples=1, random_state=None): """Rejection sampler to obtain pairs (X, Y) such that :math:`X \\sim p`, :math:`Y \\sim q` and the event :math:`\\{X=Y\\}` occurs with maximal probability. .. seealso:: `scipy.stats module <https://docs.scipy.org/doc/scipy/reference/stats.html>`_ """ rng = check_random_state(random_state) if p == q: X = rng.rand(nsamples) < p return X, X ber_p, ber_q = np.array((1 - p, p)), np.array((1 - q, q)) min_pq = np.minimum(ber_p, ber_q) int_min_pq = np.sum(min_pq) ber_p_tld = (ber_p - min_pq) / (1.0 - int_min_pq) ber_q_tld = (ber_q - min_pq) / (1.0 - int_min_pq) gam = np.diag(min_pq) + (1.0 - int_min_pq) * np.outer(ber_p_tld, ber_q_tld) sample = rng.choice(4, size=nsamples, p=gam.ravel()) X = np.array([0, 0, 1, 1]) Y = np.array([0, 1, 0, 1]) return X[sample], Y[sample], gam
def simulate_xy(path: str, T: int, sigma2_v: float, sigma2_w: float, sigma2_x1: float, random_state=None): if os.path.exists(path): with open(path, mode='rb') as f: return pickle.load(f) else: random_state = check_random_state(random_state) x = np.empty(shape=T, dtype=float) y = np.empty(shape=T, dtype=float) x_0 = random_state.normal(loc=0.0, scale=np.sqrt(sigma2_x1)) x_prev = x_0 for n in range(T): v = random_state.normal(loc=0.0, scale=np.sqrt(sigma2_v)) x[n] = x_prev / 2 + 25 * (x_prev / (1 + np.power(x_prev, 2))) + 8 * np.cos( 1.2 * (n + 1)) + v x_prev = x[n] w = random_state.normal(loc=0.0, scale=np.sqrt(sigma2_w)) y[n] = np.power(x[n], 2) / 20 + w x = np.append(x_0, x) # y = y[:, np.newaxis] with open(path, mode='wb') as f: pickle.dump((x, y), f) return x, y
def __init__(self, n_topic: int, n_iter=2000, alpha=0.1, beta=0.01, random_state=None, refresh=10): """ :param n_topic: 主题数目 :param n_iter: 迭代次数 :param alpha: 文档主题分布超参数 :param beta: 主题单词分布超参数 :param random_state: 随机种子 :param refresh: 循环多少次输出当前日志 """ self.n_topic = n_topic self.n_iter = n_iter self.alpha = alpha self.beta = beta # if random_state is None, check_random_state(None) does nothing self.random_state = random_state self.refresh = refresh self.topic_word_ = None self.doc_topic_ = None self.nzt_ = None if alpha <= 0 or beta <= 0: raise ValueError('alpha and eta must be greater than zero') # random number that are reused rng = utils.check_random_state(random_state) self._rands = rng.rand(1024**2 // 8) # 1MiB of random variates
def make_ldpc(n_code, d_v, d_c, systematic=False, sparse=True, seed=None): """Create an LDPC coding and decoding matrices H and G. Parameters ---------- n_code: int, Length of the codewords. d_v: int, Number of parity-check equations including a certain bit. d_c: int, Number of bits in the same parity-check equation. d_c Must be greater or equal to d_v and must divide n. seed: int, seed of the random generator. systematic: boolean, default False. if True, constructs a systematic coding matrix G. Returns: -------- H: array (n_equations, n_code). Parity check matrix of an LDPC code with code length `n_code` and `n_equations` number of equations. G: (n_code, n_bits) array coding matrix. """ seed = utils.check_random_state(seed) H = parity_check_matrix(n_code, d_v, d_c, seed=seed) if systematic: H, G = coding_matrix_systematic(H, sparse=sparse) else: G = coding_matrix(H, sparse=sparse) return H, G
def fit(self, corpus): """ :param corpus: array-like, shape (n_samples, n_features) Training data, where n_samples in the number of samples and n_features is the number of features. Sparse matrix allowed. :return: """ random_state = utils.check_random_state(self.random_state) rands = self._rands.copy() self._initialize(corpus) # 初始化所有有关信息 for n_iter in range(self.n_iter): random_state.shuffle(rands) if n_iter % self.refresh == 0: pp = self.perplexity(corpus) logger.info('<{}> log likelihood: {:.0f}'.format(n_iter, pp)) self._sample_topics(rands) pp = self.perplexity(corpus) logger.info('<{}> log likelihood: {:.0f}'.format(self.n_iter, pp)) # 计算文档主题分布和主题单词分布 self._count_distribution() # 删除计算过程中的中间值,以节约空间 del self.TS del self.DS del self.ZS del self.ndz_ return self
def fit(self): """ fit the model with X for one iteration """ random_state = utils.check_random_state(self.random_state) rands = self._rands.copy() self.update_potential() _lda._sample_topics(self.WS, self.DS, self.ZS, self.nzw, self.ndz, self.nz, self.alpha, self.beta, self.beta_sum, rands, self.potential)
def wright_fisher(N, T, selection_strength, start=0.5, random_state=None): rng = check_random_state(random_state) series = np.zeros(T) series[0] = int(start * N) for i in range(1, T): p_star = ( series[i - 1] * (1 + selection_strength) / (series[i - 1] * selection_strength + N) ) series[i] = rng.binomial(N, min(p_star, 1)) return series
def __init__(self, N, mu=0.001, b=0, burn_in=1000, timesteps=10, seed=None, verbose=False): self.N = N self.mu = mu self.b = b self.burn_in = burn_in self.timesteps = timesteps self.rng = check_random_state(seed) self.verbose = verbose self.n_traits = N self.population = np.arange(self.N) self.trait_fd = []
def __init__(self, n_samples, prior, proposal, supports, theta_init=None, random_state=None): self.n_samples = n_samples self.prior = prior self.proposal = proposal self.supports = supports self.theta_init = theta_init self.random_state = check_random_state(random_state)
def sample_gibbs_sweep(self, nb_sweeps=1, random_sate=None): rng = check_random_state(random_sate) n_rows, n_cols = self.shape for _ in range(nb_sweeps): for i in range(n_rows): for j in range(n_cols): sum_ngbr_01 = sum(self.state_matrix[self.neighbors(i, j)]) cond_proba = self.cond_proba[sum_ngbr_01] self.state_matrix[i, j] = rng.rand() < cond_proba return self.state_matrix, rng
def parity_check_matrix(n_code, d_v, d_c, seed=None): """ Build a regular Parity-Check Matrix H following Callager's algorithm. Parameters ---------- n_code: int, Length of the codewords. d_v: int, Number of parity-check equations including a certain bit. Must be greater or equal to 2. d_c: int, Number of bits in the same parity-check equation. d_c Must be greater or equal to d_v and must divide n. seed: int, seed of the random generator. Returns ------- H: array (n_equations, n_code). LDPC regular matrix H. Where n_equations = d_v * n / d_c, the total number of parity-check equations. """ rng = utils.check_random_state(seed) if d_v <= 1: raise ValueError("""d_v must be at least 2.""") if d_c <= d_v: raise ValueError("""d_c must be greater than d_v.""") if n_code % d_c: raise ValueError("""d_c must divide n for a regular LDPC matrix H.""") n_equations = (n_code * d_v) // d_c block = np.zeros((n_equations // d_v, n_code), dtype=int) H = np.empty((n_equations, n_code)) block_size = n_equations // d_v # Filling the first block with consecutive ones in each row of the block for i in range(block_size): for j in range(i * d_c, (i + 1) * d_c): block[i, j] = 1 H[:block_size] = block # reate remaining blocks by permutations of the first block's columns: for i in range(1, d_v): H[i * block_size:(i + 1) * block_size] = rng.permutation(block.T).T H = H.astype(int) return H
def __init__(self, n_topics, n_iter, alpha=0.1, eta=0.01, random_state=None, refresh=10): self.n_topics = n_topics self.n_iter = n_iter self.alpha = alpha self.eta = eta # if random_state is None, check_random_state(None) does nothing # other than return the current numpy RandomState self.random_state = random_state self.refresh = refresh if alpha <= 0 or eta <= 0: raise ValueError("alpha and eta must be greater than zero") # random numbers that are reused rng = utils.check_random_state(random_state) self._rands = rng.rand(1024**2 // 8) # 1MiB of random variates
def __init__(self, maze, num_advs=3, rewards={'*': 10}, terminal_markers='*', action_error_prob=0, random_state=None, directions="NSEWK"): self.maze = Maze(maze) if not isinstance(maze, Maze) else maze self.rewards = rewards self.terminal_markers = terminal_markers self.action_error_prob = action_error_prob self.random_state = check_random_state(random_state) self.num_advs = num_advs self.area = np.product(self.maze.shape[-2:]) self.vol = np.product(self.maze.shape) # find all available position self.available_state = self.maze.flat_positions_containing('.') # find all food court position self.food_court = self.maze.flat_positions_containing('&') # randomly choose position of adversary self.adversaries_position = np.random.choice(self.available_state, replace=False, size=self.num_advs) # randomly choose food court people self.fooder = [] if self.food_court: self.fooder = np.random.choice(self.food_court, replace=False, size=3) # find stairs self.stairs = self.maze.flat_positions_containing('%') # dict to upstair or downstair self.climb = {} for st in self.stairs: if st-self.area in self.stairs: self.climb[st] = st-self.area if st+self.area in self.stairs: self.climb[st] = st+self.area self.actions = [maze_actions[direction] for direction in directions] self.num_actions = len(self.actions) self.state = None self.reset() self.num_states = self.maze.shape[0] * self.maze.shape[1] * self.maze.shape[2] self.current_situation = None self.adversary_actions = None self.total_hit = 0
def _fit(self, X): random_state = utils.check_random_state(self.random_state) rands = self._rands.copy() self._initialize(X) for it in range(self.n_iter): random_state.shuffle(rands) if it % self.refresh == 0: ll = self.loglikelihood() logger.info("<{}> log likelihood: {:.0f}".format(it, ll)) self.loglikelihoods_.append(ll) self._sample_topics(rands) ll = self.loglikelihood() self.components_ = (self.nzw_ + self.eta).astype(float) self.components_ /= np.sum(self.components_, axis=1)[:, np.newaxis] self.topic_word_ = self.components_ self.doc_topic_ = (self.ndz_ + self.alpha).astype(float) self.doc_topic_ /= np.sum(self.doc_topic_, axis=1)[:, np.newaxis] # delete attributes no longer needed after fitting to save memory and reduce clutter del self.WS del self.DS del self.ZS return self,self.components_,self.doc_topic_,self.topic_word_
def __init__(self, mutation_level, is_input_node=False, _computational_layer=None, cl=True, seed=None, input_shape=None, semantics=None, input_data=None, semantic_input=False, semantic_input_node=None, computational_layer=None, semantics_computational_layer=None, input_node=None, depth=1): self.mutation_level = mutation_level self.input_node = input_node self._computational_layer = _computational_layer self.computational_layer = computational_layer self.is_input_node = is_input_node self.cl = cl self.input_shape = input_shape self.output_shape = None self.out_connections = [] self.depth = depth # depending on computational_layer is = to previous or plus one self.random_state = check_random_state(seed) self.semantics = semantics self.semantic_input = semantic_input self.semantic_input_node = semantic_input_node self.input_data = input_data self.semantics_computational_layer = semantics_computational_layer if self._computational_layer is None: self._get_random_layer() if semantics is None: self.set_computational_layer() self._input_node_output_shape()
def maximal_coupling_reflexion_two_gaussians(mu_1, mu_2, cov, nsamples=1, random_state=None): rng = check_random_state(random_state) L = la.cholesky(cov, lower=True) z = la.solve_triangular(L, mu_1 - mu_2, lower=True, unit_diagonal=False) e = z / la.norm(z) dim = len(cov) X = rng.randn(nsamples, dim) Y = X.copy() gauss = multivariate_normal(mean=np.zeros(dim)) # gaussian N(0, I) accept = np.log(rng.rand(nsamples)) < gauss.logpdf(X + z) - gauss.logpdf(X) Y[accept] += z Y[~accept] -= np.outer(X[~accept].dot(e), 2.0 * e) # Householder reflexion return X.dot(L.T) + mu_1, Y.dot(L.T) + mu_2
def common_random_number_coupling(p, q, nsamples=1, random_state=None): """ :param p: distribution of X with methods - ``.rvs()`` to generate samples from distribution :math:`p` - ``.cdf()`` to evaluate the cumulative distribution function :param q: distribution of Y with method - ``.ppf()`` to evaluate the inverse cumulative distribution function .. seealso:: `scipy.stats module <https://docs.scipy.org/doc/scipy/reference/stats.html>`_ """ rng = check_random_state(random_state) X = p.rvs(size=nsamples, random_state=rng) Y = q.ppf(p.cdf(X)) return X, Y
def fit(self, X): n_samples = X.shape[0] random_state = utils.check_random_state(self.random_state) centroids = self.init_centroids(X, 'random', random_state) new_centroids = centroids.copy() clusters = np.zeros(n_samples) for _ in range(self.max_ite): for i in range(n_samples): distances = np.sum((centroids - X[i])**2, axis=1) clusters[i] = np.argsort(distances)[0] for i in range(self.n_clusters): new_centroids[i] = X[clusters == i].mean(axis=0) if np.sum(new_centroids == centroids) == 4: break centroids = new_centroids self.centroids = centroids self.clusters = clusters return self
def rvs(self, size=1, random_state=None): rng = check_random_state(random_state) unif = uniform.rvs(size=size, random_state=rng) return self.ppf(unif)
nsamples = data.get('nsamples') print('nsamples = {}'.format(nsamples)) p = data.get('p_distribution').get('class') p_samples = data.get('p_distribution').get('samples') q = data.get('q_distribution').get('class') q_samples = data.get('q_distribution').get('samples') except FileNotFoundError: print('No previous samples were saved under {}'.format(file_name)) print('Generating samples may take a while') p, q = p_distribution(), q_distribution() nsamples = int(1e4) seed = 0 rng = check_random_state(seed) unif_01 = rng.rand(nsamples) p_samples, q_samples = p.ppf(unif_01), q.ppf(unif_01) data = { 'seed': seed, 'nsamples': nsamples, 'p_distribution': { 'class': p, 'samples': p_samples }, 'q_distribution': { 'class': q, 'samples': q_samples } }
def fit(self, X, y, sample_weight=None, check_input=True): """Build a decision tree from the training set (X, y). Parameters ---------- X : array-like or sparse matrix, shape = [n_samples, n_features] The training input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csc_matrix``. y : array-like, shape = [n_samples] or [n_samples, n_outputs] The target values (class labels in classification, real numbers in regression). In the regression case, use ``dtype=np.float64`` and ``order='C'`` for maximum efficiency. sample_weight : array-like, shape = [n_samples] or None Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. In the case of classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. check_input : boolean, (default=True) Allow to bypass several input checking. Don't use this parameter unless you know what you do. Returns ------- self : object Returns self. """ random_state = check_random_state(self.random_state) if check_input: X = check_array(X, dtype=DTYPE) # Determine output settings n_samples, self.n_features_ = X.shape is_classification = isinstance(self, ClassifierMixin) y = np.atleast_1d(y) expanded_class_weight = None if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] if is_classification: y = np.copy(y) self.classes_ = [] self.n_classes_ = [] if self.class_weight is not None: y_original = np.copy(y) for k in range(self.n_outputs_): classes_k, y[:, k] = np.unique(y[:, k], return_inverse=True) self.classes_.append(classes_k) self.n_classes_.append(classes_k.shape[0]) if self.class_weight is not None: expanded_class_weight = compute_sample_weight( self.class_weight, y_original) else: self.classes_ = [None] * self.n_outputs_ self.n_classes_ = [1] * self.n_outputs_ self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) # Check parameters max_depth = ((2 ** 31) - 1 if self.max_depth is None else self.max_depth) max_leaf_nodes = (-1 if self.max_leaf_nodes is None else self.max_leaf_nodes) if isinstance(self.max_features, string_types): if self.max_features == "auto": if is_classification: max_features = max(1, int(np.sqrt(self.n_features_))) else: max_features = self.n_features_ elif self.max_features == "sqrt": max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "log2": max_features = max(1, int(np.log2(self.n_features_))) else: raise ValueError( 'Invalid value for max_features. Allowed string ' 'values are "auto", "sqrt" or "log2".') elif self.max_features is None: max_features = self.n_features_ elif isinstance(self.max_features, (numbers.Integral, np.integer)): max_features = self.max_features else: # float if self.max_features > 0.0: max_features = max(1, int(self.max_features * self.n_features_)) else: max_features = 0 self.max_features_ = max_features if len(y) != n_samples: raise ValueError("Number of labels=%d does not match " "number of samples=%d" % (len(y), n_samples)) if self.min_samples_split <= 0: raise ValueError("min_samples_split must be greater than zero.") if self.min_samples_leaf <= 0: raise ValueError("min_samples_leaf must be greater than zero.") if not 0 <= self.min_weight_fraction_leaf <= 0.5: raise ValueError("min_weight_fraction_leaf must in [0, 0.5]") if max_depth <= 0: raise ValueError("max_depth must be greater than zero. ") if not (0 < max_features <= self.n_features_): raise ValueError("max_features must be in (0, n_features]") if not isinstance(max_leaf_nodes, (numbers.Integral, np.integer)): raise ValueError("max_leaf_nodes must be integral number but was " "%r" % max_leaf_nodes) if -1 < max_leaf_nodes < 2: raise ValueError(("max_leaf_nodes {0} must be either smaller than " "0 or larger than 1").format(max_leaf_nodes)) if sample_weight is not None: if (getattr(sample_weight, "dtype", None) != DOUBLE or not sample_weight.flags.contiguous): sample_weight = np.ascontiguousarray( sample_weight, dtype=DOUBLE) if len(sample_weight.shape) > 1: raise ValueError("Sample weights array has more " "than one dimension: %d" % len(sample_weight.shape)) if len(sample_weight) != n_samples: raise ValueError("Number of weights=%d does not match " "number of samples=%d" % (len(sample_weight), n_samples)) if expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: sample_weight = expanded_class_weight # Set min_weight_leaf from min_weight_fraction_leaf if self.min_weight_fraction_leaf != 0. and sample_weight is not None: min_weight_leaf = (self.min_weight_fraction_leaf * np.sum(sample_weight)) else: min_weight_leaf = 0. # Set min_samples_split sensibly min_samples_split = max(self.min_samples_split, 2 * self.min_samples_leaf) # Build tree criterion = self.criterion if not isinstance(criterion, Criterion): if is_classification: criterion = CRITERIA_CLF[self.criterion](self.n_outputs_, self.n_classes_) else: criterion = CRITERIA_REG[self.criterion](self.n_outputs_) SPLITTERS = DENSE_SPLITTERS splitter = self.splitter if not isinstance(self.splitter, Splitter): splitter = SPLITTERS[self.splitter](criterion, self.max_features_, self.min_samples_leaf, min_weight_leaf, random_state) self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_) # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise if max_leaf_nodes < 0: builder = DepthFirstTreeBuilder(splitter, min_samples_split, self.min_samples_leaf, min_weight_leaf, max_depth) else: builder = BestFirstTreeBuilder(splitter, min_samples_split, self.min_samples_leaf, min_weight_leaf, max_depth, max_leaf_nodes) builder.build(self.tree_, X, y, sample_weight) if self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] return self
def _random_init(X, n_components, random_state=None): rs = check_random_state(random_state) return [rs.rand(X.shape[i], n_components) for i in range(len(X.shape))]
def smacof(similarities, metric=True, n_components=2, init=None, n_init=8, n_jobs=1, max_iter=300, verbose=0, eps=1e-3, random_state=None): """ Computes multidimensional scaling using SMACOF (Scaling by Majorizing a Complicated Function) algorithm The SMACOF algorithm is a multidimensional scaling algorithm: it minimizes a objective function, the *stress*, using a majorization technique. The Stress Majorization, also known as the Guttman Transform, guarantees a monotone convergence of Stress, and is more powerful than traditional techniques such as gradient descent. The SMACOF algorithm for metric MDS can summarized by the following steps: 1. Set an initial start configuration, randomly or not. 2. Compute the stress 3. Compute the Guttman Transform 4. Iterate 2 and 3 until convergence. The nonmetric algorithm adds a monotonic regression steps before computing the stress. Parameters ---------- similarities : symmetric ndarray, shape (n_samples, n_samples) similarities between the points metric : boolean, optional, default: True compute metric or nonmetric SMACOF algorithm n_components : int, optional, default: 2 number of dimension in which to immerse the similarities overridden if initial array is provided. init : {None or ndarray of shape (n_samples, n_components)}, optional if None, randomly chooses the initial configuration if ndarray, initialize the SMACOF algorithm with this array n_init : int, optional, default: 8 Number of time the smacof algorithm will be run with different initialisation. The final results will be the best output of the n_init consecutive runs in terms of stress. n_jobs : int, optional, default: 1 The number of jobs to use for the computation. This works by breaking down the pairwise matrix into n_jobs even slices and computing them in parallel. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. max_iter : int, optional, default: 300 Maximum number of iterations of the SMACOF algorithm for a single run verbose : int, optional, default: 0 level of verbosity eps : float, optional, default: 1e-6 relative tolerance w.r.t stress to declare converge random_state : integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. Returns ------- X : ndarray (n_samples,n_components) Coordinates of the n_samples points in a n_components-space stress : float The final value of the stress (sum of squared distance of the disparities and the distances for all constrained points) Notes ----- "Modern Multidimensional Scaling - Theory and Applications" Borg, I.; Groenen P. Springer Series in Statistics (1997) "Nonmetric multidimensional scaling: a numerical method" Kruskal, J. Psychometrika, 29 (1964) "Multidimensional scaling by optimizing goodness of fit to a nonmetric hypothesis" Kruskal, J. Psychometrika, 29, (1964) """ similarities, = check_arrays(similarities, sparse_format='dense') random_state = check_random_state(random_state) if hasattr(init, '__array__'): init = np.asarray(init).copy() if not n_init == 1: warnings.warn( 'Explicit initial positions passed: ' 'performing only one init of the MDS instead of %d' % n_init) n_init = 1 best_pos, best_stress = None, None if n_jobs == 1: for it in range(n_init): pos, stress = _smacof_single(similarities, metric=metric, n_components=n_components, init=init, max_iter=max_iter, verbose=verbose, eps=eps, random_state=random_state) if best_stress is None or stress < best_stress: best_stress = stress best_pos = pos.copy() else: seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) results = Parallel(n_jobs=n_jobs, verbose=max(verbose - 1, 0))( delayed(_smacof_single)( similarities, metric=metric, n_components=n_components, init=init, max_iter=max_iter, verbose=verbose, eps=eps, random_state=seed) for seed in seeds) positions, stress = zip(*results) best = np.argmin(stress) best_stress = stress[best] best_pos = positions[best] return best_pos, best_stress
def _smacof_single(similarities, metric=True, n_components=2, init=None, max_iter=300, verbose=0, eps=1e-3, random_state=None): """ Computes multidimensional scaling using SMACOF algorithm Parameters ---------- similarities: symmetric ndarray, shape [n * n] similarities between the points metric: boolean, optional, default: True compute metric or nonmetric SMACOF algorithm n_components: int, optional, default: 2 number of dimension in which to immerse the similarities overwritten if initial array is provided. init: {None or ndarray}, optional if None, randomly chooses the initial configuration if ndarray, initialize the SMACOF algorithm with this array max_iter: int, optional, default: 300 Maximum number of iterations of the SMACOF algorithm for a single run verbose: int, optional, default: 0 level of verbosity eps: float, optional, default: 1e-6 relative tolerance w.r.t stress to declare converge random_state: integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. Returns ------- X: ndarray (n_samples, n_components), float coordinates of the n_samples points in a n_components-space stress_: float The final value of the stress (sum of squared distance of the disparities and the distances for all constrained points) """ n_samples = similarities.shape[0] random_state = check_random_state(random_state) if similarities.shape[0] != similarities.shape[1]: raise ValueError("similarities must be a square array (shape=%d)" % n_samples) if not np.allclose(similarities, similarities.T): raise ValueError("similarities must be symmetric") sim_flat = ((1 - np.tri(n_samples)) * similarities).ravel() sim_flat_w = sim_flat[sim_flat != 0] if init is None: # Randomly choose initial configuration X = random_state.rand(n_samples * n_components) X = X.reshape((n_samples, n_components)) else: # overrides the parameter p n_components = init.shape[1] if n_samples != init.shape[0]: raise ValueError("init matrix should be of shape (%d, %d)" % (n_samples, n_components)) X = init old_stress = None ir = IsotonicRegression() for it in range(max_iter): # Compute distance and monotonic regression dis = euclidean_distances(X) if metric: disparities = similarities else: dis_flat = dis.ravel() # similarities with 0 are considered as missing values dis_flat_w = dis_flat[sim_flat != 0] # Compute the disparities using a monotonic regression disparities_flat = ir.fit_transform(sim_flat_w, dis_flat_w) disparities = dis_flat.copy() disparities[sim_flat != 0] = disparities_flat disparities = disparities.reshape((n_samples, n_samples)) disparities *= np.sqrt((n_samples * (n_samples - 1) / 2) / (disparities ** 2).sum()) # Compute stress stress = ((dis.ravel() - disparities.ravel()) ** 2).sum() / 2 # Update X using the Guttman transform dis[dis == 0] = 1e-5 ratio = disparities / dis B = - ratio B[np.arange(len(B)), np.arange(len(B))] += ratio.sum(axis=1) X = 1. / n_samples * np.dot(B, X) dis = np.sqrt((X ** 2).sum(axis=1)).sum() if verbose >= 2: print('it: %d, stress %s' % (it, stress)) if old_stress is not None: if(old_stress - stress / dis) < eps: if verbose: print('breaking at iteration %d with stress %s' % (it, stress)) break old_stress = stress / dis return X, stress
def main(): algorithm = args.algorithm path = './auto_regulation_{}'.format(algorithm) random_state = check_random_state(1) if not os.path.exists(path): os.makedirs(path) t, y = load_data('./data/auto_regulation.pickle') n_samples = args.n_samples n_particles = args.n_particles burn_in = args.burn_in thinning = args.thinning state_init = np.array([8, 8, 8, 5]) const = { 'k': 10, 'c5': 0.1, 'c6': 0.9, 'observation_std': 2.0, 'times': np.concatenate(([0.0], t)) } prior = Prior([ # U(-7,2) as in the paper. However, in SciPy, the distribution is given as U(loc,loc+scale). stats.uniform(-7, 9), stats.uniform(-7, 9), stats.uniform(-7, 9), stats.uniform(-7, 9), stats.uniform(-7, 9), stats.uniform(-7, 9) ]) proposal = Proposal([ Distribution(stats.norm, scale=0.08), Distribution(stats.norm, scale=0.08), Distribution(stats.norm, scale=0.08), Distribution(stats.norm, scale=0.08), Distribution(stats.norm, scale=0.08), Distribution(stats.norm, scale=0.08) ]) theta_init = stats.uniform.rvs(loc=-7, scale=9, size=6) random_state = check_random_state(1) if algorithm == 'abcmh': alpha = args.alpha hpr_p = args.hpr_p kernel = args.kernel mcmc = ABCAutoRegulation(n_samples=n_samples, n_particles=n_particles, alpha=alpha, hpr_p=hpr_p, state_init=state_init, const=const, kernel=kernel, prior=prior, proposal=proposal, theta_init=theta_init, random_state=random_state) else: mcmc = ParticleAutoRegulation(n_samples=n_samples, n_particles=n_particles, state_init=state_init, const=const, prior=prior, proposal=proposal, theta_init=theta_init, random_state=random_state) sampled_theta_path = os.path.join(path, 'sampled_theta.pickle') if os.path.exists(sampled_theta_path): with open(sampled_theta_path, 'rb') as f: theta = pickle.load(f) else: theta = mcmc.do_inference(y) with open(sampled_theta_path, 'wb') as f: pickle.dump(theta, f) theta_true = np.log(np.array([0.1, 0.7, 0.35, 0.2, 0.3, 0.1])) theta = np.exp(theta) theta = theta[burn_in::thinning] truth = np.exp(theta_true) pretty_names = [r'$c_1$', r'$c_2$', r'$c_3$', r'$c_4$', r'$c_7$', r'$c_8$'] for i in range(theta.shape[1]): param_name = pretty_names[i] param_values = theta[:, i] fig, (ax1, ax2, ax3) = plt.subplots(3, 1, dpi=300) plt.suptitle(param_name) ax1.set_title('Trace plot') ax1.plot(param_values, color='dimgrey') ax1.axhline(truth[i], color='crimson', lw=2) plot_acf(param_values, lags=100, ax=ax2, color='dimgrey') ax3.set_title('Histogram') ax3.hist(param_values, density=True, bins=30, color='dimgrey') ax3.axvline(truth[i], color='crimson', lw=2) plt.show()
def fit(self, X, y, sample_weight=None, check_input=True): """Build a decision tree from the training set (X, y). Parameters ---------- X : array-like or sparse matrix, shape = [n_samples, n_features] The training input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csc_matrix``. y : array-like, shape = [n_samples] or [n_samples, n_outputs] The target values (class labels in classification, real numbers in regression). In the regression case, use ``dtype=np.float64`` and ``order='C'`` for maximum efficiency. sample_weight : array-like, shape = [n_samples] or None Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. In the case of classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. check_input : boolean, (default=True) Allow to bypass several input checking. Don't use this parameter unless you know what you do. Returns ------- self : object Returns self. """ random_state = check_random_state(self.random_state) if check_input: X = check_array(X, dtype=DTYPE) # Determine output settings n_samples, self.n_features_ = X.shape is_classification = isinstance(self, ClassifierMixin) y = np.atleast_1d(y) expanded_class_weight = None if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] if is_classification: y = np.copy(y) self.classes_ = [] self.n_classes_ = [] if self.class_weight is not None: y_original = np.copy(y) for k in range(self.n_outputs_): classes_k, y[:, k] = np.unique(y[:, k], return_inverse=True) self.classes_.append(classes_k) self.n_classes_.append(classes_k.shape[0]) if self.class_weight is not None: expanded_class_weight = compute_sample_weight( self.class_weight, y_original) else: self.classes_ = [None] * self.n_outputs_ self.n_classes_ = [1] * self.n_outputs_ self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) # Check parameters max_depth = ((2**31) - 1 if self.max_depth is None else self.max_depth) max_leaf_nodes = (-1 if self.max_leaf_nodes is None else self.max_leaf_nodes) if isinstance(self.max_features, string_types): if self.max_features == "auto": if is_classification: max_features = max(1, int(np.sqrt(self.n_features_))) else: max_features = self.n_features_ elif self.max_features == "sqrt": max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "log2": max_features = max(1, int(np.log2(self.n_features_))) else: raise ValueError( 'Invalid value for max_features. Allowed string ' 'values are "auto", "sqrt" or "log2".') elif self.max_features is None: max_features = self.n_features_ elif isinstance(self.max_features, (numbers.Integral, np.integer)): max_features = self.max_features else: # float if self.max_features > 0.0: max_features = max(1, int(self.max_features * self.n_features_)) else: max_features = 0 self.max_features_ = max_features if len(y) != n_samples: raise ValueError("Number of labels=%d does not match " "number of samples=%d" % (len(y), n_samples)) if self.min_samples_split <= 0: raise ValueError("min_samples_split must be greater than zero.") if self.min_samples_leaf <= 0: raise ValueError("min_samples_leaf must be greater than zero.") if not 0 <= self.min_weight_fraction_leaf <= 0.5: raise ValueError("min_weight_fraction_leaf must in [0, 0.5]") if max_depth <= 0: raise ValueError("max_depth must be greater than zero. ") if not (0 < max_features <= self.n_features_): raise ValueError("max_features must be in (0, n_features]") if not isinstance(max_leaf_nodes, (numbers.Integral, np.integer)): raise ValueError("max_leaf_nodes must be integral number but was " "%r" % max_leaf_nodes) if -1 < max_leaf_nodes < 2: raise ValueError(("max_leaf_nodes {0} must be either smaller than " "0 or larger than 1").format(max_leaf_nodes)) if sample_weight is not None: if (getattr(sample_weight, "dtype", None) != DOUBLE or not sample_weight.flags.contiguous): sample_weight = np.ascontiguousarray(sample_weight, dtype=DOUBLE) if len(sample_weight.shape) > 1: raise ValueError("Sample weights array has more " "than one dimension: %d" % len(sample_weight.shape)) if len(sample_weight) != n_samples: raise ValueError("Number of weights=%d does not match " "number of samples=%d" % (len(sample_weight), n_samples)) if expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: sample_weight = expanded_class_weight # Set min_weight_leaf from min_weight_fraction_leaf if self.min_weight_fraction_leaf != 0. and sample_weight is not None: min_weight_leaf = (self.min_weight_fraction_leaf * np.sum(sample_weight)) else: min_weight_leaf = 0. # Set min_samples_split sensibly min_samples_split = max(self.min_samples_split, 2 * self.min_samples_leaf) # Build tree criterion = self.criterion if not isinstance(criterion, Criterion): if is_classification: criterion = CRITERIA_CLF[self.criterion](self.n_outputs_, self.n_classes_) else: criterion = CRITERIA_REG[self.criterion](self.n_outputs_) SPLITTERS = DENSE_SPLITTERS splitter = self.splitter if not isinstance(self.splitter, Splitter): splitter = SPLITTERS[self.splitter](criterion, self.max_features_, self.min_samples_leaf, min_weight_leaf, random_state) self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_) # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise if max_leaf_nodes < 0: builder = DepthFirstTreeBuilder(splitter, min_samples_split, self.min_samples_leaf, min_weight_leaf, max_depth) else: builder = BestFirstTreeBuilder(splitter, min_samples_split, self.min_samples_leaf, min_weight_leaf, max_depth, max_leaf_nodes) builder.build(self.tree_, X, y, sample_weight) if self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] return self