def test_scalar(self): evs_zero_seed = { 'MT19937': 844, 'SFMT19937': 857, 'WH': 0, 'MT2203': 890, 'MCG31': 0, 'R250': 229, 'MRG32K3A': 0, 'MCG59': 0 } for brng_algo in evs_zero_seed: s = rnd.RandomState(0, brng=brng_algo) assert_equal(s.get_state()[0], brng_algo) assert_equal(s.randint(1000), evs_zero_seed[brng_algo]) evs_max_seed = { 'MT19937': 635, 'SFMT19937': 25, 'WH': 100, 'MT2203': 527, 'MCG31': 0, 'R250': 229, 'MRG32K3A': 961, 'MCG59': 0 } for brng_algo in evs_max_seed: s = rnd.RandomState(4294967295, brng=brng_algo) assert_equal(s.get_state()[0], brng_algo) assert_equal(s.randint(1000), evs_max_seed[brng_algo])
def test_array(self): s = rnd.RandomState(range(10), brng='MT19937') assert_equal(s.randint(1000), 410) s = rnd.RandomState(np.arange(10), brng='MT19937') assert_equal(s.randint(1000), 410) s = rnd.RandomState([0], brng='MT19937') assert_equal(s.randint(1000), 844) s = rnd.RandomState([4294967295], brng='MT19937') assert_equal(s.randint(1000), 635)
def split_vertex_labels(num_vertices, proportion_censored, rng=None): """ Adapts tensorflow dataset to produce another element in the labels dictionary corresponding to whether the vertex is in the training or testing set. Parameters ---------- num_vertices: The number of vertices in the graph. proportion_censored: The proportion of graph to censor rng: An instance of np.random.RandomState used to split the data. If None, a deterministic split will be chosen (corresponding to seeding with 42). Returns ------- fn: A function that can be used to map a dataset to censor some of the vertex labels. """ if rng is None: rng = random.RandomState(42) split = rng.binomial(1, 1 - proportion_censored, size=num_vertices).astype(np.int32) def fn(data): vertex_id = data['vertex_index'] sample_split = tf.gather(split, vertex_id) return {**data, 'split': sample_split} return fn
def __init__(self): self.rng = mkl_random.RandomState(brng='nondeterm') if self.rng.get_state()[0] != 'NON_DETERMINISTIC': raise RuntimeError( 'RNG returned by mkl_random is not non-deterministic RDRAND (most likely, substituted by default Mersenne Twister, ' 'since mkl_random installation on one of the nodes is of the version which does not yet support RDRAND)' )
def test_call_within_randomstate(self): # Check that custom RandomState does not call into global state m = rnd.RandomState() res = np.array([5, 7, 5, 4, 5, 5, 6, 9, 6, 1]) for i in range(3): rnd.seed(i) m.seed(4321, brng='SFMT19937') # If m.state is not honored, the result will change assert_array_equal(m.choice(10, size=10, p=np.ones(10) / 10.), res)
def test_multinomial(self): rs = rnd.RandomState(self.seed, brng=self.brng) actual = rs.multinomial(20, [1 / 6.] * 6, size=(3, 2)) desired = np.full((3, 2), 20, dtype=actual.dtype) np.testing.assert_array_equal(actual.sum(axis=-1), desired) expected = np.array([[[6, 2, 1, 3, 2, 6], [7, 5, 1, 2, 3, 2]], [[5, 1, 8, 3, 2, 1], [4, 6, 0, 4, 4, 2]], [[6, 3, 1, 4, 4, 2], [3, 2, 4, 2, 1, 8]]], actual.dtype) np.testing.assert_array_equal(actual, expected)
def check_function(self, function, sz): from threading import Thread out1 = np.empty((len(self.seeds),) + sz) out2 = np.empty((len(self.seeds),) + sz) # threaded generation t = [Thread(target=function, args=(rnd.RandomState(s), o)) for s, o in zip(self.seeds, out1)] [x.start() for x in t] [x.join() for x in t] # the same serial for s, o in zip(self.seeds, out2): function(rnd.RandomState(s), o) # these platforms change x87 fpu precision mode in threads if (np.intp().dtype.itemsize == 4 and sys.platform == "win32"): np.testing.assert_array_almost_equal(out1, out2) else: np.testing.assert_array_equal(out1, out2)
def main(): import itertools if mkl: brngs = [ 'WH', 'PHILOX4X32X10', 'MT2203', 'MCG59', 'MCG31', 'MT19937', 'MRG32K3A', 'SFMT19937', 'R250' ] else: brngs = [np.random.MT19937, np.random.Philox] samplers = { 'uniform': sample_uniform, 'normal': sample_normal, 'gamma': sample_gamma, 'beta': sample_beta, 'randint': sample_randint, 'poisson': sample_poisson, 'hypergeom': sample_hypergeom } multipliers = { 'uniform': 10, 'normal': 2, 'gamma': 1, 'beta': 1, 'randint': 10, 'poisson': 5, 'hypergeom': 1 } for brng_name, sfn in itertools.product(brngs, samplers.keys()): func = samplers[sfn] m = multipliers[sfn] times_list = [] for __ in range(OUTER_REPS): if mkl: rs = rnd.RandomState(123, brng=brng_name) else: rs = rnd.Generator(brng_name(seed=123)) t0 = timeit.default_timer() for __ in range(INNER_REPS): func(rs, (m * 100, 1000)) t1 = timeit.default_timer() times_list.append(t1 - t0) print("IntelPython,{brng_name},{dist_name},{time:.5f}".format( brng_name=brng_name, dist_name=sfn, time=min(times_list)))
def make_split_vertex_labels(num_vertices, proportion_censored, rng=None): """ Adapts tensorflow dataset to produce another element in the labels dictionary corresponding to whether the vertex is in the training or testing set. Parameters ---------- num_vertices: The number of vertices in the graph. proportion_censored: The proportion of graph to censor rng: An instance of np.random.RandomState used to split the data. If None, a deterministic split will be chosen (corresponding to seeding with 42). Returns ------- fn: A function that can be used to map a dataset to censor some of the vertex labels. """ if rng is None: rng = random.RandomState(42) # todo: can have overlap in dev and test sets... fix me in_test = rng.binomial(1, proportion_censored, size=num_vertices).astype(np.float32) in_dev = rng.binomial(1, proportion_censored, size=num_vertices).astype(np.float32) in_train = ((in_test + in_dev) == 0).astype(np.float32) def fn(data): vertex_id = data['vertex_index'] sample_in_test = tf.gather(in_test, vertex_id) sample_in_dev = tf.gather(in_dev, vertex_id) sample_in_train = tf.gather(in_train, vertex_id) return { **data, 'in_test': sample_in_test, 'in_dev': sample_in_dev, 'in_train': sample_in_train } return fn
def _daal_train_test_split(*arrays, **options): n_arrays = len(arrays) if n_arrays == 0: raise ValueError("At least one array required as input") test_size = options.pop('test_size', None) train_size = options.pop('train_size', None) random_state = options.pop('random_state', None) stratify = options.pop('stratify', None) shuffle = options.pop('shuffle', True) rng = options.pop('rng', 'OPTIMIZED_MT19937') available_rngs = [ 'default', 'MT19937', 'SFMT19937', 'MT2203', 'R250', 'WH', 'MCG31', 'MCG59', 'MRG32K3A', 'PHILOX4X32X10', 'NONDETERM', 'OPTIMIZED_MT19937' ] if rng not in available_rngs: raise ValueError("Wrong random numbers generator is chosen. " "Available generators: %s" % str(available_rngs)[1:-1]) if options: raise TypeError("Invalid parameters passed: %s" % str(options)) arrays = indexable(*arrays) n_samples = _num_samples(arrays[0]) n_train, n_test = _validate_shuffle_split(n_samples, test_size, train_size, default_test_size=0.25) if shuffle is False: if stratify is not None: raise ValueError( "Stratified train/test split is not implemented for " "shuffle=False") train = np.arange(n_train) test = np.arange(n_train, n_train + n_test) else: if stratify is not None: cv = StratifiedShuffleSplit(test_size=n_test, train_size=n_train, random_state=random_state) train, test = next(cv.split(X=arrays[0], y=stratify)) else: if mkl_random_is_imported and rng not in [ 'default', 'OPTIMIZED_MT19937' ] and (isinstance(random_state, int) or random_state is None): random_state = mkl_random.RandomState(random_state, rng) indexes = random_state.permutation(n_train + n_test) test, train = indexes[:n_test], indexes[n_test:] elif rng == 'OPTIMIZED_MT19937' and daal_check_version(((2020,'P', 3), (2021,'B',9))) \ and (isinstance(random_state, int) or random_state is None) \ and platform.system() != 'Windows': indexes = np.empty(shape=(n_train + n_test, ), dtype=np.int64 if n_train + n_test > 2**31 - 1 else np.int32) random_state = np.random.RandomState(random_state) random_state = random_state.get_state()[1] d4p.daal_generate_shuffled_indices([indexes], [random_state]) test, train = indexes[:n_test], indexes[n_test:] else: cv = ShuffleSplit(test_size=n_test, train_size=n_train, random_state=random_state) train, test = next(cv.split(X=arrays[0], y=stratify)) res = [] for arr in arrays: fallback = False # input format check if not isinstance(arr, np.ndarray): if pandas_is_imported: if not isinstance(arr, pd.core.frame.DataFrame) and not isinstance( arr, pd.core.series.Series): fallback = True else: fallback = True # dimensions check if hasattr(arr, 'ndim'): if arr.ndim > 2: fallback = True else: fallback = True # data types check dtypes = get_dtypes(arr) if dtypes is None: fallback = True else: for i, dtype in enumerate(dtypes): if 'float' not in str(dtype) and 'int' not in str(dtype): fallback = True break if fallback: res.append(safe_indexing(arr, train)) res.append(safe_indexing(arr, test)) else: if len(arr.shape) == 2: n_cols = arr.shape[1] reshape_later = False else: n_cols = 1 reshape_later = True arr_copy = d4p.get_data(arr) if not isinstance(arr_copy, list): arr_copy = arr_copy.reshape((arr_copy.shape[0], n_cols), order='A') if isinstance(arr_copy, np.ndarray): order = 'C' if arr_copy.flags['C_CONTIGUOUS'] else 'F' train_arr = np.empty(shape=(n_train, n_cols), dtype=arr_copy.dtype, order=order) test_arr = np.empty(shape=(n_test, n_cols), dtype=arr_copy.dtype, order=order) d4p.daal_train_test_split(arr_copy, train_arr, test_arr, [train], [test]) if reshape_later: train_arr, test_arr = train_arr.reshape( (n_train, )), test_arr.reshape((n_test, )) elif isinstance(arr_copy, list): train_arr = [ np.empty(shape=(n_train, ), dtype=el.dtype, order='C' if el.flags['C_CONTIGUOUS'] else 'F') for el in arr_copy ] test_arr = [ np.empty(shape=(n_test, ), dtype=el.dtype, order='C' if el.flags['C_CONTIGUOUS'] else 'F') for el in arr_copy ] d4p.daal_train_test_split(arr_copy, train_arr, test_arr, [train], [test]) train_arr = { col: train_arr[i] for i, col in enumerate(arr.columns) } test_arr = { col: test_arr[i] for i, col in enumerate(arr.columns) } else: raise ValueError('Array can\'t be converted to needed format') if pandas_is_imported: if isinstance(arr, pd.core.frame.DataFrame): train_arr, test_arr = pd.DataFrame( train_arr), pd.DataFrame(test_arr) if isinstance(arr, pd.core.series.Series): train_arr, test_arr = train_arr.reshape( n_train), test_arr.reshape(n_test) train_arr, test_arr = pd.Series(train_arr), pd.Series( test_arr) if hasattr(arr, 'index'): train_arr.index = train test_arr.index = test res.append(train_arr) res.append(test_arr) return res
def setUp(self): self.seed = 1234567890 self.prng = rnd.RandomState(self.seed) self.state = self.prng.get_state()
def test_non_deterministic(self): rs = rnd.RandomState(brng='nondeterministic') rs.rand(10) rs.randint(0, 10)
def _daal_train_test_split(*arrays, **options): n_arrays = len(arrays) if n_arrays == 0: raise ValueError("At least one array required as input") test_size = options.pop('test_size', None) train_size = options.pop('train_size', None) random_state = options.pop('random_state', None) stratify = options.pop('stratify', None) shuffle = options.pop('shuffle', True) rng = options.pop('rng', 'OPTIMIZED_MT19937') available_rngs = [ 'default', 'MT19937', 'SFMT19937', 'MT2203', 'R250', 'WH', 'MCG31', 'MCG59', 'MRG32K3A', 'PHILOX4X32X10', 'NONDETERM', 'OPTIMIZED_MT19937' ] if rng not in available_rngs: raise ValueError("Wrong random numbers generator is chosen. " "Available generators: %s" % str(available_rngs)[1:-1]) if options: raise TypeError("Invalid parameters passed: %s" % str(options)) arrays = indexable(*arrays) n_samples = _num_samples(arrays[0]) n_train, n_test = _validate_shuffle_split(n_samples, test_size, train_size, default_test_size=0.25) if shuffle is False: if stratify is not None: raise ValueError( "Stratified train/test split is not implemented for shuffle=False" ) train = np.arange(n_train) test = np.arange(n_train, n_train + n_test) else: if stratify is not None: cv = StratifiedShuffleSplit(test_size=n_test, train_size=n_train, random_state=random_state) train, test = next(cv.split(X=arrays[0], y=stratify)) else: if mkl_random_is_imported and \ rng not in ['default', 'OPTIMIZED_MT19937'] and \ (isinstance(random_state, int) or random_state is None): random_state = mkl_random.RandomState(random_state, rng) indexes = random_state.permutation(n_samples) test, train = indexes[:n_test], indexes[n_test:(n_test + n_train)] elif rng == 'OPTIMIZED_MT19937' and \ (isinstance(random_state, int) or random_state is None) and \ platform.system() != 'Windows': indexes = np.empty(shape=(n_samples, ), dtype=np.int64 if n_train + n_test > 2**31 - 1 else np.int32) random_state = np.random.RandomState(random_state) random_state = random_state.get_state()[1] d4p.daal_generate_shuffled_indices([indexes], [random_state]) test, train = indexes[:n_test], indexes[n_test:(n_test + n_train)] else: cv = ShuffleSplit(test_size=n_test, train_size=n_train, random_state=random_state) train, test = next(cv.split(X=arrays[0], y=stratify)) res = [] for arr in arrays: _patching_status = PatchingConditionsChain( "sklearn.model_selection.train_test_split") # input format check _patching_status.and_conditions([ (isinstance(arr, np.ndarray), "The input is not a np.ndarray object.") ]) if pandas_is_imported: _patching_status.or_conditions( [(isinstance(arr, pd.core.frame.DataFrame), "The input is not a pd.DataFrame object."), (isinstance(arr, pd.core.series.Series), "The input is not a pd.Series object.")], conditions_merging=any) # dimensions check _dal_ready = _patching_status.and_conditions([ (hasattr(arr, 'ndim'), "The input does not have 'ndim' attribute.") ]) if hasattr(arr, 'ndim'): _patching_status.and_conditions([ (arr.ndim <= 2, "The input has more than 2 dimensions.") ]) # data types check dtypes = get_dtypes(arr) _dal_ready = _patching_status.and_conditions([ (dtypes is not None, "Unable to parse input data types.") ]) if dtypes is not None: incorrect_dtype = None for i, dtype in enumerate(dtypes): if 'float' not in str(dtype) and 'int' not in str(dtype): incorrect_dtype = str(dtype) break _dal_ready = _patching_status.and_conditions([ (incorrect_dtype is None, f"Input has incorrect data type '{incorrect_dtype}'. " "Only integer and floating point types are supported.") ]) _patching_status.write_log() if not _dal_ready: res.append(safe_indexing(arr, train)) res.append(safe_indexing(arr, test)) else: if len(arr.shape) == 2: n_cols = arr.shape[1] reshape_later = False else: n_cols = 1 reshape_later = True arr_copy = d4p.get_data(arr) if not isinstance(arr_copy, list): arr_copy = arr_copy.reshape( (arr_copy.shape[0], n_cols), order='A', ) if isinstance(arr_copy, np.ndarray): order = 'C' if arr_copy.flags['C_CONTIGUOUS'] else 'F' train_arr = np.empty( shape=(n_train, n_cols), dtype=arr_copy.dtype, order=order, ) test_arr = np.empty( shape=(n_test, n_cols), dtype=arr_copy.dtype, order=order, ) d4p.daal_train_test_split(arr_copy, train_arr, test_arr, [train], [test]) if reshape_later: train_arr, test_arr = train_arr.reshape( (n_train, )), test_arr.reshape((n_test, )) elif isinstance(arr_copy, list): train_arr = [ np.empty( shape=(n_train, ), dtype=el.dtype, order='C' if el.flags['C_CONTIGUOUS'] else 'F', ) for el in arr_copy ] test_arr = [ np.empty(shape=(n_test, ), dtype=el.dtype, order='C' if el.flags['C_CONTIGUOUS'] else 'F') for el in arr_copy ] d4p.daal_train_test_split(arr_copy, train_arr, test_arr, [train], [test]) train_arr = { col: train_arr[i] for i, col in enumerate(arr.columns) } test_arr = { col: test_arr[i] for i, col in enumerate(arr.columns) } else: raise ValueError('Array can\'t be converted to needed format') if pandas_is_imported: if isinstance(arr, pd.core.frame.DataFrame): train_arr, test_arr = pd.DataFrame(train_arr, columns=arr.columns), \ pd.DataFrame(test_arr, columns=arr.columns) if isinstance(arr, pd.core.series.Series): train_arr, test_arr = \ train_arr.reshape(n_train), test_arr.reshape(n_test) train_arr, test_arr = pd.Series(train_arr, name=arr.name), \ pd.Series(test_arr, name=arr.name) if hasattr(arr, 'index'): train_arr.index = train test_arr.index = test res.append(train_arr) res.append(test_arr) return res