Exemple #1
0
 def test_scalar(self):
     evs_zero_seed = {
         'MT19937': 844,
         'SFMT19937': 857,
         'WH': 0,
         'MT2203': 890,
         'MCG31': 0,
         'R250': 229,
         'MRG32K3A': 0,
         'MCG59': 0
     }
     for brng_algo in evs_zero_seed:
         s = rnd.RandomState(0, brng=brng_algo)
         assert_equal(s.get_state()[0], brng_algo)
         assert_equal(s.randint(1000), evs_zero_seed[brng_algo])
     evs_max_seed = {
         'MT19937': 635,
         'SFMT19937': 25,
         'WH': 100,
         'MT2203': 527,
         'MCG31': 0,
         'R250': 229,
         'MRG32K3A': 961,
         'MCG59': 0
     }
     for brng_algo in evs_max_seed:
         s = rnd.RandomState(4294967295, brng=brng_algo)
         assert_equal(s.get_state()[0], brng_algo)
         assert_equal(s.randint(1000), evs_max_seed[brng_algo])
Exemple #2
0
 def test_array(self):
     s = rnd.RandomState(range(10), brng='MT19937')
     assert_equal(s.randint(1000), 410)
     s = rnd.RandomState(np.arange(10), brng='MT19937')
     assert_equal(s.randint(1000), 410)
     s = rnd.RandomState([0], brng='MT19937')
     assert_equal(s.randint(1000), 844)
     s = rnd.RandomState([4294967295], brng='MT19937')
     assert_equal(s.randint(1000), 635)
Exemple #3
0
def split_vertex_labels(num_vertices, proportion_censored, rng=None):
    """ Adapts tensorflow dataset to produce another element in the labels dictionary
    corresponding to whether the vertex is in the training or testing set.

    Parameters
    ----------
    num_vertices: The number of vertices in the graph.
    proportion_censored: The proportion of graph to censor
    rng: An instance of np.random.RandomState used to split the data. If None,
        a deterministic split will be chosen (corresponding to seeding with 42).

    Returns
    -------
    fn: A function that can be used to map a dataset to censor some of the vertex labels.
    """
    if rng is None:
        rng = random.RandomState(42)

    split = rng.binomial(1, 1 - proportion_censored,
                         size=num_vertices).astype(np.int32)

    def fn(data):
        vertex_id = data['vertex_index']
        sample_split = tf.gather(split, vertex_id)

        return {**data, 'split': sample_split}

    return fn
 def __init__(self):
     self.rng = mkl_random.RandomState(brng='nondeterm')
     if self.rng.get_state()[0] != 'NON_DETERMINISTIC':
         raise RuntimeError(
             'RNG returned by mkl_random is not non-deterministic RDRAND (most likely, substituted by default Mersenne Twister,  '
             'since mkl_random installation on one of the nodes is of the version which does not yet support RDRAND)'
         )
Exemple #5
0
 def test_call_within_randomstate(self):
     # Check that custom RandomState does not call into global state
     m = rnd.RandomState()
     res = np.array([5, 7, 5, 4, 5, 5, 6, 9, 6, 1])
     for i in range(3):
         rnd.seed(i)
         m.seed(4321, brng='SFMT19937')
         # If m.state is not honored, the result will change
         assert_array_equal(m.choice(10, size=10, p=np.ones(10) / 10.), res)
Exemple #6
0
 def test_multinomial(self):
     rs = rnd.RandomState(self.seed, brng=self.brng)
     actual = rs.multinomial(20, [1 / 6.] * 6, size=(3, 2))
     desired = np.full((3, 2), 20, dtype=actual.dtype)
     np.testing.assert_array_equal(actual.sum(axis=-1), desired)
     expected = np.array([[[6, 2, 1, 3, 2, 6], [7, 5, 1, 2, 3, 2]],
                          [[5, 1, 8, 3, 2, 1], [4, 6, 0, 4, 4, 2]],
                          [[6, 3, 1, 4, 4, 2], [3, 2, 4, 2, 1, 8]]],
                         actual.dtype)
     np.testing.assert_array_equal(actual, expected)
    def check_function(self, function, sz):
        from threading import Thread

        out1 = np.empty((len(self.seeds),) + sz)
        out2 = np.empty((len(self.seeds),) + sz)

        # threaded generation
        t = [Thread(target=function, args=(rnd.RandomState(s), o))
             for s, o in zip(self.seeds, out1)]
        [x.start() for x in t]
        [x.join() for x in t]

        # the same serial
        for s, o in zip(self.seeds, out2):
            function(rnd.RandomState(s), o)

        # these platforms change x87 fpu precision mode in threads
        if (np.intp().dtype.itemsize == 4 and sys.platform == "win32"):
            np.testing.assert_array_almost_equal(out1, out2)
        else:
            np.testing.assert_array_equal(out1, out2)
Exemple #8
0
def main():
    import itertools
    if mkl:
        brngs = [
            'WH', 'PHILOX4X32X10', 'MT2203', 'MCG59', 'MCG31', 'MT19937',
            'MRG32K3A', 'SFMT19937', 'R250'
        ]
    else:
        brngs = [np.random.MT19937, np.random.Philox]
    samplers = {
        'uniform': sample_uniform,
        'normal': sample_normal,
        'gamma': sample_gamma,
        'beta': sample_beta,
        'randint': sample_randint,
        'poisson': sample_poisson,
        'hypergeom': sample_hypergeom
    }
    multipliers = {
        'uniform': 10,
        'normal': 2,
        'gamma': 1,
        'beta': 1,
        'randint': 10,
        'poisson': 5,
        'hypergeom': 1
    }
    for brng_name, sfn in itertools.product(brngs, samplers.keys()):
        func = samplers[sfn]
        m = multipliers[sfn]
        times_list = []
        for __ in range(OUTER_REPS):
            if mkl:
                rs = rnd.RandomState(123, brng=brng_name)
            else:
                rs = rnd.Generator(brng_name(seed=123))
            t0 = timeit.default_timer()
            for __ in range(INNER_REPS):
                func(rs, (m * 100, 1000))
            t1 = timeit.default_timer()
            times_list.append(t1 - t0)
        print("IntelPython,{brng_name},{dist_name},{time:.5f}".format(
            brng_name=brng_name, dist_name=sfn, time=min(times_list)))
def make_split_vertex_labels(num_vertices, proportion_censored, rng=None):
    """ Adapts tensorflow dataset to produce another element in the labels dictionary
    corresponding to whether the vertex is in the training or testing set.

    Parameters
    ----------
    num_vertices: The number of vertices in the graph.
    proportion_censored: The proportion of graph to censor
    rng: An instance of np.random.RandomState used to split the data. If None,
        a deterministic split will be chosen (corresponding to seeding with 42).

    Returns
    -------
    fn: A function that can be used to map a dataset to censor some of the vertex labels.
    """
    if rng is None:
        rng = random.RandomState(42)

    # todo: can have overlap in dev and test sets... fix me
    in_test = rng.binomial(1, proportion_censored,
                           size=num_vertices).astype(np.float32)
    in_dev = rng.binomial(1, proportion_censored,
                          size=num_vertices).astype(np.float32)
    in_train = ((in_test + in_dev) == 0).astype(np.float32)

    def fn(data):
        vertex_id = data['vertex_index']
        sample_in_test = tf.gather(in_test, vertex_id)
        sample_in_dev = tf.gather(in_dev, vertex_id)
        sample_in_train = tf.gather(in_train, vertex_id)

        return {
            **data, 'in_test': sample_in_test,
            'in_dev': sample_in_dev,
            'in_train': sample_in_train
        }

    return fn
Exemple #10
0
def _daal_train_test_split(*arrays, **options):
    n_arrays = len(arrays)
    if n_arrays == 0:
        raise ValueError("At least one array required as input")
    test_size = options.pop('test_size', None)
    train_size = options.pop('train_size', None)
    random_state = options.pop('random_state', None)
    stratify = options.pop('stratify', None)
    shuffle = options.pop('shuffle', True)
    rng = options.pop('rng', 'OPTIMIZED_MT19937')

    available_rngs = [
        'default', 'MT19937', 'SFMT19937', 'MT2203', 'R250', 'WH', 'MCG31',
        'MCG59', 'MRG32K3A', 'PHILOX4X32X10', 'NONDETERM', 'OPTIMIZED_MT19937'
    ]
    if rng not in available_rngs:
        raise ValueError("Wrong random numbers generator is chosen. "
                         "Available generators: %s" %
                         str(available_rngs)[1:-1])

    if options:
        raise TypeError("Invalid parameters passed: %s" % str(options))

    arrays = indexable(*arrays)

    n_samples = _num_samples(arrays[0])
    n_train, n_test = _validate_shuffle_split(n_samples,
                                              test_size,
                                              train_size,
                                              default_test_size=0.25)
    if shuffle is False:
        if stratify is not None:
            raise ValueError(
                "Stratified train/test split is not implemented for "
                "shuffle=False")

        train = np.arange(n_train)
        test = np.arange(n_train, n_train + n_test)
    else:
        if stratify is not None:
            cv = StratifiedShuffleSplit(test_size=n_test,
                                        train_size=n_train,
                                        random_state=random_state)
            train, test = next(cv.split(X=arrays[0], y=stratify))
        else:
            if mkl_random_is_imported and rng not in [
                    'default', 'OPTIMIZED_MT19937'
            ] and (isinstance(random_state, int) or random_state is None):
                random_state = mkl_random.RandomState(random_state, rng)
                indexes = random_state.permutation(n_train + n_test)
                test, train = indexes[:n_test], indexes[n_test:]
            elif rng == 'OPTIMIZED_MT19937' and daal_check_version(((2020,'P', 3), (2021,'B',9))) \
            and (isinstance(random_state, int) or random_state is None) \
            and platform.system() != 'Windows':
                indexes = np.empty(shape=(n_train + n_test, ),
                                   dtype=np.int64 if
                                   n_train + n_test > 2**31 - 1 else np.int32)
                random_state = np.random.RandomState(random_state)
                random_state = random_state.get_state()[1]
                d4p.daal_generate_shuffled_indices([indexes], [random_state])
                test, train = indexes[:n_test], indexes[n_test:]
            else:
                cv = ShuffleSplit(test_size=n_test,
                                  train_size=n_train,
                                  random_state=random_state)
                train, test = next(cv.split(X=arrays[0], y=stratify))

    res = []
    for arr in arrays:
        fallback = False

        # input format check
        if not isinstance(arr, np.ndarray):
            if pandas_is_imported:
                if not isinstance(arr,
                                  pd.core.frame.DataFrame) and not isinstance(
                                      arr, pd.core.series.Series):
                    fallback = True
            else:
                fallback = True

        # dimensions check
        if hasattr(arr, 'ndim'):
            if arr.ndim > 2:
                fallback = True
        else:
            fallback = True

        # data types check
        dtypes = get_dtypes(arr)
        if dtypes is None:
            fallback = True
        else:
            for i, dtype in enumerate(dtypes):
                if 'float' not in str(dtype) and 'int' not in str(dtype):
                    fallback = True
                    break

        if fallback:
            res.append(safe_indexing(arr, train))
            res.append(safe_indexing(arr, test))
        else:

            if len(arr.shape) == 2:
                n_cols = arr.shape[1]
                reshape_later = False
            else:
                n_cols = 1
                reshape_later = True

            arr_copy = d4p.get_data(arr)
            if not isinstance(arr_copy, list):
                arr_copy = arr_copy.reshape((arr_copy.shape[0], n_cols),
                                            order='A')
            if isinstance(arr_copy, np.ndarray):
                order = 'C' if arr_copy.flags['C_CONTIGUOUS'] else 'F'
                train_arr = np.empty(shape=(n_train, n_cols),
                                     dtype=arr_copy.dtype,
                                     order=order)
                test_arr = np.empty(shape=(n_test, n_cols),
                                    dtype=arr_copy.dtype,
                                    order=order)
                d4p.daal_train_test_split(arr_copy, train_arr, test_arr,
                                          [train], [test])
                if reshape_later:
                    train_arr, test_arr = train_arr.reshape(
                        (n_train, )), test_arr.reshape((n_test, ))
            elif isinstance(arr_copy, list):
                train_arr = [
                    np.empty(shape=(n_train, ),
                             dtype=el.dtype,
                             order='C' if el.flags['C_CONTIGUOUS'] else 'F')
                    for el in arr_copy
                ]
                test_arr = [
                    np.empty(shape=(n_test, ),
                             dtype=el.dtype,
                             order='C' if el.flags['C_CONTIGUOUS'] else 'F')
                    for el in arr_copy
                ]
                d4p.daal_train_test_split(arr_copy, train_arr, test_arr,
                                          [train], [test])
                train_arr = {
                    col: train_arr[i]
                    for i, col in enumerate(arr.columns)
                }
                test_arr = {
                    col: test_arr[i]
                    for i, col in enumerate(arr.columns)
                }
            else:
                raise ValueError('Array can\'t be converted to needed format')

            if pandas_is_imported:
                if isinstance(arr, pd.core.frame.DataFrame):
                    train_arr, test_arr = pd.DataFrame(
                        train_arr), pd.DataFrame(test_arr)
                if isinstance(arr, pd.core.series.Series):
                    train_arr, test_arr = train_arr.reshape(
                        n_train), test_arr.reshape(n_test)
                    train_arr, test_arr = pd.Series(train_arr), pd.Series(
                        test_arr)

            if hasattr(arr, 'index'):
                train_arr.index = train
                test_arr.index = test

            res.append(train_arr)
            res.append(test_arr)

    return res
Exemple #11
0
 def setUp(self):
     self.seed = 1234567890
     self.prng = rnd.RandomState(self.seed)
     self.state = self.prng.get_state()
Exemple #12
0
 def test_non_deterministic(self):
     rs = rnd.RandomState(brng='nondeterministic')
     rs.rand(10)
     rs.randint(0, 10)
Exemple #13
0
def _daal_train_test_split(*arrays, **options):
    n_arrays = len(arrays)
    if n_arrays == 0:
        raise ValueError("At least one array required as input")
    test_size = options.pop('test_size', None)
    train_size = options.pop('train_size', None)
    random_state = options.pop('random_state', None)
    stratify = options.pop('stratify', None)
    shuffle = options.pop('shuffle', True)
    rng = options.pop('rng', 'OPTIMIZED_MT19937')

    available_rngs = [
        'default', 'MT19937', 'SFMT19937', 'MT2203', 'R250', 'WH', 'MCG31',
        'MCG59', 'MRG32K3A', 'PHILOX4X32X10', 'NONDETERM', 'OPTIMIZED_MT19937'
    ]
    if rng not in available_rngs:
        raise ValueError("Wrong random numbers generator is chosen. "
                         "Available generators: %s" %
                         str(available_rngs)[1:-1])

    if options:
        raise TypeError("Invalid parameters passed: %s" % str(options))

    arrays = indexable(*arrays)

    n_samples = _num_samples(arrays[0])
    n_train, n_test = _validate_shuffle_split(n_samples,
                                              test_size,
                                              train_size,
                                              default_test_size=0.25)
    if shuffle is False:
        if stratify is not None:
            raise ValueError(
                "Stratified train/test split is not implemented for shuffle=False"
            )

        train = np.arange(n_train)
        test = np.arange(n_train, n_train + n_test)
    else:
        if stratify is not None:
            cv = StratifiedShuffleSplit(test_size=n_test,
                                        train_size=n_train,
                                        random_state=random_state)
            train, test = next(cv.split(X=arrays[0], y=stratify))
        else:
            if mkl_random_is_imported and \
               rng not in ['default', 'OPTIMIZED_MT19937'] and \
               (isinstance(random_state, int) or random_state is None):
                random_state = mkl_random.RandomState(random_state, rng)
                indexes = random_state.permutation(n_samples)
                test, train = indexes[:n_test], indexes[n_test:(n_test +
                                                                n_train)]
            elif rng == 'OPTIMIZED_MT19937' and \
                (isinstance(random_state, int) or random_state is None) and \
                    platform.system() != 'Windows':
                indexes = np.empty(shape=(n_samples, ),
                                   dtype=np.int64 if
                                   n_train + n_test > 2**31 - 1 else np.int32)
                random_state = np.random.RandomState(random_state)
                random_state = random_state.get_state()[1]
                d4p.daal_generate_shuffled_indices([indexes], [random_state])
                test, train = indexes[:n_test], indexes[n_test:(n_test +
                                                                n_train)]
            else:
                cv = ShuffleSplit(test_size=n_test,
                                  train_size=n_train,
                                  random_state=random_state)
                train, test = next(cv.split(X=arrays[0], y=stratify))

    res = []
    for arr in arrays:
        _patching_status = PatchingConditionsChain(
            "sklearn.model_selection.train_test_split")

        # input format check
        _patching_status.and_conditions([
            (isinstance(arr,
                        np.ndarray), "The input is not a np.ndarray object.")
        ])
        if pandas_is_imported:
            _patching_status.or_conditions(
                [(isinstance(arr, pd.core.frame.DataFrame),
                  "The input is not a pd.DataFrame object."),
                 (isinstance(arr, pd.core.series.Series),
                  "The input is not a pd.Series object.")],
                conditions_merging=any)

        # dimensions check
        _dal_ready = _patching_status.and_conditions([
            (hasattr(arr, 'ndim'), "The input does not have 'ndim' attribute.")
        ])
        if hasattr(arr, 'ndim'):
            _patching_status.and_conditions([
                (arr.ndim <= 2, "The input has more than 2 dimensions.")
            ])

        # data types check
        dtypes = get_dtypes(arr)
        _dal_ready = _patching_status.and_conditions([
            (dtypes is not None, "Unable to parse input data types.")
        ])
        if dtypes is not None:
            incorrect_dtype = None
            for i, dtype in enumerate(dtypes):
                if 'float' not in str(dtype) and 'int' not in str(dtype):
                    incorrect_dtype = str(dtype)
                    break
            _dal_ready = _patching_status.and_conditions([
                (incorrect_dtype is None,
                 f"Input has incorrect data type '{incorrect_dtype}'. "
                 "Only integer and floating point types are supported.")
            ])

        _patching_status.write_log()
        if not _dal_ready:
            res.append(safe_indexing(arr, train))
            res.append(safe_indexing(arr, test))
        else:
            if len(arr.shape) == 2:
                n_cols = arr.shape[1]
                reshape_later = False
            else:
                n_cols = 1
                reshape_later = True

            arr_copy = d4p.get_data(arr)
            if not isinstance(arr_copy, list):
                arr_copy = arr_copy.reshape(
                    (arr_copy.shape[0], n_cols),
                    order='A',
                )
            if isinstance(arr_copy, np.ndarray):
                order = 'C' if arr_copy.flags['C_CONTIGUOUS'] else 'F'
                train_arr = np.empty(
                    shape=(n_train, n_cols),
                    dtype=arr_copy.dtype,
                    order=order,
                )
                test_arr = np.empty(
                    shape=(n_test, n_cols),
                    dtype=arr_copy.dtype,
                    order=order,
                )
                d4p.daal_train_test_split(arr_copy, train_arr, test_arr,
                                          [train], [test])
                if reshape_later:
                    train_arr, test_arr = train_arr.reshape(
                        (n_train, )), test_arr.reshape((n_test, ))
            elif isinstance(arr_copy, list):
                train_arr = [
                    np.empty(
                        shape=(n_train, ),
                        dtype=el.dtype,
                        order='C' if el.flags['C_CONTIGUOUS'] else 'F',
                    ) for el in arr_copy
                ]
                test_arr = [
                    np.empty(shape=(n_test, ),
                             dtype=el.dtype,
                             order='C' if el.flags['C_CONTIGUOUS'] else 'F')
                    for el in arr_copy
                ]
                d4p.daal_train_test_split(arr_copy, train_arr, test_arr,
                                          [train], [test])
                train_arr = {
                    col: train_arr[i]
                    for i, col in enumerate(arr.columns)
                }
                test_arr = {
                    col: test_arr[i]
                    for i, col in enumerate(arr.columns)
                }
            else:
                raise ValueError('Array can\'t be converted to needed format')

            if pandas_is_imported:
                if isinstance(arr, pd.core.frame.DataFrame):
                    train_arr, test_arr = pd.DataFrame(train_arr, columns=arr.columns), \
                        pd.DataFrame(test_arr, columns=arr.columns)
                if isinstance(arr, pd.core.series.Series):
                    train_arr, test_arr = \
                        train_arr.reshape(n_train), test_arr.reshape(n_test)
                    train_arr, test_arr = pd.Series(train_arr, name=arr.name), \
                        pd.Series(test_arr, name=arr.name)

            if hasattr(arr, 'index'):
                train_arr.index = train
                test_arr.index = test

            res.append(train_arr)
            res.append(test_arr)

    return res