def kneighbors(self, X=None, n_neighbors=None, return_distance=True): n_features = getattr(self, 'n_features_in_', None) shape = getattr(X, 'shape', None) if n_features and shape and len(shape) > 1 and shape[1] != n_features: raise ValueError( 'Input data shape {} is inconsistent with the trained model'. format(X.shape)) check_is_fitted(self) if n_neighbors is None: n_neighbors = self.n_neighbors elif n_neighbors <= 0: raise ValueError("Expected n_neighbors > 0. Got %d" % n_neighbors) else: if not isinstance(n_neighbors, numbers.Integral): raise TypeError("n_neighbors does not take %s value, " "enter integer value" % type(n_neighbors)) if X is not None: query_is_train = False if self.effective_metric_ == 'precomputed': X = _check_precomputed(X) else: X = check_array(X, accept_sparse='csr') else: query_is_train = True X = self._fit_X # Include an extra neighbor to account for the sample itself being # returned, which is removed later n_neighbors += 1 n_samples_fit = self.n_samples_fit_ if n_neighbors > n_samples_fit: raise ValueError("Expected n_neighbors <= n_samples, " " but n_samples = %d, n_neighbors = %d" % (n_samples_fit, n_neighbors)) chunked_results = None try: fptype = getFPType(X) except ValueError: fptype = None fit_X_correct_type = isinstance(self._fit_X, np.ndarray) if daal_check_version((2020, 3)) and fit_X_correct_type and self._fit_method in ['brute', 'kd_tree', 'auto'] \ and (self.effective_metric_ == 'minkowski' and self.p == 2 or self.effective_metric_ == 'euclidean') \ and fptype is not None and not sp.issparse(X): logging.info("sklearn.neighbors.KNeighborsMixin.kneighbors: " + method_uses_daal) params = { 'method': 'defaultDense', 'k': n_neighbors, 'resultsToCompute': 'computeIndicesOfNeighbors', 'resultsToEvaluate': 'none' } if return_distance: params['resultsToCompute'] += '|computeDistances' method = parse_auto_method(self, self._fit_method, self.n_samples_fit_, n_features) fit_X = d4p.get_data(self._fit_X) train_alg = training_algorithm(method, fptype, params) training_result = train_alg.compute(fit_X) X = d4p.get_data(X) predict_alg = prediction_algorithm(method, fptype, params) prediction_result = predict_alg.compute(X, training_result.model) if return_distance: results = prediction_result.distances.astype( fptype), prediction_result.indices.astype(int) else: results = prediction_result.indices.astype(int) else: logging.info("sklearn.neighbors.KNeighborsMixin.kneighbors: " + method_uses_sklearn) return super(KNeighborsMixin, self).kneighbors(X, n_neighbors, return_distance) if chunked_results is not None: if return_distance: neigh_dist, neigh_ind = zip(*chunked_results) results = np.vstack(neigh_dist), np.vstack(neigh_ind) else: results = np.vstack(chunked_results) if not query_is_train: return results else: # If the query data is the same as the indexed data, we would like # to ignore the first nearest neighbor of every sample, i.e # the sample itself. if return_distance: neigh_dist, neigh_ind = results else: neigh_ind = results n_queries, _ = X.shape sample_range = np.arange(n_queries)[:, None] sample_mask = neigh_ind != sample_range # Corner case: When the number of duplicates are more # than the number of neighbors, the first NN will not # be the sample, but a duplicate. # In that case mask the first duplicate. dup_gr_nbrs = np.all(sample_mask, axis=1) sample_mask[:, 0][dup_gr_nbrs] = False neigh_ind = np.reshape(neigh_ind[sample_mask], (n_queries, n_neighbors - 1)) if return_distance: neigh_dist = np.reshape(neigh_dist[sample_mask], (n_queries, n_neighbors - 1)) return neigh_dist, neigh_ind return neigh_ind
def _daal_train_test_split(*arrays, **options): n_arrays = len(arrays) if n_arrays == 0: raise ValueError("At least one array required as input") test_size = options.pop('test_size', None) train_size = options.pop('train_size', None) random_state = options.pop('random_state', None) stratify = options.pop('stratify', None) shuffle = options.pop('shuffle', True) rng = options.pop('rng', 'OPTIMIZED_MT19937') available_rngs = [ 'default', 'MT19937', 'SFMT19937', 'MT2203', 'R250', 'WH', 'MCG31', 'MCG59', 'MRG32K3A', 'PHILOX4X32X10', 'NONDETERM', 'OPTIMIZED_MT19937' ] if rng not in available_rngs: raise ValueError("Wrong random numbers generator is chosen. " "Available generators: %s" % str(available_rngs)[1:-1]) if options: raise TypeError("Invalid parameters passed: %s" % str(options)) arrays = indexable(*arrays) n_samples = _num_samples(arrays[0]) n_train, n_test = _validate_shuffle_split(n_samples, test_size, train_size, default_test_size=0.25) if shuffle is False: if stratify is not None: raise ValueError( "Stratified train/test split is not implemented for " "shuffle=False") train = np.arange(n_train) test = np.arange(n_train, n_train + n_test) else: if stratify is not None: cv = StratifiedShuffleSplit(test_size=n_test, train_size=n_train, random_state=random_state) train, test = next(cv.split(X=arrays[0], y=stratify)) else: if mkl_random_is_imported and rng not in [ 'default', 'OPTIMIZED_MT19937' ] and (isinstance(random_state, int) or random_state is None): random_state = mkl_random.RandomState(random_state, rng) indexes = random_state.permutation(n_train + n_test) test, train = indexes[:n_test], indexes[n_test:] elif rng == 'OPTIMIZED_MT19937' and daal_check_version(((2020,'P', 3), (2021,'B',9))) \ and (isinstance(random_state, int) or random_state is None) \ and platform.system() != 'Windows': indexes = np.empty(shape=(n_train + n_test, ), dtype=np.int64 if n_train + n_test > 2**31 - 1 else np.int32) random_state = np.random.RandomState(random_state) random_state = random_state.get_state()[1] d4p.daal_generate_shuffled_indices([indexes], [random_state]) test, train = indexes[:n_test], indexes[n_test:] else: cv = ShuffleSplit(test_size=n_test, train_size=n_train, random_state=random_state) train, test = next(cv.split(X=arrays[0], y=stratify)) res = [] for arr in arrays: fallback = False # input format check if not isinstance(arr, np.ndarray): if pandas_is_imported: if not isinstance(arr, pd.core.frame.DataFrame) and not isinstance( arr, pd.core.series.Series): fallback = True else: fallback = True # dimensions check if hasattr(arr, 'ndim'): if arr.ndim > 2: fallback = True else: fallback = True # data types check dtypes = get_dtypes(arr) if dtypes is None: fallback = True else: for i, dtype in enumerate(dtypes): if 'float' not in str(dtype) and 'int' not in str(dtype): fallback = True break if fallback: res.append(safe_indexing(arr, train)) res.append(safe_indexing(arr, test)) else: if len(arr.shape) == 2: n_cols = arr.shape[1] reshape_later = False else: n_cols = 1 reshape_later = True arr_copy = d4p.get_data(arr) if not isinstance(arr_copy, list): arr_copy = arr_copy.reshape((arr_copy.shape[0], n_cols), order='A') if isinstance(arr_copy, np.ndarray): order = 'C' if arr_copy.flags['C_CONTIGUOUS'] else 'F' train_arr = np.empty(shape=(n_train, n_cols), dtype=arr_copy.dtype, order=order) test_arr = np.empty(shape=(n_test, n_cols), dtype=arr_copy.dtype, order=order) d4p.daal_train_test_split(arr_copy, train_arr, test_arr, [train], [test]) if reshape_later: train_arr, test_arr = train_arr.reshape( (n_train, )), test_arr.reshape((n_test, )) elif isinstance(arr_copy, list): train_arr = [ np.empty(shape=(n_train, ), dtype=el.dtype, order='C' if el.flags['C_CONTIGUOUS'] else 'F') for el in arr_copy ] test_arr = [ np.empty(shape=(n_test, ), dtype=el.dtype, order='C' if el.flags['C_CONTIGUOUS'] else 'F') for el in arr_copy ] d4p.daal_train_test_split(arr_copy, train_arr, test_arr, [train], [test]) train_arr = { col: train_arr[i] for i, col in enumerate(arr.columns) } test_arr = { col: test_arr[i] for i, col in enumerate(arr.columns) } else: raise ValueError('Array can\'t be converted to needed format') if pandas_is_imported: if isinstance(arr, pd.core.frame.DataFrame): train_arr, test_arr = pd.DataFrame( train_arr), pd.DataFrame(test_arr) if isinstance(arr, pd.core.series.Series): train_arr, test_arr = train_arr.reshape( n_train), test_arr.reshape(n_test) train_arr, test_arr = pd.Series(train_arr), pd.Series( test_arr) if hasattr(arr, 'index'): train_arr.index = train test_arr.index = test res.append(train_arr) res.append(test_arr) return res
def _daal_train_test_split(*arrays, **options): n_arrays = len(arrays) if n_arrays == 0: raise ValueError("At least one array required as input") test_size = options.pop('test_size', None) train_size = options.pop('train_size', None) random_state = options.pop('random_state', None) stratify = options.pop('stratify', None) shuffle = options.pop('shuffle', True) rng = options.pop('rng', 'OPTIMIZED_MT19937') available_rngs = [ 'default', 'MT19937', 'SFMT19937', 'MT2203', 'R250', 'WH', 'MCG31', 'MCG59', 'MRG32K3A', 'PHILOX4X32X10', 'NONDETERM', 'OPTIMIZED_MT19937' ] if rng not in available_rngs: raise ValueError("Wrong random numbers generator is chosen. " "Available generators: %s" % str(available_rngs)[1:-1]) if options: raise TypeError("Invalid parameters passed: %s" % str(options)) arrays = indexable(*arrays) n_samples = _num_samples(arrays[0]) n_train, n_test = _validate_shuffle_split(n_samples, test_size, train_size, default_test_size=0.25) if shuffle is False: if stratify is not None: raise ValueError( "Stratified train/test split is not implemented for shuffle=False" ) train = np.arange(n_train) test = np.arange(n_train, n_train + n_test) else: if stratify is not None: cv = StratifiedShuffleSplit(test_size=n_test, train_size=n_train, random_state=random_state) train, test = next(cv.split(X=arrays[0], y=stratify)) else: if mkl_random_is_imported and \ rng not in ['default', 'OPTIMIZED_MT19937'] and \ (isinstance(random_state, int) or random_state is None): random_state = mkl_random.RandomState(random_state, rng) indexes = random_state.permutation(n_samples) test, train = indexes[:n_test], indexes[n_test:(n_test + n_train)] elif rng == 'OPTIMIZED_MT19937' and \ (isinstance(random_state, int) or random_state is None) and \ platform.system() != 'Windows': indexes = np.empty(shape=(n_samples, ), dtype=np.int64 if n_train + n_test > 2**31 - 1 else np.int32) random_state = np.random.RandomState(random_state) random_state = random_state.get_state()[1] d4p.daal_generate_shuffled_indices([indexes], [random_state]) test, train = indexes[:n_test], indexes[n_test:(n_test + n_train)] else: cv = ShuffleSplit(test_size=n_test, train_size=n_train, random_state=random_state) train, test = next(cv.split(X=arrays[0], y=stratify)) res = [] for arr in arrays: _patching_status = PatchingConditionsChain( "sklearn.model_selection.train_test_split") # input format check _patching_status.and_conditions([ (isinstance(arr, np.ndarray), "The input is not a np.ndarray object.") ]) if pandas_is_imported: _patching_status.or_conditions( [(isinstance(arr, pd.core.frame.DataFrame), "The input is not a pd.DataFrame object."), (isinstance(arr, pd.core.series.Series), "The input is not a pd.Series object.")], conditions_merging=any) # dimensions check _dal_ready = _patching_status.and_conditions([ (hasattr(arr, 'ndim'), "The input does not have 'ndim' attribute.") ]) if hasattr(arr, 'ndim'): _patching_status.and_conditions([ (arr.ndim <= 2, "The input has more than 2 dimensions.") ]) # data types check dtypes = get_dtypes(arr) _dal_ready = _patching_status.and_conditions([ (dtypes is not None, "Unable to parse input data types.") ]) if dtypes is not None: incorrect_dtype = None for i, dtype in enumerate(dtypes): if 'float' not in str(dtype) and 'int' not in str(dtype): incorrect_dtype = str(dtype) break _dal_ready = _patching_status.and_conditions([ (incorrect_dtype is None, f"Input has incorrect data type '{incorrect_dtype}'. " "Only integer and floating point types are supported.") ]) _patching_status.write_log() if not _dal_ready: res.append(safe_indexing(arr, train)) res.append(safe_indexing(arr, test)) else: if len(arr.shape) == 2: n_cols = arr.shape[1] reshape_later = False else: n_cols = 1 reshape_later = True arr_copy = d4p.get_data(arr) if not isinstance(arr_copy, list): arr_copy = arr_copy.reshape( (arr_copy.shape[0], n_cols), order='A', ) if isinstance(arr_copy, np.ndarray): order = 'C' if arr_copy.flags['C_CONTIGUOUS'] else 'F' train_arr = np.empty( shape=(n_train, n_cols), dtype=arr_copy.dtype, order=order, ) test_arr = np.empty( shape=(n_test, n_cols), dtype=arr_copy.dtype, order=order, ) d4p.daal_train_test_split(arr_copy, train_arr, test_arr, [train], [test]) if reshape_later: train_arr, test_arr = train_arr.reshape( (n_train, )), test_arr.reshape((n_test, )) elif isinstance(arr_copy, list): train_arr = [ np.empty( shape=(n_train, ), dtype=el.dtype, order='C' if el.flags['C_CONTIGUOUS'] else 'F', ) for el in arr_copy ] test_arr = [ np.empty(shape=(n_test, ), dtype=el.dtype, order='C' if el.flags['C_CONTIGUOUS'] else 'F') for el in arr_copy ] d4p.daal_train_test_split(arr_copy, train_arr, test_arr, [train], [test]) train_arr = { col: train_arr[i] for i, col in enumerate(arr.columns) } test_arr = { col: test_arr[i] for i, col in enumerate(arr.columns) } else: raise ValueError('Array can\'t be converted to needed format') if pandas_is_imported: if isinstance(arr, pd.core.frame.DataFrame): train_arr, test_arr = pd.DataFrame(train_arr, columns=arr.columns), \ pd.DataFrame(test_arr, columns=arr.columns) if isinstance(arr, pd.core.series.Series): train_arr, test_arr = \ train_arr.reshape(n_train), test_arr.reshape(n_test) train_arr, test_arr = pd.Series(train_arr, name=arr.name), \ pd.Series(test_arr, name=arr.name) if hasattr(arr, 'index'): train_arr.index = train test_arr.index = test res.append(train_arr) res.append(test_arr) return res