def split_data(self,
                   dataset,
                   y_label,
                   train_size=.8,
                   random_state=0,
                   shuffle=True):

        self.log_to_file('\n\t splitting train and test data')
        start_time = time.perf_counter()

        with PerfTimer() as split_timer:
            if 'CPU' in self.compute_type:
                X_train, X_test, y_train, y_test = sklearn_train_test_split(
                    dataset,
                    dataset[y_label],
                    train_size=train_size,
                    shuffle=shuffle,
                    random_state=random_state)
            elif 'GPU' in self.compute_type:
                X_train, X_test, y_train, y_test = cuml_train_test_split(
                    X=dataset,
                    y=y_label,
                    train_size=train_size,
                    shuffle=shuffle,
                    random_state=random_state)
        self.log_to_file(f'\t split completed in {split_timer.duration}')
        return X_train, X_test, y_train, y_test, split_timer.duration
Example #2
0
    def split_data(self,
                   dataset,
                   y_label,
                   train_size=.8,
                   random_state=0,
                   shuffle=True):
        """
        split dataset into train and test subset 
        NOTE: assumes the first column of the dataset is the classification labels
            ! in the case of sklearn, we manually filter this column in the split call
            ! in the case of cuml, the filtering happens internally 
        """
        self.log_to_file('\tsplitting train and test data')
        start_time = time.perf_counter()

        with PerfTimer() as split_timer:
            if 'CPU' in self.compute_type:
                X_train, X_test, y_train, y_test = sklearn_train_test_split(
                    dataset.loc[:, dataset.columns != y_label],
                    dataset[y_label],
                    train_size=train_size,
                    shuffle=shuffle,
                    random_state=random_state)
            elif 'GPU' in self.compute_type:
                X_train, X_test, y_train, y_test = cuml_train_test_split(
                    X=dataset,
                    y=y_label,
                    train_size=train_size,
                    shuffle=shuffle,
                    random_state=random_state)
        self.log_to_file(f'\t> split completed in {split_timer.duration}')
        return X_train, X_test, y_train, y_test, split_timer.duration
    def split_data(self, dataset, y_label, train_size = .8, random_state = 0, shuffle = True):
        """
        Splitting data into train and test split, has appropriate imports for different compute modes.
        CPU compute - Uses sklearn, we manually filter y_label column in the split call
        GPU Compute - Single GPU uses cuml and multi GPU uses dask, both split y_label internally.

        Parameters
        ----------
        dataset : dataframe
                  The dataframe on which we wish to perform the split
        y_label : string
                  The name of the column (not the series itself)
        train_size : float
                     The size for the split. Takes values between 0 to 1.
        random_state : int
                       Useful for running reproducible splits.
        shuffle : binary
                  Specifies if the data must be shuffled before splitting.

        Returns
        ----------
        X_train : dataframe
                  The data to be used for training. Has same type as input dataset.
        X_test : dataframe
                  The data to be used for testing. Has same type as input dataset.
        y_train : dataframe
                  The label to be used for training. Has same type as input dataset.
        y_test : dataframe
                  The label to be used for testing. Has same type as input dataset.
        duration : float
                   The time it took to perform the split
        """
        self.log_to_file('\n> Splitting train and test data')
        start_time = time.perf_counter()

        with PerfTimer() as split_timer:
            if 'CPU' in self.compute_type:
                X_train, X_test, y_train, y_test = sklearn_train_test_split(dataset.loc[:, dataset.columns != y_label],
                                                                            dataset[y_label],
                                                                            train_size = train_size,
                                                                            shuffle = shuffle,
                                                                            random_state = random_state)

            elif 'GPU' in self.compute_type:
                if 'single' in self.compute_type:
                    X_train, X_test, y_train, y_test = cuml_train_test_split(X = dataset,
                                                                             y = y_label,
                                                                             train_size = train_size,
                                                                             shuffle = shuffle,
                                                                             random_state = random_state) 
                elif 'multi' in self.compute_type:
                    X_train, X_test, y_train, y_test = dask_train_test_split(dataset,
                                                                             y_label,
                                                                             train_size = train_size,
                                                                             shuffle = False, # shuffle not available for dask_cudf yet
                                                                             random_state = random_state)
        
        self.log_to_file(f'\n\tX_train shape and type{X_train.shape} {type(X_train)}')
        self.log_to_file( f'\n\tSplit completed in {split_timer.duration}')
        return X_train, X_test, y_train, y_test, split_timer.duration