def get_validated_dmatrices(train_path, validate_path, content_type, csv_weights=0): """Get training and validation Data Matrices for XGBoost training. Check size and format of both training and validation data channels, and return parsed Data Matrices. :param train_path: :param validate_path: :param content_type: Content type of data. Supports 'libsvm' or 'csv' :param csv_weights: 1 if instance weights are in the second column of csv data files; otherwise, 0 :return: Parsed xgb.DMatrix """ train_files_size = get_size(train_path) if train_path else 0 val_files_size = get_size(validate_path) if validate_path else 0 logging.debug("File size need to be processed in the node: {}mb.".format( round((train_files_size + val_files_size) / (1024 * 1024), 2))) if train_files_size > 0: validate_data_file_path(train_path, content_type) if val_files_size > 0: validate_data_file_path(validate_path, content_type) train_dmatrix = get_dmatrix( train_path, content_type, csv_weights=csv_weights) if train_files_size > 0 else None val_dmatrix = get_dmatrix(validate_path, content_type) if val_files_size > 0 else None return train_dmatrix, val_dmatrix
def test_validate_csv_files(self): csv_file_paths = ['train.csv', 'train.csv.weights', 'csv_files'] for file_path in csv_file_paths: with self.subTest(file_path=file_path): csv_path = os.path.join(self.data_path, 'csv', file_path) data_utils.validate_data_file_path(csv_path, 'csv')
def test_validate_libsvm_files(self): libsvm_file_paths = ['train.libsvm', 'train.libsvm.weights', 'libsvm_files'] for file_path in libsvm_file_paths: with self.subTest(file_path=file_path): csv_path = os.path.join(self.data_path, 'libsvm', file_path) data_utils.validate_data_file_path(csv_path, 'libsvm')
def get_validated_dmatrices(train_path, validate_path, content_type, csv_weights=0, is_pipe=False, combine_train_val=False): """Get training and validation Data Matrices for XGBoost training. Check size and format of both training and validation data channels, and return parsed Data Matrices. :param train_path: :param validate_path: :param content_type: Content type of data. Supports 'libsvm' or 'csv' :param csv_weights: 1 if instance weights are in the second column of csv data files; otherwise, 0 :param is_pipe: Boolean to indicate if data is being read in pipe mode :combine_train_val: Boolean to indicate if returns a DMatrix combining train and validation data :return: Parsed xgb.DMatrix """ train_files_size = get_size(train_path, is_pipe) if train_path else 0 val_files_size = get_size(validate_path, is_pipe) if validate_path else 0 if not is_pipe: logging.debug( "File size need to be processed in the node: {}mb.".format( round((train_files_size + val_files_size) / (1024 * 1024), 2))) if train_files_size > 0: validate_data_file_path(train_path, content_type) if val_files_size > 0: validate_data_file_path(validate_path, content_type) train_dmatrix = get_dmatrix(train_path, content_type, csv_weights=csv_weights, is_pipe=is_pipe) \ if train_files_size > 0 else None val_dmatrix = get_dmatrix(validate_path, content_type, csv_weights=csv_weights, is_pipe=is_pipe) \ if val_files_size > 0 else None train_val_dmatrix = train_dmatrix if combine_train_val and train_dmatrix is not None and val_dmatrix is not None: logging.info("Read both train and validation data into one DMatrix") train_val_dmatrix = get_dmatrix([train_path, validate_path], content_type, csv_weights=csv_weights, is_pipe=is_pipe) return train_dmatrix, val_dmatrix, train_val_dmatrix
def get_validated_dmatrices(train_path, validate_path, content_type, csv_weights=0, is_pipe=False, subsample_ratio_on_read=None): """Get training and validation Data Matrices for XGBoost training. Check size and format of both training and validation data channels, and return parsed Data Matrices. :param train_path: :param validate_path: :param content_type: Content type of data. Supports 'libsvm', 'csv', 'parquet', and 'recordio-protobuf'. :param csv_weights: 1 if instance weights are in the second column of csv data files; otherwise, 0 :param is_pipe: Boolean to indicate if data is being read in pipe mode :param subsample_ratio_on_read: None or a value in (0, 1) to indicate how much of the dataset should be read into memory. :return: Parsed xgb.DMatrix """ train_files_size = get_size(train_path, is_pipe) if train_path else 0 val_files_size = get_size(validate_path, is_pipe) if validate_path else 0 if not is_pipe: logging.debug("File size need to be processed in the node: {}mb.".format( round((train_files_size + val_files_size) / (1024 * 1024), 2))) if train_files_size > 0: validate_data_file_path(train_path, content_type) if val_files_size > 0: validate_data_file_path(validate_path, content_type) train_dmatrix = get_dmatrix(train_path, content_type, csv_weights=csv_weights, is_pipe=is_pipe, subsample_ratio_on_read=subsample_ratio_on_read) \ if train_files_size > 0 else None val_dmatrix = get_dmatrix(validate_path, content_type, is_pipe=is_pipe) \ if val_files_size > 0 else None return train_dmatrix, val_dmatrix