def get_validated_dmatrices(train_path,
                            validate_path,
                            content_type,
                            csv_weights=0):
    """Get training and validation Data Matrices for XGBoost training.

    Check size and format of both training and validation data channels, and return parsed
    Data Matrices.

    :param train_path:
    :param validate_path:
    :param content_type: Content type of data. Supports 'libsvm' or 'csv'
    :param csv_weights: 1 if instance weights are in the second column of csv data files; otherwise, 0
    :return: Parsed xgb.DMatrix
    """
    train_files_size = get_size(train_path) if train_path else 0
    val_files_size = get_size(validate_path) if validate_path else 0

    logging.debug("File size need to be processed in the node: {}mb.".format(
        round((train_files_size + val_files_size) / (1024 * 1024), 2)))

    if train_files_size > 0:
        validate_data_file_path(train_path, content_type)
    if val_files_size > 0:
        validate_data_file_path(validate_path, content_type)

    train_dmatrix = get_dmatrix(
        train_path, content_type,
        csv_weights=csv_weights) if train_files_size > 0 else None
    val_dmatrix = get_dmatrix(validate_path,
                              content_type) if val_files_size > 0 else None

    return train_dmatrix, val_dmatrix
Exemple #2
0
    def test_validate_csv_files(self):
        csv_file_paths = ['train.csv', 'train.csv.weights', 'csv_files']

        for file_path in csv_file_paths:
            with self.subTest(file_path=file_path):
                csv_path = os.path.join(self.data_path, 'csv', file_path)
                data_utils.validate_data_file_path(csv_path, 'csv')
Exemple #3
0
    def test_validate_libsvm_files(self):
        libsvm_file_paths = ['train.libsvm', 'train.libsvm.weights', 'libsvm_files']

        for file_path in libsvm_file_paths:
            with self.subTest(file_path=file_path):
                csv_path = os.path.join(self.data_path, 'libsvm', file_path)
                data_utils.validate_data_file_path(csv_path, 'libsvm')
def get_validated_dmatrices(train_path,
                            validate_path,
                            content_type,
                            csv_weights=0,
                            is_pipe=False,
                            combine_train_val=False):
    """Get training and validation Data Matrices for XGBoost training.

    Check size and format of both training and validation data channels, and return parsed
    Data Matrices.

    :param train_path:
    :param validate_path:
    :param content_type: Content type of data. Supports 'libsvm' or 'csv'
    :param csv_weights: 1 if instance weights are in the second column of csv data files; otherwise, 0
    :param is_pipe: Boolean to indicate if data is being read in pipe mode
    :combine_train_val: Boolean to indicate if returns a DMatrix combining train and validation data
    :return: Parsed xgb.DMatrix
    """
    train_files_size = get_size(train_path, is_pipe) if train_path else 0
    val_files_size = get_size(validate_path, is_pipe) if validate_path else 0

    if not is_pipe:
        logging.debug(
            "File size need to be processed in the node: {}mb.".format(
                round((train_files_size + val_files_size) / (1024 * 1024), 2)))

        if train_files_size > 0:
            validate_data_file_path(train_path, content_type)
        if val_files_size > 0:
            validate_data_file_path(validate_path, content_type)

    train_dmatrix = get_dmatrix(train_path, content_type, csv_weights=csv_weights, is_pipe=is_pipe) \
        if train_files_size > 0 else None
    val_dmatrix = get_dmatrix(validate_path, content_type, csv_weights=csv_weights, is_pipe=is_pipe) \
        if val_files_size > 0 else None

    train_val_dmatrix = train_dmatrix
    if combine_train_val and train_dmatrix is not None and val_dmatrix is not None:
        logging.info("Read both train and validation data into one DMatrix")
        train_val_dmatrix = get_dmatrix([train_path, validate_path],
                                        content_type,
                                        csv_weights=csv_weights,
                                        is_pipe=is_pipe)

    return train_dmatrix, val_dmatrix, train_val_dmatrix
Exemple #5
0
def get_validated_dmatrices(train_path,
                            validate_path,
                            content_type,
                            csv_weights=0,
                            is_pipe=False,
                            subsample_ratio_on_read=None):
    """Get training and validation Data Matrices for XGBoost training.

    Check size and format of both training and validation data channels, and return parsed
    Data Matrices.

    :param train_path:
    :param validate_path:
    :param content_type: Content type of data. Supports 'libsvm', 'csv', 'parquet', and 'recordio-protobuf'.
    :param csv_weights: 1 if instance weights are in the second column of csv data files; otherwise, 0
    :param is_pipe: Boolean to indicate if data is being read in pipe mode
    :param subsample_ratio_on_read: None or a value in (0, 1) to indicate how much of the dataset should
            be read into memory.
    :return: Parsed xgb.DMatrix
    """
    train_files_size = get_size(train_path, is_pipe) if train_path else 0
    val_files_size = get_size(validate_path, is_pipe) if validate_path else 0

    if not is_pipe:
        logging.debug("File size need to be processed in the node: {}mb.".format(
            round((train_files_size + val_files_size) / (1024 * 1024), 2)))

        if train_files_size > 0:
            validate_data_file_path(train_path, content_type)
        if val_files_size > 0:
            validate_data_file_path(validate_path, content_type)

    train_dmatrix = get_dmatrix(train_path,
                                content_type,
                                csv_weights=csv_weights,
                                is_pipe=is_pipe,
                                subsample_ratio_on_read=subsample_ratio_on_read) \
        if train_files_size > 0 else None
    val_dmatrix = get_dmatrix(validate_path, content_type, is_pipe=is_pipe) \
        if val_files_size > 0 else None

    return train_dmatrix, val_dmatrix