def get_validated_dmatrices(train_path, validate_path, content_type, csv_weights=0): """Get training and validation Data Matrices for XGBoost training. Check size and format of both training and validation data channels, and return parsed Data Matrices. :param train_path: :param validate_path: :param content_type: Content type of data. Supports 'libsvm' or 'csv' :param csv_weights: 1 if instance weights are in the second column of csv data files; otherwise, 0 :return: Parsed xgb.DMatrix """ train_files_size = get_size(train_path) if train_path else 0 val_files_size = get_size(validate_path) if validate_path else 0 logging.debug("File size need to be processed in the node: {}mb.".format( round((train_files_size + val_files_size) / (1024 * 1024), 2))) if train_files_size > 0: validate_data_file_path(train_path, content_type) if val_files_size > 0: validate_data_file_path(validate_path, content_type) train_dmatrix = get_dmatrix( train_path, content_type, csv_weights=csv_weights) if train_files_size > 0 else None val_dmatrix = get_dmatrix(validate_path, content_type) if val_files_size > 0 else None return train_dmatrix, val_dmatrix
def get_validated_dmatrices(train_path, validate_path, content_type, csv_weights=0, is_pipe=False, combine_train_val=False): """Get training and validation Data Matrices for XGBoost training. Check size and format of both training and validation data channels, and return parsed Data Matrices. :param train_path: :param validate_path: :param content_type: Content type of data. Supports 'libsvm' or 'csv' :param csv_weights: 1 if instance weights are in the second column of csv data files; otherwise, 0 :param is_pipe: Boolean to indicate if data is being read in pipe mode :combine_train_val: Boolean to indicate if returns a DMatrix combining train and validation data :return: Parsed xgb.DMatrix """ train_files_size = get_size(train_path, is_pipe) if train_path else 0 val_files_size = get_size(validate_path, is_pipe) if validate_path else 0 if not is_pipe: logging.debug( "File size need to be processed in the node: {}mb.".format( round((train_files_size + val_files_size) / (1024 * 1024), 2))) if train_files_size > 0: validate_data_file_path(train_path, content_type) if val_files_size > 0: validate_data_file_path(validate_path, content_type) train_dmatrix = get_dmatrix(train_path, content_type, csv_weights=csv_weights, is_pipe=is_pipe) \ if train_files_size > 0 else None val_dmatrix = get_dmatrix(validate_path, content_type, csv_weights=csv_weights, is_pipe=is_pipe) \ if val_files_size > 0 else None train_val_dmatrix = train_dmatrix if combine_train_val and train_dmatrix is not None and val_dmatrix is not None: logging.info("Read both train and validation data into one DMatrix") train_val_dmatrix = get_dmatrix([train_path, validate_path], content_type, csv_weights=csv_weights, is_pipe=is_pipe) return train_dmatrix, val_dmatrix, train_val_dmatrix
def get_validated_dmatrices(train_path, validate_path, content_type, csv_weights=0, is_pipe=False, subsample_ratio_on_read=None): """Get training and validation Data Matrices for XGBoost training. Check size and format of both training and validation data channels, and return parsed Data Matrices. :param train_path: :param validate_path: :param content_type: Content type of data. Supports 'libsvm', 'csv', 'parquet', and 'recordio-protobuf'. :param csv_weights: 1 if instance weights are in the second column of csv data files; otherwise, 0 :param is_pipe: Boolean to indicate if data is being read in pipe mode :param subsample_ratio_on_read: None or a value in (0, 1) to indicate how much of the dataset should be read into memory. :return: Parsed xgb.DMatrix """ train_files_size = get_size(train_path, is_pipe) if train_path else 0 val_files_size = get_size(validate_path, is_pipe) if validate_path else 0 if not is_pipe: logging.debug("File size need to be processed in the node: {}mb.".format( round((train_files_size + val_files_size) / (1024 * 1024), 2))) if train_files_size > 0: validate_data_file_path(train_path, content_type) if val_files_size > 0: validate_data_file_path(validate_path, content_type) train_dmatrix = get_dmatrix(train_path, content_type, csv_weights=csv_weights, is_pipe=is_pipe, subsample_ratio_on_read=subsample_ratio_on_read) \ if train_files_size > 0 else None val_dmatrix = get_dmatrix(validate_path, content_type, is_pipe=is_pipe) \ if val_files_size > 0 else None return train_dmatrix, val_dmatrix
def test_get_dmatrix(self): current_path = Path(os.path.abspath(__file__)) data_path = os.path.join(str(current_path.parent.parent), 'resources', 'abalone', 'data') file_path = [ os.path.join(data_path, path) for path in ['train', 'validation'] ] dmatrix = data_utils.get_dmatrix(file_path, 'libsvm', 0, False) self.assertEqual(9, dmatrix.num_col()) self.assertEqual(3548, dmatrix.num_row())
default=os.environ.get('SM_HOSTS')) parser.add_argument('--sm_current_host', type=str, default=os.environ.get('SM_CURRENT_HOST')) parser.add_argument('--GetFIFlg', type=str, default='N') parser.add_argument('--GetTestScoreFlg', type=str, default='N') parser.add_argument('--GetTestPredFlg', type=str, default='N') args, _ = parser.parse_known_args() # Get SageMaker host information from runtime environment variables sm_hosts = json.loads(args.sm_hosts) sm_current_host = args.sm_current_host dtrain = get_dmatrix(args.train, 'csv') dtest = get_dmatrix(args.test, 'csv') if not (dtest): if ((args.GetTestScoreFlg == 'Y') | (args.GetTestPredFlg == 'Y')): raise Exception( 'Please provide test data in a test channel for prediction and scores or set GetTestScoreFlg and GetTestPredFlg to N' ) train_hp = { 'max_depth': args.max_depth, 'eta': args.eta, 'objective': args.objective, 'booster': args.booster, 'seed': args.seed,
# Sagemaker specific arguments. Defaults are set in the environment variables. parser.add_argument('--output_data_dir', type=str, default=os.environ.get('SM_OUTPUT_DATA_DIR')) parser.add_argument('--model_dir', type=str, default=os.environ.get('SM_MODEL_DIR')) parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN')) parser.add_argument('--validation', type=str, default=os.environ.get('SM_CHANNEL_VALIDATION')) parser.add_argument('--sm_hosts', type=str, default=os.environ.get('SM_HOSTS')) parser.add_argument('--sm_current_host', type=str, default=os.environ.get('SM_CURRENT_HOST')) args, _ = parser.parse_known_args() # Get SageMaker host information from runtime environment variables sm_hosts = json.loads(args.sm_hosts) sm_current_host = args.sm_current_host dtrain = get_dmatrix(args.train, 'libsvm') dval = get_dmatrix(args.validation, 'libsvm') watchlist = [(dtrain, 'train'), (dval, 'validation')] if dval is not None else [(dtrain, 'train')] train_hp = { 'max_depth': args.max_depth, 'eta': args.eta, 'gamma': args.gamma, 'min_child_weight': args.min_child_weight, 'subsample': args.subsample, 'objective': args.objective } xgb_train_args = dict( params=train_hp, dtrain=dtrain,
# Sagemaker specific arguments. Defaults are set in the environment variables. parser.add_argument("--model_dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model")) parser.add_argument( "--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/abalone"), ) args, _ = parser.parse_known_args() dtrain = get_dmatrix(args.train, "libsvm") params = { "max_depth": 5, "eta": 0.2, "gamma": 4, "min_child_weight": 6, "subsample": 0.7, "verbosity": 2, "objective": "reg:squarederror", "tree_method": "auto", "predictor": "auto", } booster = xgb.train(params=params, dtrain=dtrain, num_boost_round=50) booster.save_model(args.model_dir + "/" + model_filename)
parser.add_argument('--output_data_dir', type=str, default=os.environ.get('SM_OUTPUT_DATA_DIR')) parser.add_argument('--model_dir', type=str, default=os.environ.get('SM_MODEL_DIR')) parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN')) parser.add_argument('--validation', type=str, default=os.environ.get('SM_CHANNEL_VALIDATION')) parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST')) parser.add_argument('--sm_hosts', type=str, default=os.environ.get('SM_HOSTS')) parser.add_argument('--sm_current_host', type=str, default=os.environ.get('SM_CURRENT_HOST')) args, _ = parser.parse_known_args() # Get SageMaker host information from runtime environment variables sm_hosts = json.loads(args.sm_hosts) sm_current_host = args.sm_current_host dtrain = get_dmatrix(args.train, 'csv') dval = get_dmatrix(args.validation, 'csv') watchlist = [(dtrain, 'train'), (dval, 'validation')] if dval is not None else [(dtrain, 'train')] dtest = get_dmatrix(args.test, 'csv') if not(dtest): if ((args.GetTestScoreFlg=='Y') | (args.GetTestPredFlg=='Y')): raise Exception('Please provide test data in a test channel for prediction and scores or set GetTestScoreFlg and GetTestPredFlg to N') train_hp = { 'max_depth': args.max_depth, 'eta': args.eta, 'objective': args.objective, 'booster': args.booster, 'seed': args.seed,
type=str, default=os.environ.get("SM_CHANNEL_VALIDATION")) parser.add_argument("--sm_hosts", type=str, default=os.environ.get("SM_HOSTS")) parser.add_argument("--sm_current_host", type=str, default=os.environ.get("SM_CURRENT_HOST")) args, _ = parser.parse_known_args() # Get SageMaker host information from runtime environment variables sm_hosts = json.loads(args.sm_hosts) sm_current_host = args.sm_current_host dtrain = get_dmatrix(args.train, "libsvm") dval = get_dmatrix(args.validation, "libsvm") watchlist = ([(dtrain, "train"), (dval, "validation")] if dval is not None else [(dtrain, "train")]) train_hp = { "max_depth": args.max_depth, "eta": args.eta, "gamma": args.gamma, "min_child_weight": args.min_child_weight, "subsample": args.subsample, "verbosity": args.verbosity, "objective": args.objective, "tree_method": args.tree_method, "predictor": args.predictor,
# parser.add_argument('--validation', type=str, # default=os.environ['SM_CHANNEL_VALIDATION']) parser.add_argument('--sm_hosts', type=str, default=os.environ['SM_HOSTS']) parser.add_argument('--sm_current_host', type=str, default=os.environ['SM_CURRENT_HOST']) args, _ = parser.parse_known_args() # Get SageMaker host information from runtime environment variables sm_hosts = json.loads(os.environ['SM_HOSTS']) sm_current_host = args.sm_current_host print("hello, i get data") dtrain = get_dmatrix(args.train, 'csv') # dval = get_dmatrix(args.validation, 'csv') # watchlist = [(dtrain, 'train'), (dval, 'validation') # ] if dval is not None else [(dtrain, 'train')] watchlist = [(dtrain, 'train')] train_hp = { 'max_depth': args.max_depth, 'eta': args.eta, 'gamma': args.gamma, 'min_child_weight': args.min_child_weight, 'subsample': args.subsample, 'verbose': args.verbose, 'objective': args.objective, 'eval_metric': args.eval_metric }