def get_recordio_protobuf_dmatrix(path, is_pipe=False): """Get Data Matrix from recordio-protobuf data. :param path: Path where recordio-protobuf formatted training data resides, either directory, file, or SageMaker pipe :param is_pipe: Boolean to indicate if data is being read in pipe mode :return: xgb.DMatrix or None """ try: if is_pipe: pipes_path = path if isinstance(path, list) else [path] dataset = [ mlio.SageMakerPipe(pipe_path) for pipe_path in pipes_path ] else: dataset = mlio.list_files(path) reader_params = mlio.DataReaderParams(dataset=dataset, batch_size=BATCH_SIZE) reader = mlio.RecordIOProtobufReader(reader_params) if reader.peek_example() is not None: # recordio-protobuf tensor may be dense (use numpy) or sparse (use scipy) is_dense_tensor = type( reader.peek_example()['values']) is mlio.DenseTensor all_features = [] all_labels = [] for example in reader: features = as_numpy( example['values']) if is_dense_tensor else to_coo_matrix( example['values']) all_features.append(features) labels = as_numpy(example['label_values']) all_labels.append(labels) all_features = np.vstack( all_features) if is_dense_tensor else scipy_vstack( all_features).tocsr() all_labels = np.concatenate(all_labels, axis=None) dmatrix = xgb.DMatrix(all_features, label=all_labels) return dmatrix else: return None except Exception as e: raise exc.UserError( "Failed to load recordio-protobuf data with exception:\n{}".format( e))
async def __preprocess_chunk(self, chunk): t0 = time.time() preproc_fn = self.params.preproc_fn Xy = np.column_stack([as_numpy(feature) for feature in chunk]) X, y = preproc_fn(Xy, self.params.label_col_idx) logging.debug('t_preproc_chunk={:.2f}'.format(time.time() - t0)) return X, y
def _initialize_state(self, first_batch): super()._initialize_state(first_batch) # Estimate the size of items in each column using the first batch. for i in range(self._n_columns): column = as_numpy(first_batch[i]).flatten() self._row_nbytes += _get_size_total(column) / column.shape[0]
def _construct_features_array_data(self, batch): """Stacks numpy columns created from an incoming data batch into a numpy array.""" return np.column_stack([ as_numpy(batch[column_index]).flatten() for column_index in range(self._n_columns) if column_index != self.target_column_index ])
def _test_dedupe_column_names(tmpdir, input_column_names: List[str], input_data: List[int], expected_column_names: List[str], expected_data: List[int], dedupe_column_names: bool = True, **kwargs) -> None: header_str = ','.join(input_column_names) data_str = ','.join(str(x) for x in input_data) csv_file = tmpdir.join("test.csv") csv_file.write(header_str + '\n' + data_str) dataset = [mlio.File(str(csv_file))] reader_params = mlio.DataReaderParams(dataset=dataset, batch_size=1) csv_params = mlio.CsvParams(dedupe_column_names=dedupe_column_names, **kwargs) reader = mlio.CsvReader(reader_params, csv_params) example = reader.read_example() names = [desc.name for desc in example.schema.descriptors] assert names == expected_column_names record = [as_numpy(feature) for feature in example] assert np.all(np.array(record).squeeze() == np.array(expected_data))
def recordio_protobuf_to_dmatrix(string_like): # type: (bytes) -> xgb.DMatrix """Convert a RecordIO-Protobuf byte representation to a DMatrix object. Args: string_like (bytes): RecordIO-Protobuf bytes. Returns: (xgb.DMatrix): XGBoost DataMatrix """ buf = bytes(string_like) dataset = [mlio.InMemoryStore(buf)] reader_params = mlio.DataReaderParams(dataset=dataset, batch_size=100) reader = mlio.RecordIOProtobufReader(reader_params) is_dense_tensor = type(reader.peek_example()['values']) is mlio.DenseTensor examples = [] for example in reader: # Ignore labels if present values = as_numpy( example['values']) if is_dense_tensor else to_coo_matrix( example['values']) examples.append(values) data = np.vstack(examples) if is_dense_tensor else scipy_vstack( examples).tocsr() dmatrix = xgb.DMatrix(data) return dmatrix
def to_pandas(example): """ Converts the specified ``Example`` to a pandas DataFrame. """ data = {attr.name: as_numpy(ftr).flatten() for attr, ftr in zip(example.schema.attributes, example)} return pd.DataFrame(data)
def _construct_features_array_data(self, batch): """Creates a list of `self._n_features` arrays containing data from each column in the batch. Note that the arrays are interpreted as strings here, in order to easily extract itemsize and estimate size. """ return [ as_numpy(batch[i]).flatten().astype(str) for i in range(self._n_columns) if i != self.target_column_index ]
def get_recordio_protobuf_dmatrix(path, is_pipe=False, subsample_ratio_on_read=None): """Get Data Matrix from recordio-protobuf data. :param path: Path where recordio-protobuf formatted training data resides, either directory, file, or SageMaker pipe :param is_pipe: Boolean to indicate if data is being read in pipe mode :param subsample_ratio_on_read: None or a value in (0, 1) to indicate how much of the dataset should be read into memory. :return: xgb.DMatrix or None """ try: if is_pipe: dataset = [mlio.SageMakerPipe(path)] reader = mlio.RecordIOProtobufReader( dataset=dataset, batch_size=BATCH_SIZE, subsample_ratio=subsample_ratio_on_read) else: dataset = mlio.list_files(path) reader = mlio.RecordIOProtobufReader( dataset=dataset, batch_size=BATCH_SIZE, subsample_ratio=subsample_ratio_on_read) exm = reader.peek_example() if exm is None: return None # Recordio-protobuf tensor may be dense (use numpy) or sparse (use scipy) if isinstance(exm['values'], mlio.DenseTensor): to_matrix = as_numpy vstack = np.vstack else: to_matrix = to_coo_matrix vstack = scipy_vstack all_values = [] all_labels = [] for example in reader: values = to_matrix(example['values']) all_values.append(values) labels = as_numpy(example['label_values']).squeeze() all_labels.append(labels) all_values = vstack(all_values) all_labels = np.concatenate(all_labels) return xgb.DMatrix(all_values, label=all_labels) except Exception as e: raise exc.UserError( "Failed to load recordio-protobuf data with exception:\n{}".format( e))
def to_tf(tensor): if isinstance(tensor, DenseTensor): return tf.convert_to_tensor(as_numpy(tensor)) mtx = to_coo_matrix(tensor).tocsr() non_zero_row_col = mtx.nonzero() indices = np.asmatrix([non_zero_row_col[0], non_zero_row_col[1]]) indices = indices.transpose() return tf.SparseTensor(indices, mtx.data, mtx.shape)
def test_csv_params(): filename = os.path.join(resources_dir, 'test.csv') dataset = [mlio.File(filename)] rdr_prm = mlio.DataReaderParams(dataset=dataset, batch_size=1) csv_prm = mlio.CsvParams(header_row_index=None) reader = mlio.CsvReader(rdr_prm, csv_prm) example = reader.read_example() record = [as_numpy(feature) for feature in example] assert np.all(np.array(record).squeeze() == np.array([1, 0, 0, 0])) reader2 = mlio.CsvReader(rdr_prm, csv_prm) assert reader2.peek_example()
async def __train_old(self): chunk_size = self.params.chunk_size # getattr(self.params, "chunk_size") dataset = mlio.list_files(getattr(self.params, "dataset_path"), pattern='*.csv') logging.debug('mlio dataset={}'.format(dataset)) preproc_fn = self.params.preproc_fn reader_params = mlio.DataReaderParams( dataset=dataset, batch_size=chunk_size, num_prefetched_batches=self.params.num_prefetched_chunks) reader = mlio.CsvReader(reader_params) logging.debug('mlio reader={}'.format(reader)) num_epochs = self.params.num_epochs # Number of times to read the full dataset. # use eta parameteres eta = 0.01 if self.params.ml_lib == 'snap': eta = 0.1 from pai4sk import BoostingMachine as Booster else: from sklearn.tree import DecisionTreeRegressor logging.debug('starting training') models = [] # RecordIOProtobufReader is simply an iterator over mini-batches of data. for chunk_idx, chunk in enumerate(reader): rand_state = self.params.rand_state # Alternatively, transform the mini-batch into a NumPy array. chunk_train_Xy = np.column_stack( [as_numpy(feature) for feature in chunk]) chunk_train_X, chunk_train_y = preproc_fn( chunk_train_Xy, self.params.label_col_idx) #print(chunk_train_X) if self.params.ml_lib == 'snap': bl = Booster(**self.params.ml_opts_dict) bl.fit(chunk_train_X, chunk_train_y) models.append(bl) else: z_train = np.zeros(chunk_train_X.shape[0]) for epoch in range(num_epochs): #logging.debug('chunk idx={} chunk={}'.format(chunk_idx, chunk)) target = chunk_train_y - z_train bl = DecisionTreeRegressor(max_depth=3, max_features='sqrt', random_state=rand_state) bl.fit(chunk_train_X, target) u_train = bl.predict(chunk_train_X) z_train = z_train + eta * u_train models.append(bl) return models
def test_recordio_protobuf_reader_params(): filename = os.path.join(resources_dir, 'test.pbr') dataset = [mlio.File(filename)] rdr_prm = mlio.DataReaderParams(dataset=dataset, batch_size=1) reader = mlio.RecordIOProtobufReader(rdr_prm) example = reader.read_example() record = [as_numpy(feature) for feature in example] assert record[0].squeeze() == np.array(1) assert np.all(record[1].squeeze() == np.array([0, 0, 0])) # Parameters should be reusable reader2 = mlio.RecordIOProtobufReader(rdr_prm) assert reader2.peek_example()
def test_csv_nonutf_encoding_with_encoding_param(): filename = os.path.join(resources_dir, 'test_iso8859_5.csv') dataset = [mlio.File(filename)] rdr_prm = mlio.DataReaderParams(dataset=dataset, batch_size=2) csv_params = mlio.CsvParams(encoding='ISO-8859-5') reader = mlio.CsvReader(rdr_prm, csv_params) example = reader.read_example() nonutf_feature = example['col_3'] try: feature_np = as_numpy(nonutf_feature) except SystemError as err: pytest.fail("Unexpected exception thrown")
def _get_csv_dmatrix_pipe_mode(pipe_path, csv_weights): """Get Data Matrix from CSV data in pipe mode. :param pipe_path: SageMaker pipe path where CSV formatted training data is piped :param csv_weights: 1 if instance weights are in second column of CSV data; else 0 :return: xgb.DMatrix or None """ try: pipes_path = pipe_path if isinstance(pipe_path, list) else [pipe_path] dataset = [mlio.SageMakerPipe(path) for path in pipes_path] reader_params = mlio.DataReaderParams(dataset=dataset, batch_size=BATCH_SIZE) csv_params = mlio.CsvParams(header_row_index=None) reader = mlio.CsvReader(reader_params, csv_params) # Check if data is present in reader if reader.peek_example() is not None: examples = [] for example in reader: # Write each feature (column) of example into a single numpy array tmp = [as_numpy(feature).squeeze() for feature in example] tmp = np.array(tmp) if len(tmp.shape) > 1: # Columns are written as rows, needs to be transposed tmp = tmp.T else: # If tmp is a 1-D array, it needs to be reshaped as a matrix tmp = np.reshape(tmp, (1, tmp.shape[0])) examples.append(tmp) data = np.vstack(examples) del examples if csv_weights == 1: dmatrix = xgb.DMatrix(data[:, 2:], label=data[:, 0], weight=data[:, 1]) else: dmatrix = xgb.DMatrix(data[:, 1:], label=data[:, 0]) return dmatrix else: return None except Exception as e: raise exc.UserError( "Failed to load csv data with exception:\n{}".format(e))
def _get_csv_dmatrix_pipe_mode(pipe_path, csv_weights, subsample_ratio_on_read): """Get Data Matrix from CSV data in pipe mode. :param pipe_path: SageMaker pipe path where CSV formatted training data is piped :param csv_weights: 1 if instance weights are in second column of CSV data; else 0 :param subsample_ratio_on_read: None or a value in (0, 1) to indicate how much of the dataset should be read into memory. :return: xgb.DMatrix or None """ try: dataset = [mlio.SageMakerPipe(pipe_path, fifo_id=0)] reader = mlio.CsvReader(dataset=dataset, batch_size=BATCH_SIZE, header_row_index=None, subsample_ratio=subsample_ratio_on_read) # Check if data is present in reader if reader.peek_example() is None: return None batches = [] for example in reader: batch = np.column_stack([as_numpy(f) for f in example]) batches.append(batch) data = np.vstack(batches) del batches if csv_weights == 1: dmatrix = xgb.DMatrix(data[:, 2:], label=data[:, 0], weights=data[:, 1]) else: dmatrix = xgb.DMatrix(data[:, 1:], label=data[:, 0]) return dmatrix except Exception as e: raise exc.UserError( "Failed to load csv data with exception:\n{}".format(e))
def _construct_target_array_data(self, batch): if self._split_target: return as_numpy( batch[self.target_column_index]).flatten().astype(str) return None