def test_csv_params(): filename = os.path.join(resources_dir, 'test.csv') dataset = [mlio.File(filename)] rdr_prm = mlio.DataReaderParams(dataset=dataset, batch_size=1) csv_prm = mlio.CsvParams(header_row_index=None) reader = mlio.CsvReader(rdr_prm, csv_prm) example = reader.read_example() record = [as_numpy(feature) for feature in example] assert np.all(np.array(record).squeeze() == np.array([1, 0, 0, 0])) reader2 = mlio.CsvReader(rdr_prm, csv_prm) assert reader2.peek_example()
def _test_dedupe_column_names(tmpdir, input_column_names: List[str], input_data: List[int], expected_column_names: List[str], expected_data: List[int], dedupe_column_names: bool = True, **kwargs) -> None: header_str = ','.join(input_column_names) data_str = ','.join(str(x) for x in input_data) csv_file = tmpdir.join("test.csv") csv_file.write(header_str + '\n' + data_str) dataset = [mlio.File(str(csv_file))] reader_params = mlio.DataReaderParams(dataset=dataset, batch_size=1) csv_params = mlio.CsvParams(dedupe_column_names=dedupe_column_names, **kwargs) reader = mlio.CsvReader(reader_params, csv_params) example = reader.read_example() names = [desc.name for desc in example.schema.descriptors] assert names == expected_column_names record = [as_numpy(feature) for feature in example] assert np.all(np.array(record).squeeze() == np.array(expected_data))
def _get_reader(source, batch_size): """Returns 'CsvReader' for the given source Parameters ---------- source: str or bytes Name of the SageMaker Channel, File, or directory from which the data is being read or the Python buffer object from which the data is being read. batch_size : int The batch size in rows to read from the source. Returns ------- mlio.CsvReader CsvReader configured with a SageMaker Pipe, File or InMemory buffer """ data_reader_params = mlio.DataReaderParams(dataset=_get_data(source), batch_size=batch_size, warn_bad_instances=False) csv_params = mlio.CsvParams(default_data_type=mlio.DataType.STRING, header_row_index=None, allow_quoted_new_lines=True) return mlio.CsvReader(data_reader_params=data_reader_params, csv_params=csv_params)
async def __predict(self): if self.params.ml_lib == 'snap': from pai4sk import BoostingMachine as Booster else: from sklearn.tree import DecisionTreeRegressor chunk_size = self.params.chunk_size # getattr(self.params, "chunk_size") dataset = mlio.list_files(getattr(self.params, "dataset_test_path"), pattern='*.csv') logging.debug('mlio dataset={}'.format(dataset)) reader_params = mlio.DataReaderParams( dataset=dataset, batch_size=chunk_size, num_prefetched_batches=self.params.num_prefetched_chunks) reader = mlio.CsvReader(reader_params) logging.debug('mlio reader={}'.format(reader)) logging.debug('starting inference') score_norm = 0.0 score = 0.0 # preample chunkim1 = reader.read_example() if chunkim1 != None: X_im1, y_im1 = await self.__preprocess_chunk(chunkim1) chunki = reader.read_example() i = 1 logging.debug('chunk{}={}'.format(0, chunkim1)) logging.debug('chunk{}={}'.format(i, chunki)) while chunki != None: logging.debug('chunk{}={}'.format(i, chunki)) task_predict = asyncio.create_task( self.__predict_chunk(X_im1, y_im1)) task_preprocess = asyncio.create_task( self.__preprocess_chunk(chunki)) X_i, y_i = await task_preprocess s, n = await task_predict score += s score_norm += n X_im1 = X_i y_im1 = y_i chunkim1 = chunki chunki = reader.read_example() i += 1 # postample if chunkim1 != None: logging.debug('y{}m1={}'.format(i, y_im1)) s, n = await self.__predict_chunk(X_im1, y_im1) score += s score_norm += n score /= score_norm return score
async def __train_old(self): chunk_size = self.params.chunk_size # getattr(self.params, "chunk_size") dataset = mlio.list_files(getattr(self.params, "dataset_path"), pattern='*.csv') logging.debug('mlio dataset={}'.format(dataset)) preproc_fn = self.params.preproc_fn reader_params = mlio.DataReaderParams( dataset=dataset, batch_size=chunk_size, num_prefetched_batches=self.params.num_prefetched_chunks) reader = mlio.CsvReader(reader_params) logging.debug('mlio reader={}'.format(reader)) num_epochs = self.params.num_epochs # Number of times to read the full dataset. # use eta parameteres eta = 0.01 if self.params.ml_lib == 'snap': eta = 0.1 from pai4sk import BoostingMachine as Booster else: from sklearn.tree import DecisionTreeRegressor logging.debug('starting training') models = [] # RecordIOProtobufReader is simply an iterator over mini-batches of data. for chunk_idx, chunk in enumerate(reader): rand_state = self.params.rand_state # Alternatively, transform the mini-batch into a NumPy array. chunk_train_Xy = np.column_stack( [as_numpy(feature) for feature in chunk]) chunk_train_X, chunk_train_y = preproc_fn( chunk_train_Xy, self.params.label_col_idx) #print(chunk_train_X) if self.params.ml_lib == 'snap': bl = Booster(**self.params.ml_opts_dict) bl.fit(chunk_train_X, chunk_train_y) models.append(bl) else: z_train = np.zeros(chunk_train_X.shape[0]) for epoch in range(num_epochs): #logging.debug('chunk idx={} chunk={}'.format(chunk_idx, chunk)) target = chunk_train_y - z_train bl = DecisionTreeRegressor(max_depth=3, max_features='sqrt', random_state=rand_state) bl.fit(chunk_train_X, target) u_train = bl.predict(chunk_train_X) z_train = z_train + eta * u_train models.append(bl) return models
def test_csv_nonutf_encoding_with_encoding_param(): filename = os.path.join(resources_dir, 'test_iso8859_5.csv') dataset = [mlio.File(filename)] rdr_prm = mlio.DataReaderParams(dataset=dataset, batch_size=2) csv_params = mlio.CsvParams(encoding='ISO-8859-5') reader = mlio.CsvReader(rdr_prm, csv_params) example = reader.read_example() nonutf_feature = example['col_3'] try: feature_np = as_numpy(nonutf_feature) except SystemError as err: pytest.fail("Unexpected exception thrown")
def _get_csv_dmatrix_pipe_mode(pipe_path, csv_weights): """Get Data Matrix from CSV data in pipe mode. :param pipe_path: SageMaker pipe path where CSV formatted training data is piped :param csv_weights: 1 if instance weights are in second column of CSV data; else 0 :return: xgb.DMatrix or None """ try: pipes_path = pipe_path if isinstance(pipe_path, list) else [pipe_path] dataset = [mlio.SageMakerPipe(path) for path in pipes_path] reader_params = mlio.DataReaderParams(dataset=dataset, batch_size=BATCH_SIZE) csv_params = mlio.CsvParams(header_row_index=None) reader = mlio.CsvReader(reader_params, csv_params) # Check if data is present in reader if reader.peek_example() is not None: examples = [] for example in reader: # Write each feature (column) of example into a single numpy array tmp = [as_numpy(feature).squeeze() for feature in example] tmp = np.array(tmp) if len(tmp.shape) > 1: # Columns are written as rows, needs to be transposed tmp = tmp.T else: # If tmp is a 1-D array, it needs to be reshaped as a matrix tmp = np.reshape(tmp, (1, tmp.shape[0])) examples.append(tmp) data = np.vstack(examples) del examples if csv_weights == 1: dmatrix = xgb.DMatrix(data[:, 2:], label=data[:, 0], weight=data[:, 1]) else: dmatrix = xgb.DMatrix(data[:, 1:], label=data[:, 0]) return dmatrix else: return None except Exception as e: raise exc.UserError( "Failed to load csv data with exception:\n{}".format(e))
async def __train(self): chunk_size = self.params.chunk_size # getattr(self.params, "chunk_size") dataset = mlio.list_files(getattr(self.params, "dataset_path"), pattern='*.csv') logging.debug('mlio dataset={}'.format(dataset)) reader_params = mlio.DataReaderParams( dataset=dataset, batch_size=chunk_size, num_prefetched_batches=self.params.num_prefetched_chunks) reader = mlio.CsvReader(reader_params) logging.debug('mlio reader={}'.format(reader)) num_epochs = self.params.num_epochs # Number of times to read the full dataset. # use eta parameteres eta = 0.01 if self.params.ml_lib == 'snap': eta = 0.1 from pai4sk import BoostingMachine as Booster else: from sklearn.tree import DecisionTreeRegressor logging.debug('starting training') models = [] # preample chunkim1 = reader.read_example() if chunkim1 != None: X_im1, y_im1 = await self.__preprocess_chunk(chunkim1) chunki = reader.read_example() i = 1 logging.debug('chunk{}={}'.format(0, chunkim1)) logging.debug('chunk{}={}'.format(i, chunki)) while chunki != None: logging.debug('chunk{}={}'.format(i, chunki)) task_preprocess = asyncio.create_task( self.__preprocess_chunk(chunki)) task_train = asyncio.create_task(self.__train_chunk(X_im1, y_im1)) X_i, y_i = await task_preprocess models.extend(await task_train) X_im1 = X_i y_im1 = y_i chunkim1 = chunki chunki = reader.read_example() i += 1 # postample if chunkim1 != None: logging.debug('y{}m1={}'.format(i, y_im1)) models.extend(await self.__train_chunk(X_im1, y_im1)) return models
def _get_csv_dmatrix_pipe_mode(pipe_path, csv_weights, subsample_ratio_on_read): """Get Data Matrix from CSV data in pipe mode. :param pipe_path: SageMaker pipe path where CSV formatted training data is piped :param csv_weights: 1 if instance weights are in second column of CSV data; else 0 :param subsample_ratio_on_read: None or a value in (0, 1) to indicate how much of the dataset should be read into memory. :return: xgb.DMatrix or None """ try: dataset = [mlio.SageMakerPipe(pipe_path, fifo_id=0)] reader = mlio.CsvReader(dataset=dataset, batch_size=BATCH_SIZE, header_row_index=None, subsample_ratio=subsample_ratio_on_read) # Check if data is present in reader if reader.peek_example() is None: return None batches = [] for example in reader: batch = np.column_stack([as_numpy(f) for f in example]) batches.append(batch) data = np.vstack(batches) del batches if csv_weights == 1: dmatrix = xgb.DMatrix(data[:, 2:], label=data[:, 0], weights=data[:, 1]) else: dmatrix = xgb.DMatrix(data[:, 1:], label=data[:, 0]) return dmatrix except Exception as e: raise exc.UserError( "Failed to load csv data with exception:\n{}".format(e))