コード例 #1
0
def test_list_alphabetical():
    """Test for checking 'list_files' returns alphabetically"""
    path = "test/data/csv/mock_datasplitter_output"
    mlio_list_files = list_files(path, pattern="*")
    alphabetical_files = []
    for file in ["excel.csv", "manual.csv", "newline.csv", "oneline.csv"]:
        alphabetical_files.extend(list_files(path + "/" + file, pattern="*"))
    assert mlio_list_files == alphabetical_files
コード例 #2
0
def _get_reader(source, batch_size):
    """Returns 'CsvReader' for the given source

       Parameters
       ----------
       source: str or bytes
           Name of the SageMaker Channel, File, or directory from which the data is being read or
           the Python buffer object from which the data is being read.

       batch_size : int
           The batch size in rows to read from the source.

       Returns
       -------
       mlio.CsvReader
           CsvReader configured with a SageMaker Pipe, File or InMemory buffer
       """
    mlio_file = mlio.list_files(source)
    data_reader_params = mlio.DataReaderParams(dataset=mlio_file,
                                               batch_size=batch_size)

    #    data_reader_params = mlio.DataReaderParams(
    #        dataset=_get_data(source), batch_size=batch_size, #warn_bad_instances=False
    #  )
    csv_params = mlio.CsvParams(default_data_type=mlio.DataType.STRING,
                                header_row_index=None,
                                allow_quoted_new_lines=True)
    return mlio.CsvReader(data_reader_params=data_reader_params,
                          csv_params=csv_params)
コード例 #3
0
def get_recordio_protobuf_dmatrix(path,
                                  is_pipe=False,
                                  subsample_ratio_on_read=None):
    """Get Data Matrix from recordio-protobuf data.

    :param path: Path where recordio-protobuf formatted training data resides, either directory, file, or SageMaker pipe
    :param is_pipe: Boolean to indicate if data is being read in pipe mode
    :param subsample_ratio_on_read: None or a value in (0, 1) to indicate how much of the dataset should
            be read into memory.
    :return: xgb.DMatrix or None
    """
    try:
        if is_pipe:
            dataset = [mlio.SageMakerPipe(path)]
            reader = mlio.RecordIOProtobufReader(
                dataset=dataset,
                batch_size=BATCH_SIZE,
                subsample_ratio=subsample_ratio_on_read)
        else:
            dataset = mlio.list_files(path)
            reader = mlio.RecordIOProtobufReader(
                dataset=dataset,
                batch_size=BATCH_SIZE,
                subsample_ratio=subsample_ratio_on_read)

        exm = reader.peek_example()
        if exm is None:
            return None

        # Recordio-protobuf tensor may be dense (use numpy) or sparse (use scipy)
        if isinstance(exm['values'], mlio.DenseTensor):
            to_matrix = as_numpy
            vstack = np.vstack
        else:
            to_matrix = to_coo_matrix
            vstack = scipy_vstack

        all_values = []
        all_labels = []
        for example in reader:
            values = to_matrix(example['values'])
            all_values.append(values)

            labels = as_numpy(example['label_values']).squeeze()
            all_labels.append(labels)

        all_values = vstack(all_values)
        all_labels = np.concatenate(all_labels)

        return xgb.DMatrix(all_values, label=all_labels)
    except Exception as e:
        raise exc.UserError(
            "Failed to load recordio-protobuf data with exception:\n{}".format(
                e))
コード例 #4
0
    async def __predict(self):
        if self.params.ml_lib == 'snap':
            from pai4sk import BoostingMachine as Booster
        else:
            from sklearn.tree import DecisionTreeRegressor
        chunk_size = self.params.chunk_size  # getattr(self.params, "chunk_size")
        dataset = mlio.list_files(getattr(self.params, "dataset_test_path"),
                                  pattern='*.csv')
        logging.debug('mlio dataset={}'.format(dataset))
        reader_params = mlio.DataReaderParams(
            dataset=dataset,
            batch_size=chunk_size,
            num_prefetched_batches=self.params.num_prefetched_chunks)
        reader = mlio.CsvReader(reader_params)
        logging.debug('mlio reader={}'.format(reader))

        logging.debug('starting inference')
        score_norm = 0.0
        score = 0.0
        # preample
        chunkim1 = reader.read_example()
        if chunkim1 != None:
            X_im1, y_im1 = await self.__preprocess_chunk(chunkim1)
        chunki = reader.read_example()
        i = 1
        logging.debug('chunk{}={}'.format(0, chunkim1))
        logging.debug('chunk{}={}'.format(i, chunki))
        while chunki != None:
            logging.debug('chunk{}={}'.format(i, chunki))
            task_predict = asyncio.create_task(
                self.__predict_chunk(X_im1, y_im1))
            task_preprocess = asyncio.create_task(
                self.__preprocess_chunk(chunki))
            X_i, y_i = await task_preprocess
            s, n = await task_predict
            score += s
            score_norm += n
            X_im1 = X_i
            y_im1 = y_i
            chunkim1 = chunki
            chunki = reader.read_example()
            i += 1
        # postample
        if chunkim1 != None:
            logging.debug('y{}m1={}'.format(i, y_im1))
            s, n = await self.__predict_chunk(X_im1, y_im1)
            score += s
            score_norm += n
        score /= score_norm
        return score
コード例 #5
0
    async def __train_old(self):
        chunk_size = self.params.chunk_size  # getattr(self.params, "chunk_size")
        dataset = mlio.list_files(getattr(self.params, "dataset_path"),
                                  pattern='*.csv')
        logging.debug('mlio dataset={}'.format(dataset))
        preproc_fn = self.params.preproc_fn
        reader_params = mlio.DataReaderParams(
            dataset=dataset,
            batch_size=chunk_size,
            num_prefetched_batches=self.params.num_prefetched_chunks)
        reader = mlio.CsvReader(reader_params)
        logging.debug('mlio reader={}'.format(reader))
        num_epochs = self.params.num_epochs  # Number of times to read the full dataset.
        # use eta parameteres
        eta = 0.01
        if self.params.ml_lib == 'snap':
            eta = 0.1
            from pai4sk import BoostingMachine as Booster
        else:
            from sklearn.tree import DecisionTreeRegressor

        logging.debug('starting training')
        models = []
        # RecordIOProtobufReader is simply an iterator over mini-batches of data.
        for chunk_idx, chunk in enumerate(reader):
            rand_state = self.params.rand_state
            # Alternatively, transform the mini-batch into a NumPy array.
            chunk_train_Xy = np.column_stack(
                [as_numpy(feature) for feature in chunk])
            chunk_train_X, chunk_train_y = preproc_fn(
                chunk_train_Xy, self.params.label_col_idx)
            #print(chunk_train_X)
            if self.params.ml_lib == 'snap':
                bl = Booster(**self.params.ml_opts_dict)
                bl.fit(chunk_train_X, chunk_train_y)
                models.append(bl)
            else:
                z_train = np.zeros(chunk_train_X.shape[0])
                for epoch in range(num_epochs):
                    #logging.debug('chunk idx={} chunk={}'.format(chunk_idx, chunk))

                    target = chunk_train_y - z_train
                    bl = DecisionTreeRegressor(max_depth=3,
                                               max_features='sqrt',
                                               random_state=rand_state)
                    bl.fit(chunk_train_X, target)
                    u_train = bl.predict(chunk_train_X)
                    z_train = z_train + eta * u_train
                    models.append(bl)
        return models
コード例 #6
0
def get_recordio_protobuf_dmatrix(path, is_pipe=False):
    """Get Data Matrix from recordio-protobuf data.

    :param path: Path where recordio-protobuf formatted training data resides, either directory, file, or SageMaker pipe
    :param is_pipe: Boolean to indicate if data is being read in pipe mode
    :return: xgb.DMatrix or None
    """
    try:
        if is_pipe:
            pipes_path = path if isinstance(path, list) else [path]
            dataset = [
                mlio.SageMakerPipe(pipe_path) for pipe_path in pipes_path
            ]
        else:
            dataset = mlio.list_files(path)

        reader_params = mlio.DataReaderParams(dataset=dataset,
                                              batch_size=BATCH_SIZE)
        reader = mlio.RecordIOProtobufReader(reader_params)

        if reader.peek_example() is not None:
            # recordio-protobuf tensor may be dense (use numpy) or sparse (use scipy)
            is_dense_tensor = type(
                reader.peek_example()['values']) is mlio.DenseTensor

            all_features = []
            all_labels = []
            for example in reader:
                features = as_numpy(
                    example['values']) if is_dense_tensor else to_coo_matrix(
                        example['values'])
                all_features.append(features)

                labels = as_numpy(example['label_values'])
                all_labels.append(labels)

            all_features = np.vstack(
                all_features) if is_dense_tensor else scipy_vstack(
                    all_features).tocsr()
            all_labels = np.concatenate(all_labels, axis=None)
            dmatrix = xgb.DMatrix(all_features, label=all_labels)
            return dmatrix
        else:
            return None

    except Exception as e:
        raise exc.UserError(
            "Failed to load recordio-protobuf data with exception:\n{}".format(
                e))
コード例 #7
0
    async def __train(self):
        chunk_size = self.params.chunk_size  # getattr(self.params, "chunk_size")
        dataset = mlio.list_files(getattr(self.params, "dataset_path"),
                                  pattern='*.csv')
        logging.debug('mlio dataset={}'.format(dataset))
        reader_params = mlio.DataReaderParams(
            dataset=dataset,
            batch_size=chunk_size,
            num_prefetched_batches=self.params.num_prefetched_chunks)
        reader = mlio.CsvReader(reader_params)
        logging.debug('mlio reader={}'.format(reader))
        num_epochs = self.params.num_epochs  # Number of times to read the full dataset.
        # use eta parameteres
        eta = 0.01
        if self.params.ml_lib == 'snap':
            eta = 0.1
            from pai4sk import BoostingMachine as Booster
        else:
            from sklearn.tree import DecisionTreeRegressor

        logging.debug('starting training')
        models = []
        # preample
        chunkim1 = reader.read_example()
        if chunkim1 != None:
            X_im1, y_im1 = await self.__preprocess_chunk(chunkim1)
        chunki = reader.read_example()
        i = 1
        logging.debug('chunk{}={}'.format(0, chunkim1))
        logging.debug('chunk{}={}'.format(i, chunki))
        while chunki != None:
            logging.debug('chunk{}={}'.format(i, chunki))
            task_preprocess = asyncio.create_task(
                self.__preprocess_chunk(chunki))
            task_train = asyncio.create_task(self.__train_chunk(X_im1, y_im1))
            X_i, y_i = await task_preprocess
            models.extend(await task_train)
            X_im1 = X_i
            y_im1 = y_i
            chunkim1 = chunki
            chunki = reader.read_example()
            i += 1
        # postample
        if chunkim1 != None:
            logging.debug('y{}m1={}'.format(i, y_im1))
            models.extend(await self.__train_chunk(X_im1, y_im1))
        return models
コード例 #8
0
def test_list_recursive():
    """Test for checking 'list_files' lists recursively"""
    assert len(list_files("test/data/csv", pattern="*")) == 10
コード例 #9
0
def test_get_reader_mlio_file_object():
    """Test for getting a 'CsvReader' with a mlio.core.File object source"""
    source = "test/data/csv/mock_datasplitter_output"
    files = list_files(source, pattern="*")
    reader = _get_data(source=files[0])
    assert isinstance(reader[0], mlio_file)
コード例 #10
0
def _get_data(source):
    """Determines the input mode of the source and returns a InMemoryStore, SageMakerPipe, or File object
    based on the input mode.

    If source is a python buffer, a mlio.core.InMemoryStore will be returned.

    If SM_INPUT_DATA_CONFIG environment variable is not defined, source is assumed to be a file or directory and a
    mlio.core.File object will be returned.

    If SM_INPUT_DATA_CONFIG environment variable is defined, source can be the name of the channel in
    SM_INPUT_DATA_CONFIG. If the source is a path, it is assumed that the basename of the path is the name of the
    channel. The type of mlio.core object to be returned will be based on the "TrainingInputMode" of the channel.

    Here is an example of SM_INPUT_DATA_CONFIG with two channels ("code" and "train").
    SM_INPUT_DATA_CONFIG=
    {
        "code": {
            "ContentType": "application/x-code",
            "RecordWrapperType": "None",
            "S3DistributionType": "FullyReplicated",
            "TrainingInputMode": "File"
        },
        "train": {
            "ContentType": "text/csv",
            "RecordWrapperType": "None",
            "S3DistributionType": "ShardedByS3Key",
            "TrainingInputMode": "File"
        }
    }

    Parameters
    ----------
    source: str or bytes
        Name of the SageMaker Channel, File, or directory from which the data is being read or
        the Python buffer object from which the data is being read.

    Returns
    -------
    mlio.core.File:
        A mlio.core.File object is return based on the file or directory described by the `source`.

    mlio.core.SageMakerPipe:
        In SageMaker framework containers, the inputdataconfig.json is made available via environment
        variable 'SM_INPUT_DATA_CONFIG'. When the given source is a to 'Pipe' the value of the
        environment variable 'SM_INPUT_DATA_CONFIG' is used to read out the 'TrainingInputMode' and
        confirm that the source is a 'Pipe'. Then a `mlio.SageMakerPipe` object is created using the
        'source' and returned.

    mlio.core.InMemoryStore:
        Given the `source` is a Python buffer, a mlio.InMemoryStore object is created and returned
    """
    if isinstance(source, bytes):
        return [mlio.InMemoryStore(source)]

    if isinstance(source, mlio.core.File):
        source = source.id

    config = os.environ.get("SM_INPUT_DATA_CONFIG")

    if config is None:
        return mlio.list_files(source, pattern="*")

    channels = json.loads(config)

    source_channel_name = os.path.basename(source)
    try:
        channel_config = channels[source_channel_name]
    except KeyError:
        raise KeyError(
            "Configuration for channel name {} is not provided in SM_INPUT_DATA_CONFIG."
            .format(source_channel_name))

    try:
        data_config_input_mode = channel_config["TrainingInputMode"]
    except KeyError:
        raise KeyError(
            "SM_INPUT_DATA_CONFIG is malformed. TrainingInputMode is "
            "not found for channel name {}".format(source_channel_name))

    if data_config_input_mode == "Pipe":
        return [mlio.SageMakerPipe(source)]

    return mlio.list_files(source, pattern="*")  # 'File' mode