Example #1
0
def sanitize_batch(batch, data_type=None, dev=None):
    """
    Convert to Value with `data_type`. If the samples in `batch` have different
    sequence lengths, pad them to max sequence length and create a mask.

    Args:
        batch (list of NumPy arrays): input

    Returns:
        converted batch
    """
    from ..cntk_py import Value

    if isinstance(batch, Value):
        return batch

    num_seq = len(batch)

    try:
        seq_lens = [len(seq) for seq in batch]
    except:
        import ipdb;ipdb.set_trace()
    
    use_mask = len(set(seq_lens))!=1    
    if use_mask:
        # If not all sequences are of the same length, we have to pad them to
        # the same length and create a mask over the original data.
        from cntk.cntk_py import NDMask
        mask = NDMask((max(seq_lens), num_seq), dev)
        for idx, seq_len in enumerate(seq_lens):
            mask.mask_section((seq_len, idx), (cntk_py.InferredDimension, 1)) 

        # Then we pad the batch to rectangular shape
        if isinstance(batch, list):
            if len(batch)==0:
                raise ValueError('batch is empty')

            batch = pad_to_dense(batch)

    # If it still is not an NumPy array, try brute force...
    if not isinstance(batch, np.ndarray):
        batch = np.asarray(batch, dtype=data_type)

    '''
    if is_tensor(values) or is_tensor_list(values):
        values = np.asarray(values)
        if dynamic_axis:
            cntk_shape = values[0].shape[1:]
        else:
            cntk_shape = values[0].shape

        if len(cntk_shape) == 0:
            raise ValueError('values should be an array of input samples')
    '''
            
    ndav = create_NDArrayView_from_NumPy(batch, dev)

    if use_mask:
        value = Value(ndav, mask)
    else:
        value = Value(ndav)

    return value
Example #2
0
def sanitize_batch(batch, data_type=None, dev=None):
    """
    Convert to Value with `data_type`. If the samples in `batch` have different
    sequence lengths, pad them to max sequence length and create a mask.

    Args:
        batch (list of NumPy arrays): input

    Returns:
        converted batch
    """
    from ..cntk_py import Value

    if isinstance(batch, Value):
        return batch

    num_seq = len(batch)

    try:
        seq_lens = [len(seq) for seq in batch]
    except:
        import ipdb
        ipdb.set_trace()

    use_mask = len(set(seq_lens)) != 1
    if use_mask:
        # If not all sequences are of the same length, we have to pad them to
        # the same length and create a mask over the original data.
        from cntk.cntk_py import NDMask
        mask = NDMask((max(seq_lens), num_seq), dev)
        for idx, seq_len in enumerate(seq_lens):
            mask.mask_section((seq_len, idx), (cntk_py.InferredDimension, 1))

        # Then we pad the batch to rectangular shape
        if isinstance(batch, list):
            if len(batch) == 0:
                raise ValueError('batch is empty')

            batch = pad_to_dense(batch)

    # If it still is not an NumPy array, try brute force...
    if not isinstance(batch, np.ndarray):
        batch = np.asarray(batch, dtype=data_type)
    '''
    if is_tensor(values) or is_tensor_list(values):
        values = np.asarray(values)
        if dynamic_axis:
            cntk_shape = values[0].shape[1:]
        else:
            cntk_shape = values[0].shape

        if len(cntk_shape) == 0:
            raise ValueError('values should be an array of input samples')
    '''

    ndav = create_NDArrayView_from_NumPy(batch, dev)

    if use_mask:
        value = Value(ndav, mask)
    else:
        value = Value(ndav)

    return value
Example #3
0
def sanitize_batch(var, batch, seq_starts=None, data_type=None, device=None):
    '''
    Convert to :class:`cntk.cntk_py.Value` with ``data_type``. If the samples in ``batch`` have
    different sequence lengths, pad them to max sequence length and create a
    mask.

    Args:
        var (`:class:cntk.ops.variables.Variable`): variable node for which the ``batch`` is
         meant
        batch (`list` of NumPy arrays): input
        seq_starts (`list` of `bool` or `None`): if `None`, every sequence is
         treated as a new sequence. Otherwise, it is interpreted as a list of
         Booleans that tell whether a sequence is a new sequence (`True`) or a
         continuation of the previous one (`False`)

    Returns:
        `:class:cntk.cntk_py.Value`: converted batch
    '''
    from ..cntk_py import Value

    if isinstance(batch, Value):
        return batch

    use_mask = False

    if isinstance(batch, np.ndarray):
        if batch.dtype == np.int:
            batch = batch.astype(np.float32)
        elif batch.dtype not in (np.float32, np.float64):
            raise ValueError('only float32 and float64 are supported')
    elif isinstance(batch, list):
        if is_tensor_list(batch):
            use_mask = len(var.dynamic_axes) > 1

    if device is None:
        device = cntk_py.DeviceDescriptor.use_default_device()

    if not use_mask and seq_starts is not None:
        raise ValueError('specification of individual sequence begins does not'
                         ' make sense when not using the sequence axis')

    # Use the mask, if we have additional dynamic axes besides the batch axis

    if use_mask:
        seq_lens = [len(seq) for seq in batch]

        try:
            num_seq = len(batch)
        except TypeError:
            raise ValueError('expected an object of type Value or a NumPy ' +
                             'array and not "%s"' % type(batch))

        from cntk.cntk_py import NDMask
        mask = NDMask((max(seq_lens), num_seq), device)
        for idx, seq_len in enumerate(seq_lens):
            if seq_starts is None:
                mask.mark_sequence_begin((0, idx))
            elif seq_starts[idx]:
                mask.mark_sequence_begin((0, idx))
            mask.invalidate_section((seq_len, idx),
                                    (cntk_py.InferredDimension, 1))

        # Then we pad the batch to rectangular shape
        if isinstance(batch, list):
            if len(batch) == 0:
                raise ValueError('batch is empty')

            batch = pad_to_dense(batch)

    # If it still is not an NumPy array, try brute force...
    if not isinstance(batch, np.ndarray):
        if data_type is None:
            data_type = get_data_type(var)
        batch = np.asarray(batch, dtype=data_type)

    # Maybe a NumPy dtype was given, but with lower accuracy than float32, then
    # convert it to float32
    if np.issubdtype(batch.dtype, int):
        batch = batch.astype(np.float32)

        if len(cntk_shape) == 0:
            raise ValueError('values should be an array of input samples')

    ndav = create_NDArrayView_from_NumPy(batch, device)

    if use_mask:
        value = Value(ndav, mask)
    else:
        value = Value(ndav)

    return value
Example #4
0
def sanitize_batch(var, batch, seq_starts=None, data_type=None, device=None):
    '''
    Convert to :class:`cntk.cntk_py.Value` with ``data_type``. If the samples in ``batch`` have
    different sequence lengths, pad them to max sequence length and create a
    mask.

    Args:
        var (:class:`cntk.ops.variables.Variable`): variable node for which the ``batch`` is
         meant
        batch (`list` of NumPy arrays): input
        seq_starts (`list` of `bool` or `None`): if `None`, every sequence is
         treated as a new sequence. Otherwise, it is interpreted as a list of
         Booleans that tell whether a sequence is a new sequence (`True`) or a
         continuation of the previous one (`False`)

    Returns:
        :class:`cntk.cntk_py.Value`: converted batch
    '''
    from ..cntk_py import Value

    if isinstance(batch, Value):
        return batch

    use_mask = False

    if isinstance(batch, np.ndarray):
        if batch.dtype == np.int:
            batch = batch.astype(np.float32)
        elif batch.dtype not in (np.float32, np.float64):
            raise ValueError('only float32 and float64 are supported')
    elif isinstance(batch, list):
        if is_tensor_list(batch):
            use_mask =  len(var.dynamic_axes) > 1

    if device is None:
        device = use_default_device()

    if not use_mask and seq_starts is not None:
        raise ValueError('specification of individual sequence begins does not'
                ' make sense when not using the sequence axis')

    # Use the mask, if we have additional dynamic axes besides the batch axis

    if use_mask:
        seq_lens = [len(seq) for seq in batch]

        try:
            num_seq = len(batch)
        except TypeError:
            raise ValueError('expected an object of type Value or a NumPy ' +
                             'array and not "%s"' % type(batch))

        from cntk.cntk_py import NDMask
        mask = NDMask((max(seq_lens), num_seq), device)
        for idx, seq_len in enumerate(seq_lens):
            if seq_starts is None:
                mask.mark_sequence_begin((0, idx))
            elif seq_starts[idx]:
                mask.mark_sequence_begin((0, idx))
            mask.invalidate_section((seq_len, idx),
                                    (cntk_py.InferredDimension, 1))

        # Then we pad the batch to rectangular shape
        if isinstance(batch, list):
            if len(batch) == 0:
                raise ValueError('batch is empty')

            batch = pad_to_dense(batch)

    # If it still is not an NumPy array, try brute force...
    if not isinstance(batch, np.ndarray):
        if data_type is None:
            data_type = get_data_type(var)
        batch = np.asarray(batch, dtype=data_type)

    # Maybe a NumPy dtype was given, but with lower accuracy than float32, then
    # convert it to float32
    if np.issubdtype(batch.dtype, int):
        batch = batch.astype(np.float32)

        if len(cntk_shape) == 0:
            raise ValueError('values should be an array of input samples')

    ndav = create_NDArrayView_from_NumPy(batch, device)

    if use_mask:
        value = Value(ndav, mask)
    else:
        value = Value(ndav)

    return value