def sanitize_batch(batch, data_type=None, dev=None): """ Convert to Value with `data_type`. If the samples in `batch` have different sequence lengths, pad them to max sequence length and create a mask. Args: batch (list of NumPy arrays): input Returns: converted batch """ from ..cntk_py import Value if isinstance(batch, Value): return batch num_seq = len(batch) try: seq_lens = [len(seq) for seq in batch] except: import ipdb;ipdb.set_trace() use_mask = len(set(seq_lens))!=1 if use_mask: # If not all sequences are of the same length, we have to pad them to # the same length and create a mask over the original data. from cntk.cntk_py import NDMask mask = NDMask((max(seq_lens), num_seq), dev) for idx, seq_len in enumerate(seq_lens): mask.mask_section((seq_len, idx), (cntk_py.InferredDimension, 1)) # Then we pad the batch to rectangular shape if isinstance(batch, list): if len(batch)==0: raise ValueError('batch is empty') batch = pad_to_dense(batch) # If it still is not an NumPy array, try brute force... if not isinstance(batch, np.ndarray): batch = np.asarray(batch, dtype=data_type) ''' if is_tensor(values) or is_tensor_list(values): values = np.asarray(values) if dynamic_axis: cntk_shape = values[0].shape[1:] else: cntk_shape = values[0].shape if len(cntk_shape) == 0: raise ValueError('values should be an array of input samples') ''' ndav = create_NDArrayView_from_NumPy(batch, dev) if use_mask: value = Value(ndav, mask) else: value = Value(ndav) return value
def sanitize_batch(batch, data_type=None, dev=None): """ Convert to Value with `data_type`. If the samples in `batch` have different sequence lengths, pad them to max sequence length and create a mask. Args: batch (list of NumPy arrays): input Returns: converted batch """ from ..cntk_py import Value if isinstance(batch, Value): return batch num_seq = len(batch) try: seq_lens = [len(seq) for seq in batch] except: import ipdb ipdb.set_trace() use_mask = len(set(seq_lens)) != 1 if use_mask: # If not all sequences are of the same length, we have to pad them to # the same length and create a mask over the original data. from cntk.cntk_py import NDMask mask = NDMask((max(seq_lens), num_seq), dev) for idx, seq_len in enumerate(seq_lens): mask.mask_section((seq_len, idx), (cntk_py.InferredDimension, 1)) # Then we pad the batch to rectangular shape if isinstance(batch, list): if len(batch) == 0: raise ValueError('batch is empty') batch = pad_to_dense(batch) # If it still is not an NumPy array, try brute force... if not isinstance(batch, np.ndarray): batch = np.asarray(batch, dtype=data_type) ''' if is_tensor(values) or is_tensor_list(values): values = np.asarray(values) if dynamic_axis: cntk_shape = values[0].shape[1:] else: cntk_shape = values[0].shape if len(cntk_shape) == 0: raise ValueError('values should be an array of input samples') ''' ndav = create_NDArrayView_from_NumPy(batch, dev) if use_mask: value = Value(ndav, mask) else: value = Value(ndav) return value
def sanitize_batch(var, batch, seq_starts=None, data_type=None, device=None): ''' Convert to :class:`cntk.cntk_py.Value` with ``data_type``. If the samples in ``batch`` have different sequence lengths, pad them to max sequence length and create a mask. Args: var (`:class:cntk.ops.variables.Variable`): variable node for which the ``batch`` is meant batch (`list` of NumPy arrays): input seq_starts (`list` of `bool` or `None`): if `None`, every sequence is treated as a new sequence. Otherwise, it is interpreted as a list of Booleans that tell whether a sequence is a new sequence (`True`) or a continuation of the previous one (`False`) Returns: `:class:cntk.cntk_py.Value`: converted batch ''' from ..cntk_py import Value if isinstance(batch, Value): return batch use_mask = False if isinstance(batch, np.ndarray): if batch.dtype == np.int: batch = batch.astype(np.float32) elif batch.dtype not in (np.float32, np.float64): raise ValueError('only float32 and float64 are supported') elif isinstance(batch, list): if is_tensor_list(batch): use_mask = len(var.dynamic_axes) > 1 if device is None: device = cntk_py.DeviceDescriptor.use_default_device() if not use_mask and seq_starts is not None: raise ValueError('specification of individual sequence begins does not' ' make sense when not using the sequence axis') # Use the mask, if we have additional dynamic axes besides the batch axis if use_mask: seq_lens = [len(seq) for seq in batch] try: num_seq = len(batch) except TypeError: raise ValueError('expected an object of type Value or a NumPy ' + 'array and not "%s"' % type(batch)) from cntk.cntk_py import NDMask mask = NDMask((max(seq_lens), num_seq), device) for idx, seq_len in enumerate(seq_lens): if seq_starts is None: mask.mark_sequence_begin((0, idx)) elif seq_starts[idx]: mask.mark_sequence_begin((0, idx)) mask.invalidate_section((seq_len, idx), (cntk_py.InferredDimension, 1)) # Then we pad the batch to rectangular shape if isinstance(batch, list): if len(batch) == 0: raise ValueError('batch is empty') batch = pad_to_dense(batch) # If it still is not an NumPy array, try brute force... if not isinstance(batch, np.ndarray): if data_type is None: data_type = get_data_type(var) batch = np.asarray(batch, dtype=data_type) # Maybe a NumPy dtype was given, but with lower accuracy than float32, then # convert it to float32 if np.issubdtype(batch.dtype, int): batch = batch.astype(np.float32) if len(cntk_shape) == 0: raise ValueError('values should be an array of input samples') ndav = create_NDArrayView_from_NumPy(batch, device) if use_mask: value = Value(ndav, mask) else: value = Value(ndav) return value
def sanitize_batch(var, batch, seq_starts=None, data_type=None, device=None): ''' Convert to :class:`cntk.cntk_py.Value` with ``data_type``. If the samples in ``batch`` have different sequence lengths, pad them to max sequence length and create a mask. Args: var (:class:`cntk.ops.variables.Variable`): variable node for which the ``batch`` is meant batch (`list` of NumPy arrays): input seq_starts (`list` of `bool` or `None`): if `None`, every sequence is treated as a new sequence. Otherwise, it is interpreted as a list of Booleans that tell whether a sequence is a new sequence (`True`) or a continuation of the previous one (`False`) Returns: :class:`cntk.cntk_py.Value`: converted batch ''' from ..cntk_py import Value if isinstance(batch, Value): return batch use_mask = False if isinstance(batch, np.ndarray): if batch.dtype == np.int: batch = batch.astype(np.float32) elif batch.dtype not in (np.float32, np.float64): raise ValueError('only float32 and float64 are supported') elif isinstance(batch, list): if is_tensor_list(batch): use_mask = len(var.dynamic_axes) > 1 if device is None: device = use_default_device() if not use_mask and seq_starts is not None: raise ValueError('specification of individual sequence begins does not' ' make sense when not using the sequence axis') # Use the mask, if we have additional dynamic axes besides the batch axis if use_mask: seq_lens = [len(seq) for seq in batch] try: num_seq = len(batch) except TypeError: raise ValueError('expected an object of type Value or a NumPy ' + 'array and not "%s"' % type(batch)) from cntk.cntk_py import NDMask mask = NDMask((max(seq_lens), num_seq), device) for idx, seq_len in enumerate(seq_lens): if seq_starts is None: mask.mark_sequence_begin((0, idx)) elif seq_starts[idx]: mask.mark_sequence_begin((0, idx)) mask.invalidate_section((seq_len, idx), (cntk_py.InferredDimension, 1)) # Then we pad the batch to rectangular shape if isinstance(batch, list): if len(batch) == 0: raise ValueError('batch is empty') batch = pad_to_dense(batch) # If it still is not an NumPy array, try brute force... if not isinstance(batch, np.ndarray): if data_type is None: data_type = get_data_type(var) batch = np.asarray(batch, dtype=data_type) # Maybe a NumPy dtype was given, but with lower accuracy than float32, then # convert it to float32 if np.issubdtype(batch.dtype, int): batch = batch.astype(np.float32) if len(cntk_shape) == 0: raise ValueError('values should be an array of input samples') ndav = create_NDArrayView_from_NumPy(batch, device) if use_mask: value = Value(ndav, mask) else: value = Value(ndav) return value