예제 #1
0
파일: __init__.py 프로젝트: ssokhey/CNTK
def nce_loss(weights, biases, inputs, labels, noise_distribution, num_samples=32, allow_duplicates=True, seed=auto_select, name=''):
    '''nce_loss(weights, biases, inputs, labels, noise_distribution, num_samples=32, allow_duplicates=True, seed=auto_select, name='')
    Computes the noise contrastive estimation loss. This implementation mostly
    follows Chris Dyer's notes [1]. At a high level, this layer draws
    `num_samples` random labels from `noise_distribution` and then forms
    `num_samples`+1 binary classification problems where the true label is
    considered a positive example and the random labels are considered negative
    examples. The negatives are shared among all the examples in the
    minibatch. This operation only computes the logits for the labels in the
    minibatch and the random labels drawn from `noise_distribution`. The
    gradients will be sparse if the labels are sparse.

    The `noise_distribution` is read once and certain quantities are
    precomputed based on it. This operation will need to be reinstantiated if
    the `noise_distribution` changes.

    Shape inference for the weights is currently not supported when inputs are
    placeholders. Either a concrete input must be used or the weights must be
    provided without any inferred dimensions.

    Example:
        >>> import scipy
        >>> # dimensions of input, number of noise labels, batch size, number of classes
        >>> xdim = 10
        >>> samples = 32
        >>> batch = 4
        >>> classes = 100
        >>> # some variables; typically x will be the output of a layer
        >>> x = C.input_variable(xdim)
        >>> y = C.input_variable(classes, is_sparse=True)
        >>> # dummy data
        >>> x0 = np.arange(batch * xdim, dtype=np.float32).reshape((batch, xdim))/(batch * xdim)
        >>> data = np.ones(batch, dtype=np.float32)
        >>> indices = list(range(10, 10*batch+1, 10))
        >>> indptr = list(range(batch + 1))
        >>> y0 = scipy.sparse.csr_matrix((data, indices, indptr), shape=(batch, classes))
        >>> # a dummy noise distribution
        >>> q = np.arange(classes, dtype=np.float32) + 1 # normalization not necessary
        >>> # the parameters
        >>> b = C.parameter((classes, 1), init=-np.log(classes))
        >>> W = C.parameter((classes, C.InferredDimension), init=C.glorot_uniform(seed=98052))
        >>> # the loss
        >>> loss = C.nce_loss(W, b, x, y, q, seed=98052)
        >>> # evaluate the loss at our dummy data
        >>> np.round(loss.eval({x:x0, y:y0}), decimals=3)
        array([ 2.385,  3.035,  3.886,  3.868], dtype=float32)
        >>> # after training, use the logits for predictions
        >>> logits = C.times_transpose(x, W) + C.reshape(b, -1)

    Args:
        weights: parameter (or variable in general) containing the weights with
         which inputs will be multiplied. Its shape must be
         (number of classes, dimension of input)
        biases: parameter (or variable in general) containing the biases that
         will be added to the product of weights and inputs. Its shape must be
         (number of classes, 1)
        inputs: vector of inputs to this layer. Multiplying by the weights and
         adding the biases gives the logits.
        labels: a one-hot vector with the ground-truth labels.
        noise_distribution: a constant vector with dimension equal to the number
         of classes. The entries must be positive numbers but do not have to
         sum to 1. random labels will be drawn according to the normalized
         distribution.
        num_samples: number of random labels that will be drawn from the
         `noise_distribution`.
        allow_duplicates: boolean. If True (default), the random labels can
         contain duplicates. Compared to `allow_duplicates=False` it is faster
         but the quality of the approximations is slightly worse for the same
         number of samples.
        seed: random seed. The default value selects a unique random seed.
        name (str, optional): the name of the Function instance in the network
    Returns:
        :class:`~cntk.ops.functions.Function`

    See also:
        [1] C. Dyer. `Notes on Noise Contrastive Estimation and Negative Sampling [pdf] <http://demo.clab.cs.cmu.edu/cdyer/nce_notes.pdf>`_.
    '''
    from cntk.cntk_py import nce_loss
    dtype = get_data_type(inputs, labels, noise_distribution)
    inputs = sanitize_input(inputs, dtype)
    labels = sanitize_input(labels, dtype)
    noise_distribution = sanitize_input(noise_distribution, dtype)
    return nce_loss(weights, biases, inputs, labels, noise_distribution,
                    num_samples, allow_duplicates, seed, name)
예제 #2
0
def nce_loss(weights,
             biases,
             inputs,
             labels,
             noise_distribution,
             num_samples=32,
             allow_duplicates=True,
             seed=auto_select,
             name=''):
    '''nce_loss(weights, biases, inputs, labels, noise_distribution, num_samples=32, allow_duplicates=True, seed=auto_select, name='')
    Computes the noise contrastive estimation loss. This implementation mostly
    follows Chris Dyer's notes [1]. At a high level, this layer draws
    `num_samples` random labels from `noise_distribution` and then forms
    `num_samples`+1 binary classification problems where the true label is
    considered a positive example and the random labels are considered negative
    examples. The negatives are shared among all the examples in the
    minibatch. This operation only computes the logits for the labels in the
    minibatch and the random labels drawn from `noise_distribution`. The
    gradients will be sparse if the labels are sparse.

    The `noise_distribution` is read once and certain quantities are
    precomputed based on it. This operation will need to be reinstantiated if
    the `noise_distribution` changes.

    Shape inference for the weights is currently not supported when inputs are
    placeholders. Either a concrete input must be used or the weights must be
    provided without any inferred dimensions.

    Example:
        >>> import scipy
        >>> # dimensions of input, number of noise labels, batch size, number of classes
        >>> xdim = 10
        >>> samples = 32
        >>> batch = 4
        >>> classes = 100
        >>> # some variables; typically x will be the output of a layer
        >>> x = C.input_variable(xdim)
        >>> y = C.input_variable(classes, is_sparse=True)
        >>> # dummy data
        >>> x0 = np.arange(batch * xdim, dtype=np.float32).reshape((batch, xdim))/(batch * xdim)
        >>> data = np.ones(batch, dtype=np.float32)
        >>> indices = list(range(10, 10*batch+1, 10))
        >>> indptr = list(range(batch + 1))
        >>> y0 = scipy.sparse.csr_matrix((data, indices, indptr), shape=(batch, classes))
        >>> # a dummy noise distribution
        >>> q = np.arange(classes, dtype=np.float32) + 1 # normalization not necessary
        >>> # the parameters
        >>> b = C.parameter((classes, 1), init=-np.log(classes))
        >>> W = C.parameter((classes, C.InferredDimension), init=C.glorot_uniform(seed=98052))
        >>> # the loss
        >>> loss = C.nce_loss(W, b, x, y, q, seed=98052)
        >>> # evaluate the loss at our dummy data
        >>> np.round(loss.eval({x:x0, y:y0}), decimals=3)
        array([ 2.385,  3.035,  3.886,  3.868], dtype=float32)
        >>> # after training, use the logits for predictions
        >>> logits = C.times(W, C.reshape(x, (xdim, 1))) + b

    Args:
        weights: parameter (or variable in general) containing the weights with
         which inputs will be multiplied. Its shape must be
         (number of classes, dimension of input)
        biases: parameter (or variable in general) containing the biases that
         will be added to the product of weights and inputs. Its shape must be
         (number of classes, 1)
        inputs: vector of inputs to this layer. Multiplying by the weights and
         adding the biases gives the logits.
        labels: a one-hot vector with the ground-truth labels.
        noise_distribution: a constant vector with dimension equal to the number
         of classes. The entries must be positive numbers but do not have to
         sum to 1. random labels will be drawn according to the normalized
         distribution.
        num_samples: number of random labels that will be drawn from the
         `noise_distribution`.
        allow_duplicates: boolean. If True (default), the random labels can
         contain duplicates. Compared to `allow_duplicates=False` it is faster
         but the quality of the approximations is slightly worse for the same
         number of samples.
        seed: random seed. The default value selects a unique random seed.
        name (str, optional): the name of the Function instance in the network
    Returns:
        :class:`~cntk.ops.functions.Function`

    See also:
        [1] C. Dyer. `Notes on Noise Contrastive Estimation and Negative Sampling [pdf] <http://demo.clab.cs.cmu.edu/cdyer/nce_notes.pdf>`_.
    '''
    from cntk.cntk_py import nce_loss
    dtype = get_data_type(inputs, labels, noise_distribution)
    inputs = sanitize_input(inputs, dtype)
    labels = sanitize_input(labels, dtype)
    noise_distribution = sanitize_input(noise_distribution, dtype)
    return nce_loss(weights, biases, inputs, labels, noise_distribution,
                    num_samples, allow_duplicates, seed, name)