Ejemplos de NDimensionalSoftmax.apply en Python, ejemplos de blocks.bricks.NDimensionalSoftmax.apply en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: rnn_model.py Proyecto: Rene90/dl4nlp

def create_rnn(hidden_dim, vocab_dim,mode="rnn"):
    # input
    x = tensor.imatrix('inchar')
    y = tensor.imatrix('outchar')

    # 
    W = LookupTable(
        name = "W1",
        #dim = hidden_dim*4,
        dim = hidden_dim,
        length = vocab_dim,
        weights_init = initialization.IsotropicGaussian(0.01),
        biases_init = initialization.Constant(0)
    )
    if mode == "lstm":
        # Long Short Term Memory
        H = LSTM(
            hidden_dim, 
            name = 'H',
            weights_init = initialization.IsotropicGaussian(0.01),
            biases_init = initialization.Constant(0.0)
        )
    else:
        # recurrent history weight
        H = SimpleRecurrent(
            name = "H",
            dim = hidden_dim,
            activation = Tanh(),
            weights_init = initialization.IsotropicGaussian(0.01)
        )
    # 
    S = Linear(
        name = "W2",
        input_dim = hidden_dim,
        output_dim = vocab_dim,
        weights_init = initialization.IsotropicGaussian(0.01),
        biases_init = initialization.Constant(0)
    )

    A = NDimensionalSoftmax(
        name = "softmax"
    )

    initLayers([W,H,S])
    activations = W.apply(x)
    hiddens = H.apply(activations)#[0]
    activations2 = S.apply(hiddens)
    y_hat = A.apply(activations2, extra_ndim=1)
    cost = A.categorical_cross_entropy(y, activations2, extra_ndim=1).mean()

    cg = ComputationGraph(cost)
    #print VariableFilter(roles=[WEIGHT])(cg.variables)
    #W1,H,W2 = VariableFilter(roles=[WEIGHT])(cg.variables)

    layers = (x, W, H, S, A, y)

    return  cg, layers, y_hat, cost

Ejemplo n.º 2

0

Mostrar archivo

Archivo: model.py Proyecto: teganmaharaj/deeplearningclass

def softmax_layer(h, y, frame_length, hidden_size):
    hidden_to_output = Linear(name="hidden_to_output", input_dim=hidden_size, output_dim=frame_length)
    initialize([hidden_to_output])
    linear_output = hidden_to_output.apply(h)
    linear_output.name = "linear_output"
    softmax = NDimensionalSoftmax()
    y_hat = softmax.apply(linear_output, extra_ndim=1)
    y_hat.name = "y_hat"
    cost = softmax.categorical_cross_entropy(y, linear_output, extra_ndim=1).mean()
    cost.name = "cost"
    return y_hat, cost

Ejemplo n.º 3

0

Mostrar archivo

Archivo: model.py Proyecto: ryancotterell/neural-canonical-segmentation

class NewSoftmaxEmitter(AbstractEmitter, Initializable, Random):
    """A softmax emitter for the case of integer outputs.
    Interprets readout elements as energies corresponding to their indices.
    Parameters
    ----------
    initial_output : int or a scalar :class:`~theano.Variable`
        The initial output.
    """
    def __init__(self, initial_output=0, **kwargs):
        super(NewSoftmaxEmitter, self).__init__(**kwargs)
        self.initial_output = initial_output
        self.softmax = NDimensionalSoftmax()
        self.children = [self.softmax]
        self.name = 'newbidirectional'

    @application
    def probs(self, readouts):
        return self.softmax.apply(readouts, extra_ndim=readouts.ndim - 2)

    @application
    def emitProbs(self, readouts):
        probs = self.probs(readouts)
        batch_size = probs.shape[0]
        self.pvals_flat = probs.reshape((batch_size, -1))
        generated = self.theano_rng.multinomial(pvals=self.pvals_flat)
        return self.pvals_flat

    @application
    def emit(self, readouts):
        probs = self.probs(readouts)
        batch_size = probs.shape[0]
        self.pvals_flat = probs.reshape((batch_size, -1))
        generated = self.theano_rng.multinomial(pvals=self.pvals_flat)
        winning_index = generated.reshape(probs.shape).argmax(axis=-1)
        return winning_index, self.pvals_flat[0][winning_index]

    @application
    def cost(self, readouts, outputs):
        # WARNING: unfortunately this application method works
        # just fine when `readouts` and `outputs` have
        # different dimensions. Be careful!
        return self.softmax.categorical_cross_entropy(
            outputs, readouts, extra_ndim=readouts.ndim - 2)

    @application
    def initial_outputs(self, batch_size):
        return self.initial_output * tensor.ones((batch_size, ), dtype='int64')

    def get_dim(self, name):
        if name == 'outputs':
            return 0
        return super(SoftmaxEmitter, self).get_dim(name)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: model.py Proyecto: ixtel/blocks-char-rnn

def softmax_layer(h, y, vocab_size, hidden_size):
    hidden_to_output = Linear(name='hidden_to_output', input_dim=hidden_size,
                              output_dim=vocab_size)
    initialize([hidden_to_output])
    linear_output = hidden_to_output.apply(h)
    linear_output.name = 'linear_output'
    softmax = NDimensionalSoftmax()
    y_hat = softmax.apply(linear_output, extra_ndim=1)
    y_hat.name = 'y_hat'
    cost = softmax.categorical_cross_entropy(
        y, linear_output, extra_ndim=1).mean()
    cost.name = 'cost'
    return y_hat, cost

Ejemplo n.º 5

0

Mostrar archivo

Archivo: sequence_generators.py Proyecto: ZhangAustin/attention-lvcsr

class SoftmaxEmitter(AbstractEmitter, Initializable, Random):
    """A softmax emitter for the case of integer outputs.

    Interprets readout elements as energies corresponding to their indices.

    Parameters
    ----------
    initial_output : int or a scalar :class:`~theano.Variable`
        The initial output.

    """
    def __init__(self, initial_output=0, **kwargs):
        super(SoftmaxEmitter, self).__init__(**kwargs)
        self.initial_output = initial_output
        self.softmax = NDimensionalSoftmax()
        self.children = [self.softmax]

    @application
    def probs(self, readouts):
        return self.softmax.apply(readouts, extra_ndim=readouts.ndim - 2)

    @application
    def emit(self, readouts):
        probs = self.probs(readouts)
        batch_size = probs.shape[0]
        pvals_flat = probs.reshape((batch_size, -1))
        generated = self.theano_rng.multinomial(pvals=pvals_flat)
        return generated.reshape(probs.shape).argmax(axis=-1)

    @application
    def cost(self, readouts, outputs):
        # WARNING: unfortunately this application method works
        # just fine when `readouts` and `outputs` have
        # different dimensions. Be careful!
        return self.softmax.categorical_cross_entropy(
            outputs, readouts, extra_ndim=readouts.ndim - 2)

    @application
    def costs(self, readouts):
        return -self.softmax.log_probabilities(
            readouts, extra_ndim=readouts.ndim - 2)

    @application
    def initial_outputs(self, batch_size):
        return self.initial_output * tensor.ones((batch_size,), dtype='int64')

    def get_dim(self, name):
        if name == 'outputs':
            return 0
        return super(SoftmaxEmitter, self).get_dim(name)

Ejemplo n.º 6

0

Mostrar archivo

Archivo: model.py Proyecto: JianboTang/blocks-char-rnn

def softmax_layer(h, y, vocab_size, hidden_size):
    hidden_to_output = Linear(name='hidden_to_output',
                              input_dim=hidden_size,
                              output_dim=vocab_size)
    initialize([hidden_to_output])
    linear_output = hidden_to_output.apply(h)
    linear_output.name = 'linear_output'
    softmax = NDimensionalSoftmax()
    y_hat = softmax.apply(linear_output, extra_ndim=1)
    y_hat.name = 'y_hat'
    cost = softmax.categorical_cross_entropy(y, linear_output,
                                             extra_ndim=1).mean()
    cost.name = 'cost'
    return y_hat, cost

Ejemplo n.º 7

0

Mostrar archivo

Archivo: rnn_model.py Proyecto: rknaebel/dl4nlp

def create_rnn(hidden_dim, vocab_dim, mode="rnn"):
    # input
    x = tensor.imatrix('inchar')
    y = tensor.imatrix('outchar')

    #
    W = LookupTable(
        name="W1",
        #dim = hidden_dim*4,
        dim=hidden_dim,
        length=vocab_dim,
        weights_init=initialization.IsotropicGaussian(0.01),
        biases_init=initialization.Constant(0))
    if mode == "lstm":
        # Long Short Term Memory
        H = LSTM(hidden_dim,
                 name='H',
                 weights_init=initialization.IsotropicGaussian(0.01),
                 biases_init=initialization.Constant(0.0))
    else:
        # recurrent history weight
        H = SimpleRecurrent(
            name="H",
            dim=hidden_dim,
            activation=Tanh(),
            weights_init=initialization.IsotropicGaussian(0.01))
    #
    S = Linear(name="W2",
               input_dim=hidden_dim,
               output_dim=vocab_dim,
               weights_init=initialization.IsotropicGaussian(0.01),
               biases_init=initialization.Constant(0))

    A = NDimensionalSoftmax(name="softmax")

    initLayers([W, H, S])
    activations = W.apply(x)
    hiddens = H.apply(activations)  #[0]
    activations2 = S.apply(hiddens)
    y_hat = A.apply(activations2, extra_ndim=1)
    cost = A.categorical_cross_entropy(y, activations2, extra_ndim=1).mean()

    cg = ComputationGraph(cost)
    #print VariableFilter(roles=[WEIGHT])(cg.variables)
    #W1,H,W2 = VariableFilter(roles=[WEIGHT])(cg.variables)

    layers = (x, W, H, S, A, y)

    return cg, layers, y_hat, cost

Ejemplo n.º 8

0

Mostrar archivo

class GMMMLP(Initializable):
    """An mlp brick that branchs out to output
    sigma and mu for GMM
    Parameters
    ----------
    mlp: MLP brick
        the main mlp to wrap around.
    dim:
        output dim
    """
    def __init__(self, mlp, dim, k, const=1e-5, **kwargs):
        super(GMMMLP, self).__init__(**kwargs)

        self.dim = dim
        self.const = const
        self.k = k
        input_dim = mlp.output_dim
        self.mu = MLP(activations=[Identity()],
                      dims=[input_dim, dim],
                      name=self.name + "_mu")
        self.sigma = MLP(activations=[SoftPlus()],
                         dims=[input_dim, dim],
                         name=self.name + "_sigma")

        self.coeff = MLP(activations=[Identity()],
                         dims=[input_dim, k],
                         name=self.name + "_coeff")

        self.coeff2 = NDimensionalSoftmax()
        self.mlp = mlp
        self.children = [
            self.mlp, self.mu, self.sigma, self.coeff, self.coeff2
        ]
        #self.children.extend(self.mlp.children)

    @application
    def apply(self, inputs):
        state = self.mlp.apply(inputs)
        mu = self.mu.apply(state)
        sigma = self.sigma.apply(state)
        coeff = self.coeff2.apply(self.coeff.apply(state),
                                  extra_ndim=state.ndim - 2) + self.const
        return mu, sigma, coeff

    @property
    def output_dim(self):
        return self.dim

Ejemplo n.º 9

0

Mostrar archivo

Archivo: custom.py Proyecto: donghyunlee/play

class GMMMLP(Initializable):
    """An mlp brick that branchs out to output
    sigma and mu for GMM
    Parameters
    ----------
    mlp: MLP brick
        the main mlp to wrap around.
    dim:
        output dim
    """
    def __init__(self, mlp, dim, k, const=1e-5, **kwargs):
        super(GMMMLP, self).__init__(**kwargs)

        self.dim = dim
        self.const = const
        self.k = k
        input_dim = mlp.output_dim
        self.mu = MLP(activations=[Identity()],
                      dims=[input_dim, dim],
                      name=self.name + "_mu")
        self.sigma = MLP(activations=[SoftPlus()],
                         dims=[input_dim, dim],
                         name=self.name + "_sigma")

        self.coeff = MLP(activations=[Identity()],
                         dims=[input_dim, k],
                         name=self.name + "_coeff")


        self.coeff2 = NDimensionalSoftmax()
        self.mlp = mlp
        self.children = [self.mlp, self.mu, 
                         self.sigma, self.coeff, self.coeff2]
        #self.children.extend(self.mlp.children)

    @application
    def apply(self, inputs):
        state = self.mlp.apply(inputs)
        mu = self.mu.apply(state)
        sigma = self.sigma.apply(state)
        coeff = self.coeff2.apply(self.coeff.apply(state),
            extra_ndim=state.ndim - 2) + self.const
        return mu, sigma, coeff

    @property
    def output_dim(self):
        return self.dim

Ejemplo n.º 10

0

Mostrar archivo

Archivo: model.py Proyecto: sharpfun/NeverEndingMusic

    def __init__(self, input1_size, input2_size, lookup1_dim=200, lookup2_dim=200, hidden_size=512):
        self.hidden_size = hidden_size
        self.input1_size = input1_size
        self.input2_size = input2_size
        self.lookup1_dim = lookup1_dim
        self.lookup2_dim = lookup2_dim

        x1 = tensor.lmatrix('durations')
        x2 = tensor.lmatrix('syllables')
        y = tensor.lmatrix('pitches')

        lookup1 = LookupTable(dim=self.lookup1_dim, length=self.input1_size, name='lookup1',
                              weights_init=initialization.Uniform(width=0.01),
                              biases_init=Constant(0))
        lookup1.initialize()
        lookup2 = LookupTable(dim=self.lookup2_dim, length=self.input2_size, name='lookup2',
                              weights_init=initialization.Uniform(width=0.01),
                              biases_init=Constant(0))
        lookup2.initialize()
        merge = Merge(['lookup1', 'lookup2'], [self.lookup1_dim, self.lookup2_dim], self.hidden_size,
                              weights_init=initialization.Uniform(width=0.01),
                              biases_init=Constant(0))
        merge.initialize()
        recurrent_block = LSTM(dim=self.hidden_size, activation=Tanh(),
                              weights_init=initialization.Uniform(width=0.01)) #RecurrentStack([LSTM(dim=self.hidden_size, activation=Tanh())] * 3)
        recurrent_block.initialize()
        linear = Linear(input_dim=self.hidden_size, output_dim=self.input1_size,
                              weights_init=initialization.Uniform(width=0.01),
                              biases_init=Constant(0))
        linear.initialize()
        softmax = NDimensionalSoftmax()

        l1 = lookup1.apply(x1)
        l2 = lookup2.apply(x2)
        m = merge.apply(l1, l2)
        h = recurrent_block.apply(m)
        a = linear.apply(h)

        y_hat = softmax.apply(a, extra_ndim=1)
        # ValueError: x must be 1-d or 2-d tensor of floats. Got TensorType(float64, 3D)

        self.Cost = softmax.categorical_cross_entropy(y, a, extra_ndim=1).mean()

        self.ComputationGraph = ComputationGraph(self.Cost)

        self.Model = Model(y_hat)

Ejemplo n.º 11

0

Mostrar archivo

def softmax_output_layer(x, h, y, in_size, out_size, hidden_size, pred):
    if connect_h_to_o:
        hidden_to_output = Linear(name='hidden_to_output' + str(pred),
                                  input_dim=hidden_size * len(h),
                                  output_dim=out_size)
        hiddens = T.concatenate([hidden for hidden in h], axis=2)
    else:
        hidden_to_output = Linear(name='hidden_to_output' + str(pred),
                                  input_dim=hidden_size,
                                  output_dim=out_size)
        hiddens = h[-1]
    initialize([hidden_to_output])
    linear_output = hidden_to_output.apply(hiddens)
    linear_output.name = 'linear_output'
    softmax = NDimensionalSoftmax()
    extra_ndim = 1 if single_dim_out else 2
    y_hat = softmax.apply(linear_output, extra_ndim=extra_ndim)
    cost = softmax.categorical_cross_entropy(y,
                                             linear_output,
                                             extra_ndim=extra_ndim).mean()

    return y_hat, cost

Ejemplo n.º 12

0

Mostrar archivo

class LanguageModel(Initializable):
    """The dictionary-equipped language model.

    Parameters
    ----------
    emb_dim: int
        The dimension of word embeddings (including for def model if standalone)
    dim : int
        The dimension of the RNNs states (including for def model if standalone)
    num_input_words : int
        The size of the LM's input vocabulary.
    num_output_words : int
        The size of the LM's output vocabulary.
    vocab
        The vocabulary object.
    retrieval
        The dictionary retrieval algorithm. If `None`, the language model
        does not use any dictionary.
    def_reader: either 'LSTM' or 'mean'
    standalone_def_rnn : bool
        If `True`, a standalone RNN with separate word embeddings is used
        to embed definition. If `False` the language model is reused.
    disregard_word_embeddings : bool
        If `True`, the word embeddings are not used, only the information
        from the definitions is used.
    compose_type : str
        If 'sum', the definition and word embeddings are averaged
        If 'fully_connected_linear', a learned perceptron compose the 2
        embeddings linearly
        If 'fully_connected_relu', ...
        If 'fully_connected_tanh', ...

    """
    def __init__(self,
                 emb_dim,
                 emb_def_dim,
                 dim,
                 num_input_words,
                 def_num_input_words,
                 num_output_words,
                 vocab,
                 retrieval=None,
                 def_reader='LSTM',
                 standalone_def_lookup=True,
                 standalone_def_rnn=True,
                 disregard_word_embeddings=False,
                 compose_type='sum',
                 very_rare_threshold=[10],
                 cache_size=0,
                 **kwargs):
        # TODO(tombosc): document
        if emb_dim == 0:
            emb_dim = dim
        if emb_def_dim == 0:
            emb_def_dim = emb_dim
        if num_input_words == 0:
            num_input_words = vocab.size()
        if def_num_input_words == 0:
            def_num_input_words = num_input_words

        if (num_input_words !=
                def_num_input_words) and (not standalone_def_lookup):
            raise NotImplementedError()

        self._very_rare_threshold = very_rare_threshold
        self._num_input_words = num_input_words
        self._num_output_words = num_output_words
        self._vocab = vocab
        self._retrieval = retrieval
        self._disregard_word_embeddings = disregard_word_embeddings
        self._compose_type = compose_type

        self._word_to_id = WordToIdOp(self._vocab)
        self._word_to_count = WordToCountOp(self._vocab)

        children = []
        self._cache = None
        if cache_size > 0:
            #TODO(tombosc) do we implement cache as LookupTable or theano matrix?
            #self._cache = theano.shared(np.zeros((def_num_input_words, emb_dim)))
            self._cache = LookupTable(cache_size,
                                      emb_dim,
                                      name='cache_def_embeddings')
            children.append(self._cache)

        if self._retrieval:
            self._retrieve = RetrievalOp(retrieval)

        self._main_lookup = LookupTable(self._num_input_words,
                                        emb_dim,
                                        name='main_lookup')
        self._main_fork = Linear(emb_dim, 4 * dim, name='main_fork')
        self._main_rnn = DebugLSTM(
            dim, name='main_rnn')  # TODO(tombosc): use regular LSTM?
        children.extend([self._main_lookup, self._main_fork, self._main_rnn])
        if self._retrieval:
            if standalone_def_lookup:
                lookup = None
            else:
                if emb_dim != emb_def_dim:
                    raise ValueError(
                        "emb_dim != emb_def_dim: cannot share lookup")
                lookup = self._main_lookup

            if def_reader == 'LSTM':
                if standalone_def_rnn:
                    fork_and_rnn = None
                else:
                    fork_and_rnn = (self._main_fork, self._main_rnn)
                self._def_reader = LSTMReadDefinitions(def_num_input_words,
                                                       emb_def_dim,
                                                       dim,
                                                       vocab,
                                                       lookup,
                                                       fork_and_rnn,
                                                       cache=self._cache)

            elif def_reader == 'mean':
                self._def_reader = MeanPoolReadDefinitions(
                    def_num_input_words,
                    emb_def_dim,
                    dim,
                    vocab,
                    lookup,
                    translate=(emb_def_dim != dim),
                    normalize=False)
            else:
                raise Exception("def reader not understood")

            self._combiner = MeanPoolCombiner(dim=dim,
                                              emb_dim=emb_dim,
                                              compose_type=compose_type)

            children.extend([self._def_reader, self._combiner])

        self._pre_softmax = Linear(dim, self._num_output_words)
        self._softmax = NDimensionalSoftmax()
        children.extend([self._pre_softmax, self._softmax])

        super(LanguageModel, self).__init__(children=children, **kwargs)

    def _push_initialization_config(self):
        super(LanguageModel, self)._push_initialization_config()
        if self._cache:
            self._cache.weights_init = Constant(0.)

    def set_def_embeddings(self, embeddings):
        self._def_reader._def_lookup.parameters[0].set_value(
            embeddings.astype(theano.config.floatX))

    def get_def_embeddings_params(self):
        return self._def_reader._def_lookup.parameters[0]

    def get_cache_params(self):
        return self._cache.W

    def add_perplexity_measure(self, application_call, minus_logs, mask, name):
        costs = (minus_logs * mask).sum(axis=0)
        perplexity = tensor.exp(costs.sum() / mask.sum())
        perplexity.tag.aggregation_scheme = Perplexity(costs.sum(), mask.sum())
        full_name = "perplexity_" + name
        application_call.add_auxiliary_variable(perplexity, name=full_name)
        return costs

    @application
    def apply(self, application_call, words, mask):
        """Compute the log-likelihood for a batch of sequences.

        words
            An integer matrix of shape (B, T), where T is the number of time
            step, B is the batch size. Note that this order of the axis is
            different from what all RNN bricks consume, hence and the axis
            should be transposed at some point.
        mask
            A float32 matrix of shape (B, T). Zeros indicate the padding.

        """
        if self._retrieval:
            defs, def_mask, def_map = self._retrieve(words)
            def_embeddings = self._def_reader.apply(defs, def_mask)

            # Auxililary variable for debugging
            application_call.add_auxiliary_variable(def_embeddings.shape[0],
                                                    name="num_definitions")

        word_ids = self._word_to_id(words)

        # shortlisting
        input_word_ids = (
            tensor.lt(word_ids, self._num_input_words) * word_ids +
            tensor.ge(word_ids, self._num_input_words) * self._vocab.unk)
        output_word_ids = (
            tensor.lt(word_ids, self._num_output_words) * word_ids +
            tensor.ge(word_ids, self._num_output_words) * self._vocab.unk)

        application_call.add_auxiliary_variable(unk_ratio(
            input_word_ids, mask, self._vocab.unk),
                                                name='unk_ratio')

        # Run the main rnn with combined inputs
        word_embs = self._main_lookup.apply(input_word_ids)
        application_call.add_auxiliary_variable(masked_root_mean_square(
            word_embs, mask),
                                                name='word_emb_RMS')

        if self._retrieval:
            rnn_inputs, updated, positions = self._combiner.apply(
                word_embs, mask, def_embeddings, def_map)
        else:
            rnn_inputs = word_embs

        updates = []
        if self._cache:
            flat_word_ids = word_ids.flatten()
            flat_word_ids_to_update = flat_word_ids[positions]
            # computing updates for cache
            updates = [
                (self._cache.W,
                 tensor.set_subtensor(self._cache.W[flat_word_ids_to_update],
                                      updated))
            ]

        application_call.add_auxiliary_variable(masked_root_mean_square(
            word_embs, mask),
                                                name='main_rnn_in_RMS')

        main_rnn_states = self._main_rnn.apply(tensor.transpose(
            self._main_fork.apply(rnn_inputs), (1, 0, 2)),
                                               mask=mask.T)[0]

        # The first token is not predicted
        logits = self._pre_softmax.apply(main_rnn_states[:-1])
        targets = output_word_ids.T[1:]
        out_softmax = self._softmax.apply(logits, extra_ndim=1)
        application_call.add_auxiliary_variable(out_softmax.copy(),
                                                name="proba_out")
        minus_logs = self._softmax.categorical_cross_entropy(targets,
                                                             logits,
                                                             extra_ndim=1)

        targets_mask = mask.T[1:]
        costs = self.add_perplexity_measure(application_call, minus_logs,
                                            targets_mask, "")

        missing_embs = tensor.eq(input_word_ids,
                                 self._vocab.unk).astype('int32')  # (bs, L)
        self.add_perplexity_measure(application_call, minus_logs,
                                    targets_mask * missing_embs.T[:-1],
                                    "after_mis_word_embs")
        self.add_perplexity_measure(application_call, minus_logs,
                                    targets_mask * (1 - missing_embs.T[:-1]),
                                    "after_word_embs")

        word_counts = self._word_to_count(words)
        very_rare_masks = []
        for threshold in self._very_rare_threshold:
            very_rare_mask = tensor.lt(word_counts, threshold).astype('int32')
            very_rare_mask = targets_mask * (very_rare_mask.T[:-1])
            very_rare_masks.append(very_rare_mask)
            self.add_perplexity_measure(application_call, minus_logs,
                                        very_rare_mask,
                                        "after_very_rare_" + str(threshold))

        if self._retrieval:
            has_def = tensor.zeros_like(output_word_ids)
            has_def = tensor.inc_subtensor(
                has_def[def_map[:, 0], def_map[:, 1]], 1)
            mask_targets_has_def = has_def.T[:-1] * targets_mask  # (L-1, bs)
            self.add_perplexity_measure(application_call, minus_logs,
                                        mask_targets_has_def, "after_def_embs")

            for thresh, very_rare_mask in zip(self._very_rare_threshold,
                                              very_rare_masks):
                self.add_perplexity_measure(
                    application_call, minus_logs,
                    very_rare_mask * mask_targets_has_def,
                    "after_def_very_rare_" + str(thresh))

            application_call.add_auxiliary_variable(mask_targets_has_def.T,
                                                    name='mask_def_emb')

        return costs, updates

Ejemplo n.º 13

0

Mostrar archivo

Archivo: run.py Proyecto: sharpfun/NeverEndingMusic

rnn.initialize()

linear_output = Linear(
    name='linear_output',
    input_dim=hidden_layer_dim,
    output_dim=charset_size,
    weights_init=initialization.Uniform(width=0.01),
    biases_init=Constant(0))
linear_output.initialize()

softmax = NDimensionalSoftmax(name='ndim_softmax')

activation_input = lookup_input.apply(x)
hidden = rnn.apply(linear_input.apply(activation_input))
activation_output = linear_output.apply(hidden)
y_est = softmax.apply(activation_output, extra_ndim=1)

cost = softmax.categorical_cross_entropy(y, activation_output, extra_ndim=1).mean()


from blocks.graph import ComputationGraph
from blocks.algorithms import GradientDescent, Adam

cg = ComputationGraph([cost])

step_rules = [RMSProp(learning_rate=0.002, decay_rate=0.95), StepClipping(1.0)]


algorithm = GradientDescent(
    cost=cost,
    parameters=cg.parameters,

Ejemplo n.º 14

0

Mostrar archivo

Archivo: readouts.py Proyecto: liubo-cs/actor-critic-public

class ActorCriticReadout(SoftmaxReadout):
    """Actor-critic

    Params
    ------
    bos_token : int
        The token used to pad critic input. Critic needs to do
        at least one extra step compared to the actor in order
        to get the first glimpse of the ground-truth sequence
        before predicting the actual values.

    """
    def __init__(self,
                 reward_brick,
                 compute_targets,
                 compute_policy,
                 solve_bellman,
                 freeze_actor,
                 freeze_critic,
                 critic_uses_actor_states,
                 critic_uses_groundtruth,
                 critic=None,
                 critic_burnin_steps=None,
                 critic_policy_t=None,
                 entropy_reward_coof=None,
                 cross_entropy_reward_coof=None,
                 discount=None,
                 value_penalty=None,
                 value_softmax=False,
                 same_value_for_wrong=False,
                 accumulate_outputs=False,
                 use_value_biases=None,
                 actor_grad_estimate=None,
                 bos_token=None,
                 **kwargs):
        super(ActorCriticReadout, self).__init__(**kwargs)
        self.reward_brick = reward_brick
        self.critic = critic
        self.freeze_actor = freeze_actor
        self.freeze_critic = freeze_critic
        self.critic_uses_actor_states = critic_uses_actor_states
        self.critic_uses_groundtruth = (critic_uses_groundtruth
                                        if critic_uses_groundtruth is not None
                                        else True)
        self.critic_burnin_steps = (critic_burnin_steps
                                    if critic_burnin_steps is not None else 0)
        self.value_summand = Linear(output_dim=1, name='summand')
        self.softmax_t = 1.
        self.critic_policy_t = (critic_policy_t
                                if critic_policy_t is not None else 1.0)
        self.epsilon = 0.
        self.discount = (discount if discount is not None else 1.)
        self.entropy_reward_coof = (entropy_reward_coof
                                    if entropy_reward_coof is not None else 0.)
        self.cross_entropy_reward_coof = (cross_entropy_reward_coof
                                          if cross_entropy_reward_coof
                                          is not None else 0.)
        self.value_penalty = value_penalty
        self.value_softmax = value_softmax
        self.same_value_for_wrong = same_value_for_wrong
        self.compute_targets = compute_targets
        self.compute_policy = compute_policy
        self.solve_bellman = solve_bellman
        self.accumulate_outputs = accumulate_outputs
        self.use_value_biases = (use_value_biases
                                 if use_value_biases is not None else True)
        self.actor_grad_estimate = (actor_grad_estimate
                                    if actor_grad_estimate else 'all_actions')
        self.bos_token = bos_token
        self.softmax = NDimensionalSoftmax()
        self.children += [reward_brick, self.value_summand, self.softmax]
        if self.critic:
            self.children.append(self.critic)
        self.costs.inputs += ['attended', 'attended_mask']

    def _push_allocation_config(self):
        super(ActorCriticReadout, self)._push_allocation_config()
        self.value_summand.input_dim = self.get_dim('attended')

    @application
    def scores(self, **inputs):
        merged = self.merge(**dict_subset(inputs, self.merge_names))
        return self.softmax.log_probabilities(merged * self.softmax_t,
                                              extra_ndim=merged.ndim - 2)

    @application
    def costs(self, application_call, prediction, prediction_mask, groundtruth,
              groundtruth_mask, **inputs):
        def _prediction_subtensor(data):
            if data.ndim != 3:
                raise ValueError
            flat_data = data.reshape(
                (data.shape[0] * data.shape[1], data.shape[2]))
            flat_data = flat_data[tensor.arange(flat_data.shape[0]),
                                  prediction.flatten()]
            return flat_data.reshape(
                (prediction.shape[0], prediction.shape[1]))

        attended = disconnected_grad(inputs.pop('attended'))
        attended_mask = disconnected_grad(inputs.pop('attended_mask'))

        # Compute the rewards
        rewards = self.reward_brick.apply(prediction, prediction_mask,
                                          groundtruth, groundtruth_mask)[:, :,
                                                                         0]
        future_rewards = rewards[::-1].cumsum(axis=0)[::-1]

        # Compute the critic outputs
        if self.critic:
            padding = tensor.repeat(tensor.fill(prediction[0:1],
                                                self.bos_token),
                                    1,
                                    axis=0)
            mask_padding = tensor.repeat(tensor.fill(prediction_mask[0:1], 1.),
                                         1,
                                         axis=0)
            padded_prediction = tensor.concatenate([padding, prediction])
            padded_prediction_mask = tensor.concatenate(
                [mask_padding, prediction_mask])
            if self.critic_uses_groundtruth:
                critic_context = groundtruth
                critic_context_mask = groundtruth_mask
            else:
                critic_context = tensor.zeros_like(groundtruth[0:1])
                critic_context_mask = tensor.zeros_like(groundtruth_mask[0:1])
            critic_kwargs = dict(prediction=padded_prediction,
                                 prediction_mask=padded_prediction_mask,
                                 groundtruth=critic_context,
                                 groundtruth_mask=critic_context_mask,
                                 inputs=critic_context,
                                 inputs_mask=critic_context_mask)

            if self.critic_uses_actor_states:
                extra_inputs = disconnected_grad(inputs['states'])
                # We don't the very last hidden state of the actor
                # in extra_inputs. We have to add something instead for the shapes
                # to match. It doesn't matter at all, what exactly we add.
                critic_kwargs['extra_inputs'] = tensor.concatenate(
                    [extra_inputs,
                     tensor.zeros_like(extra_inputs[0:1])])
            critic_cg = ComputationGraph(self.critic.costs(**critic_kwargs))
            outputs, = VariableFilter(
                applications=[self.critic.generator.readout.all_outputs],
                roles=[OUTPUT])(critic_cg)
            # The first subtensor should be discarded, because it was outputted
            # for the padding. In addition to that Q-values from the first
            # 'critic_burnin_steps' will be ignored, see later in the code.
            outputs = outputs[1:]
        else:
            outputs = self.merge(**dict_subset(inputs, self.merge_names))
        prediction_outputs = _prediction_subtensor(outputs)

        # Compute Q adjustments
        adjustments = outputs
        prediction_adjustments = prediction_outputs
        if self.accumulate_outputs:
            prediction_adjustments = prediction_outputs.cumsum(axis=0)
            adjustments = tensor.inc_subtensor(
                adjustments[1:], prediction_adjustments[:-1][:, :, None])

        # Compute shared additive biases for all Q values
        if self.use_value_biases:
            value_biases = (self.value_summand.apply(attended)[:, :, 0] *
                            attended_mask).sum(axis=0)
        else:
            value_biases = tensor.zeros_like(adjustments[0, :, 0])
        values = adjustments + value_biases[None, :, None]
        prediction_values = prediction_adjustments + value_biases[None, :]

        rolled_prediction_mask = tensor.roll(prediction_mask, -1, axis=0)
        rolled_prediction_mask = tensor.set_subtensor(
            rolled_prediction_mask[-1], 0)

        # Compute probabilities
        logs = self.scores(use_epsilon=False, **inputs)
        probs = tensor.exp(logs)
        if not self.compute_policy:
            raise NotImplementedError("Not supported any more")
        prediction_logs = _prediction_subtensor(logs)

        # Compute value targets
        value_targets = (disconnected_grad(probs) * values).sum(axis=-1)
        value_targets = tensor.roll(value_targets, -1, axis=0)
        value_targets = (
            self.discount * value_targets * rolled_prediction_mask + rewards)
        value_targets = value_targets.astype(theano.config.floatX)

        total_costs = 0

        # Compute critic cost
        if not self.compute_targets:
            logger.debug("Using given targets")
            value_targets = tensor.matrix('value_targets')
        if self.solve_bellman == 'no':
            logger.debug("Not solving Bellman, just predicting the rewards")
            value_targets = rewards.copy(name='value_targets')
        elif self.solve_bellman == 'without_dp':
            future_rewards = rewards[::-1].cumsum(axis=0)[::-1]
            logger.debug("Solving Bellman, but without DP")
            value_targets = future_rewards
        elif self.solve_bellman is not True:
            raise ValueError()
        critic_costs_per_char = (
            (prediction_values - value_targets)**2) * prediction_mask
        critic_costs = critic_costs_per_char[self.critic_burnin_steps:].sum(
            axis=0)
        if not self.freeze_critic:
            total_costs += critic_costs

        # Compute critic Monte-Carlo cost
        critic_monte_carlo_costs = (
            (((prediction_values - future_rewards)**2) *
             prediction_mask)[self.critic_burnin_steps:].sum(axis=0))

        # Value penalty
        if self.value_penalty:
            logger.debug("Use value penalty")
            value_deviations = (values -
                                values.mean(axis=-1, keepdims=True))**2
            if not self.freeze_critic:
                total_costs += (
                    self.value_penalty *
                    (value_deviations.sum(axis=-1) *
                     prediction_mask)[self.critic_burnin_steps:].sum(axis=0))

        # Compute actor cost
        if self.critic:
            # The actor cost will be minimized, that's why values
            # must be negated.
            est_name = self.actor_grad_estimate
            if est_name == 'all_actions':
                disadvantages = disconnected_grad(
                    values.max(axis=-1)[:, :, None] - values)
                actor_costs = ((probs * disadvantages).sum(axis=-1) *
                               prediction_mask)
                actor_costs = actor_costs[self.critic_burnin_steps:]
            elif est_name.startswith('1_action'):
                # Here we do not provide a target for the first step for
                # the reason we lack an estimate of the value of the initial state.
                # This is how our critic works.
                # Hopefully the network won't unlearn
                # to produce a BOS first.
                future_reward_estimate = (future_rewards
                                          if est_name.endswith('unbiased') else
                                          prediction_values)
                weights = -disconnected_grad(future_reward_estimate[1:] +
                                             rewards[:-1] -
                                             prediction_values[:-1])
                actor_costs = ((prediction_logs[1:] * weights) *
                               prediction_mask[1:])
                actor_costs = actor_costs[self.critic_burnin_steps + 1:]
            else:
                raise ValueError
            actor_costs = actor_costs.sum(axis=0)

            actor_entropies = (probs * -logs).sum(axis=-1) * prediction_mask
            actor_entropies = actor_entropies[self.critic_burnin_steps:].sum(
                axis=0)
            critic_policy = disconnected_grad(
                self.softmax.apply(self.critic_policy_t * values,
                                   extra_ndim=1))
            critic_cross_entropies = ((critic_policy * -logs).sum(axis=-1) *
                                      prediction_mask)
            critic_cross_entropies = critic_cross_entropies[
                self.critic_burnin_steps:].sum(axis=0)
            actor_costs_with_penalties = (
                actor_costs - self.entropy_reward_coof * actor_entropies -
                self.cross_entropy_reward_coof * critic_cross_entropies)
            if not self.freeze_actor:
                total_costs += actor_costs_with_penalties
            else:
                total_costs += disconnected_grad(actor_costs_with_penalties)

        # Add auxiliary variables for intermediate steps of the computation
        application_call.add_auxiliary_variable(rewards, name='rewards')
        application_call.add_auxiliary_variable(value_biases,
                                                name='value_biases')
        application_call.add_auxiliary_variable(values.copy(), name='values')
        application_call.add_auxiliary_variable(outputs.copy(), name='outputs')
        application_call.add_auxiliary_variable(prediction_values,
                                                name='prediction_values')
        application_call.add_auxiliary_variable(prediction_outputs,
                                                name='prediction_outputs')
        application_call.add_auxiliary_variable(value_targets.copy(),
                                                name='value_targets')
        application_call.add_auxiliary_variable(probs.copy(), name='probs')
        application_call.add_auxiliary_variable(prediction_logs,
                                                name='prediction_log_probs')

        # Compute some statistics for debugging
        last_character_mask = prediction_mask - rolled_prediction_mask
        last_character_costs = (critic_costs_per_char *
                                last_character_mask).sum(axis=0)
        mean2_output = (((prediction_outputs**2) * prediction_mask).sum() /
                        prediction_mask.sum())**0.5
        max_output = abs(prediction_outputs * prediction_mask).max()
        expected_reward = (probs[0] * values[0]).sum(axis=-1)
        application_call.add_auxiliary_variable(last_character_costs,
                                                name='last_character_costs')
        application_call.add_auxiliary_variable(critic_costs.mean(),
                                                name='mean_critic_cost')
        application_call.add_auxiliary_variable(
            critic_monte_carlo_costs.mean(),
            name='mean_critic_monte_carlo_cost')
        if self.critic:
            application_call.add_auxiliary_variable(actor_costs.mean(),
                                                    name='mean_actor_cost')
            application_call.add_auxiliary_variable(actor_entropies.mean(),
                                                    name='mean_actor_entropy')
        application_call.add_auxiliary_variable(expected_reward.mean(),
                                                name='mean_expected_reward')
        application_call.add_auxiliary_variable(mean2_output,
                                                name='mean2_output')
        application_call.add_auxiliary_variable(max_output, name='max_output')

        return total_costs

Ejemplo n.º 15

0

Mostrar archivo

    def __init__(self,
                 input1_size,
                 input2_size,
                 lookup1_dim=200,
                 lookup2_dim=200,
                 hidden_size=512):
        self.hidden_size = hidden_size
        self.input1_size = input1_size
        self.input2_size = input2_size
        self.lookup1_dim = lookup1_dim
        self.lookup2_dim = lookup2_dim

        x1 = tensor.lmatrix('durations')
        x2 = tensor.lmatrix('syllables')
        y = tensor.lmatrix('pitches')

        lookup1 = LookupTable(dim=self.lookup1_dim,
                              length=self.input1_size,
                              name='lookup1',
                              weights_init=initialization.Uniform(width=0.01),
                              biases_init=Constant(0))
        lookup1.initialize()
        lookup2 = LookupTable(dim=self.lookup2_dim,
                              length=self.input2_size,
                              name='lookup2',
                              weights_init=initialization.Uniform(width=0.01),
                              biases_init=Constant(0))
        lookup2.initialize()
        merge = Merge(['lookup1', 'lookup2'],
                      [self.lookup1_dim, self.lookup2_dim],
                      self.hidden_size,
                      weights_init=initialization.Uniform(width=0.01),
                      biases_init=Constant(0))
        merge.initialize()
        recurrent_block = LSTM(
            dim=self.hidden_size,
            activation=Tanh(),
            weights_init=initialization.Uniform(width=0.01)
        )  #RecurrentStack([LSTM(dim=self.hidden_size, activation=Tanh())] * 3)
        recurrent_block.initialize()
        linear = Linear(input_dim=self.hidden_size,
                        output_dim=self.input1_size,
                        weights_init=initialization.Uniform(width=0.01),
                        biases_init=Constant(0))
        linear.initialize()
        softmax = NDimensionalSoftmax()

        l1 = lookup1.apply(x1)
        l2 = lookup2.apply(x2)
        m = merge.apply(l1, l2)
        h = recurrent_block.apply(m)
        a = linear.apply(h)

        y_hat = softmax.apply(a, extra_ndim=1)
        # ValueError: x must be 1-d or 2-d tensor of floats. Got TensorType(float64, 3D)

        self.Cost = softmax.categorical_cross_entropy(y, a,
                                                      extra_ndim=1).mean()

        self.ComputationGraph = ComputationGraph(self.Cost)

        self.Model = Model(y_hat)

Ejemplo n.º 16

0

Mostrar archivo

Archivo: frnn_model.py Proyecto: TiSU32/ift6266h16

class FRNNEmitter(AbstractEmitter, Initializable, Random):
    """An RNN emitter for the case of real outputs.
    Parameters
    ----------
    """

    def __init__(self, mlp, target_size, frame_size, k, frnn_hidden_size, frnn_step_size, const=1e-5, **kwargs):

        super(FRNNEmitter, self).__init__(**kwargs)

        self.mlp = mlp
        self.target_size = target_size
        self.frame_size = frame_size
        self.k = k
        self.frnn_hidden_size = frnn_hidden_size
        self.const = const
        self.input_dim = self.mlp.output_dim

        self.frnn_step_size = frnn_step_size

        # adding a step if the division is not exact.
        self.number_of_steps = frame_size // frnn_step_size
        self.last_steps = frame_size % frnn_step_size
        if self.last_steps != 0:
            self.number_of_steps += 1

        self.mu = MLP(activations=[Identity()], dims=[frnn_hidden_size, k * frnn_step_size], name=self.name + "_mu")
        self.sigma = MLP(
            activations=[SoftPlus()], dims=[frnn_hidden_size, k * frnn_step_size], name=self.name + "_sigma"
        )

        self.coeff = MLP(activations=[Identity()], dims=[frnn_hidden_size, k], name=self.name + "_coeff")

        self.coeff2 = NDimensionalSoftmax()

        self.frnn_initial_state = Linear(
            input_dim=self.input_dim, output_dim=frnn_hidden_size, name="frnn_initial_state"
        )

        # self.frnn_hidden = Linear(
        #    input_dim=frnn_hidden_size,
        #    output_dim=frnn_hidden_size,
        #    activation=Tanh(),
        #    name="frnn_hidden")

        self.frnn_activation = Tanh(name="frnn_activation")

        self.frnn_linear_transition_state = Linear(
            input_dim=frnn_hidden_size, output_dim=frnn_hidden_size, name="frnn_linear_transition_state"
        )

        self.frnn_linear_transition_input = Linear(
            input_dim=self.frnn_step_size, output_dim=frnn_hidden_size, name="frnn_linear_transition_input"
        )

        # self.frnn_linear_transition_output = Linear (
        #    input_dim = frnn_hidden_size,
        #    output_dim = self.rnn_hidden_dim,
        #    name="frnn_linear_transition_output")

        self.children = [
            self.mlp,
            self.mu,
            self.sigma,
            self.coeff,
            self.coeff2,
            self.frnn_initial_state,
            self.frnn_activation,
            self.frnn_linear_transition_state,
            self.frnn_linear_transition_input,
        ]

    @application
    def emit(self, readouts):
        """
        keep_parameters is True if mu,sigma,coeffs must be stacked and returned
        if false, only the result is given, the others will be empty list.

        """
        # initial state
        state = self.frnn_initial_state.apply(self.mlp.apply(readouts))

        results = []

        for i in range(self.number_of_steps):
            last_iteration = i == self.number_of_steps - 1

            # First generating distribution parameters and sampling.
            mu = self.mu.apply(state)
            sigma = self.sigma.apply(state) + self.const
            coeff = self.coeff2.apply(self.coeff.apply(state), extra_ndim=state.ndim - 2) + self.const

            shape_result = coeff.shape
            shape_result = tensor.set_subtensor(shape_result[-1], self.frnn_step_size)
            ndim_result = coeff.ndim

            mu = mu.reshape((-1, self.frnn_step_size, self.k))
            sigma = sigma.reshape((-1, self.frnn_step_size, self.k))
            coeff = coeff.reshape((-1, self.k))

            sample_coeff = self.theano_rng.multinomial(pvals=coeff, dtype=coeff.dtype)
            idx = predict(sample_coeff, axis=-1)
            # idx = predict(coeff, axis = -1) use this line for using most likely coeff.

            # shapes (ls*bs)*(fs)
            mu = mu[tensor.arange(mu.shape[0]), :, idx]
            sigma = sigma[tensor.arange(sigma.shape[0]), :, idx]

            epsilon = self.theano_rng.normal(size=mu.shape, avg=0.0, std=1.0, dtype=mu.dtype)

            result = mu + sigma * epsilon  # *0.6 #reduce variance.
            result = result.reshape(shape_result, ndim=ndim_result)
            results.append(result)

            # if the total size does not correspond to the frame_size,
            # this removes the need for padding
            if not last_iteration:
                state = self.frnn_activation.apply(
                    self.frnn_linear_transition_state.apply(state) + self.frnn_linear_transition_input.apply(result)
                )

        results = tensor.stack(results, axis=-1)
        results = tensor.flatten(results, outdim=results.ndim - 1)

        # truncate if not good size
        if self.last_steps != 0:
            results = results[tuple([slice(0, None)] * (results.ndim - 1) + [slice(0, self.frame_size)])]

        return results

    @application
    def cost(self, readouts, outputs):
        # initial state
        state = self.frnn_initial_state.apply(self.mlp.apply(readouts))

        inputs = outputs

        mus = []
        sigmas = []
        coeffs = []

        for i in range(self.number_of_steps):
            last_iteration = i == self.number_of_steps - 1

            # First generating distribution parameters and sampling.
            freq_mu = self.mu.apply(state)
            freq_sigma = self.sigma.apply(state) + self.const
            freq_coeff = self.coeff2.apply(self.coeff.apply(state), extra_ndim=state.ndim - 2) + self.const

            freq_mu = freq_mu.reshape((-1, self.frnn_step_size, self.k))
            freq_sigma = freq_sigma.reshape((-1, self.frnn_step_size, self.k))
            freq_coeff = freq_coeff.reshape((-1, self.k))
            # mu,sigma: shape (-1,fs,k)
            # coeff: shape (-1,k)

            mus.append(freq_mu)
            sigmas.append(freq_sigma)
            coeffs.append(freq_coeff)

            index = self.frnn_step_size
            freq_inputs = inputs[
                tuple([slice(0, None)] * (inputs.ndim - 1) + [slice(index, index + self.frnn_step_size)])
            ]

            if not last_iteration:
                state = self.frnn_activation.apply(
                    self.frnn_linear_transition_state.apply(state)
                    + self.frnn_linear_transition_input.apply(freq_inputs)
                )

        mus = tensor.stack(mus, axis=-2)
        sigmas = tensor.stack(sigmas, axis=-2)
        coeffs = tensor.stack(coeffs, axis=-2)

        mus = mus.reshape((-1, self.frnn_step_size * self.number_of_steps, self.k))
        sigmas = sigmas.reshape((-1, self.frnn_step_size * self.number_of_steps, self.k))
        coeffs = coeffs.repeat(self.frnn_step_size, axis=-2)

        mus = mus[tuple([slice(0, None)] * (mus.ndim - 2) + [slice(0, self.frame_size)] + [slice(0, None)])]
        sigmas = sigmas[tuple([slice(0, None)] * (sigmas.ndim - 2) + [slice(0, self.frame_size)] + [slice(0, None)])]
        coeffs = coeffs[tuple([slice(0, None)] * (coeffs.ndim - 2) + [slice(0, self.frame_size)] + [slice(0, None)])]
        # actually prob not necessary
        mu = mus.reshape((-1, self.target_size))
        sigma = sigmas.reshape((-1, self.target_size))
        coeff = coeffs.reshape((-1, self.target_size))

        return FRNN_NLL(y=outputs, mu=mu, sig=sigma, coeff=coeff, frame_size=self.frame_size, k=self.k)

    @application
    def initial_outputs(self, batch_size):
        return tensor.zeros((batch_size, self.frame_size), dtype=floatX)

    def get_dim(self, name):
        # modification here to ensure the right dim.
        if name == "outputs":
            return self.frame_size
        return super(FRNNEmitter, self).get_dim(name)

Ejemplo n.º 17

0

Mostrar archivo

Archivo: model.py Proyecto: sharpfun/NeverEndingMusic

    def __init__(self, input_sources_list, input_sources_vocab_size_list,
                 output_source, output_source_vocab_size,
                 lookup_dim=200, hidden_size=256, recurrent_stack_size=1):

        self.InputSources = input_sources_list
        self.InputSourcesVocab = input_sources_vocab_size_list
        self.OutputSource = output_source
        self.OutputSourceVocab = output_source_vocab_size

        inputs = [tensor.lmatrix(source) for source in input_sources_list]
        output = tensor.lmatrix(output_source)

        lookups = self.get_lookups(lookup_dim, input_sources_vocab_size_list)

        for lookup in lookups:
            lookup.initialize()

        merge = Merge([lookup.name for lookup in lookups], [lookup.dim for lookup in lookups], hidden_size,
                              weights_init=initialization.Uniform(width=0.01),
                              biases_init=Constant(0))
        merge.initialize()

        linear0 = Linear(input_dim=hidden_size, output_dim=hidden_size,
                        weights_init=initialization.Uniform(width=0.01),
                        biases_init=Constant(0), name='linear0')
        linear0.initialize()

        recurrent_blocks = []

        for i in range(recurrent_stack_size):
            recurrent_blocks.append(SimpleRecurrent(
                dim=hidden_size, activation=Tanh(),
                weights_init=initialization.Uniform(width=0.01),
                use_bias=False))

        for i, recurrent_block in enumerate(recurrent_blocks):
            recurrent_block.name = 'recurrent'+str(i+1)
            recurrent_block.initialize()

        linear_out = Linear(input_dim=hidden_size, output_dim=output_source_vocab_size,
                              weights_init=initialization.Uniform(width=0.01),
                              biases_init=Constant(0), name='linear_out')
        linear_out.initialize()
        softmax = NDimensionalSoftmax(name='softmax')

        lookup_outputs = [lookup.apply(input) for lookup, input in zip(lookups, inputs)]

        m = merge.apply(*lookup_outputs)
        r = linear0.apply(m)
        for block in recurrent_blocks:
            r = block.apply(r)
        a = linear_out.apply(r)

        self.Cost = softmax.categorical_cross_entropy(output, a, extra_ndim=1).mean()
        self.Cost.name = 'cost'

        y_hat = softmax.apply(a, extra_ndim=1)
        y_hat.name = 'y_hat'

        self.ComputationGraph = ComputationGraph(self.Cost)

        self.Function = None
        self.MainLoop = None
        self.Model = Model(y_hat)

Ejemplo n.º 18

0

Mostrar archivo

                      weights_init=initialization.Uniform(width=0.01))
rnn.initialize()

linear_output = Linear(name='linear_output',
                       input_dim=hidden_layer_dim,
                       output_dim=train_dataset.durations_vocab_size(),
                       weights_init=initialization.Uniform(width=0.01),
                       biases_init=Constant(0))
linear_output.initialize()

softmax = NDimensionalSoftmax(name='ndim_softmax')

activation_input = lookup_input.apply(x)
hidden = rnn.apply(linear_input.apply(activation_input))
activation_output = linear_output.apply(hidden)
y_est = softmax.apply(activation_output, extra_ndim=1)

cost = softmax.categorical_cross_entropy(y, activation_output,
                                         extra_ndim=1).mean()

from blocks.graph import ComputationGraph
from blocks.algorithms import GradientDescent, Adam

cg = ComputationGraph([cost])

step_rules = [RMSProp(learning_rate=0.002, decay_rate=0.95), StepClipping(1.0)]

algorithm = GradientDescent(cost=cost,
                            parameters=cg.parameters,
                            step_rule=CompositeRule(step_rules),
                            on_unused_sources='ignore')

Ejemplo n.º 19

0

Mostrar archivo

Archivo: frnn_model.py Proyecto: TiSU32/ift6266h16

class FRNNEmitter(AbstractEmitter, Initializable, Random):
    """An RNN emitter for the case of real outputs.
    Parameters
    ----------
    """
    def __init__(self, mlp, target_size, frame_size, k, frnn_hidden_size, \
            frnn_step_size, const=1e-5, **kwargs):

        super(FRNNEmitter, self).__init__(**kwargs)

        self.mlp = mlp
        self.target_size = target_size
        self.frame_size = frame_size
        self.k = k
        self.frnn_hidden_size = frnn_hidden_size
        self.const = const
        self.input_dim = self.mlp.output_dim

        self.frnn_step_size = frnn_step_size

        # adding a step if the division is not exact.
        self.number_of_steps = frame_size // frnn_step_size
        self.last_steps = frame_size % frnn_step_size
        if self.last_steps != 0:
            self.number_of_steps += 1

        self.mu = MLP(activations=[Identity()],
                dims=[frnn_hidden_size, k*frnn_step_size],
                name=self.name + "_mu")
        self.sigma = MLP(activations=[SoftPlus()],
                dims=[frnn_hidden_size, k*frnn_step_size],
                name=self.name + "_sigma")

        self.coeff = MLP(activations=[Identity()],
                dims=[frnn_hidden_size, k],
                name=self.name + "_coeff")

        self.coeff2 = NDimensionalSoftmax()

        self.frnn_initial_state = Linear(
            input_dim = self.input_dim,
            output_dim=frnn_hidden_size,
            name="frnn_initial_state")

        #self.frnn_hidden = Linear(
        #    input_dim=frnn_hidden_size,
        #    output_dim=frnn_hidden_size,
        #    activation=Tanh(),
        #    name="frnn_hidden")

        self.frnn_activation = Tanh(
            name="frnn_activation")

        self.frnn_linear_transition_state = Linear (
            input_dim = frnn_hidden_size,
            output_dim= frnn_hidden_size,
            name="frnn_linear_transition_state")

        self.frnn_linear_transition_input = Linear (
            input_dim = self.frnn_step_size,
            output_dim = frnn_hidden_size,
            name="frnn_linear_transition_input")

        #self.frnn_linear_transition_output = Linear (
        #    input_dim = frnn_hidden_size,
        #    output_dim = self.rnn_hidden_dim,
        #    name="frnn_linear_transition_output")

        self.children = [self.mlp,self.mu,self.sigma,self.coeff,
            self.coeff2,self.frnn_initial_state,self.frnn_activation,
            self.frnn_linear_transition_state,
            self.frnn_linear_transition_input]

    @application
    def emit(self,readouts):
        """
        keep_parameters is True if mu,sigma,coeffs must be stacked and returned
        if false, only the result is given, the others will be empty list.

        """
        # initial state
        state = self.frnn_initial_state.apply(\
            self.mlp.apply(readouts))

        results = []

        for i in range(self.number_of_steps):
            last_iteration = (i == self.number_of_steps - 1)

            # First generating distribution parameters and sampling.
            mu = self.mu.apply(state)
            sigma = self.sigma.apply(state) + self.const
            coeff = self.coeff2.apply(self.coeff.apply(state),\
                extra_ndim=state.ndim - 2) + self.const

            shape_result = coeff.shape
            shape_result = tensor.set_subtensor(shape_result[-1],self.frnn_step_size)
            ndim_result = coeff.ndim

            mu = mu.reshape((-1, self.frnn_step_size,self.k))
            sigma = sigma.reshape((-1, self.frnn_step_size,self.k))
            coeff = coeff.reshape((-1, self.k))

            sample_coeff = self.theano_rng.multinomial(pvals = coeff, dtype=coeff.dtype)
            idx = predict(sample_coeff, axis = -1)
            #idx = predict(coeff, axis = -1) use this line for using most likely coeff.

            #shapes (ls*bs)*(fs)
            mu = mu[tensor.arange(mu.shape[0]), :,idx]
            sigma = sigma[tensor.arange(sigma.shape[0]), :,idx]

            epsilon = self.theano_rng.normal(
                size=mu.shape,
                avg=0.,
                std=1.,
                dtype=mu.dtype)

            result = mu + sigma*epsilon#*0.6 #reduce variance.
            result = result.reshape(shape_result, ndim = ndim_result)
            results.append(result)

            # if the total size does not correspond to the frame_size,
            #this removes the need for padding
            if not last_iteration:
                state = self.frnn_activation.apply(
                        self.frnn_linear_transition_state.apply(state) +
                        self.frnn_linear_transition_input.apply(result))

        results = tensor.stack(results,axis=-1)
        results = tensor.flatten(results,outdim=results.ndim-1)

        # truncate if not good size
        if self.last_steps != 0:
            results = results[tuple([slice(0,None)] * \
                (results.ndim-1) +[slice(0,self.frame_size)])]

        return results

    @application
    def cost(self, readouts, outputs):
        # initial state
        state = self.frnn_initial_state.apply(\
            self.mlp.apply(readouts))

        inputs = outputs

        mus = []
        sigmas = []
        coeffs = []

        for i in range(self.number_of_steps):
            last_iteration = (i == self.number_of_steps - 1)

            # First generating distribution parameters and sampling.
            freq_mu = self.mu.apply(state)
            freq_sigma = self.sigma.apply(state) + self.const
            freq_coeff = self.coeff2.apply(self.coeff.apply(state),\
                extra_ndim=state.ndim - 2) + self.const

            freq_mu = freq_mu.reshape((-1,self.frnn_step_size,self.k))
            freq_sigma = freq_sigma.reshape((-1,self.frnn_step_size,self.k))
            freq_coeff = freq_coeff.reshape((-1,self.k))
            #mu,sigma: shape (-1,fs,k)
            #coeff: shape (-1,k)

            mus.append(freq_mu)
            sigmas.append(freq_sigma)
            coeffs.append(freq_coeff)

            index = self.frnn_step_size
            freq_inputs = inputs[tuple([slice(0,None)] * \
                (inputs.ndim-1) +[slice(index,index+self.frnn_step_size)])]

            if not last_iteration:
                state = self.frnn_activation.apply(
                    self.frnn_linear_transition_state.apply(state) +
                    self.frnn_linear_transition_input.apply(freq_inputs))

        mus = tensor.stack(mus,axis=-2)
        sigmas = tensor.stack(sigmas,axis=-2)
        coeffs = tensor.stack(coeffs,axis=-2)

        mus = mus.reshape((-1,self.frnn_step_size*self.number_of_steps,self.k))
        sigmas = sigmas.reshape((-1,self.frnn_step_size*self.number_of_steps,self.k))
        coeffs = coeffs.repeat(self.frnn_step_size,axis=-2)

        mus = mus[tuple([slice(0,None)] * \
                (mus.ndim-2) +[slice(0,self.frame_size)] + [slice(0,None)])]
        sigmas = sigmas[tuple([slice(0,None)] * \
                (sigmas.ndim-2) +[slice(0,self.frame_size)] + [slice(0,None)])]
        coeffs = coeffs[tuple([slice(0,None)] * \
                (coeffs.ndim-2) +[slice(0,self.frame_size)] + [slice(0,None)])]
        # actually prob not necessary
        mu = mus.reshape((-1,self.target_size))
        sigma = sigmas.reshape((-1,self.target_size))
        coeff = coeffs.reshape((-1, self.target_size))

        return FRNN_NLL (y=outputs, mu=mu, sig=sigma, coeff=coeff,\
            frame_size=self.frame_size,k=self.k)

    @application
    def initial_outputs(self, batch_size):
        return tensor.zeros((batch_size, self.frame_size), dtype=floatX)

    def get_dim(self, name):
	# modification here to ensure the right dim.
        if name == 'outputs':
            return self.frame_size
        return super(FRNNEmitter, self).get_dim(name)

Ejemplo n.º 20

0

Mostrar archivo

class Seq2Seq(Initializable):
    """ seq2seq model

    Parameters
    ----------
    emb_dim: int
        The dimension of word embeddings (including for def model if standalone)
    dim : int
        The dimension of the RNNs states (including for def model if standalone)
    num_input_words : int
        The size of the LM's input vocabulary.
    num_output_words : int
        The size of the LM's output vocabulary.
    vocab
        The vocabulary object.
    """
    def __init__(self, emb_dim, dim, num_input_words, 
                 num_output_words, vocab, 
                 **kwargs):
        if emb_dim == 0:
            emb_dim = dim
        if num_input_words == 0:
            num_input_words = vocab.size()
        if num_output_words == 0:
            num_output_words = vocab.size()

        self._num_input_words = num_input_words
        self._num_output_words = num_output_words
        self._vocab = vocab

        self._word_to_id = WordToIdOp(self._vocab)

        children = []

        self._main_lookup = LookupTable(self._num_input_words, emb_dim, name='main_lookup')
        self._encoder_fork = Linear(emb_dim, 4 * dim, name='encoder_fork')
        self._encoder_rnn = LSTM(dim, name='encoder_rnn')
        self._decoder_fork = Linear(emb_dim, 4 * dim, name='decoder_fork')
        self._decoder_rnn = LSTM(dim, name='decoder_rnn')
        children.extend([self._main_lookup,
                         self._encoder_fork, self._encoder_rnn,
                         self._decoder_fork, self._decoder_rnn])
        self._pre_softmax = Linear(dim, self._num_output_words)
        self._softmax = NDimensionalSoftmax()
        children.extend([self._pre_softmax, self._softmax])

        super(LanguageModel, self).__init__(children=children, **kwargs)

    def set_def_embeddings(self, embeddings):
        self._main_lookup.parameters[0].set_value(embeddings.astype(theano.config.floatX))

    def get_def_embeddings_params(self):
        return self._main_lookup.parameters[0]

    def add_perplexity_measure(self, application_call, minus_logs, mask, name):
        costs = (minus_logs * mask).sum(axis=0)
        perplexity = tensor.exp(costs.sum() / mask.sum())
        perplexity.tag.aggregation_scheme = Perplexity(
            costs.sum(), mask.sum())
        application_call.add_auxiliary_variable(perplexity, name=name)
        return costs

    @application
    def apply(self, application_call, words, mask):
        """Compute the log-likelihood for a batch of sequences.

        words
            An integer matrix of shape (B, T), where T is the number of time
            step, B is the batch size. Note that this order of the axis is
            different from what all RNN bricks consume, hence and the axis
            should be transposed at some point.
        mask
            A float32 matrix of shape (B, T). Zeros indicate the padding.

        """
        word_ids = self._word_to_id(words)

        # shortlisting
        input_word_ids = (tensor.lt(word_ids, self._num_input_words) * word_ids
                          + tensor.ge(word_ids, self._num_input_words) * self._vocab.unk)
        output_word_ids = (tensor.lt(word_ids, self._num_output_words) * word_ids
                          + tensor.ge(word_ids, self._num_output_words) * self._vocab.unk)

        application_call.add_auxiliary_variable(
            unk_ratio(input_word_ids, mask, self._vocab.unk),
            name='unk_ratio')

        # Run the main rnn with combined inputs
        rnn_inputs = self._main_lookup.apply(input_word_ids)

        encoder_rnn_states = self._encoder_rnn.apply(
            tensor.transpose(self._encoder_fork.apply(rnn_inputs), (1, 0, 2)),
            mask=mask.T)[0]

        # The first token is not predicted
        logits = self._pre_softmax.apply(main_rnn_states[:-1])
        targets = output_word_ids.T[1:]
        out_softmax = self._softmax.apply(logits, extra_ndim=1)
        application_call.add_auxiliary_variable(
                out_softmax.copy(), name="proba_out")
        minus_logs = self._softmax.categorical_cross_entropy(
            targets, logits, extra_ndim=1)

        targets_mask = mask.T[1:]
        costs = self.add_perplexity_measure(application_call, minus_logs,
                               targets_mask,
                               "perplexity")

        missing_embs = tensor.eq(input_word_ids, self._vocab.unk).astype('int32') # (bs, L)
        self.add_perplexity_measure(application_call, minus_logs,
                               targets_mask * missing_embs.T[:-1],
                               "perplexity_after_mis_word_embs")
        self.add_perplexity_measure(application_call, minus_logs,
                               targets_mask * (1-missing_embs.T[:-1]),
                               "perplexity_after_word_embs")

        word_counts = self._word_to_count(words)
        very_rare_masks = []
        for threshold in self._very_rare_threshold:
            very_rare_mask = tensor.lt(word_counts, threshold).astype('int32')
            very_rare_mask = targets_mask * (very_rare_mask.T[:-1])
            very_rare_masks.append(very_rare_mask)
            self.add_perplexity_measure(application_call, minus_logs,
                                   very_rare_mask,
                                   "perplexity_after_very_rare_" + str(threshold))

        if self._retrieval:
            has_def = tensor.zeros_like(output_word_ids)
            has_def = tensor.inc_subtensor(has_def[def_map[:,0], def_map[:,1]], 1)
            mask_targets_has_def = has_def.T[:-1] * targets_mask # (L-1, bs)
            self.add_perplexity_measure(application_call, minus_logs,
                                   mask_targets_has_def,
                                   "perplexity_after_def_embs")

            for thresh, very_rare_mask in zip(self._very_rare_threshold, very_rare_masks):
                self.add_perplexity_measure(application_call, minus_logs,
                                   very_rare_mask * mask_targets_has_def,
                                   "perplexity_after_def_very_rare_" + str(thresh))

            application_call.add_auxiliary_variable(
                    mask_targets_has_def.T, name='mask_def_emb')

        return costs, updates

Ejemplo n.º 21

0

Mostrar archivo

Archivo: sample.py Proyecto: ixtel/blocks-char-rnn

    parser.add_argument('-temperature', type=float,
                        default=1.0, help='temperature of sampling')
    args = parser.parse_args()

    # Define primetext
    ix_to_char, char_to_ix, vocab_size = get_metadata(hdf5_file)
    if args.primetext and len(args.primetext) > 0:
        primetext = ''.join(
            [ch for ch in args.primetext if ch in char_to_ix.keys()])
        x_curr = numpy.expand_dims(
            numpy.array([char_to_ix[ch] for ch in primetext], dtype='uint8'), axis=1)
    else:
        dev_stream = get_stream(hdf5_file, 'dev', batch_size)
        x_curr, y_curr = dev_stream.get_epoch_iterator().next()
        x_curr = x_curr[:, -1].reshape(seq_length, 1)

    print 'Loading model from {0}...'.format(args.model)
    main_loop = load(args.model)
    print 'Model loaded. Building prediction function...'
    model = main_loop.model
    y, x = model.inputs
    softmax = NDimensionalSoftmax()
    linear_output = [
        v for v in model.variables if v.name == 'linear_output'][0]
    y_hat = softmax.apply(linear_output, extra_ndim=1)
    predict = theano.function([x], y_hat)

    print 'Starting sampling'
    sample_string = sample(args.length, x_curr, predict, ix_to_char,
                           seed=args.seed, temperature=args.temperature)

Ejemplo n.º 22

0

Mostrar archivo

Archivo: extractive_qa_model.py Proyecto: fsonntag/dict_based_learning

class ExtractiveQAModel(Initializable):
    """The dictionary-equipped extractive QA model.

    Parameters
    ----------
    dim : int
        The default dimensionality for the components.
    emd_dim : int
        The dimensionality for the embeddings. If 0, `dim` is used.
    coattention : bool
        Use the coattention mechanism.
    num_input_words : int
        The number of input words. If 0, `vocab.size()` is used.
        The vocabulary object.
    use_definitions : bool
        Triggers the use of definitions.
    reuse_word_embeddings : bool
    compose_type : str

    """
    def __init__(self, dim, emb_dim, readout_dims, num_input_words,
                 def_num_input_words, vocab, use_definitions, def_word_gating,
                 compose_type, coattention, def_reader, reuse_word_embeddings,
                 random_unk, **kwargs):
        self._vocab = vocab
        if emb_dim == 0:
            emb_dim = dim
        if num_input_words == 0:
            num_input_words = vocab.size()
        if def_num_input_words == 0:
            def_num_input_words = num_input_words

        self._coattention = coattention
        self._num_input_words = num_input_words
        self._use_definitions = use_definitions
        self._random_unk = random_unk
        self._reuse_word_embeddings = reuse_word_embeddings

        lookup_num_words = num_input_words
        if reuse_word_embeddings:
            lookup_num_words = max(num_input_words, def_num_input_words)
        if random_unk:
            lookup_num_words = vocab.size()

        # Dima: we can have slightly less copy-paste here if we
        # copy the RecurrentFromFork class from my other projects.
        children = []
        self._lookup = LookupTable(lookup_num_words, emb_dim)
        self._encoder_fork = Linear(emb_dim, 4 * dim, name='encoder_fork')
        self._encoder_rnn = LSTM(dim, name='encoder_rnn')
        self._question_transform = Linear(dim, dim, name='question_transform')
        self._bidir_fork = Linear(3 * dim if coattention else 2 * dim,
                                  4 * dim,
                                  name='bidir_fork')
        self._bidir = Bidirectional(LSTM(dim), name='bidir')
        children.extend([
            self._lookup, self._encoder_fork, self._encoder_rnn,
            self._question_transform, self._bidir, self._bidir_fork
        ])

        activations = [Rectifier()] * len(readout_dims) + [None]
        readout_dims = [2 * dim] + readout_dims + [1]
        self._begin_readout = MLP(activations,
                                  readout_dims,
                                  name='begin_readout')
        self._end_readout = MLP(activations, readout_dims, name='end_readout')
        self._softmax = NDimensionalSoftmax()
        children.extend(
            [self._begin_readout, self._end_readout, self._softmax])

        if self._use_definitions:
            # A potential bug here: we pass the same vocab to the def reader.
            # If a different token is reserved for UNK in text and in the definitions,
            # we can be screwed.
            def_reader_class = eval(def_reader)
            def_reader_kwargs = dict(
                num_input_words=def_num_input_words,
                dim=dim,
                emb_dim=emb_dim,
                vocab=vocab,
                lookup=self._lookup if reuse_word_embeddings else None)
            if def_reader_class == MeanPoolReadDefinitions:
                def_reader_kwargs.update(dict(normalize=True, translate=False))
            self._def_reader = def_reader_class(**def_reader_kwargs)
            self._combiner = MeanPoolCombiner(dim=dim,
                                              emb_dim=emb_dim,
                                              def_word_gating=def_word_gating,
                                              compose_type=compose_type)
            children.extend([self._def_reader, self._combiner])

        super(ExtractiveQAModel, self).__init__(children=children, **kwargs)

        # create default input variables
        self.contexts = tensor.lmatrix('contexts')
        self.context_mask = tensor.matrix('contexts_mask')
        self.questions = tensor.lmatrix('questions')
        self.question_mask = tensor.matrix('questions_mask')
        self.answer_begins = tensor.lvector('answer_begins')
        self.answer_ends = tensor.lvector('answer_ends')
        input_vars = [
            self.contexts, self.context_mask, self.questions,
            self.question_mask, self.answer_begins, self.answer_ends
        ]
        if self._use_definitions:
            self.defs = tensor.lmatrix('defs')
            self.def_mask = tensor.matrix('def_mask')
            self.contexts_def_map = tensor.lmatrix('contexts_def_map')
            self.questions_def_map = tensor.lmatrix('questions_def_map')
            input_vars.extend([
                self.defs, self.def_mask, self.contexts_def_map,
                self.questions_def_map
            ])
        self.input_vars = OrderedDict([(var.name, var) for var in input_vars])

    def set_embeddings(self, embeddings):
        self._lookup.parameters[0].set_value(
            embeddings.astype(theano.config.floatX))

    def embeddings_var(self):
        return self._lookup.parameters[0]

    def def_reading_parameters(self):
        parameters = Selector(self._def_reader).get_parameters().values()
        parameters.extend(Selector(self._combiner).get_parameters().values())
        if self._reuse_word_embeddings:
            lookup_parameters = Selector(
                self._lookup).get_parameters().values()
            parameters = [p for p in parameters if p not in lookup_parameters]
        return parameters

    @application
    def _encode(self,
                application_call,
                text,
                mask,
                def_embs=None,
                def_map=None,
                text_name=None):
        if not self._random_unk:
            text = (tensor.lt(text, self._num_input_words) * text +
                    tensor.ge(text, self._num_input_words) * self._vocab.unk)
        if text_name:
            application_call.add_auxiliary_variable(
                unk_ratio(text, mask, self._vocab.unk),
                name='{}_unk_ratio'.format(text_name))
        embs = self._lookup.apply(text)
        if self._random_unk:
            embs = (tensor.lt(text, self._num_input_words)[:, :, None] * embs +
                    tensor.ge(text, self._num_input_words)[:, :, None] *
                    disconnected_grad(embs))
        if def_embs:
            embs = self._combiner.apply(embs, mask, def_embs, def_map)
        add_role(embs, EMBEDDINGS)
        encoded = flip01(
            self._encoder_rnn.apply(self._encoder_fork.apply(flip01(embs)),
                                    mask=mask.T)[0])
        return encoded

    @application
    def apply(self,
              application_call,
              contexts,
              contexts_mask,
              questions,
              questions_mask,
              answer_begins,
              answer_ends,
              defs=None,
              def_mask=None,
              contexts_def_map=None,
              questions_def_map=None):
        def_embs = None
        if self._use_definitions:
            def_embs = self._def_reader.apply(defs, def_mask)

        context_enc = self._encode(contexts, contexts_mask, def_embs,
                                   contexts_def_map, 'context')
        question_enc_pre = self._encode(questions, questions_mask, def_embs,
                                        questions_def_map, 'question')
        question_enc = tensor.tanh(
            self._question_transform.apply(question_enc_pre))

        # should be (batch size, context length, question_length)
        affinity = tensor.batched_dot(context_enc, flip12(question_enc))
        affinity_mask = contexts_mask[:, :, None] * questions_mask[:, None, :]
        affinity = affinity * affinity_mask - 1000.0 * (1 - affinity_mask)
        # soft-aligns every position in the context to positions in the question
        d2q_att_weights = self._softmax.apply(affinity, extra_ndim=1)
        application_call.add_auxiliary_variable(d2q_att_weights.copy(),
                                                name='d2q_att_weights')
        # soft-aligns every position in the question to positions in the document
        q2d_att_weights = self._softmax.apply(flip12(affinity), extra_ndim=1)
        application_call.add_auxiliary_variable(q2d_att_weights.copy(),
                                                name='q2d_att_weights')

        # question encoding "in the view of the document"
        question_enc_informed = tensor.batched_dot(q2d_att_weights,
                                                   context_enc)
        question_enc_concatenated = tensor.concatenate(
            [question_enc, question_enc_informed], 2)
        # document encoding "in the view of the question"
        context_enc_informed = tensor.batched_dot(d2q_att_weights,
                                                  question_enc_concatenated)

        if self._coattention:
            context_enc_concatenated = tensor.concatenate(
                [context_enc, context_enc_informed], 2)
        else:
            question_repr_repeated = tensor.repeat(question_enc[:, [-1], :],
                                                   context_enc.shape[1],
                                                   axis=1)
            context_enc_concatenated = tensor.concatenate(
                [context_enc, question_repr_repeated], 2)

        # note: forward and backward LSTMs share the
        # input weights in the current impl
        bidir_states = flip01(
            self._bidir.apply(self._bidir_fork.apply(
                flip01(context_enc_concatenated)),
                              mask=contexts_mask.T)[0])

        begin_readouts = self._begin_readout.apply(bidir_states)[:, :, 0]
        begin_readouts = begin_readouts * contexts_mask - 1000.0 * (
            1 - contexts_mask)
        begin_costs = self._softmax.categorical_cross_entropy(
            answer_begins, begin_readouts)

        end_readouts = self._end_readout.apply(bidir_states)[:, :, 0]
        end_readouts = end_readouts * contexts_mask - 1000.0 * (1 -
                                                                contexts_mask)
        end_costs = self._softmax.categorical_cross_entropy(
            answer_ends, end_readouts)

        predicted_begins = begin_readouts.argmax(axis=-1)
        predicted_ends = end_readouts.argmax(axis=-1)
        exact_match = (tensor.eq(predicted_begins, answer_begins) *
                       tensor.eq(predicted_ends, answer_ends))
        application_call.add_auxiliary_variable(predicted_begins,
                                                name='predicted_begins')
        application_call.add_auxiliary_variable(predicted_ends,
                                                name='predicted_ends')
        application_call.add_auxiliary_variable(exact_match,
                                                name='exact_match')

        return begin_costs + end_costs

    def apply_with_default_vars(self):
        return self.apply(*self.input_vars.values())

Ejemplo n.º 23

0

Mostrar archivo

class ESIM(Initializable):
    """
    ESIM model based on https://github.com/NYU-MLL/multiNLI/blob/master/python/models/esim.py
    """

    # seq_length, emb_dim, hidden_dim
    def __init__(
            self,
            dim,
            emb_dim,
            vocab,
            def_emb_translate_dim=-1,
            def_dim=-1,
            encoder='bilstm',
            bn=True,
            def_reader=None,
            def_combiner=None,
            dropout=0.5,
            num_input_words=-1,
            # Others
            **kwargs):

        self._dropout = dropout
        self._vocab = vocab
        self._emb_dim = emb_dim
        self._def_reader = def_reader
        self._def_combiner = def_combiner

        if encoder != 'bilstm':
            raise NotImplementedError()

        if def_emb_translate_dim < 0:
            self.def_emb_translate_dim = emb_dim
        else:
            self.def_emb_translate_dim = def_emb_translate_dim

        if def_dim < 0:
            self._def_dim = emb_dim
        else:
            self._def_dim = def_dim

        if num_input_words > 0:
            logger.info("Restricting vocab to " + str(num_input_words))
            self._num_input_words = num_input_words
        else:
            self._num_input_words = vocab.size()

        children = []

        if self.def_emb_translate_dim != self._emb_dim:
            self._translate_pre_def = Linear(input_dim=emb_dim,
                                             output_dim=def_emb_translate_dim)
            children.append(self._translate_pre_def)
        else:
            self._translate_pre_def = None

        ## Embedding
        self._lookup = LookupTable(self._num_input_words,
                                   emb_dim,
                                   weights_init=GlorotUniform())
        children.append(self._lookup)

        if def_reader:
            self._final_emb_dim = self._def_dim
            self._def_reader = def_reader
            self._def_combiner = def_combiner
            children.extend([self._def_reader, self._def_combiner])
        else:
            self._final_emb_dim = self._emb_dim

        ## BiLSTM
        self._hyp_bidir_fork = Linear(
            self._def_dim if def_reader else self._emb_dim,
            4 * dim,
            name='hyp_bidir_fork')
        self._hyp_bidir = Bidirectional(LSTM(dim), name='hyp_bidir')
        self._prem_bidir_fork = Linear(
            self._def_dim if def_reader else self._emb_dim,
            4 * dim,
            name='prem_bidir_fork')
        self._prem_bidir = Bidirectional(LSTM(dim), name='prem_bidir')
        children.extend([self._hyp_bidir_fork, self._hyp_bidir])
        children.extend([self._prem_bidir, self._prem_bidir_fork])

        ## BiLSTM no. 2 (encoded attentioned embeddings)
        self._hyp_bidir_fork2 = Linear(8 * dim,
                                       4 * dim,
                                       name='hyp_bidir_fork2')
        self._hyp_bidir2 = Bidirectional(LSTM(dim), name='hyp_bidir2')
        self._prem_bidir_fork2 = Linear(8 * dim,
                                        4 * dim,
                                        name='prem_bidir_fork2')
        self._prem_bidir2 = Bidirectional(LSTM(dim), name='prem_bidir2')
        children.extend([self._hyp_bidir_fork2, self._hyp_bidir2])
        children.extend([self._prem_bidir2, self._prem_bidir_fork2])

        self._rnns = [
            self._prem_bidir2, self._hyp_bidir2, self._prem_bidir,
            self._hyp_bidir
        ]

        ## MLP
        if bn:
            self._mlp = BatchNormalizedMLP([Tanh()], [8 * dim, dim],
                                           conserve_memory=False,
                                           name="mlp")
            self._pred = BatchNormalizedMLP([Softmax()], [dim, 3],
                                            conserve_memory=False,
                                            name="pred_mlp")
        else:
            self._mlp = MLP([Tanh()], [8 * dim, dim], name="mlp")
            self._pred = MLP([Softmax()], [dim, 3], name="pred_mlp")

        children.append(self._mlp)
        children.append(self._pred)

        ## Softmax
        self._ndim_softmax = NDimensionalSoftmax()
        children.append(self._ndim_softmax)

        super(ESIM, self).__init__(children=children, **kwargs)

    def get_embeddings_lookups(self):
        return [self._lookup]

    def set_embeddings(self, embeddings):
        self._lookup.parameters[0].set_value(
            embeddings.astype(theano.config.floatX))

    def get_def_embeddings_lookups(self):
        return [self._def_reader._def_lookup]

    def set_def_embeddings(self, embeddings):
        self._def_reader._def_lookup.parameters[0].set_value(
            embeddings.astype(theano.config.floatX))

    @application
    def apply(self,
              application_call,
              s1_preunk,
              s1_mask,
              s2_preunk,
              s2_mask,
              def_mask=None,
              defs=None,
              s1_def_map=None,
              s2_def_map=None,
              train_phase=True):
        # Shortlist words (sometimes we want smaller vocab, especially when dict is small)
        s1 = (tensor.lt(s1_preunk, self._num_input_words) * s1_preunk +
              tensor.ge(s1_preunk, self._num_input_words) * self._vocab.unk)
        s2 = (tensor.lt(s2_preunk, self._num_input_words) * s2_preunk +
              tensor.ge(s2_preunk, self._num_input_words) * self._vocab.unk)

        ### Embed ###

        s1_emb = self._lookup.apply(s1)
        s2_emb = self._lookup.apply(s2)

        application_call.add_auxiliary_variable(1 * s1_emb,
                                                name='s1_word_embeddings')

        if self._def_reader:
            assert defs is not None

            def_embs = self._def_reader.apply(defs, def_mask)

            if self._translate_pre_def:
                logger.info("Translate pre def")
                s1_emb = s1_emb.reshape(
                    (s1_emb.shape[0] * s1_emb.shape[1], s1_emb.shape[2]))
                s2_emb = s2_emb.reshape(
                    (s2_emb.shape[0] * s2_emb.shape[1], s2_emb.shape[2]))
                s1_emb = self._translate_pre_def.apply(s1_emb)
                s2_emb = self._translate_pre_def.apply(s2_emb)
                s1_emb = s1_emb.reshape(
                    (s1_preunk.shape[0], s1_preunk.shape[1], -1))
                s2_emb = s2_emb.reshape(
                    (s2_preunk.shape[0], s2_preunk.shape[1], -1))

            s1_emb = self._def_combiner.apply(s1_emb,
                                              s1_mask,
                                              def_embs,
                                              s1_def_map,
                                              word_ids=s1,
                                              train_phase=train_phase,
                                              call_name="s1")

            s2_emb = self._def_combiner.apply(s2_emb,
                                              s2_mask,
                                              def_embs,
                                              s2_def_map,
                                              word_ids=s2,
                                              train_phase=train_phase,
                                              call_name="s2")
        else:
            if train_phase and self._dropout > 0:
                s1_emb = apply_dropout(s1_emb, drop_prob=self._dropout)
                s2_emb = apply_dropout(s2_emb, drop_prob=self._dropout)

        ### Encode ###

        # TODO: Share this bilstm?
        s1_bilstm, _ = self._prem_bidir.apply(
            flip01(self._prem_bidir_fork.apply(s1_emb)),
            mask=s1_mask.T)  # (batch_size, n_seq, 2 * dim)
        s2_bilstm, _ = self._hyp_bidir.apply(
            flip01(self._hyp_bidir_fork.apply(s2_emb)),
            mask=s2_mask.T)  # (batch_size, n_seq, 2 * dim)
        s1_bilstm = flip01(s1_bilstm)
        s2_bilstm = flip01(s2_bilstm)

        ### Attention ###

        # Compute E matrix (eq. 11)
        # E_ij = <s1[i], s2[j]>
        # each call computes E[i, :]
        def compute_e_row(s2_i, s1_bilstm, s1_mask):
            b_size = s1_bilstm.shape[0]
            # s2_i is (batch_size, emb_dim)
            # s1_bilstm is (batch_size, seq_len, emb_dim)
            # s1_mask is (batch_size, seq_len)
            # s2_i = s2_i.reshape((s2_i.shape[0], s2_i.shape[1], 1))
            s2_i = s2_i.reshape((b_size, s2_i.shape[1], 1))
            s2_i = T.repeat(s2_i, 2, axis=2)
            # s2_i is (batch_size, emb_dim, 2)
            assert s1_bilstm.ndim == 3
            assert s2_i.ndim == 3
            score = T.batched_dot(s1_bilstm, s2_i)  # (batch_size, seq_len, 1)
            score = score[:, :, 0].reshape(
                (b_size, -1))  # (batch_size, seq_len)
            return score  # E[i, :]

        # NOTE: No point in masking here
        E, _ = theano.scan(compute_e_row,
                           sequences=[s1_bilstm.transpose(1, 0, 2)],
                           non_sequences=[s2_bilstm, s2_mask])
        # (seq_len, batch_size, seq_len)
        E = E.dimshuffle(1, 0, 2)
        assert E.ndim == 3

        s2s_att_weights = self._ndim_softmax.apply(E, extra_ndim=1)
        application_call.add_auxiliary_variable(s2s_att_weights.copy(),
                                                name='s2s_att_weights')

        # (batch_size, seq_len, seq_len)

        ### Compute tilde vectors (eq. 12 and 13) ###

        def compute_tilde_vector(e_i, s, s_mask):
            # e_i is (batch_size, seq_len)
            # s_mask is (batch_size, seq_len)
            # s_tilde_i = \sum e_ij b_j, (batch_size, seq_len)
            score = masked_softmax(e_i, s_mask, axis=1)
            score = score.dimshuffle(0, 1, "x")

            s_tilde_i = (score *
                         (s * s_mask.dimshuffle(0, 1, "x"))).sum(axis=1)
            return s_tilde_i

        # (batch_size, seq_len, def_dim)
        s1_tilde, _ = theano.scan(compute_tilde_vector,
                                  sequences=[E.dimshuffle(1, 0, 2)],
                                  non_sequences=[s2_bilstm, s2_mask])
        s1_tilde = s1_tilde.dimshuffle(1, 0, 2)
        s2_tilde, _ = theano.scan(compute_tilde_vector,
                                  sequences=[E.dimshuffle(2, 0, 1)],
                                  non_sequences=[s1_bilstm, s1_mask])
        s2_tilde = s2_tilde.dimshuffle(1, 0, 2)

        ### Compose (eq. 14 and 15) ###

        # (batch_size, seq_len, 8 * dim)
        s1_comp = T.concatenate(
            [s1_bilstm, s1_tilde, s1_bilstm - s1_tilde, s1_bilstm * s1_tilde],
            axis=2)
        s2_comp = T.concatenate(
            [s2_bilstm, s2_tilde, s2_bilstm - s2_tilde, s2_bilstm * s2_tilde],
            axis=2)
        ### Encode (eq. 16 and 17) ###

        # (batch_size, seq_len, 8 * dim)
        # TODO: Share this bilstm?
        s1_comp_bilstm, _ = self._prem_bidir2.apply(
            self._prem_bidir_fork2.apply(flip01(s1_comp)),
            mask=s1_mask.T)  # (batch_size, n_seq, 2 * dim)
        s2_comp_bilstm, _ = self._hyp_bidir2.apply(
            self._hyp_bidir_fork2.apply(flip01(s2_comp)),
            mask=s2_mask.T)  # (batch_size, n_seq, 2 * dim)
        s1_comp_bilstm = flip01(s1_comp_bilstm)
        s2_comp_bilstm = flip01(s2_comp_bilstm)
        ### Pooling Layer ###

        s1_comp_bilstm_ave = (s1_mask.dimshuffle(0, 1, "x") * s1_comp_bilstm).sum(axis=1) \
                            / s1_mask.sum(axis=1).dimshuffle(0, "x")

        s1_comp_bilstm_max = T.max( ((1 - s1_mask.dimshuffle(0, 1, "x")) * -10000) + \
                                    (s1_mask.dimshuffle(0, 1, "x")) * s1_comp_bilstm, axis=1)

        s2_comp_bilstm_ave = (s2_mask.dimshuffle(0, 1, "x") * s2_comp_bilstm).sum(axis=1) \
                             / s2_mask.sum(axis=1).dimshuffle(0, "x")
        # (batch_size, dim)
        s2_comp_bilstm_max = T.max(((1 - s2_mask.dimshuffle(0, 1, "x")) * -10000) + \
                                   (s2_mask.dimshuffle(0, 1, "x")) * s2_comp_bilstm, axis=1)

        ### Final classifier ###

        # MLP layer
        # (batch_size, 8 * dim)
        m = T.concatenate([
            s1_comp_bilstm_ave, s1_comp_bilstm_max, s2_comp_bilstm_ave,
            s2_comp_bilstm_max
        ],
                          axis=1)
        pre_logits = self._mlp.apply(m)

        if train_phase:
            pre_logits = apply_dropout(pre_logits, drop_prob=self._dropout)

        # Get prediction
        self.logits = self._pred.apply(pre_logits)

        return self.logits

Ejemplo n.º 24

0

Mostrar archivo

class MinRiskInitialContextSequenceGenerator(InitialContextSequenceGenerator):
    def __init__(self, *args, **kwargs):
        self.softmax = NDimensionalSoftmax()
        super(MinRiskInitialContextSequenceGenerator,
              self).__init__(*args, **kwargs)
        self.children.append(self.softmax)

    @application
    def probs(self, readouts):
        return self.softmax.apply(readouts, extra_ndim=readouts.ndim - 2)

    # TODO: check where 'target_samples_mask' is used -- do we need a mask for context features (probably not)
    # Note: the @application decorator inspects the arguments, and transparently adds args  ('application_call')
    @application(inputs=[
        'representation', 'source_sentence_mask', 'target_samples_mask',
        'target_samples', 'scores'
    ],
                 outputs=['cost'])
    def expected_cost(self,
                      application_call,
                      representation,
                      source_sentence_mask,
                      target_samples,
                      target_samples_mask,
                      scores,
                      smoothing_constant=0.005,
                      **kwargs):
        """
        emulate the process in sequence_generator.cost_matrix, but compute log probabilities instead of costs
        for each sample, we need its probability according to the model (these could actually be passed from the
        sampling model, which could be more efficient)
        """

        # Transpose everything (note we can use transpose here only if it's 2d, otherwise we need dimshuffle)
        source_sentence_mask = source_sentence_mask.T

        # make samples (time, batch)
        samples = target_samples.T
        samples_mask = target_samples_mask.T

        # we need this to set the 'attended' kwarg
        keywords = {
            'mask': target_samples_mask,
            'outputs': target_samples,
            'attended': representation,
            'attended_mask': source_sentence_mask
        }

        batch_size = samples.shape[1]

        # Prepare input for the iterative part
        states = dict_subset(keywords, self._state_names, must_have=False)
        # masks in context are optional (e.g. `attended_mask`)
        # contexts = dict_subset(keywords, self._context_names, must_have=False)

        # add the initial state context features
        contexts = dict_subset(keywords, self._context_names, must_have=False)
        contexts['initial_state_context'] = kwargs['initial_state_context']

        feedback = self.readout.feedback(samples)
        inputs = self.fork.apply(feedback, as_dict=True)

        # Run the recurrent network
        results = self.transition.apply(mask=samples_mask,
                                        return_initial_states=True,
                                        as_dict=True,
                                        **dict_union(inputs, states, contexts))

        # Separate the deliverables. The last states are discarded: they
        # are not used to predict any output symbol. The initial glimpses
        # are discarded because they are not used for prediction.
        # Remember, glimpses are computed _before_ output stage, states are
        # computed after.
        states = {name: results[name][:-1] for name in self._state_names}
        glimpses = {name: results[name][1:] for name in self._glimpse_names}

        # Compute the cost
        feedback = tensor.roll(feedback, 1, 0)
        feedback = tensor.set_subtensor(
            feedback[0],
            self.readout.feedback(self.readout.initial_outputs(batch_size)))
        readouts = self.readout.readout(feedback=feedback,
                                        **dict_union(states, glimpses,
                                                     contexts))

        word_probs = self.probs(readouts)
        word_probs = tensor.log(word_probs)

        # Note: converting the samples to one-hot wastes space, but it gets the job done
        # TODO: this may be the op that sometimes causes out-of-memory
        one_hot_samples = tensor.eye(word_probs.shape[-1])[samples]
        one_hot_samples.astype('float32')
        actual_probs = word_probs * one_hot_samples

        # reshape to (batch, time, prob), then sum over the batch dimension
        # to get sequence-level probability
        actual_probs = actual_probs.dimshuffle(1, 0, 2)
        # we are first summing over vocabulary (only one non-zero cell per row)
        sequence_probs = actual_probs.sum(axis=2)
        sequence_probs = sequence_probs * target_samples_mask
        # now sum over time dimension
        sequence_probs = sequence_probs.sum(axis=1)

        # reshape and do exp() to get the true probs back
        # sequence_probs = tensor.exp(sequence_probs.reshape(scores.shape))
        sequence_probs = sequence_probs.reshape(scores.shape)

        # Note that the smoothing constant can be set by user
        sequence_distributions = (
            tensor.exp(sequence_probs * smoothing_constant) /
            tensor.exp(sequence_probs * smoothing_constant).sum(axis=1,
                                                                keepdims=True))

        # the following lines are done explicitly for code clarity
        # -- first get sequence expectation, then sum up the expectations for every
        # seq in the minibatch
        expected_scores = (sequence_distributions * scores).sum(axis=1)
        expected_scores = expected_scores.sum(axis=0)

        return expected_scores

Ejemplo n.º 25

0

Mostrar archivo

    ix_to_char, char_to_ix, vocab_size = get_metadata(hdf5_file)
    if args.primetext and len(args.primetext) > 0:
        primetext = ''.join(
            [ch for ch in args.primetext if ch in char_to_ix.keys()])
        x_curr = numpy.expand_dims(numpy.array(
            [char_to_ix[ch] for ch in primetext], dtype='uint8'),
                                   axis=1)
    else:
        dev_stream = get_stream(hdf5_file, 'dev', batch_size)
        x_curr, y_curr = dev_stream.get_epoch_iterator().next()
        x_curr = x_curr[:, -1].reshape(seq_length, 1)

    print 'Loading model from {0}...'.format(args.model)
    main_loop = load(args.model)
    print 'Model loaded. Building prediction function...'
    model = main_loop.model
    y, x = model.inputs
    softmax = NDimensionalSoftmax()
    linear_output = [v for v in model.variables
                     if v.name == 'linear_output'][0]
    y_hat = softmax.apply(linear_output, extra_ndim=1)
    predict = theano.function([x], y_hat)

    print 'Starting sampling'
    sample_string = sample(args.length,
                           x_curr,
                           predict,
                           ix_to_char,
                           seed=args.seed,
                           temperature=args.temperature)