Ejemplo n.º 1
0
    def selection_by_tree(self, tree, mode, idx=0):
        input_layers, pairs = self._select_by_tree(tree, mode, True)
        if len(pairs) == 0:
            if not self.opt['allow_partial']:
                input_layers, pairs = self._select_by_tree(tree, mode, False)
            else:
                print 'early stop! discard {} / {}.'.format(
                    len(tree.V), len(tree.terms))
                return None, None
        W1_rl = dy.parameter(self.model_parameters['W1_rl'])
        b1_rl = dy.parameter(self.model_parameters['b1_rl'])
        if not self.opt['one_layer']:
            W2_rl = dy.parameter(self.model_parameters['W2_rl'])
            b2_rl = dy.parameter(self.model_parameters['b2_rl'])

        # pr = W2_rl * dy.rectify(W1_rl * dy.concatenate_to_batch(input_layers) + b1_rl) + b2_rl
        # (V x N)x160 160x50 50x60 60x1
        input_layers = dy.concatenate_cols(input_layers)
        input_layers = dy.transpose(input_layers)

        if not self.opt['one_layer']:
            if self.opt['use_history']:
                pr = input_layers * dy.rectify(W2_rl * dy.rectify(
                    W1_rl * self.history[idx].output() + b1_rl) + b2_rl)
            else:
                pr = dy.rectify(input_layers * W2_rl + b2_rl) * W1_rl + b1_rl
        else:
            if self.opt['use_history']:
                pr = input_layers * dy.rectify(
                    W1_rl * self.history[idx].output() + b1_rl)
            else:
                pr = input_layers * W1_rl + b1_rl
        # (#actions, )
        pr = dy.reshape(pr, (len(pairs), ))
        return dy.softmax(pr), pairs
Ejemplo n.º 2
0
    def associate_parameters(self):
        self.U = dy.parameter(self._U)
        self.V = dy.parameter(self._V)
        self.W = dy.parameter(self._W)

        if self.encoder_type == 'attention':
            self.P = dy.parameter(self._P)
Ejemplo n.º 3
0
    def predict(self,
                feature_vector,
                task_ids,
                train=False,
                soft_labels=False,
                temperature=None,
                dropout_rate=0.0,
                orthogonality_weight=0.0,
                domain_id=None):
        dynet.renew_cg()  # new graph

        feature_vector = feature_vector.toarray()
        feature_vector = np.squeeze(feature_vector, axis=0)

        # self.input = dynet.vecInput(self.vocab_size)
        # self.input.set(feature_vector)
        # TODO this takes too long; can we speed this up somehow?
        input = dynet.inputVector(feature_vector)
        for i in range(self.h_layers):
            if train:  # add some noise
                input = dynet.noise(input, self.noise_sigma)
                input = dynet.dropout(input, dropout_rate)
            input = self.layers[i](input)
        outputs = []
        for task_id in task_ids:
            output = self.output_layers_dict[task_id](input,
                                                      soft_labels=soft_labels,
                                                      temperature=temperature)
            outputs.append(output)

        constraint, adv_loss = 0, 0
        if orthogonality_weight != 0:
            # put the orthogonality constraint either directly on the
            # output layer or on the hidden layer if it's an MLP
            F0_layer = self.output_layers_dict["F0"]
            F1_layer = self.output_layers_dict["F1"]
            F0_param = F0_layer.W_mlp if self.add_hidden else F0_layer.W
            F1_param = F1_layer.W_mlp if self.add_hidden else F1_layer.W
            F0_W = dynet.parameter(F0_param)
            F1_W = dynet.parameter(F1_param)

            # calculate the matrix product of the task matrix with both others
            matrix_product = dynet.transpose(F0_W) * F1_W

            # take the squared Frobenius norm by squaring
            # every element and then summing them
            squared_frobenius_norm = dynet.sum_elems(
                dynet.square(matrix_product))
            constraint += squared_frobenius_norm
            # print('Constraint with first matrix:', squared_frobenius_norm.value())

        if domain_id is not None:
            # flip the gradient when back-propagating through here
            adv_input = dynet.flip_gradient(input)  # last state
            adv_output = self.adv_layer(adv_input)
            adv_loss = self.pick_neg_log(adv_output, domain_id)
            # print('Adversarial loss:', avg_adv_loss.value())
        return outputs, constraint, adv_loss
Ejemplo n.º 4
0
 def compute_output_layer(self, input):
     res = [input]
     for i, p in enumerate(self.parameters):
         W, b = dy.parameter(p[0]), dy.parameter(p[1])
         if i == len(self.parameters) - 1:
             res.append(dy.logistic(W * res[-1] + b))
         else:
             res.append(self.activation(W * res[-1] + b))
     return res
Ejemplo n.º 5
0
    def generate(self, num, limit=40, beam=3):
        dy.renew_cg()

        generated = []

        W = dy.parameter(self.W)
        b = dy.parameter(self.b)

        for wordi in range(num):

            # Initialize the LSTM state with EOW token.
            start_state = self.lstm.initial_state()
            start_state = start_state.add_input(self.lookup[self.c2i[EOW]])
            best_states = [('', start_state, 0)]

            final_hypotheses = []

            # Perform beam search.
            while len(final_hypotheses) < beam and len(best_states) > 0:
                new_states = []

                for hyp, s, p in best_states:

                    # Cutoff when we exceed the character limit.
                    if len(hyp) >= limit:
                        final_hypotheses.append((hyp, p))
                        continue

                    # Get the prediction from the current LSTM state.
                    unnormalized = dy.affine_transform([b, W, s.output()])
                    softmax = dy.softmax(unnormalized).npvalue()

                    # Sample beam number of times.
                    for beami in range(beam):
                        ci = sample_softmax(softmax)
                        c = self.i2c[ci]
                        next_p = softmax[ci]
                        logp = p - np.log(next_p)

                        if c == EOW:
                            # Add final hypothesis if we reach end of word.
                            final_hypotheses.append((hyp, logp))
                        else:
                            # Else add to states to search next time step.
                            new_states.append((hyp + c,
                                               s.add_input(self.lookup[ci]),
                                               logp))

                # Sort and prune the states to within the beam.
                new_states.sort(key=lambda t: t[-1])
                best_states = new_states[:beam]

            final_hypotheses.sort(key=lambda t: t[-1])

            generated.append(final_hypotheses[0][0])

        return generated
Ejemplo n.º 6
0
    def get_top_k_paths(self, all_paths, relation_index, threshold):
        """
        Get the top k scoring paths
        """
        builder = self.builder
        model = self.model
        model_parameters = self.model_parameters
        lemma_lookup = model_parameters['lemma_lookup']
        pos_lookup = model_parameters['pos_lookup']
        dep_lookup = model_parameters['dep_lookup']
        dir_lookup = model_parameters['dir_lookup']

        path_scores = []

        for i, path in enumerate(all_paths):

            if i % 1000 == 0:
                cg = dy.renew_cg()
                W1 = dy.parameter(model_parameters['W1'])
                b1 = dy.parameter(model_parameters['b1'])
                W2 = None
                b2 = None

                if self.num_hidden_layers == 1:
                    W2 = dy.parameter(model_parameters['W2'])
                    b2 = dy.parameter(model_parameters['b2'])

            path_embedding = get_path_embedding(builder, lemma_lookup,
                                                pos_lookup, dep_lookup,
                                                dir_lookup, path)

            if self.use_xy_embeddings:
                zero_word = dy.inputVector([0.0] * self.lemma_embeddings_dim)
                path_embedding = dy.concatenate(
                    [zero_word, path_embedding, zero_word])

            h = W1 * path_embedding + b1

            if self.num_hidden_layers == 1:
                h = W2 * dy.tanh(h) + b2

            path_score = dy.softmax(h).npvalue().T
            path_scores.append(path_score)

        path_scores = np.vstack(path_scores)

        top_paths = []
        for i in range(len(relation_index)):
            indices = np.argsort(-path_scores[:, i])
            top_paths.append([
                (all_paths[index], path_scores[index, i]) for index in indices
                if threshold is None or path_scores[index, i] >= threshold
            ])

        return top_paths
Ejemplo n.º 7
0
 def associate_parameters(self):
     self.Wd = dy.parameter(self._Wd)
     self.bd = dy.parameter(self._bd)
     self.Wa = dy.parameter(self._Wa)
     self.Ua = dy.parameter(self._Ua)
     self.va = dy.parameter(self._va)
     self.Wr = dy.parameter(self._Wr)
     self.Ur = dy.parameter(self._Ur)
     self.Vr = dy.parameter(self._Vr)
     self.Wo = dy.parameter(self._Wo)
Ejemplo n.º 8
0
def do_cpu():
	C.renew_cg()
	W = C.parameter(cpW)
	W = W*W*W*W*W*W*W
	z = C.squared_distance(W,W)
	z.value()
	z.backward()
Ejemplo n.º 9
0
def select_action(tree, policy, choose_max=False, return_prob=False, mode='train'):
    prob, pairs = policy.selection_by_tree(tree, mode)
    if pairs is None:
        if return_prob:
            return None, None, None, None
        else:
            return None, None, None
    with np.errstate(all='raise'):
        try:
            prob_v = prob.npvalue()
            if choose_max:
                idx = np.argmax(prob_v)
            else:
                # if np.random.random() < policy.epsilon:
                #     idx = np.random.randint(len(prob_v))
                #     while prob_v[idx] == 0:
                #         idx = np.random.randint(len(prob_v))
                # else:
                idx = np.random.choice(range(len(prob_v)), p=prob_v / np.sum(prob_v))
        except:
            for para in policy.model_parameters:
                check_error(para, dy.parameter(policy.model_parameters[para]))
            check_error('history', policy.history.output())
            check_error('pr', prob)
    action = prob[idx]
    policy.saved_actions[-1].append(action)
    policy.update_history(pairs[idx])
    if return_prob:
        return pairs[idx], prob_v[idx], pairs, prob_v
    return pairs[idx], prob_v[idx], dy.mean_elems(dy.cmult(prob, dy.log(prob)))
Ejemplo n.º 10
0
    def train_fake(self, input, targets, epsilon = 1e-10):
        init_states = [input, dy.zeros(self.dim_lstm)]
        
        state = self.lstm.initial_state(init_states)

        loss = dy.zeros(1)
        W = dy.parameter(self.h2o)
        b = dy.parameter(self.b)
        
        state = state.add_input(self.lu[targets[0]])
        
        for target in targets[1:]:
            loss += dy.pickneglogsoftmax(W * state.output() + b + epsilon, target)
            
            embedding = self.lu[target]
            state = state.add_input(embedding)
        
        return loss
Ejemplo n.º 11
0
    def __call__(self, x, soft_labels=False, temperature=None):
        if self.mlp:
            W_mlp = dynet.parameter(self.W_mlp)
            b_mlp = dynet.parameter(self.b_mlp)
            act = self.mlp_activation
            x_in = act(W_mlp * x + b_mlp)
        else:
            x_in = x
        # from params to expressions
        W = dynet.parameter(self.W)
        b = dynet.parameter(self.b)

        logits = (W * x_in + b) + dynet.scalarInput(1e-15)
        if soft_labels and temperature:
            # calculate the soft labels smoothed with the temperature
            # see Distilling the Knowledge in a Neural Network
            elems = dynet.exp(logits / temperature)
            return dynet.cdiv(elems, dynet.sum_elems(elems))
        return self.act(logits)
Ejemplo n.º 12
0
    def __call__(self, x, soft_labels=False, temperature=None, train=False):
        if self.mlp:
            W_mlp = dynet.parameter(self.W_mlp)
            b_mlp = dynet.parameter(self.b_mlp)
            act = self.mlp_activation
            x_in = act(W_mlp * x + b_mlp)
        else:
            x_in = x
        # from params to expressions
        W = dynet.parameter(self.W)
        b = dynet.parameter(self.b)

        logits = W*x_in + b
        if soft_labels and temperature:
            # calculate the soft labels smoothed with the temperature
            # see Distilling the Knowledge in a Neural Network
            elems = dynet.exp(logits / temperature)
            return dynet.cdiv(elems, dynet.sum_elems(elems))
        if self.act:
            return self.act(logits)
        return logits
Ejemplo n.º 13
0
    def train_batch(self, words):
        losses = []

        W = dy.parameter(self.W)
        b = dy.parameter(self.b)

        for word in words:
            wlosses = []

            word = self.word_to_indices(word)

            s = self.lstm.initial_state()

            for c, next_c in zip(word, word[1:]):
                s = s.add_input(self.lookup[c])
                unnormalized = dy.affine_transform([b, W, s.output()])
                wlosses.append(dy.pickneglogsoftmax(unnormalized, next_c))

            losses.append(dy.esum(wlosses) / len(word))

        return dy.esum(losses) / len(words)
Ejemplo n.º 14
0
def do_cpu():
  import _dynet as C
  C.init()
  cm = C.Model()
  cpW = cm.add_parameters((1000,1000))
  s = time.time()
  C.renew_cg()
  W = C.parameter(cpW)
  W = W*W*W*W*W*W*W
  z = C.squared_distance(W,W)
  z.value()
  z.backward()
  print("CPU time:",time.time() - s)
Ejemplo n.º 15
0
def do_cpu():
    import _dynet as C
    C.init()
    cm = C.Model()
    cpW = cm.add_parameters((1000, 1000))
    s = time.time()
    C.renew_cg()
    W = C.parameter(cpW)
    W = W * W * W * W * W * W * W
    z = C.squared_distance(W, W)
    z.value()
    z.backward()
    print("CPU time:", time.time() - s)
Ejemplo n.º 16
0
def do_gpu():
  import _dynet as G
  import sys 
  sys.argv.append('--dynet-devices')
  sys.argv.append('GPU:0')
  G.init()
  gm = G.Model()
  gpW = gm.add_parameters((1000,1000))
  s = time.time()
  G.renew_cg()
  W = G.parameter(gpW)
  W = W*W*W*W*W*W*W
  z = G.squared_distance(W,W)
  z.value()
  z.backward()
  print("GPU time:",time.time() - s)
Ejemplo n.º 17
0
def do_gpu():
    import _dynet as G
    import sys
    sys.argv.append('--dynet-devices')
    sys.argv.append('GPU:0')
    G.init()
    gm = G.Model()
    gpW = gm.add_parameters((1000, 1000))
    s = time.time()
    G.renew_cg()
    W = G.parameter(gpW)
    W = W * W * W * W * W * W * W
    z = G.squared_distance(W, W)
    z.value()
    z.backward()
    print("GPU time:", time.time() - s)
Ejemplo n.º 18
0
 def associate_parameters(self):
     self.Ws = [dy.parameter(_W) for _W in self._Ws]
     self.bs = [dy.parameter(_b) for _b in self._bs]
Ejemplo n.º 19
0
def process_one_instance(builder,
                         model,
                         model_parameters,
                         instance,
                         path_cache,
                         update=True,
                         dropout=0.0,
                         x_y_vectors=None,
                         num_hidden_layers=0):
    """
    Return the LSTM output vector of a single term-pair - the average path embedding
    :param builder: the LSTM builder
    :param model: the LSTM model
    :param model_parameters: the model parameters
    :param instance: a Counter object with paths
    :param path_cache: the cache for path embeddings
    :param update: whether to update the lemma embeddings
    :param dropout: word dropout rate
    :param x_y_vectors: the current word vectors for x and y
    :param num_hidden_layers The number of hidden layers for the term-pair classification network
    :return: the LSTM output vector of a single term-pair
    """
    W1 = dy.parameter(model_parameters['W1'])
    b1 = dy.parameter(model_parameters['b1'])
    W2 = None
    b2 = None

    if num_hidden_layers == 1:
        W2 = dy.parameter(model_parameters['W2'])
        b2 = dy.parameter(model_parameters['b2'])

    lemma_lookup = model_parameters['lemma_lookup']
    pos_lookup = model_parameters['pos_lookup']
    dep_lookup = model_parameters['dep_lookup']
    dir_lookup = model_parameters['dir_lookup']

    # Use the LSTM output vector and feed it to the MLP

    # Add the empty path
    paths = instance

    if len(paths) == 0:
        paths[EMPTY_PATH] = 1

    # Compute the averaged path
    num_paths = reduce(lambda x, y: x + y, instance.itervalues())
    path_embbedings = [
        get_path_embedding_from_cache(path_cache, builder, lemma_lookup,
                                      pos_lookup, dep_lookup, dir_lookup, path,
                                      update, dropout) * count
        for path, count in instance.iteritems()
    ]
    input_vec = dy.esum(path_embbedings) * (1.0 / num_paths)

    # Concatenate x and y embeddings
    if x_y_vectors is not None:
        x_vector, y_vector = dy.lookup(lemma_lookup,
                                       x_y_vectors[0]), dy.lookup(
                                           lemma_lookup, x_y_vectors[1])
        input_vec = dy.concatenate([x_vector, input_vec, y_vector])

    h = W1 * input_vec + b1

    if num_hidden_layers == 1:
        h = W2 * dy.tanh(h) + b2

    output = dy.softmax(h)

    return output
Ejemplo n.º 20
0
    def predict_greedy(self, encoder, input_seq):
        dn.renew_cg()

        self.readout = dn.parameter(self.params['readout'])
        self.bias = dn.parameter(self.params['bias'])
        self.w_c = dn.parameter(self.params['w_c'])
        self.u_a = dn.parameter(self.params['u_a'])
        self.v_a = dn.parameter(self.params['v_a'])
        self.w_a = dn.parameter(self.params['w_a'])

        alphas_mtx = []

        if len(input_seq) == 0:
            return []

        # encode input sequence
        blstm_outputs, input_masks = encoder.encode_batch([input_seq])

        # initialize the decoder rnn
        s = self.decoder_rnn.initial_state()

        # set prev_output_vec for first lstm step as BEGIN_WORD concatenated with special padding vector
        prev_output_vec = dn.concatenate([
            self.output_lookup[self.y2int[common.BEGIN_SEQ]],
            self.init_lookup[0]
        ])
        predicted_sequence = []
        i = 0

        # run the decoder through the sequence and predict output symbols
        while (self.max_prediction_len is None) or (i <
                                                    self.max_prediction_len):

            # get current h of the decoder
            s = s.add_input(prev_output_vec)
            decoder_rnn_output = s.output()

            # perform attention step
            attention_output_vector, alphas = self.attend(
                blstm_outputs, decoder_rnn_output)

            if self.plot:
                val = alphas.vec_value()
                alphas_mtx.append(val)

            # compute output probabilities
            # h = readout * attention_output_vector + bias
            h = dn.affine_transform(
                [self.bias, self.readout, attention_output_vector])

            # TODO: understand why diverse needs tanh before softmax
            if self.diverse:
                h = dn.tanh(h)
            probs = dn.softmax(h)

            # find best candidate output - greedy
            next_element_index = np.argmax(probs.npvalue())
            predicted_sequence.append(self.int2y[next_element_index])

            # check if reached end of word
            if predicted_sequence[-1] == common.END_SEQ:
                break

            # prepare for the next iteration - "feedback"
            prev_output_vec = dn.concatenate([
                self.output_lookup[next_element_index], attention_output_vector
            ])
            i += 1

        # remove the end seq symbol
        return predicted_sequence[0:-1], alphas_mtx
Ejemplo n.º 21
0
 def associate_parameters(self):
     self.Ws = dy.parameter(self._Ws)
     self.Us = dy.parameter(self._Us)
     self.bs = dy.parameter(self._bs)
     self.hf_0 = dy.zeroes((self.hid_dim))
     self.hb_0 = dy.zeroes((self.hid_dim))
Ejemplo n.º 22
0
    def predict_beamsearch(self, encoder, input_seq):
        if len(input_seq) == 0:
            return []

        dn.renew_cg()

        self.readout = dn.parameter(self.params['readout'])
        self.bias = dn.parameter(self.params['bias'])
        self.w_c = dn.parameter(self.params['w_c'])
        self.u_a = dn.parameter(self.params['u_a'])
        self.v_a = dn.parameter(self.params['v_a'])
        self.w_a = dn.parameter(self.params['w_a'])

        alphas_mtx = []

        # encode input sequence
        blstm_outputs, input_masks = encoder.encode_batch([input_seq])

        # complete sequences and their probabilities
        final_states = []

        # initialize the decoder rnn
        s_0 = self.decoder_rnn.initial_state()

        # holds beam step index mapped to (sequence, probability, decoder state, attn_vector) tuples
        beam = {-1: [([common.BEGIN_SEQ], 1.0, s_0, self.init_lookup[0])]}
        i = 0

        # expand another step if didn't reach max length and there's still beams to expand
        #while i < self.max_prediction_len and len(beam[i - 1]) > 0:
        while ((self.max_prediction_len is None) or
               (i < self.max_prediction_len)) and len(beam[i - 1]) > 0:

            # create all expansions from the previous beam:
            new_hypos = []
            for hypothesis in beam[i - 1]:
                prefix_seq, prefix_prob, prefix_decoder, prefix_attn = hypothesis
                last_hypo_symbol = prefix_seq[-1]

                # cant expand finished sequences
                if last_hypo_symbol == common.END_SEQ:
                    continue

                # expand from the last symbol of the hypothesis
                try:
                    prev_output_vec = self.output_lookup[
                        self.y2int[last_hypo_symbol]]
                except KeyError:
                    # not a known symbol
                    print 'impossible to expand, key error: ' + str(
                        last_hypo_symbol)
                    continue

                decoder_input = dn.concatenate([prev_output_vec, prefix_attn])
                s = prefix_decoder.add_input(decoder_input)
                decoder_rnn_output = s.output()

                # perform attention step
                attention_output_vector, alphas = self.attend(
                    blstm_outputs, decoder_rnn_output)

                # save attention weights for plotting
                # TODO: add attention weights properly to allow building the attention matrix for the best path
                if self.plot:
                    val = alphas.vec_value()
                    alphas_mtx.append(val)

                # compute output probabilities
                # h = readout * attention_output_vector + bias
                h = dn.affine_transform(
                    [self.bias, self.readout, attention_output_vector])

                # TODO: understand why diverse needs tanh before softmax
                if self.diverse:
                    h = dn.tanh(h)
                probs = dn.softmax(h)
                probs_val = probs.npvalue()

                # TODO: maybe should choose nbest from all expansions and not only from nbest of each hypothesis?
                # find best candidate outputs
                n_best_indices = common.argmax(probs_val, self.beam_size)
                for index in n_best_indices:
                    p = probs_val[index]
                    new_seq = prefix_seq + [self.int2y[index]]
                    new_prob = prefix_prob * p
                    #if new_seq[-1] == common.END_SEQ or i == self.max_prediction_len - 1:
                    if new_seq[-1] == common.END_SEQ or (
                        (self.max_prediction_len is not None) and
                        (i == self.max_prediction_len - 1)):
                        # TODO: add to final states only if fits in k best?
                        # if found a complete sequence or max length - add to final states
                        final_states.append((new_seq[1:-1], new_prob))
                    else:
                        new_hypos.append(
                            (new_seq, new_prob, s, attention_output_vector))

            # add the most probable expansions from all hypotheses to the beam
            new_probs = np.array([p for (s, p, r, a) in new_hypos])
            argmax_indices = common.argmax(new_probs, self.beam_size)
            beam[i] = [new_hypos[l] for l in argmax_indices]
            i += 1

        # get nbest results from final states found in search
        final_probs = np.array([p for (s, p) in final_states])
        argmax_indices = common.argmax(final_probs, self.beam_size)
        nbest_seqs = [final_states[l] for l in argmax_indices]

        return nbest_seqs, alphas_mtx
Ejemplo n.º 23
0
    def compute_decoder_batch_loss(self, encoded_inputs, input_masks,
                                   output_word_ids, output_masks, batch_size):
        self.readout = dn.parameter(self.params['readout'])
        self.bias = dn.parameter(self.params['bias'])
        self.w_c = dn.parameter(self.params['w_c'])
        self.u_a = dn.parameter(self.params['u_a'])
        self.v_a = dn.parameter(self.params['v_a'])
        self.w_a = dn.parameter(self.params['w_a'])

        # initialize the decoder rnn
        s_0 = self.decoder_rnn.initial_state()

        # initial "input feeding" vectors to feed decoder - 3*h
        init_input_feeding = dn.lookup_batch(self.init_lookup,
                                             [0] * batch_size)

        # initial feedback embeddings for the decoder, use begin seq symbol embedding
        init_feedback = dn.lookup_batch(
            self.output_lookup, [self.y2int[common.BEGIN_SEQ]] * batch_size)

        # init decoder rnn
        decoder_init = dn.concatenate([init_feedback, init_input_feeding])
        s = s_0.add_input(decoder_init)

        # loss per timestep
        losses = []

        # run the decoder through the output sequences and aggregate loss
        for i, step_word_ids in enumerate(output_word_ids):

            # returns h x batch size matrix
            decoder_rnn_output = s.output()

            # compute attention context vector for each sequence in the batch (returns 2h x batch size matrix)
            attention_output_vector, alphas = self.attend(
                encoded_inputs, decoder_rnn_output, input_masks)

            # compute output scores (returns vocab_size x batch size matrix)
            # h = readout * attention_output_vector + bias
            h = dn.affine_transform(
                [self.bias, self.readout, attention_output_vector])

            # encourage diversity by punishing highly confident predictions
            # TODO: support batching - esp. w.r.t. scalar inputs
            if self.diverse:
                soft = dn.softmax(dn.tanh(h))
                batch_loss = dn.pick_batch(-dn.log(soft), step_word_ids) \
                    - dn.log(dn.scalarInput(1) - dn.pick_batch(soft, step_word_ids)) - dn.log(dn.scalarInput(4))
            else:
                # get batch loss for this timestep
                batch_loss = dn.pickneglogsoftmax_batch(h, step_word_ids)

            # mask the loss if at least one sentence is shorter
            if output_masks and output_masks[i][-1] != 1:
                mask_expr = dn.inputVector(output_masks[i])
                # noinspection PyArgumentList
                mask_expr = dn.reshape(mask_expr, (1, ), batch_size)
                batch_loss = batch_loss * mask_expr

            # input feeding approach - input h (attention_output_vector) to the decoder
            # prepare for the next iteration - "feedback"
            feedback_embeddings = dn.lookup_batch(self.output_lookup,
                                                  step_word_ids)
            decoder_input = dn.concatenate(
                [feedback_embeddings, attention_output_vector])
            s = s.add_input(decoder_input)

            losses.append(batch_loss)

        # sum the loss over the time steps and batch
        total_batch_loss = dn.sum_batches(dn.esum(losses))

        return total_batch_loss
Ejemplo n.º 24
0
 def associate_parameters(self):
     self.W = dy.parameter(self._W)
     self.b = dy.parameter(self._b)
Ejemplo n.º 25
0
    def predict(self,
                word_indices,
                char_indices,
                task_id,
                train=False,
                soft_labels=False,
                temperature=None,
                orthogonality_weight=0.0,
                domain_id=None):
        """
        predict tags for a sentence represented as char+word embeddings
        :param domain_id: Predict adversarial loss if domain id is provided.
        """
        dynet.renew_cg()  # new graph

        char_emb = []
        rev_char_emb = []

        wfeatures = [self.wembeds[w] for w in word_indices]

        if self.c_in_dim > 0:
            # get representation for words
            for chars_of_token in char_indices:
                char_feats = [self.cembeds[c] for c in chars_of_token]
                # use last state as word representation
                f_char, b_char = self.char_rnn.predict_sequence(
                    char_feats, char_feats)
                last_state = f_char[-1]
                rev_last_state = b_char[-1]
                char_emb.append(last_state)
                rev_char_emb.append(rev_last_state)

            features = [
                dynet.concatenate([w, c, rev_c])
                for w, c, rev_c in zip(wfeatures, char_emb, rev_char_emb)
            ]
        else:
            features = wfeatures

        if train:  # only do at training time
            features = [dynet.noise(fe, self.noise_sigma) for fe in features]

        output_expected_at_layer = self.h_layers
        output_expected_at_layer -= 1

        # go through layers
        prev = features
        prev_rev = features
        num_layers = self.h_layers
        constraint = 0
        adv_loss = 0
        for i in range(0, num_layers):
            predictor = self.predictors["inner"][i]
            forward_sequence, backward_sequence = predictor.predict_sequence(
                prev, prev_rev)
            if i > 0 and self.activation:
                # activation between LSTM layers
                forward_sequence = [
                    self.activation(s) for s in forward_sequence
                ]
                backward_sequence = [
                    self.activation(s) for s in backward_sequence
                ]

            if i == output_expected_at_layer:

                concat_layer = [
                    dynet.concatenate([f, b]) for f, b in zip(
                        forward_sequence, reversed(backward_sequence))
                ]
                if train and self.noise_sigma > 0.0:
                    concat_layer = [
                        dynet.noise(fe, self.noise_sigma)
                        for fe in concat_layer
                    ]

                if task_id not in ["src", "trg"]:
                    output_predictor = self.predictors["output_layers_dict"][
                        task_id]
                    output = output_predictor.predict_sequence(
                        concat_layer,
                        soft_labels=soft_labels,
                        temperature=temperature)
                else:
                    # one src example for all three outputs
                    output = []  # in this case it is a list
                    for t_id in self.task_ids:
                        output_predictor = self.predictors[
                            "output_layers_dict"][t_id]
                        output_t = output_predictor.predict_sequence(
                            concat_layer,
                            soft_labels=soft_labels,
                            temperature=temperature)
                        output.append(output_t)

                if orthogonality_weight != 0 and task_id != "Ft":
                    # put the orthogonality constraint either directly on the
                    # output layer or on the hidden layer if it's an MLP
                    # use orthogonality_weight only between F0 and F1
                    builder = self.predictors["output_layers_dict"][
                        "F0"].network_builder
                    task_param = builder.W_mlp if self.add_hidden else builder.W
                    task_W = dynet.parameter(task_param)

                    builder = self.predictors["output_layers_dict"][
                        "F1"].network_builder
                    other_param = builder.W_mlp if self.add_hidden else builder.W
                    other_task_W = dynet.parameter(other_param)

                    # calculate the matrix product of the task matrix with the other
                    matrix_product_1 = dynet.transpose(task_W) * other_task_W

                    # take the squared Frobenius norm by squaring
                    # every element and then summing them
                    squared_frobenius_norm = dynet.sum_elems(
                        dynet.square(matrix_product_1))
                    constraint = squared_frobenius_norm

                    #print('Constraint with first matrix:', squared_frobenius_norm.value())

                if domain_id is not None:
                    # flip the gradient when back-propagating through here
                    adv_input = dynet.flip_gradient(
                        concat_layer[-1])  # last state
                    adv_output = self.adv_layer(adv_input)
                    adv_loss = self.pick_neg_log(adv_output, domain_id)
                    #print('Adversarial loss:', avg_adv_loss.value())

                # output is list if task_id = 'src'
                return output, constraint, adv_loss

            prev = forward_sequence
            prev_rev = backward_sequence

        raise Exception("oops should not be here")
        return None
Ejemplo n.º 26
0
 def get_graph(self, embedding):
     dy.renew_cg()
     w = dy.parameter(self.pW)
     u = dy.parameter(self.pU)
     return u * dy.tanh(w * dy.inputTensor(embedding))