Exemple #1
0
 def score(self, features, axis):
     """
     Calculate score for each label
     :param features: extracted feature values, of size input_size
     :param axis: axis of the label we are predicting
     :return: array with score for each label
     """
     super().score(features, axis)
     num_labels = self.num_labels[axis]
     if self.updates > 0 and num_labels > 1:
         if dynet_config.gpu():
             # RestrictedLogSoftmax is not implemented for GPU, so we move the value to CPU first
             value = dy.to_device(self.evaluate(features, axis), 'CPU')
             # then, we move it back to GPU (if the device name is '', the default device will be selected)
             value = dy.to_device(
                 dy.log_softmax(value, restrict=list(range(num_labels))),
                 '').npvalue()
         else:
             value = dy.log_softmax(self.evaluate(features, axis),
                                    restrict=list(
                                        range(num_labels))).npvalue()
         return value[:num_labels]
     self.config.print("  no updates done yet, returning zero vector.",
                       level=4)
     return np.zeros(num_labels)
 def predict_chunks_by_tokens(self, w_t, chunk_batch):
     ender = [self.lattice_vocab.chunk_end.i] * self.BATCH_SIZE
     lps = []
     state = self.lattice_rnn.initial_state(dropout=self.DROPOUT)
     cs = [[self.lattice_vocab.chunk_start.i] * self.BATCH_SIZE
           ] + chunk_batch
     cum_lp = dynet.scalarInput(0.0, device=self.args.param_device)
     for i, (cc, nc) in enumerate(zip(cs, cs[1:])):
         if self.args.concat_context_vector:
             x_t = dynet.pick_batch(self.vocab_R, cc)
             state.add_input(x_t)
         else:
             if i == 0:
                 state.add_input(self.project_main_to_lattice_init_R * w_t)
             else:
                 x_t = dynet.pick_batch(self.vocab_R, cc)
                 state.add_input(x_t)
         y_t = state.output()
         y_t = dynet.to_device(y_t, self.args.param_device)
         if self.DROPOUT:
             y_t = dynet.cmult(y_t, self.dropout_mask_lattice_y_t)
         if self.args.concat_context_vector:
             y_t = dynet.concatenate([y_t, w_t])
         r_t = dynet.affine_transform([
             self.vocab_bias, self.vocab_R,
             dynet.tanh(
                 dynet.affine_transform(
                     [self.lattice_bias, self.lattice_R, y_t]))
         ])
         if i > 0:
             lps.append(cum_lp + -dynet.pickneglogsoftmax_batch(r_t, ender))
         cum_lp = cum_lp + -dynet.pickneglogsoftmax_batch(r_t, nc)
     lps.append(cum_lp)
     return lps
 def compress_chunk(self, chunks, masks=None):
     compression_batch_size = len(chunks[0])
     # token_embeddings = [dynet.reshape(dynet.select_cols(self.vocab_lookup, tokens), (self.args.dim,), compression_batch_size)
     # token_embeddings = [dynet.reshape(dynet.transpose(dynet.select_rows(self.vocab_R, tokens)), (self.args.dim,), compression_batch_size)
     token_embeddings = [
         dynet.pick_batch(self.vocab_R, tokens) for tokens in chunks
     ]
     fwd_state = self.lattice_fwd_comp_rnn.initial_state(
         mb_size=compression_batch_size, dropout=self.DROPOUT)
     bwd_state = self.lattice_bwd_comp_rnn.initial_state(
         mb_size=compression_batch_size, dropout=self.DROPOUT)
     if masks is None:
         fwd_emb = fwd_state.transduce(token_embeddings)[-1]
         bwd_emb = bwd_state.transduce(list(reversed(token_embeddings)))[-1]
     else:
         masks = [
             dynet.inputTensor(
                 mask, batched=True, device=self.args.param_device)
             if min(mask) == 0 else None for mask in masks
         ]
         fwd_emb = fwd_state.transduce(token_embeddings, masks)[-1]
         bwd_emb = bwd_state.transduce(reversed(token_embeddings),
                                       reversed(masks))[-1]
     emb = dynet.concatenate([fwd_emb, bwd_emb])
     emb = dynet.to_device(emb, self.args.param_device)
     return emb
Exemple #4
0
 def score(self, features, axis):
     """
     Calculate score for each label
     :param features: extracted feature values, of size input_size
     :param axis: axis of the label we are predicting
     :return: array with score for each label
     """
     super().score(features, axis)
     num_labels = self.num_labels[axis]
     if self.updates > 0 and num_labels > 1:
         if dynet_config.gpu():
             # RestrictedLogSoftmax is not implemented for GPU, so we move the value to CPU first
             value = dy.to_device(self.evaluate(features, axis), 'CPU')
             # then, we move it back to GPU (if the device name is '', the default device will be selected)
             value = dy.to_device(dy.log_softmax(value, restrict=list(range(num_labels))), '').npvalue()
         else:
             value = dy.log_softmax(self.evaluate(features, axis), restrict=list(range(num_labels))).npvalue()
         return value[:num_labels]
     self.config.print("  no updates done yet, returning zero vector.", level=4)
     return np.zeros(num_labels)
 def calculate_c_t(self):
     if self.c_t is None:
         if len(self.c_t_sources) == 1:
             self.c_t = self.c_t_sources[0]
         elif self.path_dropout:
             self.c_t = self.c_t_sources[self.get_path(
                 [w.scalar_value() for w in self.weights])]
         else:
             self.c_t = dynet.concatenate_cols(
                 self.c_t_sources) * dynet.to_device(
                     dynet.softmax(self.weights), self.device)
     return self.c_t
    def process_batch(self, batch, training=False):
        self.TRAINING_ITER = training
        self.DROPOUT = self.args.dropout if (
            self.TRAINING_ITER and self.args.dropout > 0) else None
        self.BATCH_SIZE = len(batch)

        sents, masks = self.vocab.batchify(batch)

        self.instantiate_parameters()
        init_state = self.rnn.initial_state(mb_size=self.BATCH_SIZE,
                                            dropout=self.DROPOUT)

        # embeddings = [dynet.reshape(dynet.select_cols(self.vocab_lookup, toks), (self.args.dim,), self.BATCH_SIZE)
        # embeddings = [dynet.reshape(dynet.transpose(dynet.select_rows(self.vocab_R, toks)), (self.args.dim*2,), self.BATCH_SIZE)
        embeddings = [dynet.pick_batch(self.vocab_R, toks) for toks in sents]
        outputs = init_state.transduce(embeddings)
        outputs = [
            dynet.to_device(out, self.args.param_device) for out in outputs
        ]
        if self.DROPOUT:
            y_ts = [dynet.cmult(y_t, self.dropout_mask_y_t) for y_t in outputs]
        else:
            y_ts = outputs

        r_ts = [
            dynet.affine_transform([
                self.vocab_bias, self.vocab_R,
                dynet.tanh(dynet.affine_transform([self.bias, self.R, y_t]))
            ]) for y_t in y_ts
        ]
        errs = [
            dynet.pickneglogsoftmax_batch(r_t, toks)
            for r_t, toks in zip(r_ts, sents[1:])
        ]

        for tok_i, (err, mask) in enumerate(zip(errs, masks[1:])):
            if min(mask) == 0:
                errs[tok_i] = err * dynet.inputTensor(
                    mask, batched=True, device=self.args.param_device)

        err = dynet.esum(errs)
        char_count = [1 + len(self.vocab.pp(sent[1:-1])) for sent in batch]
        word_count = [len(sent[1:]) for sent in batch]
        # word_count = [2+self.vocab.pp(sent[1:-1]).count(' ') for sent in batch]
        return {"loss": err, "charcount": char_count, "wordcount": word_count}
Exemple #7
0
    def add_input(self, x_t, mask=None):
        x_t = dynet.to_device(x_t, self.device)
        if self.dropout is None:
            x_t = x_t
            h_t = self.h_t
            bias = self.bias
        else:
            x_t = dynet.cmult(x_t, self.dropout_mask_x)
            h_t = dynet.cmult(self.h_t, self.dropout_mask_h)
            bias = self.bias

        # calculate all information for all gates in one big matrix multiplication
        gates = self.W * dynet.concatenate([x_t, h_t, bias])

        # input gate
        i = dynet.logistic(dynet.pickrange(gates, 0, self.dim))
        # forget gate
        f = 1.0 - i
        # output gate
        o = dynet.logistic(dynet.pickrange(gates, self.dim, self.dim * 2))
        # input modulation gate
        g = dynet.tanh(dynet.pickrange(gates, self.dim * 2, self.dim * 3))
        # cell state
        c_t = dynet.cmult(f, self.c_t) + dynet.cmult(i, g)
        # hidden state
        h_t = dynet.cmult(o, dynet.tanh(c_t))

        if mask is None:
            self.c_t = c_t
            self.h_t = h_t
        else:
            self.c_t = (c_t * mask) + (self.c_t * (1.0 - mask))
            self.h_t = (h_t * mask) + (self.h_t * (1.0 - mask))

        if self.next_layer is not None:
            self.next_layer.add_input(self.h_t, mask)
Exemple #8
0
pb = m.add_parameters(HIDDEN_SIZE, device="GPU:0")
pV = m.add_parameters((1, HIDDEN_SIZE), device="CPU")
pa = m.add_parameters(1, device="CPU")

if len(sys.argv) == 2:
    m.populate_from_textfile(sys.argv[1])

W = dy.parameter(pW)
b = dy.parameter(pb)
V = dy.parameter(pV)
a = dy.parameter(pa)

x = dy.vecInput(2, "GPU:0")
y = dy.scalarInput(0, "CPU")
h = dy.tanh((W * x) + b)
h_cpu = dy.to_device(h, "CPU")
if xsent:
    y_pred = dy.logistic((V * h_cpu) + a)
    loss = dy.binary_log_loss(y_pred, y)
    T = 1
    F = 0
else:
    y_pred = (V * h_cpu) + a
    loss = dy.squared_distance(y_pred, y)
    T = 1
    F = -1

for iter in range(ITERATIONS):
    mloss = 0.0
    for mi in range(4):
        x1 = mi % 2
pb1 = m.add_parameters(HIDDEN_SIZE, device="GPU:1")
pW2 = m.add_parameters((HIDDEN_SIZE, HIDDEN_SIZE), device="GPU:0")
pb2 = m.add_parameters(HIDDEN_SIZE, device="GPU:0")
pV = m.add_parameters((1, HIDDEN_SIZE), device="CPU")
pa = m.add_parameters(1, device="CPU")

if len(sys.argv) == 2:
    m.populate_from_textfile(sys.argv[1])

dy.renew_cg()
W1, b1, W2, b2, V, a = dy.parameter(pW1, pb1, pW2, pb2, pV, pa)

x = dy.vecInput(2, "GPU:1")
y = dy.scalarInput(0, "CPU")
h1 = dy.tanh((W1 * x) + b1)
h1_gpu0 = dy.to_device(h1, "GPU:0")
h2 = dy.tanh((W2 * h1_gpu0) + b2)
h2_cpu = dy.to_device(h2, "CPU")
if xsent:
    y_pred = dy.logistic((V * h2_cpu) + a)
    loss = dy.binary_log_loss(y_pred, y)
    T = 1
    F = 0
else:
    y_pred = (V * h2_cpu) + a
    loss = dy.squared_distance(y_pred, y)
    T = 1
    F = -1

for iter in range(ITERATIONS):
    mloss = 0.0
    def add_input(self, x_t):
        x_t = dynet.to_device(x_t, self.device)
        h_t = self.calculate_h_t()

        if self.dropout:
            x_t = dynet.cmult(x_t, self.dropout_mask_x)
            h_t = dynet.cmult(h_t, self.dropout_mask_h)

        # bias
        bias = self.bias

        # calculate all information for all gates in one big matrix multiplication
        gates = self.W * dynet.concatenate([x_t, h_t, bias])

        # input gate
        # i = dynet.logistic(dynet.pickrange(gates, 0, self.dim))
        # output gate
        # o = dynet.logistic(dynet.pickrange(gates, self.dim, self.dim*2))
        # input modulation gate
        # g = dynet.tanh(dynet.pickrange(gates, self.dim*2, self.dim*3))

        # output gate
        o = dynet.logistic(dynet.pickrange(gates, 0, self.dim))
        # input modulation gate
        g = dynet.tanh(dynet.pickrange(gates, self.dim, self.dim * 2))

        # forget gate
        Wfx = self.Wf * dynet.concatenate([x_t, bias])
        if len(self.h_t_sources) == 1 or self.path_dropout:
            if len(self.h_t_sources) == 1: idx = 0
            else: idx = self.get_path()
            c_t = self.c_t_sources[idx]

            f_k = dynet.logistic(Wfx + self.Uf * h_t)

            # input gate
            i = 1. - f_k

            # cell state
            c_t = dynet.cmult(f_k, c_t) + dynet.cmult(i, g)
        else:
            weights = dynet.to_device(dynet.softmax(self.weights), self.device)
            if self.dropout:
                f_k = [
                    dynet.logistic(Wfx + self.Uf *
                                   dynet.cmult(h, self.dropout_mask_h)) * w
                    for h, w in zip(self.h_t_sources, weights)
                ]
            else:
                f_k = [
                    dynet.logistic(Wfx + self.Uf * h) * w
                    for h, w in zip(self.h_t_sources, weights)
                ]

            # input gate
            i = 1. - dynet.esum(f_k)

            # cell state
            c_t = dynet.esum(
                [dynet.cmult(f, c)
                 for f, c in zip(f_k, self.c_t_sources)]) + dynet.cmult(i, g)

        # hidden state
        h_t = dynet.cmult(o, dynet.tanh(c_t))

        if self.next_layer is not None:
            c_stack, h_stack = self.next_layer.add_input(h_t)
            return [c_t] + c_stack, [h_t] + h_stack
        else:
            return [c_t], [h_t]
Exemple #11
0
pb1 = m.add_parameters(HIDDEN_SIZE, device="GPU:1")
pW2 = m.add_parameters((HIDDEN_SIZE, HIDDEN_SIZE), device="GPU:0")
pb2 = m.add_parameters(HIDDEN_SIZE, device="GPU:0")
pV = m.add_parameters((1, HIDDEN_SIZE), device="CPU")
pa = m.add_parameters(1, device="CPU")

if len(sys.argv) == 2:
  m.populate_from_textfile(sys.argv[1])

dy.renew_cg()
W1, b1, W2, b2, V, a = dy.parameter(pW1, pb1, pW2, pb2, pV, pa)

x = dy.vecInput(2, "GPU:1")
y = dy.scalarInput(0, "CPU")
h1 = dy.tanh((W1*x) + b1)
h1_gpu0 = dy.to_device(h1, "GPU:0")
h2 = dy.tanh((W2*h1_gpu0) + b2)
h2_cpu = dy.to_device(h2, "CPU")
if xsent:
    y_pred = dy.logistic((V*h2_cpu) + a)
    loss = dy.binary_log_loss(y_pred, y)
    T = 1 
    F = 0 
else:
    y_pred = (V*h2_cpu) + a 
    loss = dy.squared_distance(y_pred, y)
    T = 1 
    F = -1


for iter in range(ITERATIONS):
    def process_batch_internal(self, batch, training=False, debug=False):
        self.TRAINING_ITER = training
        self.DROPOUT = self.args.dropout if (
            self.TRAINING_ITER and self.args.dropout > 0) else None
        self.BATCH_SIZE = len(batch)
        self.instantiate_parameters()

        if self.args.use_cache: self.initialize_cache(batch)

        sents, masks = self.vocab.batchify(batch)

        # paths represent the different connections within the lattice. paths[i] contains all the state/chunk pairs that
        #  end at index i
        paths = [[] for _ in range(len(sents))]
        paths[0] = [(self.rnn.fresh_state(init_to_zero=True), sents[0],
                     dynet.scalarInput(0.0, device=self.args.param_device))]
        for tok_i in range(len(sents) - 1):
            # calculate the total probability of reaching this state
            _, _, lps = zip(*paths[tok_i])
            if len(lps) == 1:
                cum_lp = lps[0]
            else:
                cum_lp = dynet.logsumexp(list(lps))

            # add all previous state/chunk pairs to the tree_lstm
            new_state = self.rnn.fresh_state()
            if self.TRAINING_ITER and self.args.train_with_random and not self.first_time_memory_test:
                raise Exception("bruh")
            else:
                self.first_time_memory_test = False
                for state, c_t, lp in paths[tok_i]:
                    x_t = dynet.pick_batch(self.vocab_R, c_t)
                    h_t_stack, c_t_stack = state.add_input(x_t)
                    new_state.add_history(h_t_stack, c_t_stack, lp)

            # treeLSTM state merging
            new_state.concat_weights()
            if self.args.gumbel_sample:
                new_state.apply_gumbel_noise_to_weights(
                    temperature=max(.25, self.args.temperature))
                if not self.TRAINING_ITER or self.args.sample_train:
                    new_state.weights_to_argmax()
                # new_state.weights_to_argmax()

            # output of tree_lstm
            y_t = dynet.to_device(new_state.output(), self.args.param_device)
            if self.DROPOUT: y_t = dynet.cmult(y_t, self.dropout_mask_y_t)

            # get the list of next tokens to consider
            base_is = sents[tok_i + 1]
            n_ts = [[nt + (i * self.vocab.size) for nt in base_is]
                    for i in range(self.args.multi_size)]

            r_t = dynet.affine_transform([
                self.vocab_bias, self.vocab_R,
                dynet.tanh(dynet.affine_transform([self.bias, self.R, y_t]))
            ])
            for n_t in n_ts:
                lp = -dynet.pickneglogsoftmax_batch(r_t, n_t)
                paths[tok_i + 1].append((new_state, n_t, cum_lp + lp))

        ending_masks = [[0.0] * self.BATCH_SIZE for _ in range(len(masks))]
        for sent_i in range(len(batch)):
            ending_masks[batch[sent_i].index(
                self.vocab.end_token.s)][sent_i] = 1.0

        # put together all of the final path states to get the final error
        cum_lp = dynet.scalarInput(0.0, device=self.args.param_device)
        for path, mask in zip(paths, ending_masks):
            if max(mask) == 1:
                assert len(path) != 0
                _, _, lps = zip(*path)
                if len(lps) == 1:
                    local_cum_lp = lps[0]
                else:
                    local_cum_lp = dynet.logsumexp(list(lps))
                cum_lp += local_cum_lp * dynet.inputTensor(
                    mask, batched=True, device=self.args.param_device)

        if debug: return paths

        err = -cum_lp
        char_count = [1 + len(self.vocab.pp(sent[1:-1])) for sent in batch]
        word_count = [len(sent[1:]) for sent in batch]
        # word_count = [2+self.lattice_vocab.pp(sent[1:-1]).count(' ') for sent in batch]
        return {"loss": err, "charcount": char_count, "wordcount": word_count}
    def process_batch_internal(self, batch, training=False, debug=False):
        self.TRAINING_ITER = training
        self.DROPOUT = self.args.dropout if (
            self.TRAINING_ITER and self.args.dropout > 0) else None
        self.BATCH_SIZE = len(batch)
        self.instantiate_parameters()

        if self.args.use_cache: self.initialize_cache(batch)

        sents, masks = self.lattice_vocab.batchify(batch)

        # paths represent the different connections within the lattice. paths[i] contains all the state/chunk pairs that
        #  end at index i
        paths = [[] for _ in range(len(sents))]
        paths[0] = [(self.rnn.fresh_state(init_to_zero=True), [sents[0]],
                     dynet.scalarInput(0.0, device=self.args.param_device))]
        for tok_i in range(len(sents) - 1):
            # calculate the total probability of reaching this state
            _, _, lps = zip(*paths[tok_i])
            if len(lps) == 1: cum_lp = lps[0]
            else: cum_lp = dynet.logsumexp(list(lps))

            # add all previous state/chunk pairs to the tree_lstm
            new_state = self.rnn.fresh_state()
            if self.TRAINING_ITER and self.args.train_with_random and not self.first_time_memory_test:
                state, c_t, lp = random.choice(paths[tok_i])
                if self.args.use_cache: x_t = self.cached_embedding_lookup(c_t)
                else: x_t = self.get_chunk_embedding(c_t)
                h_t_stack, c_t_stack = state.add_input(x_t)
                new_state.add_history(h_t_stack, c_t_stack, lp)
            else:
                self.first_time_memory_test = False
                for state, c_t, lp in paths[tok_i]:
                    if self.args.use_cache:
                        x_t = self.cached_embedding_lookup(c_t)
                    else:
                        x_t = self.get_chunk_embedding(c_t)
                    h_t_stack, c_t_stack = state.add_input(x_t)
                    new_state.add_history(h_t_stack, c_t_stack, lp)

            # treeLSTM state merging
            new_state.concat_weights()
            if self.args.gumbel_sample:
                new_state.apply_gumbel_noise_to_weights(
                    temperature=max(.25, self.args.temperature))
                if not self.TRAINING_ITER: new_state.weights_to_argmax()
                # new_state.weights_to_argmax()

            # output of tree_lstm
            y_t = new_state.output()
            y_t = dynet.to_device(y_t, self.args.param_device)
            if self.DROPOUT: y_t = dynet.cmult(y_t, self.dropout_mask_y_t)

            # based on lattice_size, decide what set of chunks to consider from here
            if self.args.lattice_size < 1: end_tok_i = len(sents)
            else:
                end_tok_i = min(tok_i + 1 + self.args.lattice_size, len(sents))
            next_chunks = sents[tok_i + 1:end_tok_i]

            # for each chunk, calculate the probability of that chunk, and then add a pointer to the state/chunk into
            #  the place in the sentence where the chunk will end
            assert not (self.args.no_fixed_preds
                        and self.args.no_dynamic_preds)
            if not self.args.no_fixed_preds:
                fixed_chunk_lps, use_dynamic_lp = self.predict_chunks(
                    y_t, next_chunks)
            if not self.args.no_dynamic_preds:
                dynamic_chunk_lps = self.predict_chunks_by_tokens(
                    y_t, next_chunks)
            for chunk_i, tok_loc in enumerate(range(tok_i + 1, end_tok_i)):
                if self.args.no_fixed_preds:
                    lp = dynamic_chunk_lps[chunk_i]
                elif self.args.no_dynamic_preds:
                    lp = fixed_chunk_lps[chunk_i]
                else:  # we are using both fixed & dynamic predictions
                    lp = dynet.logsumexp([
                        fixed_chunk_lps[chunk_i],
                        use_dynamic_lp + dynamic_chunk_lps[chunk_i]
                    ])
                paths[tok_loc].append(
                    (new_state, sents[tok_i + 1:tok_loc + 1], cum_lp + lp))

        ending_masks = [[0.0] * self.BATCH_SIZE for _ in range(len(masks))]
        for sent_i in range(len(batch)):
            ending_masks[batch[sent_i].index(
                self.lattice_vocab.end_token.s)][sent_i] = 1.0

        # put together all of the final path states to get the final error
        cum_lp = dynet.scalarInput(0.0, device=self.args.param_device)
        for path, mask in zip(paths, ending_masks):
            if max(mask) == 1:
                assert len(path) != 0
                _, _, lps = zip(*path)
                if len(lps) == 1: local_cum_lp = lps[0]
                else: local_cum_lp = dynet.logsumexp(list(lps))
                cum_lp += local_cum_lp * dynet.inputTensor(
                    mask, batched=True, device=self.args.param_device)

        if debug: return paths

        err = -cum_lp
        char_count = [
            1 + len(self.lattice_vocab.pp(sent[1:-1])) for sent in batch
        ]
        word_count = [len(sent[1:]) for sent in batch]
        # word_count = [2+self.lattice_vocab.pp(sent[1:-1]).count(' ') for sent in batch]
        return {"loss": err, "charcount": char_count, "wordcount": word_count}