Ejemplo n.º 1
0
    def __init__(self, config, encodings, embeddings, runtime=False):
        self.config = config
        self.encodings = encodings
        # Bug in encodings - will be removed after UD
        self.has_bug=False
        if self.encodings.char2int[' ']!=1:
            self.has_bug=True
            import sys
            sys.stdout.write("Detected encodings BUG!")
        self.embeddings = embeddings
        self.losses = []
        self.model = dy.Model()
        self.trainer = dy.AdamTrainer(self.model, alpha=2e-3, beta_1=0.9, beta_2=0.9)
        self.character_network = CharacterNetwork(self.config.tag_embeddings_size, encodings,
                                                  rnn_size=self.config.char_rnn_size,
                                                  rnn_layers=self.config.char_rnn_layers,
                                                  embeddings_size=self.config.char_embeddings,
                                                  model=self.model, runtime=runtime)
        self.word2lemma={}

        self.upos_lookup = self.model.add_lookup_parameters(
            (len(self.encodings.upos2int), self.config.tag_embeddings_size))
        self.xpos_lookup = self.model.add_lookup_parameters(
            (len(self.encodings.xpos2int), self.config.tag_embeddings_size))
        self.attrs_lookup = self.model.add_lookup_parameters(
            (len(self.encodings.attrs2int), self.config.tag_embeddings_size))
        self.char_lookup = self.model.add_lookup_parameters((len(self.encodings.char2int), self.config.char_embeddings))
        if runtime:
            self.rnn = dy.LSTMBuilder(self.config.rnn_layers,
                                      self.config.char_rnn_size * 2 + self.config.char_embeddings + self.config.tag_embeddings_size,
                                      self.config.rnn_size,
                                      self.model)
        else:
            from utils import orthonormal_VanillaLSTMBuilder
            self.rnn = orthonormal_VanillaLSTMBuilder(self.config.rnn_layers,
                                                      self.config.char_rnn_size * 2 + self.config.char_embeddings + self.config.tag_embeddings_size,
                                                      self.config.rnn_size,
                                                      self.model)

        # self.att_w1 = self.model.add_parameters((200, self.config.char_rnn_size * 2))
        # self.att_w2 = self.model.add_parameters((200, self.config.rnn_size + self.config.tag_embeddings_size))
        # self.att_v = self.model.add_parameters((1, 200))

        self.start_lookup = self.model.add_lookup_parameters(
            (1, self.config.char_rnn_size * 2 + self.config.char_embeddings + self.config.tag_embeddings_size))

        self.softmax_w = self.model.add_parameters((len(self.encodings.char2int) + 3, self.config.rnn_size))
        self.softmax_b = self.model.add_parameters((len(self.encodings.char2int) + 3))

        ofs = len(self.encodings.char2int)
        self.label2int = {}
        self.label2int['<EOS>'] = ofs
        self.label2int['<COPY>'] = ofs + 1
        self.label2int['<INC>'] = ofs + 2
Ejemplo n.º 2
0
    def __init__(self, config, encodings, embeddings, runtime=False):
        self.config = config
        self.word_embeddings = embeddings
        self.encodings = encodings
        self.model = dy.Model()
        self.trainer = dy.AdamTrainer(self.model,
                                      alpha=2e-3,
                                      beta_1=0.9,
                                      beta_2=0.9)

        from character_embeddings import CharacterNetwork
        self.encoder = CharacterNetwork(self.config.character_embeddings_size,
                                        encodings,
                                        self.config.encoder_size,
                                        self.config.encoder_layers,
                                        self.config.character_embeddings_size,
                                        self.model,
                                        runtime=runtime)

        self.decoder = dy.VanillaLSTMBuilder(self.config.decoder_layers,
                                             self.config.encoder_size * 2,
                                             self.config.decoder_size,
                                             self.model)
        self.decoder_start_lookup = self.model.add_lookup_parameters(
            (1, self.config.encoder_size * 2))

        self.att_w1 = self.model.add_parameters(
            (self.config.character_embeddings_size * 2,
             self.config.encoder_size * 2))
        self.att_w2 = self.model.add_parameters(
            (self.config.character_embeddings_size * 2,
             self.config.decoder_size))
        self.att_v = self.model.add_parameters(
            (1, self.config.character_embeddings_size * 2))

        self.softmax_w = self.model.add_parameters(
            (len(self.encodings.char2int) + 4, self.config.decoder_size)
        )  # all known characters except digits with COPY, INC, TOK and EOS
        self.softmax_b = self.model.add_parameters(
            (len(self.encodings.char2int) + 4))

        self.softmax_comp_w = self.model.add_parameters(
            (2, self.config.character_embeddings_size))
        self.softmax_comp_b = self.model.add_parameters((2))

        self.label2int = {}
        ofs = len(self.encodings.char2int)
        self.label2int['<EOS>'] = ofs
        self.label2int['<TOK>'] = ofs + 1
        self.label2int['<COPY>'] = ofs + 2
        self.label2int['<INC>'] = ofs + 3

        self.losses = []
Ejemplo n.º 3
0
    def __init__(self, lemmatizer_config, encodings, embeddings, runtime=False):
        self.config = lemmatizer_config
        self.encodings = encodings
        # Bug in encodings - this will be removed after UD Shared Task
        self.has_bug = False
        if self.encodings.char2int[' '] != 1:
            self.has_bug = True
        self.embeddings = embeddings
        self.losses = []

        self.model = dy.Model()
        self.trainer = dy.AdamTrainer(self.model, alpha=2e-3, beta_1=0.9, beta_2=0.9)

        self.character_network = CharacterNetwork(self.config.tag_embeddings_size, encodings,
                                                  rnn_size=self.config.char_rnn_size,
                                                  rnn_layers=self.config.char_rnn_layers,
                                                  embeddings_size=self.config.char_embeddings,
                                                  model=self.model, runtime=runtime)

        self.upos_lookup = self.model.add_lookup_parameters(
            (len(self.encodings.upos2int), self.config.tag_embeddings_size))
        self.xpos_lookup = self.model.add_lookup_parameters(
            (len(self.encodings.xpos2int), self.config.tag_embeddings_size))
        self.attrs_lookup = self.model.add_lookup_parameters(
            (len(self.encodings.attrs2int), self.config.tag_embeddings_size))
        self.char_lookup = self.model.add_lookup_parameters((len(self.encodings.char2int), self.config.char_embeddings))

        if runtime:
            self.rnn = dy.LSTMBuilder(self.config.rnn_layers,
                                      self.config.char_rnn_size * 2 + self.config.char_embeddings, self.config.rnn_size,
                                      self.model)
        else:
            from utils import orthonormal_VanillaLSTMBuilder
            self.rnn = orthonormal_VanillaLSTMBuilder(self.config.rnn_layers,
                                                      self.config.char_rnn_size * 2 + self.config.char_embeddings,
                                                      self.config.rnn_size,
                                                      self.model)

        self.att_w1 = self.model.add_parameters((200, self.config.char_rnn_size * 2))
        self.att_w2 = self.model.add_parameters((200, self.config.rnn_size + self.config.tag_embeddings_size))
        self.att_v = self.model.add_parameters((1, 200))

        self.start_lookup = self.model.add_lookup_parameters(
            (1, self.config.char_rnn_size * 2 + self.config.char_embeddings))

        self.softmax_w = self.model.add_parameters((len(self.encodings.char2int) + 1, self.config.rnn_size))
        self.softmax_b = self.model.add_parameters((len(self.encodings.char2int) + 1))
        self.softmax_casing_w = self.model.add_parameters((2, self.config.rnn_size))
        self.softmax_casing_b = self.model.add_parameters((2))
Ejemplo n.º 4
0
class CompoundWordExpander:
    def __init__(self, config, encodings, embeddings, runtime=False):
        self.config = config
        self.word_embeddings = embeddings
        self.encodings = encodings
        self.model = dy.Model()
        self.trainer = dy.AdamTrainer(self.model,
                                      alpha=2e-3,
                                      beta_1=0.9,
                                      beta_2=0.9)

        from character_embeddings import CharacterNetwork
        self.encoder = CharacterNetwork(self.config.character_embeddings_size,
                                        encodings,
                                        self.config.encoder_size,
                                        self.config.encoder_layers,
                                        self.config.character_embeddings_size,
                                        self.model,
                                        runtime=runtime)

        self.decoder = dy.VanillaLSTMBuilder(self.config.decoder_layers,
                                             self.config.encoder_size * 2,
                                             self.config.decoder_size,
                                             self.model)
        self.decoder_start_lookup = self.model.add_lookup_parameters(
            (1, self.config.encoder_size * 2))

        self.att_w1 = self.model.add_parameters(
            (self.config.character_embeddings_size * 2,
             self.config.encoder_size * 2))
        self.att_w2 = self.model.add_parameters(
            (self.config.character_embeddings_size * 2,
             self.config.decoder_size))
        self.att_v = self.model.add_parameters(
            (1, self.config.character_embeddings_size * 2))

        self.softmax_w = self.model.add_parameters(
            (len(self.encodings.char2int) + 4, self.config.decoder_size)
        )  # all known characters except digits with COPY, INC, TOK and EOS
        self.softmax_b = self.model.add_parameters(
            (len(self.encodings.char2int) + 4))

        self.softmax_comp_w = self.model.add_parameters(
            (2, self.config.character_embeddings_size))
        self.softmax_comp_b = self.model.add_parameters((2))

        self.label2int = {}
        ofs = len(self.encodings.char2int)
        self.label2int['<EOS>'] = ofs
        self.label2int['<TOK>'] = ofs + 1
        self.label2int['<COPY>'] = ofs + 2
        self.label2int['<INC>'] = ofs + 3

        self.losses = []

    def start_batch(self):
        self.losses = []
        dy.renew_cg()

    def end_batch(self):
        total_loss = 0
        if len(self.losses) != 0:
            loss = dy.esum(self.losses)
            self.losses = []
            total_loss = loss.value()
            loss.backward()
            self.trainer.update()
        dy.renew_cg()
        return total_loss

    def learn(self, seq):
        losses = []
        examples = self._get_examples(seq)

        for example in examples:
            y_pred, encoder_states = self._predict_is_compound_entry(
                example.source, runtime=False)
            if not example.should_expand:
                losses.append(-dy.log(dy.pick(y_pred, 0)))
            else:
                losses.append(-dy.log(dy.pick(y_pred, 1)))
                losses.append(
                    self._learn_transduction(example.source,
                                             example.destination,
                                             encoder_states))
        loss = dy.esum(losses)
        self.losses.append(loss)

    def _compute_transduction_states(self, source, destination):
        a = np.zeros((len(source) + 1, len(destination) + 1))
        for i in xrange(len(source) + 1):
            a[i, 0] = i

        for i in xrange(len(destination) + 1):
            a[0, i] = i

        for i in xrange(1, len(source) + 1):
            for j in xrange(1, len(destination) + 1):
                cost = 0
                if source[i - 1] != destination[j - 1]:
                    cost = 1
                m = min([a[i - 1, j - 1], a[i - 1, j], a[i, j - 1]])
                a[i, j] = m + cost

        alignments = [-1] * len(destination)

        i = len(source)
        j = len(destination)
        while i > 1 or j > 1:
            if source[i - 1] == destination[j - 1]:
                alignments[j - 1] = i - 1
            if i == 1:
                j -= 1
            elif j == 1:
                i -= 1
            else:
                if a[i - 1, j - 1] <= a[i - 1, j] and a[i - 1,
                                                        j - 1] <= a[i, j - 1]:
                    i -= 1
                    j -= 1
                elif a[i - 1][j] <= a[i - 1, j - 1] and a[i - 1,
                                                          j] <= a[i, j - 1]:
                    i -= 1
                else:
                    j -= 1
        if source[i - 1] == destination[j - 1]:
            alignments[j - 1] = i - 1

        y_pred = []
        index_src = 0
        index_dst = 0
        while index_dst < len(destination):
            if alignments[index_dst] == index_src:
                y_pred.append("<COPY>")
                index_dst += 1
            elif alignments[index_dst] == -1:
                if destination[index_dst] == "\t":
                    y_pred.append("<TOK>")
                    index_dst += 1
                else:
                    y_pred.append(destination[index_dst])
                    index_dst += 1
            else:
                y_pred.append("<INC>")
                index_src += 1

        y_pred.append("<EOS>")
        return y_pred

    def _attend(self, input_vectors, state):
        w1 = self.att_w1.expr()
        w2 = self.att_w2.expr()
        v = self.att_v.expr()
        attention_weights = []

        w2dt = w2 * state.h()[-1]
        for input_vector in input_vectors:
            attention_weight = v * dy.tanh(w1 * input_vector + w2dt)
            attention_weights.append(attention_weight)

        attention_weights = dy.softmax(dy.concatenate(attention_weights))

        output_vectors = dy.esum([
            vector * attention_weight for vector, attention_weight in zip(
                input_vectors, attention_weights)
        ])

        return output_vectors

    def _decode(self, encoder_states, runtime=True, max_preds=-1):
        y_pred = []
        num_preds = 0
        lstm = self.decoder.initial_state().add_input(
            self.decoder_start_lookup[0])
        while num_preds < max_preds:
            input = self._attend(encoder_states, lstm)
            lstm = lstm.add_input(input)
            softmax_out = dy.softmax(self.softmax_w.expr() * lstm.output() +
                                     self.softmax_b.expr())
            y_pred.append(softmax_out)
            num_preds += 1
            if max_preds == -1 or runtime:
                if np.argmax(softmax_out.npvalue()) == self.label2int['<EOS>']:
                    return y_pred
        return y_pred

    def _learn_transduction(self, source, destination, encoder_states):
        losses = []
        y_target = self._compute_transduction_states(source, destination)
        y_predicted = self._decode(encoder_states,
                                   runtime=False,
                                   max_preds=len(y_target))
        for y_real, y_pred in zip(y_target, y_predicted):
            if y_real in self.label2int:
                losses.append(-dy.log(dy.pick(y_pred, self.label2int[y_real])))
            else:
                if y_real in self.encodings.char2int:
                    losses.append(-dy.log(
                        dy.pick(y_pred, self.encodings.char2int[y_real])))
                # else:
                #    print source + "\t\t" + destination

        return dy.esum(losses)

    def _predict_is_compound_entry(self, word, runtime=True):
        emb, states = self.encoder.compute_embeddings(word, runtime=runtime)
        output = dy.softmax(self.softmax_comp_w.expr() * emb +
                            self.softmax_comp_b.expr())
        return output, states

    def _transduce(self, source, encoder_states):
        tokens = []
        y_pred = self._decode(encoder_states, runtime=True, max_preds=100)

        i_src = 0
        token = ""
        for y in y_pred:
            y = np.argmax(y.npvalue())
            if y == self.label2int['<INC>']:
                i_src += 1
            elif y == self.label2int['<COPY>']:
                if i_src < len(source):
                    token += source[i_src]
            elif y == self.label2int['<TOK>'] or y == self.label2int['<EOS>']:
                tokens.append(token)
                token = ""
            else:
                token += self.encodings.characters[y]

        return tokens

    def tag_token(self, word):
        dy.renew_cg()
        compound = False
        word = unicode(word, 'utf-8')
        tokens = []
        ce_out, encoder_states = self._predict_is_compound_entry(word,
                                                                 runtime=True)
        if np.argmax(ce_out.npvalue()) == 1:
            tokens = self._transduce(word, encoder_states)
            compound = True
        return compound, tokens

    def tag(self, seq):
        dy.renew_cg()
        new_seq = []
        index = 1
        for entry in seq:
            if not entry.is_compound_entry:
                ce_out, encoder_states = self._predict_is_compound_entry(
                    unicode(entry.word, 'utf-8'), runtime=True)
                if np.argmax(ce_out.npvalue()) == 0:
                    entry.index = index
                    new_seq.append(entry)
                    index += 1
                else:
                    compounds = self._transduce(unicode(entry.word, 'utf-8'),
                                                encoder_states)
                    entry.index = str(index) + '-' + str(index +
                                                         len(compounds))
                    new_seq.append(entry)
                    for word in compounds:
                        from io_utils.conll import ConllEntry
                        entry = ConllEntry(index, word.encode('utf-8'),
                                           word.encode('utf-8'), '_', '_', '_',
                                           '0', '_', '_', '')
                        new_seq.append(entry)
                        index += 1

        return new_seq

    def _get_examples(self, seq):
        examples = []
        cww = 0
        for entry in seq:
            if cww == 0:
                et = ExpandedToken(source=unicode(entry.word, 'utf-8'))
                if entry.is_compound_entry:
                    et.should_expand = True
                    et.destination = u''
                    interval = entry.index
                    interval = interval.split("-")
                    stop = int(interval[1])
                    start = int(interval[0])
                    cww = stop - start + 1
                else:
                    et.destination = et.source
                    examples.append(et)
            else:
                et.destination += "\t" + unicode(entry.word, 'utf-8')
                cww -= 1
                if cww == 0:
                    et.destination = et.destination.strip()
                    examples.append(et)

        return examples

    def save(self, filename):
        self.model.save(filename)
Ejemplo n.º 5
0
    def __init__(self,
                 tagger_config,
                 encodings,
                 embeddings,
                 aux_softmax_weight=0.2,
                 runtime=False):
        self.config = tagger_config
        self.encodings = encodings
        self.embeddings = embeddings

        self.model = dy.Model()
        self.trainer = dy.AdamTrainer(
            self.model, alpha=2e-3, beta_1=0.9,
            beta_2=0.9)  # dy.MomentumSGDTrainer(self.model)
        self.trainer.set_sparse_updates(False)
        self.character_network = CharacterNetwork(
            100,
            encodings,
            rnn_size=200,
            rnn_layers=1,
            embeddings_size=self.embeddings.word_embeddings_size,
            model=self.model,
            runtime=runtime)

        self.unknown_word_embedding = self.model.add_lookup_parameters(
            (1, self.embeddings.word_embeddings_size))
        self.holistic_word_embedding = self.model.add_lookup_parameters(
            (len(encodings.word2int), self.embeddings.word_embeddings_size))

        self.char_proj_w = self.model.add_parameters(
            (self.config.input_size, self.embeddings.word_embeddings_size))
        self.emb_proj_w = self.model.add_parameters(
            (self.config.input_size, self.embeddings.word_embeddings_size))
        self.hol_proj_w = self.model.add_parameters(
            (self.config.input_size, self.embeddings.word_embeddings_size))

        self.bdrnn_fw = []
        self.bdrnn_bw = []
        rnn_input_size = self.config.input_size  # self.embeddings.word_embeddings_size

        aux_softmax_input_size = 0
        index = 0
        for layer_size in self.config.layers:
            if runtime:
                self.bdrnn_fw.append(
                    dy.VanillaLSTMBuilder(1, rnn_input_size, layer_size,
                                          self.model))
                self.bdrnn_bw.append(
                    dy.VanillaLSTMBuilder(1, rnn_input_size, layer_size,
                                          self.model))
            else:
                self.bdrnn_fw.append(
                    orthonormal_VanillaLSTMBuilder(1, rnn_input_size,
                                                   layer_size, self.model))
                self.bdrnn_bw.append(
                    orthonormal_VanillaLSTMBuilder(1, rnn_input_size,
                                                   layer_size, self.model))
            rnn_input_size = layer_size * 2
            index += 1
            if index == self.config.aux_softmax_layer:
                aux_softmax_input_size = rnn_input_size

        self.mlps = []
        for _ in xrange(3):  # upos, xpos and attrs
            mlp_w = []
            mlp_b = []
            input_sz = self.config.layers[-1] * 2
            for l_size in self.config.presoftmax_mlp_layers:
                mlp_w.append(self.model.add_parameters((l_size, input_sz)))
                mlp_b.append(self.model.add_parameters((l_size)))
                input_sz = l_size
            self.mlps.append([mlp_w, mlp_b])

        softmax_input_size = self.config.presoftmax_mlp_layers[-1]
        self.softmax_upos_w = self.model.add_parameters(
            (len(self.encodings.upos2int), softmax_input_size))
        self.softmax_upos_b = self.model.add_parameters(
            (len(self.encodings.upos2int)))
        self.softmax_xpos_w = self.model.add_parameters(
            (len(self.encodings.xpos2int), softmax_input_size))
        self.softmax_xpos_b = self.model.add_parameters(
            (len(self.encodings.xpos2int)))
        self.softmax_attrs_w = self.model.add_parameters(
            (len(self.encodings.attrs2int), softmax_input_size))
        self.softmax_attrs_b = self.model.add_parameters(
            (len(self.encodings.attrs2int)))

        self.aux_softmax_upos_w = self.model.add_parameters(
            (len(self.encodings.upos2int), aux_softmax_input_size))
        self.aux_softmax_upos_b = self.model.add_parameters(
            (len(self.encodings.upos2int)))
        self.aux_softmax_xpos_w = self.model.add_parameters(
            (len(self.encodings.xpos2int), aux_softmax_input_size))
        self.aux_softmax_xpos_b = self.model.add_parameters(
            (len(self.encodings.xpos2int)))
        self.aux_softmax_attrs_w = self.model.add_parameters(
            (len(self.encodings.attrs2int), aux_softmax_input_size))
        self.aux_softmax_attrs_b = self.model.add_parameters(
            (len(self.encodings.attrs2int)))

        self.aux_softmax_weight = aux_softmax_weight
        self.losses = []
Ejemplo n.º 6
0
class BDRNNTagger:
    def __init__(self,
                 tagger_config,
                 encodings,
                 embeddings,
                 aux_softmax_weight=0.2,
                 runtime=False):
        self.config = tagger_config
        self.encodings = encodings
        self.embeddings = embeddings

        self.model = dy.Model()
        self.trainer = dy.AdamTrainer(
            self.model, alpha=2e-3, beta_1=0.9,
            beta_2=0.9)  # dy.MomentumSGDTrainer(self.model)
        self.trainer.set_sparse_updates(False)
        self.character_network = CharacterNetwork(
            100,
            encodings,
            rnn_size=200,
            rnn_layers=1,
            embeddings_size=self.embeddings.word_embeddings_size,
            model=self.model,
            runtime=runtime)

        self.unknown_word_embedding = self.model.add_lookup_parameters(
            (1, self.embeddings.word_embeddings_size))
        self.holistic_word_embedding = self.model.add_lookup_parameters(
            (len(encodings.word2int), self.embeddings.word_embeddings_size))

        self.char_proj_w = self.model.add_parameters(
            (self.config.input_size, self.embeddings.word_embeddings_size))
        self.emb_proj_w = self.model.add_parameters(
            (self.config.input_size, self.embeddings.word_embeddings_size))
        self.hol_proj_w = self.model.add_parameters(
            (self.config.input_size, self.embeddings.word_embeddings_size))

        self.bdrnn_fw = []
        self.bdrnn_bw = []
        rnn_input_size = self.config.input_size  # self.embeddings.word_embeddings_size

        aux_softmax_input_size = 0
        index = 0
        for layer_size in self.config.layers:
            if runtime:
                self.bdrnn_fw.append(
                    dy.VanillaLSTMBuilder(1, rnn_input_size, layer_size,
                                          self.model))
                self.bdrnn_bw.append(
                    dy.VanillaLSTMBuilder(1, rnn_input_size, layer_size,
                                          self.model))
            else:
                self.bdrnn_fw.append(
                    orthonormal_VanillaLSTMBuilder(1, rnn_input_size,
                                                   layer_size, self.model))
                self.bdrnn_bw.append(
                    orthonormal_VanillaLSTMBuilder(1, rnn_input_size,
                                                   layer_size, self.model))
            rnn_input_size = layer_size * 2
            index += 1
            if index == self.config.aux_softmax_layer:
                aux_softmax_input_size = rnn_input_size

        self.mlps = []
        for _ in xrange(3):  # upos, xpos and attrs
            mlp_w = []
            mlp_b = []
            input_sz = self.config.layers[-1] * 2
            for l_size in self.config.presoftmax_mlp_layers:
                mlp_w.append(self.model.add_parameters((l_size, input_sz)))
                mlp_b.append(self.model.add_parameters((l_size)))
                input_sz = l_size
            self.mlps.append([mlp_w, mlp_b])

        softmax_input_size = self.config.presoftmax_mlp_layers[-1]
        self.softmax_upos_w = self.model.add_parameters(
            (len(self.encodings.upos2int), softmax_input_size))
        self.softmax_upos_b = self.model.add_parameters(
            (len(self.encodings.upos2int)))
        self.softmax_xpos_w = self.model.add_parameters(
            (len(self.encodings.xpos2int), softmax_input_size))
        self.softmax_xpos_b = self.model.add_parameters(
            (len(self.encodings.xpos2int)))
        self.softmax_attrs_w = self.model.add_parameters(
            (len(self.encodings.attrs2int), softmax_input_size))
        self.softmax_attrs_b = self.model.add_parameters(
            (len(self.encodings.attrs2int)))

        self.aux_softmax_upos_w = self.model.add_parameters(
            (len(self.encodings.upos2int), aux_softmax_input_size))
        self.aux_softmax_upos_b = self.model.add_parameters(
            (len(self.encodings.upos2int)))
        self.aux_softmax_xpos_w = self.model.add_parameters(
            (len(self.encodings.xpos2int), aux_softmax_input_size))
        self.aux_softmax_xpos_b = self.model.add_parameters(
            (len(self.encodings.xpos2int)))
        self.aux_softmax_attrs_w = self.model.add_parameters(
            (len(self.encodings.attrs2int), aux_softmax_input_size))
        self.aux_softmax_attrs_b = self.model.add_parameters(
            (len(self.encodings.attrs2int)))

        self.aux_softmax_weight = aux_softmax_weight
        self.losses = []

    def tag(self, seq):
        dy.renew_cg()
        softmax_list, aux_softmax_list = self._predict(seq)
        label_list = []
        for softmax in softmax_list:
            label_list.append([
                self.encodings.upos_list[np.argmax(softmax[0].npvalue())],
                self.encodings.xpos_list[np.argmax(softmax[1].npvalue())],
                self.encodings.attrs_list[np.argmax(softmax[2].npvalue())]
            ])
        return label_list

    def learn(self, seq):
        # dy.renew_cg()
        softmax_list, aux_softmax_list = self._predict(seq, runtime=False)
        losses = []
        for entry, softmax, aux_softmax in zip(seq, softmax_list,
                                               aux_softmax_list):
            upos_index = self.encodings.upos2int[entry.upos]
            xpos_index = self.encodings.xpos2int[entry.xpos]
            attrs_index = self.encodings.attrs2int[entry.attrs]

            losses.append(-dy.log(dy.pick(softmax[0], upos_index)))
            losses.append(-dy.log(dy.pick(softmax[1], xpos_index)))
            losses.append(-dy.log(dy.pick(softmax[2], attrs_index)))
            losses.append(-dy.log(dy.pick(aux_softmax[0], upos_index)) *
                          (self.aux_softmax_weight / 3))
            losses.append(-dy.log(dy.pick(aux_softmax[1], xpos_index)) *
                          (self.aux_softmax_weight / 3))
            losses.append(-dy.log(dy.pick(aux_softmax[2], attrs_index)) *
                          (self.aux_softmax_weight / 3))

        # loss = dy.average(losses)
        # loss_val = loss.value()
        # loss.backward()
        # self.trainer.update()
        # return loss_val
        self.losses.append(dy.esum(losses))

    def start_batch(self):
        self.losses = []
        dy.renew_cg()

    def end_batch(self):
        total_loss_val = 0
        if len(self.losses) > 0:
            total_loss = dy.esum(self.losses)
            self.losses = []
            total_loss_val = total_loss.value()
            total_loss.backward()
            self.trainer.update()
        return total_loss_val

    def _predict(self, seq, runtime=True):
        softmax_list = []
        aux_softmax_list = []
        x_list = []
        for entry in seq:
            word = entry.word
            char_emb, _ = self.character_network.compute_embeddings(
                word, runtime=runtime)

            word_emb, found = self.embeddings.get_word_embeddings(
                word.decode('utf-8'))
            if not found:
                word_emb = self.unknown_word_embedding[0]
            else:
                word_emb = dy.inputVector(word_emb)

            holistic_word = word.decode('utf-8').lower()
            if holistic_word in self.encodings.word2int:
                hol_emb = self.holistic_word_embedding[
                    self.encodings.word2int[holistic_word]]
            else:
                hol_emb = self.holistic_word_embedding[
                    self.encodings.word2int['<UNK>']]
            proj_emb = self.emb_proj_w.expr() * word_emb
            proj_hol = self.hol_proj_w.expr() * hol_emb
            proj_char = self.char_proj_w.expr() * char_emb
            # x_list.append(dy.tanh(proj_char + proj_emb + proj_hol))

            if runtime:
                x_list.append(dy.tanh(proj_char + proj_emb + proj_hol))
            else:
                p1 = random.random()
                p2 = random.random()
                p3 = random.random()
                m1 = 1
                m2 = 1
                m3 = 1
                if p1 < self.config.input_dropout_prob:
                    m1 = 0
                if p2 < self.config.input_dropout_prob:
                    m2 = 0
                if p3 < self.config.input_dropout_prob:
                    m3 = 0

                scale = 1.0
                if m1 + m2 + m3 > 0:
                    scale = float(3) / (m1 + m2 + m3)
                m1 = dy.scalarInput(m1)
                m2 = dy.scalarInput(m2)
                m3 = dy.scalarInput(m3)
                scale = dy.scalarInput(scale)
                x_list.append(
                    dy.tanh((proj_char * m1 + proj_emb * m2 + proj_hol * m3) *
                            scale))

        # BDLSTM
        rnn_outputs = []
        for fw, bw, dropout in zip(self.bdrnn_fw, self.bdrnn_bw,
                                   self.config.layer_dropouts):
            if not runtime:
                fw.set_dropouts(0, dropout)
                bw.set_dropouts(0, dropout)
            else:
                fw.set_dropouts(0, 0)
                bw.set_dropouts(0, 0)
            fw_list = fw.initial_state().transduce(x_list)
            bw_list = list(
                reversed(bw.initial_state().transduce(reversed(x_list))))
            x_list = [
                dy.concatenate([x_fw, x_bw])
                for x_fw, x_bw in zip(fw_list, bw_list)
            ]
            # if runtime:
            #    x_out = x_list
            # else:
            #    x_out = [dy.dropout(x, dropout) for x in x_list]
            rnn_outputs.append(x_list)

        # SOFTMAX
        mlp_output = []
        for x in rnn_outputs[-1]:
            pre_softmax = []
            for iMLP in xrange(3):
                mlp_w = self.mlps[iMLP][0]
                mlp_b = self.mlps[iMLP][1]
                inp = x
                for w, b, drop, in zip(mlp_w, mlp_b,
                                       self.config.presoftmax_mlp_dropouts):
                    inp = dy.tanh(w.expr() * inp + b.expr())
                    if not runtime:
                        inp = dy.dropout(inp, drop)
                pre_softmax.append(inp)
            mlp_output.append(pre_softmax)

        for softmax_inp, aux_softmax_inp in zip(
                mlp_output, rnn_outputs[self.config.aux_softmax_layer - 1]):
            softmax_list.append([
                dy.softmax(self.softmax_upos_w.expr() * softmax_inp[0] +
                           self.softmax_upos_b.expr()),
                dy.softmax(self.softmax_xpos_w.expr() * softmax_inp[1] +
                           self.softmax_xpos_b.expr()),
                dy.softmax(self.softmax_attrs_w.expr() * softmax_inp[2] +
                           self.softmax_attrs_b.expr())
            ])
            aux_softmax_list.append([
                dy.softmax(self.aux_softmax_upos_w.expr() * aux_softmax_inp +
                           self.aux_softmax_upos_b.expr()),
                dy.softmax(self.aux_softmax_xpos_w.expr() * aux_softmax_inp +
                           self.aux_softmax_xpos_b.expr()),
                dy.softmax(self.aux_softmax_attrs_w.expr() * aux_softmax_inp +
                           self.aux_softmax_attrs_b.expr())
            ])

        return softmax_list, aux_softmax_list

    def save(self, path):
        self.model.save(path)

    def load(self, path):
        self.model.populate(path)

    def tag_sequences(self, sequences):
        new_sequences = []
        for sequence in sequences:
            new_sequence = copy.deepcopy(sequence)
            predicted_tags = self.tag(new_sequence)
            for entryIndex, pred in enumerate(predicted_tags):
                new_sequence[entryIndex].upos = pred[0]
                new_sequence[entryIndex].xpos = pred[1]
                new_sequence[entryIndex].attrs = pred[2]
            new_sequences.append(new_sequence)
        return new_sequences
Ejemplo n.º 7
0
    def __init__(self,
                 parser_config,
                 encodings,
                 embeddings,
                 aux_softmax_weight=0.2,
                 runtime=False):
        self.config = parser_config
        self.encodings = encodings
        self.embeddings = embeddings
        self.decoder = GreedyDecoder()

        self.model = dy.Model()

        # self.trainer = dy.SimpleSGDTrainer(self.model)
        self.trainer = dy.AdamTrainer(self.model,
                                      alpha=2e-3,
                                      beta_1=0.9,
                                      beta_2=0.9)

        self.trainer.set_sparse_updates(False)
        self.character_network = CharacterNetwork(
            100,
            encodings,
            rnn_size=200,
            rnn_layers=1,
            embeddings_size=self.config.input_embeddings_size,
            model=self.model,
            runtime=runtime)

        self.holistic_embeddings = self.model.add_lookup_parameters(
            (len(self.encodings.word2int), self.config.input_embeddings_size))

        self.input_proj_w_word = self.model.add_parameters(
            (self.config.input_embeddings_size,
             self.embeddings.word_embeddings_size))
        self.input_proj_b_word = self.model.add_parameters(
            (self.config.input_embeddings_size))

        self.unknown_word_embedding = self.model.add_lookup_parameters(
            (3, self.config.input_embeddings_size))  # for padding lexical
        self.pad_tag_embedding = self.model.add_lookup_parameters(
            (3, self.config.input_embeddings_size))  # for padding morphology

        self.bdrnn_fw = []
        self.bdrnn_bw = []

        rnn_input_size = 0
        if self.config.use_lexical:
            rnn_input_size += self.config.input_embeddings_size

        if self.config.use_morphology:
            rnn_input_size += self.config.input_embeddings_size
            self.upos_lookup = self.model.add_lookup_parameters(
                (len(self.encodings.upos2int),
                 self.config.input_embeddings_size))
            self.xpos_lookup = self.model.add_lookup_parameters(
                (len(self.encodings.xpos2int),
                 self.config.input_embeddings_size))
            self.attrs_lookup = self.model.add_lookup_parameters(
                (len(self.encodings.attrs2int),
                 self.config.input_embeddings_size))

        index = 0
        aux_proj_input_size = 0
        for layer_size in self.config.layers:
            if runtime:
                self.bdrnn_fw.append(
                    dy.VanillaLSTMBuilder(1, rnn_input_size, layer_size,
                                          self.model))
                self.bdrnn_bw.append(
                    dy.VanillaLSTMBuilder(1, rnn_input_size, layer_size,
                                          self.model))
            else:
                self.bdrnn_fw.append(
                    orthonormal_VanillaLSTMBuilder(1, rnn_input_size,
                                                   layer_size, self.model))
                self.bdrnn_bw.append(
                    orthonormal_VanillaLSTMBuilder(1, rnn_input_size,
                                                   layer_size, self.model))
            rnn_input_size = layer_size * 2
            index += 1
            if index == self.config.aux_softmax_layer:
                aux_proj_input_size = rnn_input_size

        proj_input_size = self.config.layers[-1] * 2

        self.proj_arc_w_head = self.model.add_parameters(
            (self.config.arc_proj_size, proj_input_size))
        self.proj_arc_b_head = self.model.add_parameters(
            (self.config.arc_proj_size))
        self.proj_arc_w_dep = self.model.add_parameters(
            (self.config.arc_proj_size, proj_input_size))
        self.proj_arc_b_dep = self.model.add_parameters(
            (self.config.arc_proj_size))
        self.proj_label_w_head = self.model.add_parameters(
            (self.config.label_proj_size, proj_input_size))
        self.proj_label_b_head = self.model.add_parameters(
            (self.config.label_proj_size))
        self.proj_label_w_dep = self.model.add_parameters(
            (self.config.label_proj_size, proj_input_size))
        self.proj_label_b_dep = self.model.add_parameters(
            (self.config.label_proj_size))
        if not self.config.predict_morphology:
            self.aux_proj_arc_w_head = self.model.add_parameters(
                (self.config.arc_proj_size, aux_proj_input_size))
            self.aux_proj_arc_b_head = self.model.add_parameters(
                (self.config.arc_proj_size))
            self.aux_proj_arc_w_dep = self.model.add_parameters(
                (self.config.arc_proj_size, aux_proj_input_size))
            self.aux_proj_arc_b_dep = self.model.add_parameters(
                (self.config.arc_proj_size))
        else:
            self.upos_proj_w = self.model.add_parameters(
                (self.config.label_proj_size, aux_proj_input_size))
            self.xpos_proj_w = self.model.add_parameters(
                (self.config.label_proj_size, aux_proj_input_size))
            self.attrs_proj_w = self.model.add_parameters(
                (self.config.label_proj_size, aux_proj_input_size))
            self.upos_proj_b = self.model.add_parameters(
                (self.config.label_proj_size))
            self.xpos_proj_b = self.model.add_parameters(
                (self.config.label_proj_size))
            self.attrs_proj_b = self.model.add_parameters(
                (self.config.label_proj_size))

        self.link_b = self.model.add_parameters((1, self.config.arc_proj_size))
        self.link_w = self.model.add_parameters(
            (self.config.arc_proj_size, self.config.arc_proj_size))

        self.label_ww = self.model.add_parameters(
            (1, len(self.encodings.label2int)))
        self.label_w = self.model.add_parameters(
            (len(self.encodings.label2int), self.config.label_proj_size * 2))
        self.label_bb = self.model.add_parameters(
            (len(self.encodings.label2int)))

        if not self.config.predict_morphology:
            self.aux_link_w = self.model.add_parameters(
                (self.config.arc_proj_size, self.config.arc_proj_size))
            self.aux_link_b = self.model.add_parameters(
                (1, self.config.arc_proj_size))
        else:
            self.upos_softmax_w = self.model.add_parameters(
                (len(self.encodings.upos2int), self.config.label_proj_size))
            self.xpos_softmax_w = self.model.add_parameters(
                (len(self.encodings.xpos2int), self.config.label_proj_size))
            self.attrs_softmax_w = self.model.add_parameters(
                (len(self.encodings.attrs2int), self.config.label_proj_size))

            self.upos_softmax_b = self.model.add_parameters(
                (len(self.encodings.upos2int)))
            self.xpos_softmax_b = self.model.add_parameters(
                (len(self.encodings.xpos2int)))
            self.attrs_softmax_b = self.model.add_parameters(
                (len(self.encodings.attrs2int)))
            self.lemma_softmax_b = self.model.add_parameters(
                (len(self.encodings.char2int) + 1))
            self.lemma_softmax_casing_b = self.model.add_parameters((2))

        self.aux_softmax_weight = aux_softmax_weight
        self.batch_loss = []
Ejemplo n.º 8
0
class BDRNNParser:
    def __init__(self,
                 parser_config,
                 encodings,
                 embeddings,
                 aux_softmax_weight=0.2,
                 runtime=False):
        self.config = parser_config
        self.encodings = encodings
        self.embeddings = embeddings
        self.decoder = GreedyDecoder()

        self.model = dy.Model()

        # self.trainer = dy.SimpleSGDTrainer(self.model)
        self.trainer = dy.AdamTrainer(self.model,
                                      alpha=2e-3,
                                      beta_1=0.9,
                                      beta_2=0.9)

        self.trainer.set_sparse_updates(False)
        self.character_network = CharacterNetwork(
            100,
            encodings,
            rnn_size=200,
            rnn_layers=1,
            embeddings_size=self.config.input_embeddings_size,
            model=self.model,
            runtime=runtime)

        self.holistic_embeddings = self.model.add_lookup_parameters(
            (len(self.encodings.word2int), self.config.input_embeddings_size))

        self.input_proj_w_word = self.model.add_parameters(
            (self.config.input_embeddings_size,
             self.embeddings.word_embeddings_size))
        self.input_proj_b_word = self.model.add_parameters(
            (self.config.input_embeddings_size))

        self.unknown_word_embedding = self.model.add_lookup_parameters(
            (3, self.config.input_embeddings_size))  # for padding lexical
        self.pad_tag_embedding = self.model.add_lookup_parameters(
            (3, self.config.input_embeddings_size))  # for padding morphology

        self.bdrnn_fw = []
        self.bdrnn_bw = []

        rnn_input_size = 0
        if self.config.use_lexical:
            rnn_input_size += self.config.input_embeddings_size

        if self.config.use_morphology:
            rnn_input_size += self.config.input_embeddings_size
            self.upos_lookup = self.model.add_lookup_parameters(
                (len(self.encodings.upos2int),
                 self.config.input_embeddings_size))
            self.xpos_lookup = self.model.add_lookup_parameters(
                (len(self.encodings.xpos2int),
                 self.config.input_embeddings_size))
            self.attrs_lookup = self.model.add_lookup_parameters(
                (len(self.encodings.attrs2int),
                 self.config.input_embeddings_size))

        index = 0
        aux_proj_input_size = 0
        for layer_size in self.config.layers:
            if runtime:
                self.bdrnn_fw.append(
                    dy.VanillaLSTMBuilder(1, rnn_input_size, layer_size,
                                          self.model))
                self.bdrnn_bw.append(
                    dy.VanillaLSTMBuilder(1, rnn_input_size, layer_size,
                                          self.model))
            else:
                self.bdrnn_fw.append(
                    orthonormal_VanillaLSTMBuilder(1, rnn_input_size,
                                                   layer_size, self.model))
                self.bdrnn_bw.append(
                    orthonormal_VanillaLSTMBuilder(1, rnn_input_size,
                                                   layer_size, self.model))
            rnn_input_size = layer_size * 2
            index += 1
            if index == self.config.aux_softmax_layer:
                aux_proj_input_size = rnn_input_size

        proj_input_size = self.config.layers[-1] * 2

        self.proj_arc_w_head = self.model.add_parameters(
            (self.config.arc_proj_size, proj_input_size))
        self.proj_arc_b_head = self.model.add_parameters(
            (self.config.arc_proj_size))
        self.proj_arc_w_dep = self.model.add_parameters(
            (self.config.arc_proj_size, proj_input_size))
        self.proj_arc_b_dep = self.model.add_parameters(
            (self.config.arc_proj_size))
        self.proj_label_w_head = self.model.add_parameters(
            (self.config.label_proj_size, proj_input_size))
        self.proj_label_b_head = self.model.add_parameters(
            (self.config.label_proj_size))
        self.proj_label_w_dep = self.model.add_parameters(
            (self.config.label_proj_size, proj_input_size))
        self.proj_label_b_dep = self.model.add_parameters(
            (self.config.label_proj_size))
        if not self.config.predict_morphology:
            self.aux_proj_arc_w_head = self.model.add_parameters(
                (self.config.arc_proj_size, aux_proj_input_size))
            self.aux_proj_arc_b_head = self.model.add_parameters(
                (self.config.arc_proj_size))
            self.aux_proj_arc_w_dep = self.model.add_parameters(
                (self.config.arc_proj_size, aux_proj_input_size))
            self.aux_proj_arc_b_dep = self.model.add_parameters(
                (self.config.arc_proj_size))
        else:
            self.upos_proj_w = self.model.add_parameters(
                (self.config.label_proj_size, aux_proj_input_size))
            self.xpos_proj_w = self.model.add_parameters(
                (self.config.label_proj_size, aux_proj_input_size))
            self.attrs_proj_w = self.model.add_parameters(
                (self.config.label_proj_size, aux_proj_input_size))
            self.upos_proj_b = self.model.add_parameters(
                (self.config.label_proj_size))
            self.xpos_proj_b = self.model.add_parameters(
                (self.config.label_proj_size))
            self.attrs_proj_b = self.model.add_parameters(
                (self.config.label_proj_size))

        self.link_b = self.model.add_parameters((1, self.config.arc_proj_size))
        self.link_w = self.model.add_parameters(
            (self.config.arc_proj_size, self.config.arc_proj_size))

        self.label_ww = self.model.add_parameters(
            (1, len(self.encodings.label2int)))
        self.label_w = self.model.add_parameters(
            (len(self.encodings.label2int), self.config.label_proj_size * 2))
        self.label_bb = self.model.add_parameters(
            (len(self.encodings.label2int)))

        if not self.config.predict_morphology:
            self.aux_link_w = self.model.add_parameters(
                (self.config.arc_proj_size, self.config.arc_proj_size))
            self.aux_link_b = self.model.add_parameters(
                (1, self.config.arc_proj_size))
        else:
            self.upos_softmax_w = self.model.add_parameters(
                (len(self.encodings.upos2int), self.config.label_proj_size))
            self.xpos_softmax_w = self.model.add_parameters(
                (len(self.encodings.xpos2int), self.config.label_proj_size))
            self.attrs_softmax_w = self.model.add_parameters(
                (len(self.encodings.attrs2int), self.config.label_proj_size))

            self.upos_softmax_b = self.model.add_parameters(
                (len(self.encodings.upos2int)))
            self.xpos_softmax_b = self.model.add_parameters(
                (len(self.encodings.xpos2int)))
            self.attrs_softmax_b = self.model.add_parameters(
                (len(self.encodings.attrs2int)))
            self.lemma_softmax_b = self.model.add_parameters(
                (len(self.encodings.char2int) + 1))
            self.lemma_softmax_casing_b = self.model.add_parameters((2))

        self.aux_softmax_weight = aux_softmax_weight
        self.batch_loss = []

    def start_batch(self):
        dy.renew_cg()
        self.batch_loss = []

    def end_batch(self):
        if len(self.batch_loss) > 0:
            loss = dy.esum(self.batch_loss)
            loss_val = loss.value()
            loss.backward()
            self.trainer.update()
            return loss_val
        else:
            return 0

    def learn(self, seq):
        # remove compound words
        tmp = []
        for ss in seq:
            if not ss.is_compound_entry:
                tmp.append(ss)
        seq = tmp
        arc_matrix, aux_arc_matrix, proj_labels, softmax_morphology = self._predict_arc(
            seq, runtime=False)
        gold_heads = [entry.head for entry in seq]
        gold_labels = [entry.label for entry in seq]

        softmax_labels = self._predict_label(gold_heads,
                                             proj_labels,
                                             runtime=False)

        losses = []

        for gold_head, gold_label, arc_probs, softmax_label, entry in zip(
                gold_heads, gold_labels, arc_matrix[1:], softmax_labels, seq):
            label_index = self.encodings.label2int[gold_label]
            losses.append(-dy.log(arc_probs[gold_head]))
            losses.append(-dy.log(dy.pick(softmax_label, label_index)))

        if not self.config.predict_morphology:
            for gold_head, aux_probs, entry in zip(gold_heads,
                                                   aux_arc_matrix[1:], seq):
                losses.append(-dy.log(aux_probs[gold_head]) *
                              self.aux_softmax_weight)

        else:
            for softmax_morph, entry in zip(softmax_morphology, seq):
                loss_upos = -dy.log(
                    dy.pick(softmax_morph[0],
                            self.encodings.upos2int[entry.upos]))
                losses.append(loss_upos * (self.aux_softmax_weight / 3))

                if len(
                        self.encodings.xpos2int
                ) > 1:  # stability check (some languages are missing attributes or XPOS, resulting in numerical overflow during backpropagation
                    loss_xpos = -dy.log(
                        dy.pick(softmax_morph[1],
                                self.encodings.xpos2int[entry.xpos]))
                    losses.append(loss_xpos * (self.aux_softmax_weight / 3))

                if len(
                        self.encodings.attrs2int
                ) > 1:  # stability check (some languages are missing attributes or XPOS, resulting in numerical overflow during backpropagation
                    loss_attrs = -dy.log(
                        dy.pick(softmax_morph[2],
                                self.encodings.attrs2int[entry.attrs]))
                    losses.append(loss_attrs * (self.aux_softmax_weight / 3))

        loss = dy.esum(losses)
        self.batch_loss.append(loss)

    def _attend(self, input_vectors, state, aux_embeddings):
        w1 = self.lemma_att_w1.expr()
        w2 = self.lemma_att_w2.expr()
        v = self.lemma_att_v.expr()
        attention_weights = []

        w2dt = w2 * dy.concatenate([state.h()[-1], aux_embeddings])
        for input_vector in input_vectors:
            attention_weight = v * dy.tanh(w1 * input_vector + w2dt)
            attention_weights.append(attention_weight)

        attention_weights = dy.softmax(dy.concatenate(attention_weights))

        output_vectors = dy.esum([
            vector * attention_weight for vector, attention_weight in zip(
                input_vectors, attention_weights)
        ])

        return output_vectors

    def tag(self, seq):
        tmp = []
        for ss in seq:
            if not ss.is_compound_entry:
                tmp.append(ss)

        # if len(tmp)<2:
        #     print "ERRRORR"
        #     for entry in seq:
        #         print str(entry.index)+"\t"+str(entry.word)
        seq = tmp

        dy.renew_cg()
        arc_matrix, aux_arc_matrix, proj_labels, softmax_morphology = self._predict_arc(
            seq)
        pred_heads = self.decoder.decode(arc_matrix)
        softmax_labels = self._predict_label(pred_heads, proj_labels)

        tag_list = []
        for pred_head, softmax_label in zip(pred_heads, softmax_labels):
            label_index = np.argmax(softmax_label.npvalue())
            tag = ParserTag(pred_head, self.encodings.labels[label_index],
                            None, None, None)
            tag_list.append(tag)

        if self.config.predict_morphology:
            for tag, softmax_morph in zip(tag_list, softmax_morphology):
                tag.upos = self.encodings.upos_list[np.argmax(
                    softmax_morph[0].npvalue())]
                tag.xpos = self.encodings.xpos_list[np.argmax(
                    softmax_morph[1].npvalue())]
                tag.attrs = self.encodings.attrs_list[np.argmax(
                    softmax_morph[2].npvalue())]

        return tag_list

    def _predict_label(self, heads, proj_labels, runtime=True):
        s_labels = []
        for iDep, iHead in zip(range(1, len(heads) + 1), heads):
            modw = dy.transpose(
                dy.reshape(proj_labels[iHead][1],
                           (self.config.label_proj_size, 1)) *
                self.label_ww.expr())
            term1 = modw * proj_labels[iDep][0]
            term2 = self.label_w.expr() * dy.concatenate(
                [proj_labels[iHead][1], proj_labels[iDep][0]])
            term3 = self.label_bb.expr()
            s_labels.append(dy.softmax(term1 + term2 + term3))

        return s_labels

    def _make_input(self, seq, runtime):
        x_list = []
        encoder_states_list = [None]
        # add the root
        if not self.config.use_morphology:
            x_list.append(self.unknown_word_embedding[1])
        elif not self.config.use_lexical:
            x_list.append(self.pad_tag_embedding[1])
        else:  # both lexical and morphology are used
            x_list.append(
                dy.concatenate([
                    self.unknown_word_embedding[1], self.pad_tag_embedding[1]
                ]))

        for entry in seq:
            word = entry.word

            if self.config.use_lexical:
                # prepare lexical embeddings
                char_emb, encoder_states = self.character_network.compute_embeddings(
                    word, runtime=runtime)
                encoder_states_list.append(encoder_states)

                word_emb, found = self.embeddings.get_word_embeddings(
                    word.decode('utf-8'))
                if not found:
                    word_emb = self.unknown_word_embedding[0]
                else:
                    word_emb = dy.tanh(self.input_proj_w_word.expr() *
                                       dy.inputVector(word_emb) +
                                       self.input_proj_b_word.expr())

                word = word.decode('utf-8').lower()
                if word in self.encodings.word2int:
                    holistic_emb = self.holistic_embeddings[
                        self.encodings.word2int[word]]
                else:
                    holistic_emb = self.holistic_embeddings[
                        self.encodings.word2int['<UNK>']]

                # dropout lexical embeddings
                if runtime:
                    w_emb = word_emb + char_emb + holistic_emb
                else:
                    p1 = random.random()
                    p2 = random.random()
                    p3 = random.random()
                    m1 = 1
                    m2 = 1
                    m3 = 1
                    if p1 < self.config.input_dropout_prob:
                        m1 = 0
                    if p2 < self.config.input_dropout_prob:
                        m2 = 0
                    if p3 < self.config.input_dropout_prob:
                        m3 = 0

                    scale = 1.0
                    if m1 + m2 + m3 > 0:
                        scale = float(3) / (m1 + m2 + m3)
                    m1 = dy.scalarInput(m1)
                    m2 = dy.scalarInput(m2)
                    m3 = dy.scalarInput(m3)
                    scale = dy.scalarInput(scale)
                    w_emb = (word_emb * m1 + char_emb * m2 +
                             holistic_emb * m3) * scale

            if self.config.use_morphology:
                if entry.upos in self.encodings.upos2int:
                    upos_emb = self.upos_lookup[self.encodings.upos2int[
                        entry.upos]]
                else:
                    upos_emb = dy.inputVector(
                        [0] * self.config.input_embeddings_size)
                if entry.xpos in self.encodings.xpos2int:
                    xpos_emb = self.xpos_lookup[self.encodings.xpos2int[
                        entry.xpos]]
                else:
                    xpos_emb = dy.inputVector(
                        [0] * self.config.input_embeddings_size)
                if entry.attrs in self.encodings.attrs2int:
                    attrs_emb = self.attrs_lookup[self.encodings.attrs2int[
                        entry.attrs]]
                else:
                    attrs_emb = dy.inputVector(
                        [0] * self.config.input_embeddings_size)
                # overwrite all dropouts. it will later be handled by "same-mask"
                t_emb = upos_emb + xpos_emb + attrs_emb
                # w_emb = word_emb + char_emb + holistic_emb

            # compose embeddings, if necessary
            if self.config.use_lexical and self.config.use_morphology:
                if not runtime:
                    p1 = random.random()
                    p2 = random.random()
                    m1 = 1
                    m2 = 1
                    if p1 < self.config.input_dropout_prob:
                        m1 = 0
                    if p2 < self.config.input_dropout_prob:
                        m2 = 0
                    if m1 + m2 > 0:
                        scale = float(2.0) / (m1 + m2)
                    else:
                        scale = 1.0
                    scale = dy.scalarInput(scale)
                    m1 = dy.scalarInput(m1)
                    m2 = dy.scalarInput(m2)
                    x_list.append(
                        dy.concatenate(
                            [w_emb * m1 * scale, t_emb * m2 * scale]))
                else:
                    x_list.append(dy.concatenate([w_emb, t_emb]))
            elif self.config.use_lexical:  # just use_lexical == True
                x_list.append(w_emb)
            else:  # just use_morphology == True
                x_list.append(t_emb)

        # close sequence
        if not self.config.use_morphology:
            x_list.append(self.unknown_word_embedding[2])
        elif not self.config.use_lexical:
            x_list.append(self.pad_tag_embedding[2])
        else:
            x_list.append(
                dy.concatenate([
                    self.unknown_word_embedding[2], self.pad_tag_embedding[2]
                ]))

        encoder_states_list.append(None)
        return x_list, encoder_states_list

    def _predict_arc(self, seq, runtime=True):
        x_list, encoder_states_list = self._make_input(seq, runtime)

        # BDLSTM
        rnn_outputs = [x_list]
        for fw, bw, dropout in zip(self.bdrnn_fw, self.bdrnn_bw,
                                   self.config.layer_dropouts):
            if runtime:
                fw.set_dropouts(0, 0)
                bw.set_dropouts(0, 0)
            else:
                fw.set_dropouts(dropout, dropout)
                bw.set_dropouts(dropout, dropout)

            fw_list = fw.initial_state().transduce(x_list)
            bw_list = list(
                reversed(bw.initial_state().transduce(reversed(x_list))))
            x_list = [
                dy.concatenate([x_fw, x_bw])
                for x_fw, x_bw in zip(fw_list, bw_list)
            ]

            rnn_outputs.append(x_list)

        # projections
        arc_projections = [[
            dy.tanh(self.proj_arc_w_dep.expr() * x +
                    self.proj_arc_b_dep.expr()),
            dy.tanh(self.proj_arc_w_head.expr() * x +
                    self.proj_arc_b_head.expr())
        ] for x in rnn_outputs[-1]]
        label_projections = [[
            dy.tanh(self.proj_label_w_dep.expr() * x +
                    self.proj_label_b_dep.expr()),
            dy.tanh(self.proj_label_w_head.expr() * x +
                    self.proj_label_b_head.expr())
        ] for x in rnn_outputs[-1]]
        if not runtime:
            arc_projections = [[
                dy.dropout(x1, self.config.presoftmax_mlp_dropout),
                dy.dropout(x2, self.config.presoftmax_mlp_dropout)
            ] for x1, x2 in arc_projections]
            label_projections = [[
                dy.dropout(x1, self.config.presoftmax_mlp_dropout),
                dy.dropout(x2, self.config.presoftmax_mlp_dropout)
            ] for x1, x2 in label_projections]
        if not self.config.predict_morphology:
            aux_arc_projections = [[
                dy.tanh(self.aux_proj_arc_w_dep.expr() * x +
                        self.aux_proj_arc_b_dep.expr()),
                dy.tanh(self.aux_proj_arc_w_head.expr() * x +
                        self.aux_proj_arc_b_head.expr())
            ] for x in rnn_outputs[self.config.aux_softmax_layer]]
            if not runtime:
                aux_arc_projections = [[
                    dy.dropout(x1, self.config.presoftmax_mlp_dropout),
                    dy.dropout(x2, self.config.presoftmax_mlp_dropout)
                ] for x1, x2 in aux_arc_projections]

        else:
            drp = self.config.presoftmax_mlp_dropout
            if runtime:
                drp = 0
            upos_softmax = [
                dy.softmax(self.upos_softmax_w.expr() * dy.dropout(
                    dy.tanh(self.upos_proj_w.expr() * x +
                            self.upos_proj_b.expr()), drp) +
                           self.upos_softmax_b.expr())
                for x in rnn_outputs[self.config.aux_softmax_layer]
            ]
            xpos_softmax = [
                dy.softmax(self.xpos_softmax_w.expr() * dy.dropout(
                    dy.tanh(self.xpos_proj_w.expr() * x +
                            self.xpos_proj_b.expr()), drp) +
                           self.xpos_softmax_b.expr())
                for x in rnn_outputs[self.config.aux_softmax_layer]
            ]
            attrs_softmax = [
                dy.softmax(self.attrs_softmax_w.expr() * dy.dropout(
                    dy.tanh(self.attrs_proj_w.expr() * x +
                            self.attrs_proj_b.expr()), drp) +
                           self.attrs_softmax_b.expr())
                for x in rnn_outputs[self.config.aux_softmax_layer]
            ]

            morphology_softmax = [
                [upos, xpos, attrs] for upos, xpos, attrs in zip(
                    upos_softmax, xpos_softmax, attrs_softmax)
            ]

        n = len(seq) + 1
        arc_matrix = [[None] * n for _ in xrange(n)]
        if not self.config.predict_morphology:
            aux_arc_matrix = [[None] * n for _ in xrange(n)]
        for iDst in xrange(n):
            term_bias = self.link_b.expr() * arc_projections[iDst][1]
            term_weight = self.link_w.expr() * arc_projections[iDst][1]
            if not self.config.predict_morphology:
                aux_term_bias = self.aux_link_b.expr(
                ) * aux_arc_projections[iDst][1]
                aux_term_weight = self.aux_link_w.expr(
                ) * aux_arc_projections[iDst][1]
            for iSrc in xrange(n):
                if iSrc != iDst:
                    attention = dy.reshape(
                        term_weight, (1, self.config.arc_proj_size
                                      )) * arc_projections[iSrc][0] + term_bias
                    arc_matrix[iSrc][iDst] = attention
                    if not self.config.predict_morphology:
                        aux_attention = dy.reshape(aux_term_weight, (1, self.config.arc_proj_size)) * \
                                        aux_arc_projections[iSrc][0] + aux_term_bias
                        aux_arc_matrix[iSrc][iDst] = aux_attention

        # compute softmax for arcs
        a_m = [[None] * n for _ in xrange(n)]
        if not self.config.predict_morphology:
            aux_a_m = [[None] * n for _ in xrange(n)]

        for iSrc in xrange(n):
            s_max = []
            if not self.config.predict_morphology:
                aux_s_max = []
            for iDst in xrange(n):
                if iSrc != iDst:
                    s_max.append(arc_matrix[iSrc][iDst])
                    if not self.config.predict_morphology:
                        aux_s_max.append(aux_arc_matrix[iSrc][iDst])
            s_max = dy.softmax(dy.concatenate(s_max))
            if not self.config.predict_morphology:
                aux_s_max = dy.softmax(dy.concatenate(aux_s_max))
            ofs = 0
            for iDst in xrange(n):
                if iSrc == iDst:
                    ofs = -1
                else:
                    a_m[iSrc][iDst] = s_max[iDst + ofs]
                    if not self.config.predict_morphology:
                        aux_a_m[iSrc][iDst] = aux_s_max[iDst + ofs]
        if not self.config.predict_morphology:
            return a_m, aux_a_m, label_projections, None
        else:
            return a_m, None, label_projections, morphology_softmax[1:-1]

    def save(self, path):
        self.model.save(path)

    def load(self, path):
        self.model.populate(path)

    def parse_sequences(self, sequences):
        new_sequences = []
        for sequence in sequences:
            new_sequence = copy.deepcopy(sequence)
            predicted_tags = self.tag(new_sequence)
            iOrig, iTags = 0, 0
            while iOrig < len(new_sequence):
                while new_sequence[iOrig].is_compound_entry:
                    iOrig += 1
                new_sequence[iOrig].head = predicted_tags[iTags].head
                new_sequence[iOrig].label = predicted_tags[iTags].label
                if self.config.predict_morphology == True:
                    new_sequence[iOrig].upos = predicted_tags[iTags].upos
                    new_sequence[iOrig].xpos = predicted_tags[iTags].xpos
                    new_sequence[iOrig].attrs = predicted_tags[iTags].attrs
                iTags += 1
                iOrig += 1

            new_sequences.append(new_sequence)
        return new_sequences
Ejemplo n.º 9
0
class BDRNNLemmatizer:
    def __init__(self, lemmatizer_config, encodings, embeddings, runtime=False):
        self.config = lemmatizer_config
        self.encodings = encodings
        # Bug in encodings - this will be removed after UD Shared Task
        self.has_bug = False
        if self.encodings.char2int[' '] != 1:
            self.has_bug = True
        self.embeddings = embeddings
        self.losses = []

        self.model = dy.Model()
        self.trainer = dy.AdamTrainer(self.model, alpha=2e-3, beta_1=0.9, beta_2=0.9)

        self.character_network = CharacterNetwork(self.config.tag_embeddings_size, encodings,
                                                  rnn_size=self.config.char_rnn_size,
                                                  rnn_layers=self.config.char_rnn_layers,
                                                  embeddings_size=self.config.char_embeddings,
                                                  model=self.model, runtime=runtime)

        self.upos_lookup = self.model.add_lookup_parameters(
            (len(self.encodings.upos2int), self.config.tag_embeddings_size))
        self.xpos_lookup = self.model.add_lookup_parameters(
            (len(self.encodings.xpos2int), self.config.tag_embeddings_size))
        self.attrs_lookup = self.model.add_lookup_parameters(
            (len(self.encodings.attrs2int), self.config.tag_embeddings_size))
        self.char_lookup = self.model.add_lookup_parameters((len(self.encodings.char2int), self.config.char_embeddings))

        if runtime:
            self.rnn = dy.LSTMBuilder(self.config.rnn_layers,
                                      self.config.char_rnn_size * 2 + self.config.char_embeddings, self.config.rnn_size,
                                      self.model)
        else:
            from utils import orthonormal_VanillaLSTMBuilder
            self.rnn = orthonormal_VanillaLSTMBuilder(self.config.rnn_layers,
                                                      self.config.char_rnn_size * 2 + self.config.char_embeddings,
                                                      self.config.rnn_size,
                                                      self.model)

        self.att_w1 = self.model.add_parameters((200, self.config.char_rnn_size * 2))
        self.att_w2 = self.model.add_parameters((200, self.config.rnn_size + self.config.tag_embeddings_size))
        self.att_v = self.model.add_parameters((1, 200))

        self.start_lookup = self.model.add_lookup_parameters(
            (1, self.config.char_rnn_size * 2 + self.config.char_embeddings))

        self.softmax_w = self.model.add_parameters((len(self.encodings.char2int) + 1, self.config.rnn_size))
        self.softmax_b = self.model.add_parameters((len(self.encodings.char2int) + 1))
        self.softmax_casing_w = self.model.add_parameters((2, self.config.rnn_size))
        self.softmax_casing_b = self.model.add_parameters((2))

    def _attend(self, input_vectors, state, embeddings):
        w1 = self.att_w1.expr()
        w2 = self.att_w2.expr()
        v = self.att_v.expr()
        attention_weights = []

        w2dt = w2 * dy.concatenate([state.h()[-1], embeddings])
        for input_vector in input_vectors:
            attention_weight = v * dy.tanh(w1 * input_vector + w2dt)
            attention_weights.append(attention_weight)

        attention_weights = dy.softmax(dy.concatenate(attention_weights))

        output_vectors = dy.esum(
            [vector * attention_weight for vector, attention_weight in zip(input_vectors, attention_weights)])

        return output_vectors

    def _predict(self, word, upos, xpos, attrs, num_chars=0, gs_chars=None):
        if num_chars == 0:
            runtime = True
        else:
            runtime = False

        char_emb, states = self.character_network.compute_embeddings(word, runtime=runtime)

        num_predictions = 0
        softmax_list = []
        m1, m2, m3 = 0, 0, 0
        zero_vec = dy.vecInput(self.config.tag_embeddings_size)
        if upos in self.encodings.upos2int:
            upos_emb = self.upos_lookup[self.encodings.upos2int[upos]]
            m1 = 1
        else:
            upos_emb = zero_vec

        if xpos in self.encodings.xpos2int:
            xpos_emb = self.xpos_lookup[self.encodings.xpos2int[xpos]]
            m2 = 1
        else:
            xpos_emb = zero_vec

        if attrs in self.encodings.attrs2int:
            attrs_emb = self.attrs_lookup[self.encodings.attrs2int[attrs]]
            m3 = 1
        else:
            attrs_emb = zero_vec

        scale = float(4.0) / (m1 + m2 + m3 + 1.0)

        scale = dy.scalarInput(scale)
        tag_emb = (upos_emb + xpos_emb + attrs_emb + char_emb) * scale
        rnn = self.rnn.initial_state().add_input(self.start_lookup[0])
        char_emb = dy.inputVector([0] * self.config.char_embeddings)

        while True:
            attention = self._attend(states, rnn, tag_emb)

            input = dy.concatenate([attention, char_emb])
            rnn = rnn.add_input(input)

            softmax = dy.softmax(self.softmax_w.expr() * rnn.output() + self.softmax_b.expr())
            softmax_casing = dy.softmax(self.softmax_casing_w.expr() * rnn.output() + self.softmax_casing_b.expr())
            softmax_list.append([softmax, softmax_casing])
            if num_chars == 0:
                s_index = np.argmax(softmax.npvalue())
                if s_index == len(self.encodings.char2int):
                    break
                char_emb = self.char_lookup[s_index]
            else:
                if num_predictions < len(gs_chars):
                    char = gs_chars[num_predictions]
                    if char in self.encodings.char2int:
                        char_emb = self.char_lookup[self.encodings.char2int[char]]
                    else:
                        char_emb = self.char_lookup[self.encodings.char2int["<UNK>"]]

            num_predictions += 1
            if num_predictions == num_chars or num_predictions > 255:
                break

        return softmax_list

    def start_batch(self):
        self.losses = []
        dy.renew_cg()

    def end_batch(self):
        total_loss = 0
        if len(self.losses) > 0:
            loss = dy.esum(self.losses)
            total_loss = loss.value()
            loss.backward()
            self.trainer.update()
        self.losses = []
        return total_loss

    def learn(self, seq):

        for entry in seq:
            if entry.upos != 'NUM' and entry.upos != 'PROPN':
                losses = []
                unilemma = unicode(entry.lemma, 'utf-8')
                n_chars = len(unilemma)
                softmax_output_list = self._predict(entry.word, entry.upos, entry.xpos, entry.attrs,
                                                    num_chars=n_chars + 1, gs_chars=unilemma)
                # print unilemma.encode('utf-8')#, softmax_output_list
                for softmax, char in zip(softmax_output_list[:-1], unilemma):

                    char_index = -1
                    if char.lower() == char:
                        casing = 0
                    else:
                        casing = 1
                    char = char.lower()
                    if char in self.encodings.char2int:
                        char_index = self.encodings.char2int[char]
                    if char_index != -1:
                        losses.append(-dy.log(dy.pick(softmax[0], char_index)))
                    losses.append(-dy.log(dy.pick(softmax[1], casing)))
                    # print np.argmax(softmax[0].npvalue()), char_index, softmax

                losses.append(-dy.log(dy.pick(softmax_output_list[-1][0], len(self.encodings.char2int))))
                loss = dy.esum(losses)
                self.losses.append(loss)

    def tag(self, seq):
        dy.renew_cg()
        lemmas = []
        for entry in seq:
            if entry.upos == 'NUM' or entry.upos == 'PROPN':
                lemma = entry.word.decode('utf-8')
            else:
                softmax_output_list = self._predict(entry.word, entry.upos, entry.xpos, entry.attrs)
                lemma = ""
                for softmax in softmax_output_list[:-1]:
                    char_index = np.argmax(softmax[0].npvalue())
                    if char_index < len(self.encodings.characters):
                        char = self.encodings.characters[char_index]
                        if np.argmax(softmax[1].npvalue()) == 1:
                            char = char.upper()
                        lemma += char
            lemmas.append(lemma)
        return lemmas

    def save(self, path):
        self.model.save(path)

    def load(self, path):
        self.model.populate(path)
Ejemplo n.º 10
0
class FSTLemmatizer:
    def __init__(self, config, encodings, embeddings, runtime=False):
        self.config = config
        self.encodings = encodings
        # Bug in encodings - will be removed after UD
        self.has_bug=False
        if self.encodings.char2int[' ']!=1:
            self.has_bug=True
            import sys
            sys.stdout.write("Detected encodings BUG!")
        self.embeddings = embeddings
        self.losses = []
        self.model = dy.Model()
        self.trainer = dy.AdamTrainer(self.model, alpha=2e-3, beta_1=0.9, beta_2=0.9)
        self.character_network = CharacterNetwork(self.config.tag_embeddings_size, encodings,
                                                  rnn_size=self.config.char_rnn_size,
                                                  rnn_layers=self.config.char_rnn_layers,
                                                  embeddings_size=self.config.char_embeddings,
                                                  model=self.model, runtime=runtime)
        self.word2lemma={}

        self.upos_lookup = self.model.add_lookup_parameters(
            (len(self.encodings.upos2int), self.config.tag_embeddings_size))
        self.xpos_lookup = self.model.add_lookup_parameters(
            (len(self.encodings.xpos2int), self.config.tag_embeddings_size))
        self.attrs_lookup = self.model.add_lookup_parameters(
            (len(self.encodings.attrs2int), self.config.tag_embeddings_size))
        self.char_lookup = self.model.add_lookup_parameters((len(self.encodings.char2int), self.config.char_embeddings))
        if runtime:
            self.rnn = dy.LSTMBuilder(self.config.rnn_layers,
                                      self.config.char_rnn_size * 2 + self.config.char_embeddings + self.config.tag_embeddings_size,
                                      self.config.rnn_size,
                                      self.model)
        else:
            from utils import orthonormal_VanillaLSTMBuilder
            self.rnn = orthonormal_VanillaLSTMBuilder(self.config.rnn_layers,
                                                      self.config.char_rnn_size * 2 + self.config.char_embeddings + self.config.tag_embeddings_size,
                                                      self.config.rnn_size,
                                                      self.model)

        # self.att_w1 = self.model.add_parameters((200, self.config.char_rnn_size * 2))
        # self.att_w2 = self.model.add_parameters((200, self.config.rnn_size + self.config.tag_embeddings_size))
        # self.att_v = self.model.add_parameters((1, 200))

        self.start_lookup = self.model.add_lookup_parameters(
            (1, self.config.char_rnn_size * 2 + self.config.char_embeddings + self.config.tag_embeddings_size))

        self.softmax_w = self.model.add_parameters((len(self.encodings.char2int) + 3, self.config.rnn_size))
        self.softmax_b = self.model.add_parameters((len(self.encodings.char2int) + 3))

        ofs = len(self.encodings.char2int)
        self.label2int = {}
        self.label2int['<EOS>'] = ofs
        self.label2int['<COPY>'] = ofs + 1
        self.label2int['<INC>'] = ofs + 2

    def _attend(self, input_vectors, state, embeddings):
        w1 = self.att_w1.expr()
        w2 = self.att_w2.expr()
        v = self.att_v.expr()
        attention_weights = []

        w2dt = w2 * dy.concatenate([state.h()[-1], embeddings])
        for input_vector in input_vectors:
            attention_weight = v * dy.tanh(w1 * input_vector + w2dt)
            attention_weights.append(attention_weight)

        attention_weights = dy.softmax(dy.concatenate(attention_weights))

        output_vectors = dy.esum(
            [vector * attention_weight for vector, attention_weight in zip(input_vectors, attention_weights)])

        return output_vectors

    def _predict(self, word, upos, xpos, attrs, max_predictions=0, runtime=True, gs_labels=None):
        char_emb, states = self.character_network.compute_embeddings(word, runtime=runtime)

        softmax_list = []
        m1, m2, m3 = 0, 0, 0
        zero_vec = dy.vecInput(self.config.tag_embeddings_size)
        if upos in self.encodings.upos2int:
            upos_emb = self.upos_lookup[self.encodings.upos2int[upos]]
            m1 = 1
        else:
            upos_emb = zero_vec

        if xpos in self.encodings.xpos2int:
            xpos_emb = self.xpos_lookup[self.encodings.xpos2int[xpos]]
            m2 = 1
        else:
            xpos_emb = zero_vec

        if attrs in self.encodings.attrs2int:
            attrs_emb = self.attrs_lookup[self.encodings.attrs2int[attrs]]
            m3 = 1
        else:
            attrs_emb = zero_vec

        scale = float(4.0) / (m1 + m2 + m3 + 1.0)

        scale = dy.scalarInput(scale)
        tag_emb = (upos_emb + xpos_emb + attrs_emb + char_emb) * scale
        rnn = self.rnn.initial_state().add_input(self.start_lookup[0])
        num_predictions = 0
        i_src = 0
        i_labels = 0
        while num_predictions < max_predictions:
            # attention = self._attend(states, rnn, tag_emb)

            input = dy.concatenate([char_emb, states[i_src], tag_emb])
            rnn = rnn.add_input(input)

            softmax = dy.softmax(self.softmax_w.expr() * rnn.output() + self.softmax_b.expr())
            softmax_list.append(softmax)
            num_predictions += 1
            if runtime:
                l_index = np.argmax(softmax.npvalue())
                if l_index == self.label2int['<EOS>']:
                    break
                elif l_index == self.label2int['<INC>'] and i_src < len(states) - 1:
                    i_src += 1
            else:
                if gs_labels[i_labels] == '<INC>' and i_src < len(states) - 1:
                    i_src += 1
            i_labels += 1

        return softmax_list

    def start_batch(self):
        self.losses = []
        dy.renew_cg()

    def end_batch(self):
        total_loss = 0
        if len(self.losses) > 0:
            loss = dy.esum(self.losses)
            total_loss = loss.value()
            loss.backward()
            self.trainer.update()
        self.losses = []
        return total_loss

    def learn(self, seq):
        for entry in seq:
            if entry.upos != 'NUM' and entry.upos != 'PROPN':
                # print entry.word+"\t"+entry.lemma
                y_real = self._compute_transduction_states(unicode(entry.word, 'utf-8').lower(),
                                                           unicode(entry.lemma, 'utf-8').lower())
                # print y_real
                losses = []
                n_chars = len(y_real)
                # print entry.word, entry.lemma
                # print y_real
                softmax_output_list = self._predict(entry.word, entry.upos, entry.xpos, entry.attrs,
                                                    max_predictions=n_chars, runtime=False, gs_labels=y_real)
                # print unilemma.encode('utf-8')#, softmax_output_list
                for softmax, y_target in zip(softmax_output_list, y_real):
                    if y_target in self.label2int:
                        losses.append(-dy.log(dy.pick(softmax, self.label2int[y_target])))
                    elif y_target in self.encodings.char2int:
                        losses.append(-dy.log(dy.pick(softmax, self.encodings.char2int[y_target])))

                if len(losses) > 0:
                    loss = dy.esum(losses)

                self.losses.append(loss)

    def _compute_transduction_states(self, source, destination):
        a = np.zeros((len(source) + 1, len(destination) + 1))
        for i in xrange(len(source) + 1):
            a[i, 0] = i

        for i in xrange(len(destination) + 1):
            a[0, i] = i

        for i in xrange(1, len(source) + 1):
            for j in xrange(1, len(destination) + 1):
                cost = 0
                if source[i - 1] != destination[j - 1]:
                    cost = 1
                m = min([a[i - 1, j - 1], a[i - 1, j], a[i, j - 1]])
                a[i, j] = m + cost

        alignments = [-1] * len(destination)

        i = len(source)
        j = len(destination)
        while i > 1 or j > 1:
            if source[i - 1] == destination[j - 1]:
                alignments[j - 1] = i - 1
            if i == 1:
                j -= 1
            elif j == 1:
                i -= 1
            else:
                if a[i - 1, j - 1] <= a[i - 1, j] and a[i - 1, j - 1] <= a[i, j - 1]:
                    i -= 1
                    j -= 1
                elif a[i - 1][j] <= a[i - 1, j - 1] and a[i - 1, j] <= a[i, j - 1]:
                    i -= 1
                else:
                    j -= 1
        if source[i - 1] == destination[j - 1]:
            alignments[j - 1] = i - 1

        y_pred = []
        index_src = 0
        index_dst = 0
        while index_dst < len(destination):
            if alignments[index_dst] == index_src:
                y_pred.append("<COPY>")
                index_dst += 1
            elif alignments[index_dst] == -1:
                if destination[index_dst] == "\t":
                    y_pred.append("<TOK>")
                    index_dst += 1
                else:
                    y_pred.append(destination[index_dst])
                    index_dst += 1
            else:
                y_pred.append("<INC>")
                index_src += 1

        y_pred.append("<EOS>")
        return y_pred

    def tag(self, seq):
        dy.renew_cg()
        lemmas = []
        for entry in seq:
            if entry.upos == 'NUM' or entry.upos == 'PROPN':
                lemma = entry.word.decode('utf-8')
            else:
                #check dictionary
                key=entry.word.decode('utf-8').lower().encode('utf-8')+"\t"+entry.lemma
                if key in self.word2lemma:
                    lemma=unicode(self.word2lemma[key],'utf-8')
                else:
                    uniword = unicode(entry.word, 'utf-8')
                    softmax_output_list = self._predict(uniword, entry.upos, entry.xpos, entry.attrs,
                                                        max_predictions=500, runtime=True)
                    lemma = ""
                    src_index = 0
                    for softmax in softmax_output_list[:-1]:
                        label_index = np.argmax(softmax.npvalue())
                        if label_index == self.label2int['<COPY>'] and src_index < len(uniword):
                            lemma += uniword[src_index]
                        elif label_index == self.label2int['<INC>'] or label_index == self.label2int['<EOS>']:
                            src_index += 1
                        elif label_index < len(self.encodings.characters):
                            #if self.has_bug and label_index >= self.encodings.char2int[' ']:
                            #     label_index += 1
                            lemma += self.encodings.characters[label_index]
            # print entry.word+"\t"+lemma.encode('utf-8')
            if entry.upos!='PROPN':
                lemmas.append(lemma.lower())
            else:
                lemmas.append(lemma)
        return lemmas

    def save(self, path):
        self.model.save(path)

    def load(self, path):
        self.model.populate(path)
        dict_path=path.replace(".bestACC", ".dict")
        import os.path
        if os.path.exists(dict_path):
            self.load_dict(dict_path)

    def load_dict(self, path):
        print "Loading lemma dictionary"
        with open (path, "r") as f:
            lines=f.readlines()
            for line in lines:
                parts=line.strip().split('\t')
                if len(parts)==5:
                    word=unicode(parts[0],'utf-8').lower().encode('utf-8')
                    upos=parts[1]
                    key=word+'\t'+upos
                    self.word2lemma[key]=parts[4]
        print "Loaded "+str(len(self.word2lemma))+" pairs"


    def lemmatize_sequences(self, sequences):
        new_sequences = []
        for sequence in sequences:
            new_sequence = copy.deepcopy(sequence)
            predicted_lemmas = self.tag(new_sequence)

            for entry, lemma in zip(new_sequence, predicted_lemmas):
                if not entry.is_compound_entry:
                    entry.lemma = lemma if lemma is not None else "_"  # lemma.encode('utf-8')
                else:
                    entry.lemma = "_"
            # for entryIndex, lemma in enumerate(predicted_lemmas):
            #    new_sequence[entryIndex].lemma = lemma if lemma is not None else "_"
            new_sequences.append(new_sequence)
        return new_sequences