def __init__(self, config, encodings, embeddings, runtime=False): self.config = config self.encodings = encodings # Bug in encodings - will be removed after UD self.has_bug=False if self.encodings.char2int[' ']!=1: self.has_bug=True import sys sys.stdout.write("Detected encodings BUG!") self.embeddings = embeddings self.losses = [] self.model = dy.Model() self.trainer = dy.AdamTrainer(self.model, alpha=2e-3, beta_1=0.9, beta_2=0.9) self.character_network = CharacterNetwork(self.config.tag_embeddings_size, encodings, rnn_size=self.config.char_rnn_size, rnn_layers=self.config.char_rnn_layers, embeddings_size=self.config.char_embeddings, model=self.model, runtime=runtime) self.word2lemma={} self.upos_lookup = self.model.add_lookup_parameters( (len(self.encodings.upos2int), self.config.tag_embeddings_size)) self.xpos_lookup = self.model.add_lookup_parameters( (len(self.encodings.xpos2int), self.config.tag_embeddings_size)) self.attrs_lookup = self.model.add_lookup_parameters( (len(self.encodings.attrs2int), self.config.tag_embeddings_size)) self.char_lookup = self.model.add_lookup_parameters((len(self.encodings.char2int), self.config.char_embeddings)) if runtime: self.rnn = dy.LSTMBuilder(self.config.rnn_layers, self.config.char_rnn_size * 2 + self.config.char_embeddings + self.config.tag_embeddings_size, self.config.rnn_size, self.model) else: from utils import orthonormal_VanillaLSTMBuilder self.rnn = orthonormal_VanillaLSTMBuilder(self.config.rnn_layers, self.config.char_rnn_size * 2 + self.config.char_embeddings + self.config.tag_embeddings_size, self.config.rnn_size, self.model) # self.att_w1 = self.model.add_parameters((200, self.config.char_rnn_size * 2)) # self.att_w2 = self.model.add_parameters((200, self.config.rnn_size + self.config.tag_embeddings_size)) # self.att_v = self.model.add_parameters((1, 200)) self.start_lookup = self.model.add_lookup_parameters( (1, self.config.char_rnn_size * 2 + self.config.char_embeddings + self.config.tag_embeddings_size)) self.softmax_w = self.model.add_parameters((len(self.encodings.char2int) + 3, self.config.rnn_size)) self.softmax_b = self.model.add_parameters((len(self.encodings.char2int) + 3)) ofs = len(self.encodings.char2int) self.label2int = {} self.label2int['<EOS>'] = ofs self.label2int['<COPY>'] = ofs + 1 self.label2int['<INC>'] = ofs + 2
def __init__(self, config, encodings, embeddings, runtime=False): self.config = config self.word_embeddings = embeddings self.encodings = encodings self.model = dy.Model() self.trainer = dy.AdamTrainer(self.model, alpha=2e-3, beta_1=0.9, beta_2=0.9) from character_embeddings import CharacterNetwork self.encoder = CharacterNetwork(self.config.character_embeddings_size, encodings, self.config.encoder_size, self.config.encoder_layers, self.config.character_embeddings_size, self.model, runtime=runtime) self.decoder = dy.VanillaLSTMBuilder(self.config.decoder_layers, self.config.encoder_size * 2, self.config.decoder_size, self.model) self.decoder_start_lookup = self.model.add_lookup_parameters( (1, self.config.encoder_size * 2)) self.att_w1 = self.model.add_parameters( (self.config.character_embeddings_size * 2, self.config.encoder_size * 2)) self.att_w2 = self.model.add_parameters( (self.config.character_embeddings_size * 2, self.config.decoder_size)) self.att_v = self.model.add_parameters( (1, self.config.character_embeddings_size * 2)) self.softmax_w = self.model.add_parameters( (len(self.encodings.char2int) + 4, self.config.decoder_size) ) # all known characters except digits with COPY, INC, TOK and EOS self.softmax_b = self.model.add_parameters( (len(self.encodings.char2int) + 4)) self.softmax_comp_w = self.model.add_parameters( (2, self.config.character_embeddings_size)) self.softmax_comp_b = self.model.add_parameters((2)) self.label2int = {} ofs = len(self.encodings.char2int) self.label2int['<EOS>'] = ofs self.label2int['<TOK>'] = ofs + 1 self.label2int['<COPY>'] = ofs + 2 self.label2int['<INC>'] = ofs + 3 self.losses = []
def __init__(self, lemmatizer_config, encodings, embeddings, runtime=False): self.config = lemmatizer_config self.encodings = encodings # Bug in encodings - this will be removed after UD Shared Task self.has_bug = False if self.encodings.char2int[' '] != 1: self.has_bug = True self.embeddings = embeddings self.losses = [] self.model = dy.Model() self.trainer = dy.AdamTrainer(self.model, alpha=2e-3, beta_1=0.9, beta_2=0.9) self.character_network = CharacterNetwork(self.config.tag_embeddings_size, encodings, rnn_size=self.config.char_rnn_size, rnn_layers=self.config.char_rnn_layers, embeddings_size=self.config.char_embeddings, model=self.model, runtime=runtime) self.upos_lookup = self.model.add_lookup_parameters( (len(self.encodings.upos2int), self.config.tag_embeddings_size)) self.xpos_lookup = self.model.add_lookup_parameters( (len(self.encodings.xpos2int), self.config.tag_embeddings_size)) self.attrs_lookup = self.model.add_lookup_parameters( (len(self.encodings.attrs2int), self.config.tag_embeddings_size)) self.char_lookup = self.model.add_lookup_parameters((len(self.encodings.char2int), self.config.char_embeddings)) if runtime: self.rnn = dy.LSTMBuilder(self.config.rnn_layers, self.config.char_rnn_size * 2 + self.config.char_embeddings, self.config.rnn_size, self.model) else: from utils import orthonormal_VanillaLSTMBuilder self.rnn = orthonormal_VanillaLSTMBuilder(self.config.rnn_layers, self.config.char_rnn_size * 2 + self.config.char_embeddings, self.config.rnn_size, self.model) self.att_w1 = self.model.add_parameters((200, self.config.char_rnn_size * 2)) self.att_w2 = self.model.add_parameters((200, self.config.rnn_size + self.config.tag_embeddings_size)) self.att_v = self.model.add_parameters((1, 200)) self.start_lookup = self.model.add_lookup_parameters( (1, self.config.char_rnn_size * 2 + self.config.char_embeddings)) self.softmax_w = self.model.add_parameters((len(self.encodings.char2int) + 1, self.config.rnn_size)) self.softmax_b = self.model.add_parameters((len(self.encodings.char2int) + 1)) self.softmax_casing_w = self.model.add_parameters((2, self.config.rnn_size)) self.softmax_casing_b = self.model.add_parameters((2))
class CompoundWordExpander: def __init__(self, config, encodings, embeddings, runtime=False): self.config = config self.word_embeddings = embeddings self.encodings = encodings self.model = dy.Model() self.trainer = dy.AdamTrainer(self.model, alpha=2e-3, beta_1=0.9, beta_2=0.9) from character_embeddings import CharacterNetwork self.encoder = CharacterNetwork(self.config.character_embeddings_size, encodings, self.config.encoder_size, self.config.encoder_layers, self.config.character_embeddings_size, self.model, runtime=runtime) self.decoder = dy.VanillaLSTMBuilder(self.config.decoder_layers, self.config.encoder_size * 2, self.config.decoder_size, self.model) self.decoder_start_lookup = self.model.add_lookup_parameters( (1, self.config.encoder_size * 2)) self.att_w1 = self.model.add_parameters( (self.config.character_embeddings_size * 2, self.config.encoder_size * 2)) self.att_w2 = self.model.add_parameters( (self.config.character_embeddings_size * 2, self.config.decoder_size)) self.att_v = self.model.add_parameters( (1, self.config.character_embeddings_size * 2)) self.softmax_w = self.model.add_parameters( (len(self.encodings.char2int) + 4, self.config.decoder_size) ) # all known characters except digits with COPY, INC, TOK and EOS self.softmax_b = self.model.add_parameters( (len(self.encodings.char2int) + 4)) self.softmax_comp_w = self.model.add_parameters( (2, self.config.character_embeddings_size)) self.softmax_comp_b = self.model.add_parameters((2)) self.label2int = {} ofs = len(self.encodings.char2int) self.label2int['<EOS>'] = ofs self.label2int['<TOK>'] = ofs + 1 self.label2int['<COPY>'] = ofs + 2 self.label2int['<INC>'] = ofs + 3 self.losses = [] def start_batch(self): self.losses = [] dy.renew_cg() def end_batch(self): total_loss = 0 if len(self.losses) != 0: loss = dy.esum(self.losses) self.losses = [] total_loss = loss.value() loss.backward() self.trainer.update() dy.renew_cg() return total_loss def learn(self, seq): losses = [] examples = self._get_examples(seq) for example in examples: y_pred, encoder_states = self._predict_is_compound_entry( example.source, runtime=False) if not example.should_expand: losses.append(-dy.log(dy.pick(y_pred, 0))) else: losses.append(-dy.log(dy.pick(y_pred, 1))) losses.append( self._learn_transduction(example.source, example.destination, encoder_states)) loss = dy.esum(losses) self.losses.append(loss) def _compute_transduction_states(self, source, destination): a = np.zeros((len(source) + 1, len(destination) + 1)) for i in xrange(len(source) + 1): a[i, 0] = i for i in xrange(len(destination) + 1): a[0, i] = i for i in xrange(1, len(source) + 1): for j in xrange(1, len(destination) + 1): cost = 0 if source[i - 1] != destination[j - 1]: cost = 1 m = min([a[i - 1, j - 1], a[i - 1, j], a[i, j - 1]]) a[i, j] = m + cost alignments = [-1] * len(destination) i = len(source) j = len(destination) while i > 1 or j > 1: if source[i - 1] == destination[j - 1]: alignments[j - 1] = i - 1 if i == 1: j -= 1 elif j == 1: i -= 1 else: if a[i - 1, j - 1] <= a[i - 1, j] and a[i - 1, j - 1] <= a[i, j - 1]: i -= 1 j -= 1 elif a[i - 1][j] <= a[i - 1, j - 1] and a[i - 1, j] <= a[i, j - 1]: i -= 1 else: j -= 1 if source[i - 1] == destination[j - 1]: alignments[j - 1] = i - 1 y_pred = [] index_src = 0 index_dst = 0 while index_dst < len(destination): if alignments[index_dst] == index_src: y_pred.append("<COPY>") index_dst += 1 elif alignments[index_dst] == -1: if destination[index_dst] == "\t": y_pred.append("<TOK>") index_dst += 1 else: y_pred.append(destination[index_dst]) index_dst += 1 else: y_pred.append("<INC>") index_src += 1 y_pred.append("<EOS>") return y_pred def _attend(self, input_vectors, state): w1 = self.att_w1.expr() w2 = self.att_w2.expr() v = self.att_v.expr() attention_weights = [] w2dt = w2 * state.h()[-1] for input_vector in input_vectors: attention_weight = v * dy.tanh(w1 * input_vector + w2dt) attention_weights.append(attention_weight) attention_weights = dy.softmax(dy.concatenate(attention_weights)) output_vectors = dy.esum([ vector * attention_weight for vector, attention_weight in zip( input_vectors, attention_weights) ]) return output_vectors def _decode(self, encoder_states, runtime=True, max_preds=-1): y_pred = [] num_preds = 0 lstm = self.decoder.initial_state().add_input( self.decoder_start_lookup[0]) while num_preds < max_preds: input = self._attend(encoder_states, lstm) lstm = lstm.add_input(input) softmax_out = dy.softmax(self.softmax_w.expr() * lstm.output() + self.softmax_b.expr()) y_pred.append(softmax_out) num_preds += 1 if max_preds == -1 or runtime: if np.argmax(softmax_out.npvalue()) == self.label2int['<EOS>']: return y_pred return y_pred def _learn_transduction(self, source, destination, encoder_states): losses = [] y_target = self._compute_transduction_states(source, destination) y_predicted = self._decode(encoder_states, runtime=False, max_preds=len(y_target)) for y_real, y_pred in zip(y_target, y_predicted): if y_real in self.label2int: losses.append(-dy.log(dy.pick(y_pred, self.label2int[y_real]))) else: if y_real in self.encodings.char2int: losses.append(-dy.log( dy.pick(y_pred, self.encodings.char2int[y_real]))) # else: # print source + "\t\t" + destination return dy.esum(losses) def _predict_is_compound_entry(self, word, runtime=True): emb, states = self.encoder.compute_embeddings(word, runtime=runtime) output = dy.softmax(self.softmax_comp_w.expr() * emb + self.softmax_comp_b.expr()) return output, states def _transduce(self, source, encoder_states): tokens = [] y_pred = self._decode(encoder_states, runtime=True, max_preds=100) i_src = 0 token = "" for y in y_pred: y = np.argmax(y.npvalue()) if y == self.label2int['<INC>']: i_src += 1 elif y == self.label2int['<COPY>']: if i_src < len(source): token += source[i_src] elif y == self.label2int['<TOK>'] or y == self.label2int['<EOS>']: tokens.append(token) token = "" else: token += self.encodings.characters[y] return tokens def tag_token(self, word): dy.renew_cg() compound = False word = unicode(word, 'utf-8') tokens = [] ce_out, encoder_states = self._predict_is_compound_entry(word, runtime=True) if np.argmax(ce_out.npvalue()) == 1: tokens = self._transduce(word, encoder_states) compound = True return compound, tokens def tag(self, seq): dy.renew_cg() new_seq = [] index = 1 for entry in seq: if not entry.is_compound_entry: ce_out, encoder_states = self._predict_is_compound_entry( unicode(entry.word, 'utf-8'), runtime=True) if np.argmax(ce_out.npvalue()) == 0: entry.index = index new_seq.append(entry) index += 1 else: compounds = self._transduce(unicode(entry.word, 'utf-8'), encoder_states) entry.index = str(index) + '-' + str(index + len(compounds)) new_seq.append(entry) for word in compounds: from io_utils.conll import ConllEntry entry = ConllEntry(index, word.encode('utf-8'), word.encode('utf-8'), '_', '_', '_', '0', '_', '_', '') new_seq.append(entry) index += 1 return new_seq def _get_examples(self, seq): examples = [] cww = 0 for entry in seq: if cww == 0: et = ExpandedToken(source=unicode(entry.word, 'utf-8')) if entry.is_compound_entry: et.should_expand = True et.destination = u'' interval = entry.index interval = interval.split("-") stop = int(interval[1]) start = int(interval[0]) cww = stop - start + 1 else: et.destination = et.source examples.append(et) else: et.destination += "\t" + unicode(entry.word, 'utf-8') cww -= 1 if cww == 0: et.destination = et.destination.strip() examples.append(et) return examples def save(self, filename): self.model.save(filename)
def __init__(self, tagger_config, encodings, embeddings, aux_softmax_weight=0.2, runtime=False): self.config = tagger_config self.encodings = encodings self.embeddings = embeddings self.model = dy.Model() self.trainer = dy.AdamTrainer( self.model, alpha=2e-3, beta_1=0.9, beta_2=0.9) # dy.MomentumSGDTrainer(self.model) self.trainer.set_sparse_updates(False) self.character_network = CharacterNetwork( 100, encodings, rnn_size=200, rnn_layers=1, embeddings_size=self.embeddings.word_embeddings_size, model=self.model, runtime=runtime) self.unknown_word_embedding = self.model.add_lookup_parameters( (1, self.embeddings.word_embeddings_size)) self.holistic_word_embedding = self.model.add_lookup_parameters( (len(encodings.word2int), self.embeddings.word_embeddings_size)) self.char_proj_w = self.model.add_parameters( (self.config.input_size, self.embeddings.word_embeddings_size)) self.emb_proj_w = self.model.add_parameters( (self.config.input_size, self.embeddings.word_embeddings_size)) self.hol_proj_w = self.model.add_parameters( (self.config.input_size, self.embeddings.word_embeddings_size)) self.bdrnn_fw = [] self.bdrnn_bw = [] rnn_input_size = self.config.input_size # self.embeddings.word_embeddings_size aux_softmax_input_size = 0 index = 0 for layer_size in self.config.layers: if runtime: self.bdrnn_fw.append( dy.VanillaLSTMBuilder(1, rnn_input_size, layer_size, self.model)) self.bdrnn_bw.append( dy.VanillaLSTMBuilder(1, rnn_input_size, layer_size, self.model)) else: self.bdrnn_fw.append( orthonormal_VanillaLSTMBuilder(1, rnn_input_size, layer_size, self.model)) self.bdrnn_bw.append( orthonormal_VanillaLSTMBuilder(1, rnn_input_size, layer_size, self.model)) rnn_input_size = layer_size * 2 index += 1 if index == self.config.aux_softmax_layer: aux_softmax_input_size = rnn_input_size self.mlps = [] for _ in xrange(3): # upos, xpos and attrs mlp_w = [] mlp_b = [] input_sz = self.config.layers[-1] * 2 for l_size in self.config.presoftmax_mlp_layers: mlp_w.append(self.model.add_parameters((l_size, input_sz))) mlp_b.append(self.model.add_parameters((l_size))) input_sz = l_size self.mlps.append([mlp_w, mlp_b]) softmax_input_size = self.config.presoftmax_mlp_layers[-1] self.softmax_upos_w = self.model.add_parameters( (len(self.encodings.upos2int), softmax_input_size)) self.softmax_upos_b = self.model.add_parameters( (len(self.encodings.upos2int))) self.softmax_xpos_w = self.model.add_parameters( (len(self.encodings.xpos2int), softmax_input_size)) self.softmax_xpos_b = self.model.add_parameters( (len(self.encodings.xpos2int))) self.softmax_attrs_w = self.model.add_parameters( (len(self.encodings.attrs2int), softmax_input_size)) self.softmax_attrs_b = self.model.add_parameters( (len(self.encodings.attrs2int))) self.aux_softmax_upos_w = self.model.add_parameters( (len(self.encodings.upos2int), aux_softmax_input_size)) self.aux_softmax_upos_b = self.model.add_parameters( (len(self.encodings.upos2int))) self.aux_softmax_xpos_w = self.model.add_parameters( (len(self.encodings.xpos2int), aux_softmax_input_size)) self.aux_softmax_xpos_b = self.model.add_parameters( (len(self.encodings.xpos2int))) self.aux_softmax_attrs_w = self.model.add_parameters( (len(self.encodings.attrs2int), aux_softmax_input_size)) self.aux_softmax_attrs_b = self.model.add_parameters( (len(self.encodings.attrs2int))) self.aux_softmax_weight = aux_softmax_weight self.losses = []
class BDRNNTagger: def __init__(self, tagger_config, encodings, embeddings, aux_softmax_weight=0.2, runtime=False): self.config = tagger_config self.encodings = encodings self.embeddings = embeddings self.model = dy.Model() self.trainer = dy.AdamTrainer( self.model, alpha=2e-3, beta_1=0.9, beta_2=0.9) # dy.MomentumSGDTrainer(self.model) self.trainer.set_sparse_updates(False) self.character_network = CharacterNetwork( 100, encodings, rnn_size=200, rnn_layers=1, embeddings_size=self.embeddings.word_embeddings_size, model=self.model, runtime=runtime) self.unknown_word_embedding = self.model.add_lookup_parameters( (1, self.embeddings.word_embeddings_size)) self.holistic_word_embedding = self.model.add_lookup_parameters( (len(encodings.word2int), self.embeddings.word_embeddings_size)) self.char_proj_w = self.model.add_parameters( (self.config.input_size, self.embeddings.word_embeddings_size)) self.emb_proj_w = self.model.add_parameters( (self.config.input_size, self.embeddings.word_embeddings_size)) self.hol_proj_w = self.model.add_parameters( (self.config.input_size, self.embeddings.word_embeddings_size)) self.bdrnn_fw = [] self.bdrnn_bw = [] rnn_input_size = self.config.input_size # self.embeddings.word_embeddings_size aux_softmax_input_size = 0 index = 0 for layer_size in self.config.layers: if runtime: self.bdrnn_fw.append( dy.VanillaLSTMBuilder(1, rnn_input_size, layer_size, self.model)) self.bdrnn_bw.append( dy.VanillaLSTMBuilder(1, rnn_input_size, layer_size, self.model)) else: self.bdrnn_fw.append( orthonormal_VanillaLSTMBuilder(1, rnn_input_size, layer_size, self.model)) self.bdrnn_bw.append( orthonormal_VanillaLSTMBuilder(1, rnn_input_size, layer_size, self.model)) rnn_input_size = layer_size * 2 index += 1 if index == self.config.aux_softmax_layer: aux_softmax_input_size = rnn_input_size self.mlps = [] for _ in xrange(3): # upos, xpos and attrs mlp_w = [] mlp_b = [] input_sz = self.config.layers[-1] * 2 for l_size in self.config.presoftmax_mlp_layers: mlp_w.append(self.model.add_parameters((l_size, input_sz))) mlp_b.append(self.model.add_parameters((l_size))) input_sz = l_size self.mlps.append([mlp_w, mlp_b]) softmax_input_size = self.config.presoftmax_mlp_layers[-1] self.softmax_upos_w = self.model.add_parameters( (len(self.encodings.upos2int), softmax_input_size)) self.softmax_upos_b = self.model.add_parameters( (len(self.encodings.upos2int))) self.softmax_xpos_w = self.model.add_parameters( (len(self.encodings.xpos2int), softmax_input_size)) self.softmax_xpos_b = self.model.add_parameters( (len(self.encodings.xpos2int))) self.softmax_attrs_w = self.model.add_parameters( (len(self.encodings.attrs2int), softmax_input_size)) self.softmax_attrs_b = self.model.add_parameters( (len(self.encodings.attrs2int))) self.aux_softmax_upos_w = self.model.add_parameters( (len(self.encodings.upos2int), aux_softmax_input_size)) self.aux_softmax_upos_b = self.model.add_parameters( (len(self.encodings.upos2int))) self.aux_softmax_xpos_w = self.model.add_parameters( (len(self.encodings.xpos2int), aux_softmax_input_size)) self.aux_softmax_xpos_b = self.model.add_parameters( (len(self.encodings.xpos2int))) self.aux_softmax_attrs_w = self.model.add_parameters( (len(self.encodings.attrs2int), aux_softmax_input_size)) self.aux_softmax_attrs_b = self.model.add_parameters( (len(self.encodings.attrs2int))) self.aux_softmax_weight = aux_softmax_weight self.losses = [] def tag(self, seq): dy.renew_cg() softmax_list, aux_softmax_list = self._predict(seq) label_list = [] for softmax in softmax_list: label_list.append([ self.encodings.upos_list[np.argmax(softmax[0].npvalue())], self.encodings.xpos_list[np.argmax(softmax[1].npvalue())], self.encodings.attrs_list[np.argmax(softmax[2].npvalue())] ]) return label_list def learn(self, seq): # dy.renew_cg() softmax_list, aux_softmax_list = self._predict(seq, runtime=False) losses = [] for entry, softmax, aux_softmax in zip(seq, softmax_list, aux_softmax_list): upos_index = self.encodings.upos2int[entry.upos] xpos_index = self.encodings.xpos2int[entry.xpos] attrs_index = self.encodings.attrs2int[entry.attrs] losses.append(-dy.log(dy.pick(softmax[0], upos_index))) losses.append(-dy.log(dy.pick(softmax[1], xpos_index))) losses.append(-dy.log(dy.pick(softmax[2], attrs_index))) losses.append(-dy.log(dy.pick(aux_softmax[0], upos_index)) * (self.aux_softmax_weight / 3)) losses.append(-dy.log(dy.pick(aux_softmax[1], xpos_index)) * (self.aux_softmax_weight / 3)) losses.append(-dy.log(dy.pick(aux_softmax[2], attrs_index)) * (self.aux_softmax_weight / 3)) # loss = dy.average(losses) # loss_val = loss.value() # loss.backward() # self.trainer.update() # return loss_val self.losses.append(dy.esum(losses)) def start_batch(self): self.losses = [] dy.renew_cg() def end_batch(self): total_loss_val = 0 if len(self.losses) > 0: total_loss = dy.esum(self.losses) self.losses = [] total_loss_val = total_loss.value() total_loss.backward() self.trainer.update() return total_loss_val def _predict(self, seq, runtime=True): softmax_list = [] aux_softmax_list = [] x_list = [] for entry in seq: word = entry.word char_emb, _ = self.character_network.compute_embeddings( word, runtime=runtime) word_emb, found = self.embeddings.get_word_embeddings( word.decode('utf-8')) if not found: word_emb = self.unknown_word_embedding[0] else: word_emb = dy.inputVector(word_emb) holistic_word = word.decode('utf-8').lower() if holistic_word in self.encodings.word2int: hol_emb = self.holistic_word_embedding[ self.encodings.word2int[holistic_word]] else: hol_emb = self.holistic_word_embedding[ self.encodings.word2int['<UNK>']] proj_emb = self.emb_proj_w.expr() * word_emb proj_hol = self.hol_proj_w.expr() * hol_emb proj_char = self.char_proj_w.expr() * char_emb # x_list.append(dy.tanh(proj_char + proj_emb + proj_hol)) if runtime: x_list.append(dy.tanh(proj_char + proj_emb + proj_hol)) else: p1 = random.random() p2 = random.random() p3 = random.random() m1 = 1 m2 = 1 m3 = 1 if p1 < self.config.input_dropout_prob: m1 = 0 if p2 < self.config.input_dropout_prob: m2 = 0 if p3 < self.config.input_dropout_prob: m3 = 0 scale = 1.0 if m1 + m2 + m3 > 0: scale = float(3) / (m1 + m2 + m3) m1 = dy.scalarInput(m1) m2 = dy.scalarInput(m2) m3 = dy.scalarInput(m3) scale = dy.scalarInput(scale) x_list.append( dy.tanh((proj_char * m1 + proj_emb * m2 + proj_hol * m3) * scale)) # BDLSTM rnn_outputs = [] for fw, bw, dropout in zip(self.bdrnn_fw, self.bdrnn_bw, self.config.layer_dropouts): if not runtime: fw.set_dropouts(0, dropout) bw.set_dropouts(0, dropout) else: fw.set_dropouts(0, 0) bw.set_dropouts(0, 0) fw_list = fw.initial_state().transduce(x_list) bw_list = list( reversed(bw.initial_state().transduce(reversed(x_list)))) x_list = [ dy.concatenate([x_fw, x_bw]) for x_fw, x_bw in zip(fw_list, bw_list) ] # if runtime: # x_out = x_list # else: # x_out = [dy.dropout(x, dropout) for x in x_list] rnn_outputs.append(x_list) # SOFTMAX mlp_output = [] for x in rnn_outputs[-1]: pre_softmax = [] for iMLP in xrange(3): mlp_w = self.mlps[iMLP][0] mlp_b = self.mlps[iMLP][1] inp = x for w, b, drop, in zip(mlp_w, mlp_b, self.config.presoftmax_mlp_dropouts): inp = dy.tanh(w.expr() * inp + b.expr()) if not runtime: inp = dy.dropout(inp, drop) pre_softmax.append(inp) mlp_output.append(pre_softmax) for softmax_inp, aux_softmax_inp in zip( mlp_output, rnn_outputs[self.config.aux_softmax_layer - 1]): softmax_list.append([ dy.softmax(self.softmax_upos_w.expr() * softmax_inp[0] + self.softmax_upos_b.expr()), dy.softmax(self.softmax_xpos_w.expr() * softmax_inp[1] + self.softmax_xpos_b.expr()), dy.softmax(self.softmax_attrs_w.expr() * softmax_inp[2] + self.softmax_attrs_b.expr()) ]) aux_softmax_list.append([ dy.softmax(self.aux_softmax_upos_w.expr() * aux_softmax_inp + self.aux_softmax_upos_b.expr()), dy.softmax(self.aux_softmax_xpos_w.expr() * aux_softmax_inp + self.aux_softmax_xpos_b.expr()), dy.softmax(self.aux_softmax_attrs_w.expr() * aux_softmax_inp + self.aux_softmax_attrs_b.expr()) ]) return softmax_list, aux_softmax_list def save(self, path): self.model.save(path) def load(self, path): self.model.populate(path) def tag_sequences(self, sequences): new_sequences = [] for sequence in sequences: new_sequence = copy.deepcopy(sequence) predicted_tags = self.tag(new_sequence) for entryIndex, pred in enumerate(predicted_tags): new_sequence[entryIndex].upos = pred[0] new_sequence[entryIndex].xpos = pred[1] new_sequence[entryIndex].attrs = pred[2] new_sequences.append(new_sequence) return new_sequences
def __init__(self, parser_config, encodings, embeddings, aux_softmax_weight=0.2, runtime=False): self.config = parser_config self.encodings = encodings self.embeddings = embeddings self.decoder = GreedyDecoder() self.model = dy.Model() # self.trainer = dy.SimpleSGDTrainer(self.model) self.trainer = dy.AdamTrainer(self.model, alpha=2e-3, beta_1=0.9, beta_2=0.9) self.trainer.set_sparse_updates(False) self.character_network = CharacterNetwork( 100, encodings, rnn_size=200, rnn_layers=1, embeddings_size=self.config.input_embeddings_size, model=self.model, runtime=runtime) self.holistic_embeddings = self.model.add_lookup_parameters( (len(self.encodings.word2int), self.config.input_embeddings_size)) self.input_proj_w_word = self.model.add_parameters( (self.config.input_embeddings_size, self.embeddings.word_embeddings_size)) self.input_proj_b_word = self.model.add_parameters( (self.config.input_embeddings_size)) self.unknown_word_embedding = self.model.add_lookup_parameters( (3, self.config.input_embeddings_size)) # for padding lexical self.pad_tag_embedding = self.model.add_lookup_parameters( (3, self.config.input_embeddings_size)) # for padding morphology self.bdrnn_fw = [] self.bdrnn_bw = [] rnn_input_size = 0 if self.config.use_lexical: rnn_input_size += self.config.input_embeddings_size if self.config.use_morphology: rnn_input_size += self.config.input_embeddings_size self.upos_lookup = self.model.add_lookup_parameters( (len(self.encodings.upos2int), self.config.input_embeddings_size)) self.xpos_lookup = self.model.add_lookup_parameters( (len(self.encodings.xpos2int), self.config.input_embeddings_size)) self.attrs_lookup = self.model.add_lookup_parameters( (len(self.encodings.attrs2int), self.config.input_embeddings_size)) index = 0 aux_proj_input_size = 0 for layer_size in self.config.layers: if runtime: self.bdrnn_fw.append( dy.VanillaLSTMBuilder(1, rnn_input_size, layer_size, self.model)) self.bdrnn_bw.append( dy.VanillaLSTMBuilder(1, rnn_input_size, layer_size, self.model)) else: self.bdrnn_fw.append( orthonormal_VanillaLSTMBuilder(1, rnn_input_size, layer_size, self.model)) self.bdrnn_bw.append( orthonormal_VanillaLSTMBuilder(1, rnn_input_size, layer_size, self.model)) rnn_input_size = layer_size * 2 index += 1 if index == self.config.aux_softmax_layer: aux_proj_input_size = rnn_input_size proj_input_size = self.config.layers[-1] * 2 self.proj_arc_w_head = self.model.add_parameters( (self.config.arc_proj_size, proj_input_size)) self.proj_arc_b_head = self.model.add_parameters( (self.config.arc_proj_size)) self.proj_arc_w_dep = self.model.add_parameters( (self.config.arc_proj_size, proj_input_size)) self.proj_arc_b_dep = self.model.add_parameters( (self.config.arc_proj_size)) self.proj_label_w_head = self.model.add_parameters( (self.config.label_proj_size, proj_input_size)) self.proj_label_b_head = self.model.add_parameters( (self.config.label_proj_size)) self.proj_label_w_dep = self.model.add_parameters( (self.config.label_proj_size, proj_input_size)) self.proj_label_b_dep = self.model.add_parameters( (self.config.label_proj_size)) if not self.config.predict_morphology: self.aux_proj_arc_w_head = self.model.add_parameters( (self.config.arc_proj_size, aux_proj_input_size)) self.aux_proj_arc_b_head = self.model.add_parameters( (self.config.arc_proj_size)) self.aux_proj_arc_w_dep = self.model.add_parameters( (self.config.arc_proj_size, aux_proj_input_size)) self.aux_proj_arc_b_dep = self.model.add_parameters( (self.config.arc_proj_size)) else: self.upos_proj_w = self.model.add_parameters( (self.config.label_proj_size, aux_proj_input_size)) self.xpos_proj_w = self.model.add_parameters( (self.config.label_proj_size, aux_proj_input_size)) self.attrs_proj_w = self.model.add_parameters( (self.config.label_proj_size, aux_proj_input_size)) self.upos_proj_b = self.model.add_parameters( (self.config.label_proj_size)) self.xpos_proj_b = self.model.add_parameters( (self.config.label_proj_size)) self.attrs_proj_b = self.model.add_parameters( (self.config.label_proj_size)) self.link_b = self.model.add_parameters((1, self.config.arc_proj_size)) self.link_w = self.model.add_parameters( (self.config.arc_proj_size, self.config.arc_proj_size)) self.label_ww = self.model.add_parameters( (1, len(self.encodings.label2int))) self.label_w = self.model.add_parameters( (len(self.encodings.label2int), self.config.label_proj_size * 2)) self.label_bb = self.model.add_parameters( (len(self.encodings.label2int))) if not self.config.predict_morphology: self.aux_link_w = self.model.add_parameters( (self.config.arc_proj_size, self.config.arc_proj_size)) self.aux_link_b = self.model.add_parameters( (1, self.config.arc_proj_size)) else: self.upos_softmax_w = self.model.add_parameters( (len(self.encodings.upos2int), self.config.label_proj_size)) self.xpos_softmax_w = self.model.add_parameters( (len(self.encodings.xpos2int), self.config.label_proj_size)) self.attrs_softmax_w = self.model.add_parameters( (len(self.encodings.attrs2int), self.config.label_proj_size)) self.upos_softmax_b = self.model.add_parameters( (len(self.encodings.upos2int))) self.xpos_softmax_b = self.model.add_parameters( (len(self.encodings.xpos2int))) self.attrs_softmax_b = self.model.add_parameters( (len(self.encodings.attrs2int))) self.lemma_softmax_b = self.model.add_parameters( (len(self.encodings.char2int) + 1)) self.lemma_softmax_casing_b = self.model.add_parameters((2)) self.aux_softmax_weight = aux_softmax_weight self.batch_loss = []
class BDRNNParser: def __init__(self, parser_config, encodings, embeddings, aux_softmax_weight=0.2, runtime=False): self.config = parser_config self.encodings = encodings self.embeddings = embeddings self.decoder = GreedyDecoder() self.model = dy.Model() # self.trainer = dy.SimpleSGDTrainer(self.model) self.trainer = dy.AdamTrainer(self.model, alpha=2e-3, beta_1=0.9, beta_2=0.9) self.trainer.set_sparse_updates(False) self.character_network = CharacterNetwork( 100, encodings, rnn_size=200, rnn_layers=1, embeddings_size=self.config.input_embeddings_size, model=self.model, runtime=runtime) self.holistic_embeddings = self.model.add_lookup_parameters( (len(self.encodings.word2int), self.config.input_embeddings_size)) self.input_proj_w_word = self.model.add_parameters( (self.config.input_embeddings_size, self.embeddings.word_embeddings_size)) self.input_proj_b_word = self.model.add_parameters( (self.config.input_embeddings_size)) self.unknown_word_embedding = self.model.add_lookup_parameters( (3, self.config.input_embeddings_size)) # for padding lexical self.pad_tag_embedding = self.model.add_lookup_parameters( (3, self.config.input_embeddings_size)) # for padding morphology self.bdrnn_fw = [] self.bdrnn_bw = [] rnn_input_size = 0 if self.config.use_lexical: rnn_input_size += self.config.input_embeddings_size if self.config.use_morphology: rnn_input_size += self.config.input_embeddings_size self.upos_lookup = self.model.add_lookup_parameters( (len(self.encodings.upos2int), self.config.input_embeddings_size)) self.xpos_lookup = self.model.add_lookup_parameters( (len(self.encodings.xpos2int), self.config.input_embeddings_size)) self.attrs_lookup = self.model.add_lookup_parameters( (len(self.encodings.attrs2int), self.config.input_embeddings_size)) index = 0 aux_proj_input_size = 0 for layer_size in self.config.layers: if runtime: self.bdrnn_fw.append( dy.VanillaLSTMBuilder(1, rnn_input_size, layer_size, self.model)) self.bdrnn_bw.append( dy.VanillaLSTMBuilder(1, rnn_input_size, layer_size, self.model)) else: self.bdrnn_fw.append( orthonormal_VanillaLSTMBuilder(1, rnn_input_size, layer_size, self.model)) self.bdrnn_bw.append( orthonormal_VanillaLSTMBuilder(1, rnn_input_size, layer_size, self.model)) rnn_input_size = layer_size * 2 index += 1 if index == self.config.aux_softmax_layer: aux_proj_input_size = rnn_input_size proj_input_size = self.config.layers[-1] * 2 self.proj_arc_w_head = self.model.add_parameters( (self.config.arc_proj_size, proj_input_size)) self.proj_arc_b_head = self.model.add_parameters( (self.config.arc_proj_size)) self.proj_arc_w_dep = self.model.add_parameters( (self.config.arc_proj_size, proj_input_size)) self.proj_arc_b_dep = self.model.add_parameters( (self.config.arc_proj_size)) self.proj_label_w_head = self.model.add_parameters( (self.config.label_proj_size, proj_input_size)) self.proj_label_b_head = self.model.add_parameters( (self.config.label_proj_size)) self.proj_label_w_dep = self.model.add_parameters( (self.config.label_proj_size, proj_input_size)) self.proj_label_b_dep = self.model.add_parameters( (self.config.label_proj_size)) if not self.config.predict_morphology: self.aux_proj_arc_w_head = self.model.add_parameters( (self.config.arc_proj_size, aux_proj_input_size)) self.aux_proj_arc_b_head = self.model.add_parameters( (self.config.arc_proj_size)) self.aux_proj_arc_w_dep = self.model.add_parameters( (self.config.arc_proj_size, aux_proj_input_size)) self.aux_proj_arc_b_dep = self.model.add_parameters( (self.config.arc_proj_size)) else: self.upos_proj_w = self.model.add_parameters( (self.config.label_proj_size, aux_proj_input_size)) self.xpos_proj_w = self.model.add_parameters( (self.config.label_proj_size, aux_proj_input_size)) self.attrs_proj_w = self.model.add_parameters( (self.config.label_proj_size, aux_proj_input_size)) self.upos_proj_b = self.model.add_parameters( (self.config.label_proj_size)) self.xpos_proj_b = self.model.add_parameters( (self.config.label_proj_size)) self.attrs_proj_b = self.model.add_parameters( (self.config.label_proj_size)) self.link_b = self.model.add_parameters((1, self.config.arc_proj_size)) self.link_w = self.model.add_parameters( (self.config.arc_proj_size, self.config.arc_proj_size)) self.label_ww = self.model.add_parameters( (1, len(self.encodings.label2int))) self.label_w = self.model.add_parameters( (len(self.encodings.label2int), self.config.label_proj_size * 2)) self.label_bb = self.model.add_parameters( (len(self.encodings.label2int))) if not self.config.predict_morphology: self.aux_link_w = self.model.add_parameters( (self.config.arc_proj_size, self.config.arc_proj_size)) self.aux_link_b = self.model.add_parameters( (1, self.config.arc_proj_size)) else: self.upos_softmax_w = self.model.add_parameters( (len(self.encodings.upos2int), self.config.label_proj_size)) self.xpos_softmax_w = self.model.add_parameters( (len(self.encodings.xpos2int), self.config.label_proj_size)) self.attrs_softmax_w = self.model.add_parameters( (len(self.encodings.attrs2int), self.config.label_proj_size)) self.upos_softmax_b = self.model.add_parameters( (len(self.encodings.upos2int))) self.xpos_softmax_b = self.model.add_parameters( (len(self.encodings.xpos2int))) self.attrs_softmax_b = self.model.add_parameters( (len(self.encodings.attrs2int))) self.lemma_softmax_b = self.model.add_parameters( (len(self.encodings.char2int) + 1)) self.lemma_softmax_casing_b = self.model.add_parameters((2)) self.aux_softmax_weight = aux_softmax_weight self.batch_loss = [] def start_batch(self): dy.renew_cg() self.batch_loss = [] def end_batch(self): if len(self.batch_loss) > 0: loss = dy.esum(self.batch_loss) loss_val = loss.value() loss.backward() self.trainer.update() return loss_val else: return 0 def learn(self, seq): # remove compound words tmp = [] for ss in seq: if not ss.is_compound_entry: tmp.append(ss) seq = tmp arc_matrix, aux_arc_matrix, proj_labels, softmax_morphology = self._predict_arc( seq, runtime=False) gold_heads = [entry.head for entry in seq] gold_labels = [entry.label for entry in seq] softmax_labels = self._predict_label(gold_heads, proj_labels, runtime=False) losses = [] for gold_head, gold_label, arc_probs, softmax_label, entry in zip( gold_heads, gold_labels, arc_matrix[1:], softmax_labels, seq): label_index = self.encodings.label2int[gold_label] losses.append(-dy.log(arc_probs[gold_head])) losses.append(-dy.log(dy.pick(softmax_label, label_index))) if not self.config.predict_morphology: for gold_head, aux_probs, entry in zip(gold_heads, aux_arc_matrix[1:], seq): losses.append(-dy.log(aux_probs[gold_head]) * self.aux_softmax_weight) else: for softmax_morph, entry in zip(softmax_morphology, seq): loss_upos = -dy.log( dy.pick(softmax_morph[0], self.encodings.upos2int[entry.upos])) losses.append(loss_upos * (self.aux_softmax_weight / 3)) if len( self.encodings.xpos2int ) > 1: # stability check (some languages are missing attributes or XPOS, resulting in numerical overflow during backpropagation loss_xpos = -dy.log( dy.pick(softmax_morph[1], self.encodings.xpos2int[entry.xpos])) losses.append(loss_xpos * (self.aux_softmax_weight / 3)) if len( self.encodings.attrs2int ) > 1: # stability check (some languages are missing attributes or XPOS, resulting in numerical overflow during backpropagation loss_attrs = -dy.log( dy.pick(softmax_morph[2], self.encodings.attrs2int[entry.attrs])) losses.append(loss_attrs * (self.aux_softmax_weight / 3)) loss = dy.esum(losses) self.batch_loss.append(loss) def _attend(self, input_vectors, state, aux_embeddings): w1 = self.lemma_att_w1.expr() w2 = self.lemma_att_w2.expr() v = self.lemma_att_v.expr() attention_weights = [] w2dt = w2 * dy.concatenate([state.h()[-1], aux_embeddings]) for input_vector in input_vectors: attention_weight = v * dy.tanh(w1 * input_vector + w2dt) attention_weights.append(attention_weight) attention_weights = dy.softmax(dy.concatenate(attention_weights)) output_vectors = dy.esum([ vector * attention_weight for vector, attention_weight in zip( input_vectors, attention_weights) ]) return output_vectors def tag(self, seq): tmp = [] for ss in seq: if not ss.is_compound_entry: tmp.append(ss) # if len(tmp)<2: # print "ERRRORR" # for entry in seq: # print str(entry.index)+"\t"+str(entry.word) seq = tmp dy.renew_cg() arc_matrix, aux_arc_matrix, proj_labels, softmax_morphology = self._predict_arc( seq) pred_heads = self.decoder.decode(arc_matrix) softmax_labels = self._predict_label(pred_heads, proj_labels) tag_list = [] for pred_head, softmax_label in zip(pred_heads, softmax_labels): label_index = np.argmax(softmax_label.npvalue()) tag = ParserTag(pred_head, self.encodings.labels[label_index], None, None, None) tag_list.append(tag) if self.config.predict_morphology: for tag, softmax_morph in zip(tag_list, softmax_morphology): tag.upos = self.encodings.upos_list[np.argmax( softmax_morph[0].npvalue())] tag.xpos = self.encodings.xpos_list[np.argmax( softmax_morph[1].npvalue())] tag.attrs = self.encodings.attrs_list[np.argmax( softmax_morph[2].npvalue())] return tag_list def _predict_label(self, heads, proj_labels, runtime=True): s_labels = [] for iDep, iHead in zip(range(1, len(heads) + 1), heads): modw = dy.transpose( dy.reshape(proj_labels[iHead][1], (self.config.label_proj_size, 1)) * self.label_ww.expr()) term1 = modw * proj_labels[iDep][0] term2 = self.label_w.expr() * dy.concatenate( [proj_labels[iHead][1], proj_labels[iDep][0]]) term3 = self.label_bb.expr() s_labels.append(dy.softmax(term1 + term2 + term3)) return s_labels def _make_input(self, seq, runtime): x_list = [] encoder_states_list = [None] # add the root if not self.config.use_morphology: x_list.append(self.unknown_word_embedding[1]) elif not self.config.use_lexical: x_list.append(self.pad_tag_embedding[1]) else: # both lexical and morphology are used x_list.append( dy.concatenate([ self.unknown_word_embedding[1], self.pad_tag_embedding[1] ])) for entry in seq: word = entry.word if self.config.use_lexical: # prepare lexical embeddings char_emb, encoder_states = self.character_network.compute_embeddings( word, runtime=runtime) encoder_states_list.append(encoder_states) word_emb, found = self.embeddings.get_word_embeddings( word.decode('utf-8')) if not found: word_emb = self.unknown_word_embedding[0] else: word_emb = dy.tanh(self.input_proj_w_word.expr() * dy.inputVector(word_emb) + self.input_proj_b_word.expr()) word = word.decode('utf-8').lower() if word in self.encodings.word2int: holistic_emb = self.holistic_embeddings[ self.encodings.word2int[word]] else: holistic_emb = self.holistic_embeddings[ self.encodings.word2int['<UNK>']] # dropout lexical embeddings if runtime: w_emb = word_emb + char_emb + holistic_emb else: p1 = random.random() p2 = random.random() p3 = random.random() m1 = 1 m2 = 1 m3 = 1 if p1 < self.config.input_dropout_prob: m1 = 0 if p2 < self.config.input_dropout_prob: m2 = 0 if p3 < self.config.input_dropout_prob: m3 = 0 scale = 1.0 if m1 + m2 + m3 > 0: scale = float(3) / (m1 + m2 + m3) m1 = dy.scalarInput(m1) m2 = dy.scalarInput(m2) m3 = dy.scalarInput(m3) scale = dy.scalarInput(scale) w_emb = (word_emb * m1 + char_emb * m2 + holistic_emb * m3) * scale if self.config.use_morphology: if entry.upos in self.encodings.upos2int: upos_emb = self.upos_lookup[self.encodings.upos2int[ entry.upos]] else: upos_emb = dy.inputVector( [0] * self.config.input_embeddings_size) if entry.xpos in self.encodings.xpos2int: xpos_emb = self.xpos_lookup[self.encodings.xpos2int[ entry.xpos]] else: xpos_emb = dy.inputVector( [0] * self.config.input_embeddings_size) if entry.attrs in self.encodings.attrs2int: attrs_emb = self.attrs_lookup[self.encodings.attrs2int[ entry.attrs]] else: attrs_emb = dy.inputVector( [0] * self.config.input_embeddings_size) # overwrite all dropouts. it will later be handled by "same-mask" t_emb = upos_emb + xpos_emb + attrs_emb # w_emb = word_emb + char_emb + holistic_emb # compose embeddings, if necessary if self.config.use_lexical and self.config.use_morphology: if not runtime: p1 = random.random() p2 = random.random() m1 = 1 m2 = 1 if p1 < self.config.input_dropout_prob: m1 = 0 if p2 < self.config.input_dropout_prob: m2 = 0 if m1 + m2 > 0: scale = float(2.0) / (m1 + m2) else: scale = 1.0 scale = dy.scalarInput(scale) m1 = dy.scalarInput(m1) m2 = dy.scalarInput(m2) x_list.append( dy.concatenate( [w_emb * m1 * scale, t_emb * m2 * scale])) else: x_list.append(dy.concatenate([w_emb, t_emb])) elif self.config.use_lexical: # just use_lexical == True x_list.append(w_emb) else: # just use_morphology == True x_list.append(t_emb) # close sequence if not self.config.use_morphology: x_list.append(self.unknown_word_embedding[2]) elif not self.config.use_lexical: x_list.append(self.pad_tag_embedding[2]) else: x_list.append( dy.concatenate([ self.unknown_word_embedding[2], self.pad_tag_embedding[2] ])) encoder_states_list.append(None) return x_list, encoder_states_list def _predict_arc(self, seq, runtime=True): x_list, encoder_states_list = self._make_input(seq, runtime) # BDLSTM rnn_outputs = [x_list] for fw, bw, dropout in zip(self.bdrnn_fw, self.bdrnn_bw, self.config.layer_dropouts): if runtime: fw.set_dropouts(0, 0) bw.set_dropouts(0, 0) else: fw.set_dropouts(dropout, dropout) bw.set_dropouts(dropout, dropout) fw_list = fw.initial_state().transduce(x_list) bw_list = list( reversed(bw.initial_state().transduce(reversed(x_list)))) x_list = [ dy.concatenate([x_fw, x_bw]) for x_fw, x_bw in zip(fw_list, bw_list) ] rnn_outputs.append(x_list) # projections arc_projections = [[ dy.tanh(self.proj_arc_w_dep.expr() * x + self.proj_arc_b_dep.expr()), dy.tanh(self.proj_arc_w_head.expr() * x + self.proj_arc_b_head.expr()) ] for x in rnn_outputs[-1]] label_projections = [[ dy.tanh(self.proj_label_w_dep.expr() * x + self.proj_label_b_dep.expr()), dy.tanh(self.proj_label_w_head.expr() * x + self.proj_label_b_head.expr()) ] for x in rnn_outputs[-1]] if not runtime: arc_projections = [[ dy.dropout(x1, self.config.presoftmax_mlp_dropout), dy.dropout(x2, self.config.presoftmax_mlp_dropout) ] for x1, x2 in arc_projections] label_projections = [[ dy.dropout(x1, self.config.presoftmax_mlp_dropout), dy.dropout(x2, self.config.presoftmax_mlp_dropout) ] for x1, x2 in label_projections] if not self.config.predict_morphology: aux_arc_projections = [[ dy.tanh(self.aux_proj_arc_w_dep.expr() * x + self.aux_proj_arc_b_dep.expr()), dy.tanh(self.aux_proj_arc_w_head.expr() * x + self.aux_proj_arc_b_head.expr()) ] for x in rnn_outputs[self.config.aux_softmax_layer]] if not runtime: aux_arc_projections = [[ dy.dropout(x1, self.config.presoftmax_mlp_dropout), dy.dropout(x2, self.config.presoftmax_mlp_dropout) ] for x1, x2 in aux_arc_projections] else: drp = self.config.presoftmax_mlp_dropout if runtime: drp = 0 upos_softmax = [ dy.softmax(self.upos_softmax_w.expr() * dy.dropout( dy.tanh(self.upos_proj_w.expr() * x + self.upos_proj_b.expr()), drp) + self.upos_softmax_b.expr()) for x in rnn_outputs[self.config.aux_softmax_layer] ] xpos_softmax = [ dy.softmax(self.xpos_softmax_w.expr() * dy.dropout( dy.tanh(self.xpos_proj_w.expr() * x + self.xpos_proj_b.expr()), drp) + self.xpos_softmax_b.expr()) for x in rnn_outputs[self.config.aux_softmax_layer] ] attrs_softmax = [ dy.softmax(self.attrs_softmax_w.expr() * dy.dropout( dy.tanh(self.attrs_proj_w.expr() * x + self.attrs_proj_b.expr()), drp) + self.attrs_softmax_b.expr()) for x in rnn_outputs[self.config.aux_softmax_layer] ] morphology_softmax = [ [upos, xpos, attrs] for upos, xpos, attrs in zip( upos_softmax, xpos_softmax, attrs_softmax) ] n = len(seq) + 1 arc_matrix = [[None] * n for _ in xrange(n)] if not self.config.predict_morphology: aux_arc_matrix = [[None] * n for _ in xrange(n)] for iDst in xrange(n): term_bias = self.link_b.expr() * arc_projections[iDst][1] term_weight = self.link_w.expr() * arc_projections[iDst][1] if not self.config.predict_morphology: aux_term_bias = self.aux_link_b.expr( ) * aux_arc_projections[iDst][1] aux_term_weight = self.aux_link_w.expr( ) * aux_arc_projections[iDst][1] for iSrc in xrange(n): if iSrc != iDst: attention = dy.reshape( term_weight, (1, self.config.arc_proj_size )) * arc_projections[iSrc][0] + term_bias arc_matrix[iSrc][iDst] = attention if not self.config.predict_morphology: aux_attention = dy.reshape(aux_term_weight, (1, self.config.arc_proj_size)) * \ aux_arc_projections[iSrc][0] + aux_term_bias aux_arc_matrix[iSrc][iDst] = aux_attention # compute softmax for arcs a_m = [[None] * n for _ in xrange(n)] if not self.config.predict_morphology: aux_a_m = [[None] * n for _ in xrange(n)] for iSrc in xrange(n): s_max = [] if not self.config.predict_morphology: aux_s_max = [] for iDst in xrange(n): if iSrc != iDst: s_max.append(arc_matrix[iSrc][iDst]) if not self.config.predict_morphology: aux_s_max.append(aux_arc_matrix[iSrc][iDst]) s_max = dy.softmax(dy.concatenate(s_max)) if not self.config.predict_morphology: aux_s_max = dy.softmax(dy.concatenate(aux_s_max)) ofs = 0 for iDst in xrange(n): if iSrc == iDst: ofs = -1 else: a_m[iSrc][iDst] = s_max[iDst + ofs] if not self.config.predict_morphology: aux_a_m[iSrc][iDst] = aux_s_max[iDst + ofs] if not self.config.predict_morphology: return a_m, aux_a_m, label_projections, None else: return a_m, None, label_projections, morphology_softmax[1:-1] def save(self, path): self.model.save(path) def load(self, path): self.model.populate(path) def parse_sequences(self, sequences): new_sequences = [] for sequence in sequences: new_sequence = copy.deepcopy(sequence) predicted_tags = self.tag(new_sequence) iOrig, iTags = 0, 0 while iOrig < len(new_sequence): while new_sequence[iOrig].is_compound_entry: iOrig += 1 new_sequence[iOrig].head = predicted_tags[iTags].head new_sequence[iOrig].label = predicted_tags[iTags].label if self.config.predict_morphology == True: new_sequence[iOrig].upos = predicted_tags[iTags].upos new_sequence[iOrig].xpos = predicted_tags[iTags].xpos new_sequence[iOrig].attrs = predicted_tags[iTags].attrs iTags += 1 iOrig += 1 new_sequences.append(new_sequence) return new_sequences
class BDRNNLemmatizer: def __init__(self, lemmatizer_config, encodings, embeddings, runtime=False): self.config = lemmatizer_config self.encodings = encodings # Bug in encodings - this will be removed after UD Shared Task self.has_bug = False if self.encodings.char2int[' '] != 1: self.has_bug = True self.embeddings = embeddings self.losses = [] self.model = dy.Model() self.trainer = dy.AdamTrainer(self.model, alpha=2e-3, beta_1=0.9, beta_2=0.9) self.character_network = CharacterNetwork(self.config.tag_embeddings_size, encodings, rnn_size=self.config.char_rnn_size, rnn_layers=self.config.char_rnn_layers, embeddings_size=self.config.char_embeddings, model=self.model, runtime=runtime) self.upos_lookup = self.model.add_lookup_parameters( (len(self.encodings.upos2int), self.config.tag_embeddings_size)) self.xpos_lookup = self.model.add_lookup_parameters( (len(self.encodings.xpos2int), self.config.tag_embeddings_size)) self.attrs_lookup = self.model.add_lookup_parameters( (len(self.encodings.attrs2int), self.config.tag_embeddings_size)) self.char_lookup = self.model.add_lookup_parameters((len(self.encodings.char2int), self.config.char_embeddings)) if runtime: self.rnn = dy.LSTMBuilder(self.config.rnn_layers, self.config.char_rnn_size * 2 + self.config.char_embeddings, self.config.rnn_size, self.model) else: from utils import orthonormal_VanillaLSTMBuilder self.rnn = orthonormal_VanillaLSTMBuilder(self.config.rnn_layers, self.config.char_rnn_size * 2 + self.config.char_embeddings, self.config.rnn_size, self.model) self.att_w1 = self.model.add_parameters((200, self.config.char_rnn_size * 2)) self.att_w2 = self.model.add_parameters((200, self.config.rnn_size + self.config.tag_embeddings_size)) self.att_v = self.model.add_parameters((1, 200)) self.start_lookup = self.model.add_lookup_parameters( (1, self.config.char_rnn_size * 2 + self.config.char_embeddings)) self.softmax_w = self.model.add_parameters((len(self.encodings.char2int) + 1, self.config.rnn_size)) self.softmax_b = self.model.add_parameters((len(self.encodings.char2int) + 1)) self.softmax_casing_w = self.model.add_parameters((2, self.config.rnn_size)) self.softmax_casing_b = self.model.add_parameters((2)) def _attend(self, input_vectors, state, embeddings): w1 = self.att_w1.expr() w2 = self.att_w2.expr() v = self.att_v.expr() attention_weights = [] w2dt = w2 * dy.concatenate([state.h()[-1], embeddings]) for input_vector in input_vectors: attention_weight = v * dy.tanh(w1 * input_vector + w2dt) attention_weights.append(attention_weight) attention_weights = dy.softmax(dy.concatenate(attention_weights)) output_vectors = dy.esum( [vector * attention_weight for vector, attention_weight in zip(input_vectors, attention_weights)]) return output_vectors def _predict(self, word, upos, xpos, attrs, num_chars=0, gs_chars=None): if num_chars == 0: runtime = True else: runtime = False char_emb, states = self.character_network.compute_embeddings(word, runtime=runtime) num_predictions = 0 softmax_list = [] m1, m2, m3 = 0, 0, 0 zero_vec = dy.vecInput(self.config.tag_embeddings_size) if upos in self.encodings.upos2int: upos_emb = self.upos_lookup[self.encodings.upos2int[upos]] m1 = 1 else: upos_emb = zero_vec if xpos in self.encodings.xpos2int: xpos_emb = self.xpos_lookup[self.encodings.xpos2int[xpos]] m2 = 1 else: xpos_emb = zero_vec if attrs in self.encodings.attrs2int: attrs_emb = self.attrs_lookup[self.encodings.attrs2int[attrs]] m3 = 1 else: attrs_emb = zero_vec scale = float(4.0) / (m1 + m2 + m3 + 1.0) scale = dy.scalarInput(scale) tag_emb = (upos_emb + xpos_emb + attrs_emb + char_emb) * scale rnn = self.rnn.initial_state().add_input(self.start_lookup[0]) char_emb = dy.inputVector([0] * self.config.char_embeddings) while True: attention = self._attend(states, rnn, tag_emb) input = dy.concatenate([attention, char_emb]) rnn = rnn.add_input(input) softmax = dy.softmax(self.softmax_w.expr() * rnn.output() + self.softmax_b.expr()) softmax_casing = dy.softmax(self.softmax_casing_w.expr() * rnn.output() + self.softmax_casing_b.expr()) softmax_list.append([softmax, softmax_casing]) if num_chars == 0: s_index = np.argmax(softmax.npvalue()) if s_index == len(self.encodings.char2int): break char_emb = self.char_lookup[s_index] else: if num_predictions < len(gs_chars): char = gs_chars[num_predictions] if char in self.encodings.char2int: char_emb = self.char_lookup[self.encodings.char2int[char]] else: char_emb = self.char_lookup[self.encodings.char2int["<UNK>"]] num_predictions += 1 if num_predictions == num_chars or num_predictions > 255: break return softmax_list def start_batch(self): self.losses = [] dy.renew_cg() def end_batch(self): total_loss = 0 if len(self.losses) > 0: loss = dy.esum(self.losses) total_loss = loss.value() loss.backward() self.trainer.update() self.losses = [] return total_loss def learn(self, seq): for entry in seq: if entry.upos != 'NUM' and entry.upos != 'PROPN': losses = [] unilemma = unicode(entry.lemma, 'utf-8') n_chars = len(unilemma) softmax_output_list = self._predict(entry.word, entry.upos, entry.xpos, entry.attrs, num_chars=n_chars + 1, gs_chars=unilemma) # print unilemma.encode('utf-8')#, softmax_output_list for softmax, char in zip(softmax_output_list[:-1], unilemma): char_index = -1 if char.lower() == char: casing = 0 else: casing = 1 char = char.lower() if char in self.encodings.char2int: char_index = self.encodings.char2int[char] if char_index != -1: losses.append(-dy.log(dy.pick(softmax[0], char_index))) losses.append(-dy.log(dy.pick(softmax[1], casing))) # print np.argmax(softmax[0].npvalue()), char_index, softmax losses.append(-dy.log(dy.pick(softmax_output_list[-1][0], len(self.encodings.char2int)))) loss = dy.esum(losses) self.losses.append(loss) def tag(self, seq): dy.renew_cg() lemmas = [] for entry in seq: if entry.upos == 'NUM' or entry.upos == 'PROPN': lemma = entry.word.decode('utf-8') else: softmax_output_list = self._predict(entry.word, entry.upos, entry.xpos, entry.attrs) lemma = "" for softmax in softmax_output_list[:-1]: char_index = np.argmax(softmax[0].npvalue()) if char_index < len(self.encodings.characters): char = self.encodings.characters[char_index] if np.argmax(softmax[1].npvalue()) == 1: char = char.upper() lemma += char lemmas.append(lemma) return lemmas def save(self, path): self.model.save(path) def load(self, path): self.model.populate(path)
class FSTLemmatizer: def __init__(self, config, encodings, embeddings, runtime=False): self.config = config self.encodings = encodings # Bug in encodings - will be removed after UD self.has_bug=False if self.encodings.char2int[' ']!=1: self.has_bug=True import sys sys.stdout.write("Detected encodings BUG!") self.embeddings = embeddings self.losses = [] self.model = dy.Model() self.trainer = dy.AdamTrainer(self.model, alpha=2e-3, beta_1=0.9, beta_2=0.9) self.character_network = CharacterNetwork(self.config.tag_embeddings_size, encodings, rnn_size=self.config.char_rnn_size, rnn_layers=self.config.char_rnn_layers, embeddings_size=self.config.char_embeddings, model=self.model, runtime=runtime) self.word2lemma={} self.upos_lookup = self.model.add_lookup_parameters( (len(self.encodings.upos2int), self.config.tag_embeddings_size)) self.xpos_lookup = self.model.add_lookup_parameters( (len(self.encodings.xpos2int), self.config.tag_embeddings_size)) self.attrs_lookup = self.model.add_lookup_parameters( (len(self.encodings.attrs2int), self.config.tag_embeddings_size)) self.char_lookup = self.model.add_lookup_parameters((len(self.encodings.char2int), self.config.char_embeddings)) if runtime: self.rnn = dy.LSTMBuilder(self.config.rnn_layers, self.config.char_rnn_size * 2 + self.config.char_embeddings + self.config.tag_embeddings_size, self.config.rnn_size, self.model) else: from utils import orthonormal_VanillaLSTMBuilder self.rnn = orthonormal_VanillaLSTMBuilder(self.config.rnn_layers, self.config.char_rnn_size * 2 + self.config.char_embeddings + self.config.tag_embeddings_size, self.config.rnn_size, self.model) # self.att_w1 = self.model.add_parameters((200, self.config.char_rnn_size * 2)) # self.att_w2 = self.model.add_parameters((200, self.config.rnn_size + self.config.tag_embeddings_size)) # self.att_v = self.model.add_parameters((1, 200)) self.start_lookup = self.model.add_lookup_parameters( (1, self.config.char_rnn_size * 2 + self.config.char_embeddings + self.config.tag_embeddings_size)) self.softmax_w = self.model.add_parameters((len(self.encodings.char2int) + 3, self.config.rnn_size)) self.softmax_b = self.model.add_parameters((len(self.encodings.char2int) + 3)) ofs = len(self.encodings.char2int) self.label2int = {} self.label2int['<EOS>'] = ofs self.label2int['<COPY>'] = ofs + 1 self.label2int['<INC>'] = ofs + 2 def _attend(self, input_vectors, state, embeddings): w1 = self.att_w1.expr() w2 = self.att_w2.expr() v = self.att_v.expr() attention_weights = [] w2dt = w2 * dy.concatenate([state.h()[-1], embeddings]) for input_vector in input_vectors: attention_weight = v * dy.tanh(w1 * input_vector + w2dt) attention_weights.append(attention_weight) attention_weights = dy.softmax(dy.concatenate(attention_weights)) output_vectors = dy.esum( [vector * attention_weight for vector, attention_weight in zip(input_vectors, attention_weights)]) return output_vectors def _predict(self, word, upos, xpos, attrs, max_predictions=0, runtime=True, gs_labels=None): char_emb, states = self.character_network.compute_embeddings(word, runtime=runtime) softmax_list = [] m1, m2, m3 = 0, 0, 0 zero_vec = dy.vecInput(self.config.tag_embeddings_size) if upos in self.encodings.upos2int: upos_emb = self.upos_lookup[self.encodings.upos2int[upos]] m1 = 1 else: upos_emb = zero_vec if xpos in self.encodings.xpos2int: xpos_emb = self.xpos_lookup[self.encodings.xpos2int[xpos]] m2 = 1 else: xpos_emb = zero_vec if attrs in self.encodings.attrs2int: attrs_emb = self.attrs_lookup[self.encodings.attrs2int[attrs]] m3 = 1 else: attrs_emb = zero_vec scale = float(4.0) / (m1 + m2 + m3 + 1.0) scale = dy.scalarInput(scale) tag_emb = (upos_emb + xpos_emb + attrs_emb + char_emb) * scale rnn = self.rnn.initial_state().add_input(self.start_lookup[0]) num_predictions = 0 i_src = 0 i_labels = 0 while num_predictions < max_predictions: # attention = self._attend(states, rnn, tag_emb) input = dy.concatenate([char_emb, states[i_src], tag_emb]) rnn = rnn.add_input(input) softmax = dy.softmax(self.softmax_w.expr() * rnn.output() + self.softmax_b.expr()) softmax_list.append(softmax) num_predictions += 1 if runtime: l_index = np.argmax(softmax.npvalue()) if l_index == self.label2int['<EOS>']: break elif l_index == self.label2int['<INC>'] and i_src < len(states) - 1: i_src += 1 else: if gs_labels[i_labels] == '<INC>' and i_src < len(states) - 1: i_src += 1 i_labels += 1 return softmax_list def start_batch(self): self.losses = [] dy.renew_cg() def end_batch(self): total_loss = 0 if len(self.losses) > 0: loss = dy.esum(self.losses) total_loss = loss.value() loss.backward() self.trainer.update() self.losses = [] return total_loss def learn(self, seq): for entry in seq: if entry.upos != 'NUM' and entry.upos != 'PROPN': # print entry.word+"\t"+entry.lemma y_real = self._compute_transduction_states(unicode(entry.word, 'utf-8').lower(), unicode(entry.lemma, 'utf-8').lower()) # print y_real losses = [] n_chars = len(y_real) # print entry.word, entry.lemma # print y_real softmax_output_list = self._predict(entry.word, entry.upos, entry.xpos, entry.attrs, max_predictions=n_chars, runtime=False, gs_labels=y_real) # print unilemma.encode('utf-8')#, softmax_output_list for softmax, y_target in zip(softmax_output_list, y_real): if y_target in self.label2int: losses.append(-dy.log(dy.pick(softmax, self.label2int[y_target]))) elif y_target in self.encodings.char2int: losses.append(-dy.log(dy.pick(softmax, self.encodings.char2int[y_target]))) if len(losses) > 0: loss = dy.esum(losses) self.losses.append(loss) def _compute_transduction_states(self, source, destination): a = np.zeros((len(source) + 1, len(destination) + 1)) for i in xrange(len(source) + 1): a[i, 0] = i for i in xrange(len(destination) + 1): a[0, i] = i for i in xrange(1, len(source) + 1): for j in xrange(1, len(destination) + 1): cost = 0 if source[i - 1] != destination[j - 1]: cost = 1 m = min([a[i - 1, j - 1], a[i - 1, j], a[i, j - 1]]) a[i, j] = m + cost alignments = [-1] * len(destination) i = len(source) j = len(destination) while i > 1 or j > 1: if source[i - 1] == destination[j - 1]: alignments[j - 1] = i - 1 if i == 1: j -= 1 elif j == 1: i -= 1 else: if a[i - 1, j - 1] <= a[i - 1, j] and a[i - 1, j - 1] <= a[i, j - 1]: i -= 1 j -= 1 elif a[i - 1][j] <= a[i - 1, j - 1] and a[i - 1, j] <= a[i, j - 1]: i -= 1 else: j -= 1 if source[i - 1] == destination[j - 1]: alignments[j - 1] = i - 1 y_pred = [] index_src = 0 index_dst = 0 while index_dst < len(destination): if alignments[index_dst] == index_src: y_pred.append("<COPY>") index_dst += 1 elif alignments[index_dst] == -1: if destination[index_dst] == "\t": y_pred.append("<TOK>") index_dst += 1 else: y_pred.append(destination[index_dst]) index_dst += 1 else: y_pred.append("<INC>") index_src += 1 y_pred.append("<EOS>") return y_pred def tag(self, seq): dy.renew_cg() lemmas = [] for entry in seq: if entry.upos == 'NUM' or entry.upos == 'PROPN': lemma = entry.word.decode('utf-8') else: #check dictionary key=entry.word.decode('utf-8').lower().encode('utf-8')+"\t"+entry.lemma if key in self.word2lemma: lemma=unicode(self.word2lemma[key],'utf-8') else: uniword = unicode(entry.word, 'utf-8') softmax_output_list = self._predict(uniword, entry.upos, entry.xpos, entry.attrs, max_predictions=500, runtime=True) lemma = "" src_index = 0 for softmax in softmax_output_list[:-1]: label_index = np.argmax(softmax.npvalue()) if label_index == self.label2int['<COPY>'] and src_index < len(uniword): lemma += uniword[src_index] elif label_index == self.label2int['<INC>'] or label_index == self.label2int['<EOS>']: src_index += 1 elif label_index < len(self.encodings.characters): #if self.has_bug and label_index >= self.encodings.char2int[' ']: # label_index += 1 lemma += self.encodings.characters[label_index] # print entry.word+"\t"+lemma.encode('utf-8') if entry.upos!='PROPN': lemmas.append(lemma.lower()) else: lemmas.append(lemma) return lemmas def save(self, path): self.model.save(path) def load(self, path): self.model.populate(path) dict_path=path.replace(".bestACC", ".dict") import os.path if os.path.exists(dict_path): self.load_dict(dict_path) def load_dict(self, path): print "Loading lemma dictionary" with open (path, "r") as f: lines=f.readlines() for line in lines: parts=line.strip().split('\t') if len(parts)==5: word=unicode(parts[0],'utf-8').lower().encode('utf-8') upos=parts[1] key=word+'\t'+upos self.word2lemma[key]=parts[4] print "Loaded "+str(len(self.word2lemma))+" pairs" def lemmatize_sequences(self, sequences): new_sequences = [] for sequence in sequences: new_sequence = copy.deepcopy(sequence) predicted_lemmas = self.tag(new_sequence) for entry, lemma in zip(new_sequence, predicted_lemmas): if not entry.is_compound_entry: entry.lemma = lemma if lemma is not None else "_" # lemma.encode('utf-8') else: entry.lemma = "_" # for entryIndex, lemma in enumerate(predicted_lemmas): # new_sequence[entryIndex].lemma = lemma if lemma is not None else "_" new_sequences.append(new_sequence) return new_sequences