def decode(model, dec_lstm, vectors, output): output = [EOS] + list(output) + [EOS] output = [char2int[c] for c in output] w = pc.parameter(model["decoder_w"]) b = pc.parameter(model["decoder_b"]) s = dec_lstm.initial_state().add_input(pc.vecInput(STATE_SIZE * 2)) loss = [] for char in output: vector = attend(model, vectors, s) s = s.add_input(vector) out_vector = w * s.output() + b probs = pc.softmax(out_vector) loss.append(-pc.log(pc.pick(probs, char))) loss = pc.esum(loss) return loss
def decode(model, dec_lstm, vectors, output): output = [EOS] + list(output) + [EOS] output = [char2int[c] for c in output] w = pc.parameter(model["decoder_w"]) b = pc.parameter(model["decoder_b"]) s = dec_lstm.initial_state().add_input(pc.vecInput(STATE_SIZE*2)) loss = [] for char in output: vector = attend(model, vectors, s) s = s.add_input(vector) out_vector = w * s.output() + b probs = pc.softmax(out_vector) loss.append(-pc.log(pc.pick(probs, char))) loss = pc.esum(loss) return loss
def one_word_loss(model, encoder_frnn, encoder_rrnn, decoder_rnn, lemma, feats, word, alphabet_index, aligned_pair, feat_index, feature_types): pc.renew_cg() # read the parameters char_lookup = model["char_lookup"] feat_lookup = model["feat_lookup"] R = pc.parameter(model["R"]) bias = pc.parameter(model["bias"]) padded_lemma = BEGIN_WORD + lemma + END_WORD # convert characters to matching embeddings lemma_char_vecs = [] for char in padded_lemma: try: lemma_char_vecs.append(char_lookup[alphabet_index[char]]) except KeyError: # handle UNK lemma_char_vecs.append(char_lookup[alphabet_index[UNK]]) # convert features to matching embeddings, if UNK handle properly feat_vecs = [] for feat in sorted(feature_types): # TODO: is it OK to use same UNK for all feature types? and for unseen feats as well? # if this feature has a value, take it from the lookup. otherwise use UNK if feat in feats: feat_str = feat + ':' + feats[feat] try: feat_vecs.append(feat_lookup[feat_index[feat_str]]) except KeyError: # handle UNK or dropout feat_vecs.append(feat_lookup[feat_index[UNK_FEAT]]) else: feat_vecs.append(feat_lookup[feat_index[UNK_FEAT]]) feats_input = pc.concatenate(feat_vecs) # BiLSTM forward pass s_0 = encoder_frnn.initial_state() s = s_0 frnn_outputs = [] for c in lemma_char_vecs: s = s.add_input(c) frnn_outputs.append(s.output()) # BiLSTM backward pass s_0 = encoder_rrnn.initial_state() s = s_0 rrnn_outputs = [] for c in reversed(lemma_char_vecs): s = s.add_input(c) rrnn_outputs.append(s.output()) # BiLTSM outputs blstm_outputs = [] lemma_char_vecs_len = len(lemma_char_vecs) for i in xrange(lemma_char_vecs_len): blstm_outputs.append(pc.concatenate([frnn_outputs[i], rrnn_outputs[lemma_char_vecs_len - i - 1]])) # initialize the decoder rnn s_0 = decoder_rnn.initial_state() s = s_0 # set prev_output_vec for first lstm step as BEGIN_WORD prev_output_vec = char_lookup[alphabet_index[BEGIN_WORD]] prev_char_vec = char_lookup[alphabet_index[BEGIN_WORD]] loss = [] # i is input index, j is output index i = 0 j = 0 # go through alignments, progress j when new output is introduced, progress i when new char is seen on lemma (no ~) # TODO: try sutskever flip trick? # TODO: attention on the lemma chars/feats could help here? aligned_lemma, aligned_word = aligned_pair aligned_lemma += END_WORD aligned_word += END_WORD # run through the alignments for index, (input_char, output_char) in enumerate(zip(aligned_lemma, aligned_word)): possible_outputs = [] # feedback, i, j, blstm[i], feats decoder_input = pc.concatenate([prev_output_vec, prev_char_vec, # char_lookup[alphabet_index[str(i)]], # char_lookup[alphabet_index[str(j)]], blstm_outputs[i], feats_input]) # if reached the end word symbol if output_char == END_WORD: s = s.add_input(decoder_input) decoder_rnn_output = s.output() probs = pc.softmax(R * decoder_rnn_output + bias) # compute local loss loss.append(-pc.log(pc.pick(probs, alphabet_index[END_WORD]))) continue # if there is no prefix, step if padded_lemma[i] == BEGIN_WORD and aligned_lemma[index] != ALIGN_SYMBOL: # perform rnn step # feedback, i, j, blstm[i], feats s = s.add_input(decoder_input) decoder_rnn_output = s.output() probs = pc.softmax(R * decoder_rnn_output + bias) # compute local loss loss.append(-pc.log(pc.pick(probs, alphabet_index[STEP]))) # prepare for the next iteration - "feedback" prev_output_vec = char_lookup[alphabet_index[STEP]] prev_char_vec = char_lookup[alphabet_index[EPSILON]] i += 1 # if there is new output if aligned_word[index] != ALIGN_SYMBOL: decoder_input = pc.concatenate([prev_output_vec, prev_char_vec, # char_lookup[alphabet_index[str(i)]], # char_lookup[alphabet_index[str(j)]], blstm_outputs[i], feats_input]) # copy i action - maybe model as a single action? if padded_lemma[i] == aligned_word[j]: possible_outputs.append(str(i)) possible_outputs.append(padded_lemma[i]) else: possible_outputs.append(aligned_word[index]) # perform rnn step s = s.add_input(decoder_input) decoder_rnn_output = s.output() probs = pc.softmax(R * decoder_rnn_output + bias) local_loss = pc.scalarInput(0) max_output_loss = -pc.log(pc.pick(probs, alphabet_index[possible_outputs[0]])) max_likelihood_output = possible_outputs[0] # sum over all correct output possibilities and pick feedback output to be the one with the highest # probability for output in possible_outputs: neg_log_likelihood = -pc.log(pc.pick(probs, alphabet_index[output])) if neg_log_likelihood < max_output_loss: max_likelihood_output = output max_output_loss = neg_log_likelihood local_loss += neg_log_likelihood loss.append(local_loss) # prepare for the next iteration - "feedback" prev_output_vec = char_lookup[alphabet_index[max_likelihood_output]] prev_char_vec = char_lookup[alphabet_index[aligned_word[index]]] j += 1 # now check if it's time to progress on input if i < len(padded_lemma) - 1 and aligned_lemma[index + 1] != ALIGN_SYMBOL: # perform rnn step # feedback, i, j, blstm[i], feats decoder_input = pc.concatenate([prev_output_vec, prev_char_vec, # char_lookup[alphabet_index[str(i)]], # char_lookup[alphabet_index[str(j)]], blstm_outputs[i], feats_input]) s = s.add_input(decoder_input) decoder_rnn_output = s.output() probs = pc.softmax(R * decoder_rnn_output + bias) # compute local loss loss.append(-pc.log(pc.pick(probs, alphabet_index[STEP]))) # prepare for the next iteration - "feedback" prev_output_vec = char_lookup[alphabet_index[STEP]] prev_char_vec = char_lookup[alphabet_index[EPSILON]] i += 1 # TODO: maybe here a "special" loss function is appropriate? # loss = esum(loss) loss = pc.average(loss) return loss
def train( feature_mapper, word_dims, tag_dims, lstm_units, hidden_units, epochs, batch_size, train_data_file, dev_data_file, model_save_file, droprate, unk_param, alpha=1.0, beta=0.0, ): start_time = time.time() fm = feature_mapper word_count = fm.total_words() tag_count = fm.total_tags() network = Network( word_count=word_count, tag_count=tag_count, word_dims=word_dims, tag_dims=tag_dims, lstm_units=lstm_units, hidden_units=hidden_units, struct_out=2, label_out=fm.total_label_actions(), droprate=droprate, ) network.init_params() print('Hidden units: {}, per-LSTM units: {}'.format( hidden_units, lstm_units, )) print('Embeddings: word={} tag={}'.format( (word_count, word_dims), (tag_count, tag_dims), )) print('Dropout rate: {}'.format(droprate)) print('Parameters initialized in [-0.01, 0.01]') print('Random UNKing parameter z = {}'.format(unk_param)) print('Exploration: alpha={} beta={}'.format(alpha, beta)) training_data = fm.gold_data_from_file(train_data_file) num_batches = -(-len(training_data) // batch_size) print('Loaded {} training sentences ({} batches of size {})!'.format( len(training_data), num_batches, batch_size, )) parse_every = -(-num_batches // 4) dev_trees = PhraseTree.load_treefile(dev_data_file) print('Loaded {} validation trees!'.format(len(dev_trees))) best_acc = FScore() for epoch in xrange(1, epochs + 1): print('........... epoch {} ...........'.format(epoch)) total_cost = 0.0 total_states = 0 training_acc = FScore() np.random.shuffle(training_data) for b in xrange(num_batches): batch = training_data[(b * batch_size):((b + 1) * batch_size)] explore = [ Parser.exploration( example, fm, network, alpha=alpha, beta=beta, ) for example in batch ] for (_, acc) in explore: training_acc += acc batch = [example for (example, _) in explore] pycnn.renew_cg() network.prep_params() errors = [] for example in batch: ## random UNKing ## for (i, w) in enumerate(example['w']): if w <= 2: continue freq = fm.word_freq_list[w] drop_prob = unk_param / (unk_param + freq) r = np.random.random() if r < drop_prob: example['w'][i] = 0 fwd, back = network.evaluate_recurrent( example['w'], example['t'], ) for (left, right), correct in example['struct_data'].items(): scores = network.evaluate_struct( fwd, back, left, right) probs = pycnn.softmax(scores) loss = -pycnn.log(pycnn.pick(probs, correct)) errors.append(loss) total_states += len(example['struct_data']) for (left, right), correct in example['label_data'].items(): scores = network.evaluate_label(fwd, back, left, right) probs = pycnn.softmax(scores) loss = -pycnn.log(pycnn.pick(probs, correct)) errors.append(loss) total_states += len(example['label_data']) batch_error = pycnn.esum(errors) total_cost += batch_error.scalar_value() batch_error.backward() network.trainer.update() mean_cost = total_cost / total_states print( '\rBatch {} Mean Cost {:.4f} [Train: {}]'.format( b, mean_cost, training_acc, ), end='', ) sys.stdout.flush() if ((b + 1) % parse_every) == 0 or b == (num_batches - 1): dev_acc = Parser.evaluate_corpus( dev_trees, fm, network, ) print(' [Val: {}]'.format(dev_acc)) if dev_acc > best_acc: best_acc = dev_acc network.save(model_save_file) print(' [saved model: {}]'.format(model_save_file)) current_time = time.time() runmins = (current_time - start_time) / 60. print(' Elapsed time: {:.2f}m'.format(runmins))
def pick_neg_log(self, pred, gold): return -pycnn.log(pycnn.pick(pred, gold))