def forward(self, lemma_indices, tag_indices, inflected_form_indices=None, a_ls_true=None, p_gens_true=None): """ Args: lemma_indices: list of list containing lemma indices tag_indices: list of list containing tag indices inflected_form_indices: list of list containing inflected form indices (for teacher forcing) a_ls_true: true alignments (for teacher forcing) p_gens_true: true p_gens (for teacher forcing) Returns: p_ws: log probabilities, of shape (bsz, max_decode_len, char_vocab_size) a_ls: attention over lemmas, of shape (bsz, max_decode_len, max_lemma_len) p_gens: p_gens, of shape (bsz, max_decode_len) """ # (bsz, max_lemma_len, 2*hidden_size), (bsz, max_lemma_len), (1, bsz, hidden_size) h_l, mask_l, (h_l_n, c_l_n) = self.lemma_encoder(lemma_indices) # (bsz, max_tag_len, 2*hidden_size), (bsz, max_tag_len), (1, bsz, hidden_size) h_tg, mask_tg, (h_tg_n, c_tg_n) = self.tag_encoder(tag_indices) # (1, bsz, hidden_size) & (1, bsz, hidden_size) -> (1, bsz, hidden_size) s_0 = self.bridge_h(torch.cat([h_l_n, h_tg_n], dim=2)) # (1, bsz, hidden_size) & (1, bsz, hidden_size) -> (1, bsz, hidden_size) c_0 = self.bridge_c(torch.cat([c_l_n, c_tg_n], dim=2)) lemma_indices_padded = pad_lists(lemma_indices, self.vocab.padding_idx, dtype=torch.long, device=device) if inflected_form_indices is not None: inflected_form_indices = [ [self.vocab.char_to_index(self.vocab.START_CHAR)] + seq_indices for seq_indices in inflected_form_indices ] inflected_form_indices = pad_lists( inflected_form_indices, self.vocab.padding_idx, dtype=torch.long, device=device) # (bsz, max_tgt_len) decoder_input = self.lemma_encoder.embedder( inflected_form_indices) # (bsz, max_tgt_len, embedding_size) else: decoder_input = None p_ws, a_ls, p_gens = self.decoder(lemma_indices_padded, h_l, h_tg, mask_l, mask_tg, (s_0, c_0), decoder_input, a_ls_true, p_gens_true) return p_ws, a_ls, p_gens
def forward(self, lemma_indices, tag_indices, inflected_form_indices=None, a_ls_true=None, p_gens_true=None): """ Args: lemma_indices: list of list containing lemma indices tag_indices: list of list containing tag indices inflected_form_indices: list of list containing inflected form indices (for teacher forcing) a_ls_true: true alignments (for teacher forcing) p_gens_true: true p_gens (for teacher forcing) Returns: p_ws: log probabilities, of shape (bsz, max_decode_len, char_vocab_size) a_ls: attention over lemmas, of shape (bsz, max_decode_len, max_lemma_len) p_gens: p_gens, of shape (bsz, max_decode_len) """ # (bsz, max_lemma+tag_len, 2*hidden_size), (bsz, max_lemma_tag_len), (1, bsz, hidden_size) h, mask, (h_n, c_n) = self.encoder( [x + y for x, y in zip(lemma_indices, tag_indices)]) s_0 = h_n # (1, bsz, hidden_size) c_0 = c_n # (1, bsz, hidden_size) input_indices_padded = pad_lists( [x + y for x, y in zip(lemma_indices, tag_indices)], self.vocab.padding_idx, dtype=torch.long, device=device) if inflected_form_indices is not None: inflected_form_indices = [ [self.vocab.char_to_index(self.vocab.START_CHAR)] + seq_indices for seq_indices in inflected_form_indices ] inflected_form_indices = pad_lists( inflected_form_indices, self.vocab.padding_idx, dtype=torch.long, device=device) # (bsz, max_tgt_len) decoder_input = self.encoder.embedder( inflected_form_indices) # (bsz, max_tgt_len, embedding_size) else: decoder_input = None p_ws, a_ls, p_gens = self.decoder(input_indices_padded, h, mask, (s_0, c_0), decoder_input, a_ls_true, p_gens_true) return p_ws, a_ls, p_gens
def Evaluate(sess, model, dev_data, transition_params_trained, parameters): total_token_num = 0 correct_token_num = 0 start = time.time() epoch_num = 0 while True: step = 0 sent_list = [] sentences = [] tags = [] sentence_lengths = [] word_lengths = [] while len(sentences) < parameters['batch_size']: sent, epoch_num = advance_sent(epoch_num, dev_data) sent_list.append(sent) sentences.append(sent.word_ids) tags.append(sent.tag_ids) sentence_lengths.append(sent.get_sent_len()) feed_dict = { model.input_token_indices: utils.pad_lists(sentences), model.input_sent_lengths: sentence_lengths, model.input_label_indices:utils.pad_lists(tags), model.dropout_keep_prob: 1-parameters['dropout_rate'] } unary_scores, predictions = sess.run([model.unary_scores, model.predictions], feed_dict) for index in range(parameters["batch_size"]): if parameters['use_crf']: outputs, _ = tf.contrib.crf.viterbi_decode(unary_scores[index], transition_params_trained) sent_list[index].set_sent_tags(outputs[1:-1]) else: outputs = predictions[index] sent_list[index].set_sent_tags(outputs) if epoch_num >= 1: break num_tokens = 0 num_correct = 0 dev_data.reset_index() while dev_data.has_next_sent(): sent = dev_data.get_next_sent() gold_tags = sent.tag_ids output_tags = sent.get_tag_output() assert len(gold_tags) == len(output_tags) for idx, tag in enumerate(gold_tags): num_tokens += 1 if gold_tags[idx] == output_tags[idx]: num_correct += 1 dev_data.reset_index() logging.info(num_correct) logging.info(num_tokens) logging.info('token number is %d, accuracy is %.2f%%', num_tokens, (100.0*num_correct/num_tokens)) return 100.0 * num_correct / num_tokens
def Evaluate(sess, model, data, transition_params_trained, parameters): total_token_num = 0 correct_token_num = 0 start = time.time() while data.has_next_sent('dev'): sent = data.get_next_sent('dev') feed_dict = { model.input_token_indices: sent.word_ids, model.input_label_indices: sent.ner_ids, model.input_pos_indices: sent.pos_ids, model.input_token_character_indices: utils.pad_lists(sent.char_lists), model.input_token_lengths: sent.word_lengths, model.dropout_keep_prob: 1 - parameters['dropout_rate'] } unary_scores, predictions = sess.run( [model.unary_scores, model.predictions], feed_dict) if parameters['use_crf']: predictions, _ = tf.contrib.crf.viterbi_decode( unary_scores, transition_params_trained) predictions = predictions[1:-1] else: predictions = predictions.tolist() gold_labels = sent.ner_ids total_token_num += len(predictions) for idx, p in enumerate(predictions): if p == gold_labels[idx]: correct_token_num += 1 data.reset_index('dev') return 100.0 * correct_token_num / total_token_num
def forward(self, a_ls, a_ls_true): max_decoder_len = a_ls.shape[1] max_lemma_len = a_ls.shape[2] target = pad_lists(a_ls_true, -1, pad_len=max_decoder_len, dtype=torch.long, device=device) loss = self.criterion( torch.log(a_ls + 1e-6).view(-1, max_lemma_len), target.view(-1)) return loss
def forward(self, indices): """ Args: indices: list containing sequences of indices, of length bsz Returns: h: hidden state at each time step, of shape (bsz, max_src_len, 2*hidden_size) mask: 1 where input index is 0 (bsz, max_src_len) (h_n, c_n): final hidden state, a tuple ((1, bsz, hidden_size), (1, bsz, hidden_size)) """ # Inspired from here, https://discuss.pytorch.org/t/rnns-sorting-operations-autograd-safe/1461 # See also, https://discuss.pytorch.org/t/solved-multiple-packedsequence-input-ordering/2106 lengths = torch.tensor([len(x) for x in indices], dtype=torch.long, device=device) indices_padded = pad_lists(indices, self.vocab.padding_idx, dtype=torch.long, device=device) lengths_sorted, sorted_idx = lengths.sort(descending=True) indices_sorted = indices_padded[sorted_idx] embeddings_padded = self.embedder(indices_sorted) embeddings_padded = self.dropout_input(embeddings_padded) embeddings_packed = pack_padded_sequence(embeddings_padded, lengths_sorted.tolist(), batch_first=True) h, (h_n, c_n) = self.lstm(embeddings_packed) h, _ = pad_packed_sequence(h, batch_first=True, padding_value=self.vocab.padding_idx) h = torch.zeros_like(h).scatter_( 0, sorted_idx.unsqueeze(1).unsqueeze(1).expand( -1, h.shape[1], h.shape[2]), h) # Revert sorting h_n = torch.zeros_like(h_n).scatter_( 1, sorted_idx.unsqueeze(0).unsqueeze(2).expand( h_n.shape[0], -1, h_n.shape[2]), h_n) # Revert sorting c_n = torch.zeros_like(c_n).scatter_( 1, sorted_idx.unsqueeze(0).unsqueeze(2).expand( c_n.shape[0], -1, c_n.shape[2]), c_n) # Revert sorting h = self.dropout_output(h) h_n = (h_n[0, :, :] + h_n[1, :, :]).unsqueeze( 0) # (1, bsz, hidden_size) c_n = (c_n[0, :, :] + c_n[1, :, :]).unsqueeze( 0) # (1, bsz, hidden_size) mask = indices_padded == 0 # (bsz, max_lemma_len) return h, mask, (h_n, c_n)
def forward(self, p_gens, p_gens_true): bsz = p_gens.shape[0] max_decoder_len = p_gens.shape[1] score = torch.zeros(bsz, max_decoder_len, 2, device=device) score[:, :, 0] = 1 - p_gens score[:, :, 1] = p_gens target = pad_lists(p_gens_true, -1, pad_len=max_decoder_len, dtype=torch.long, device=device) loss = self.criterion( torch.log(score + 1e-6).view(-1, 2), target.view(-1)) return loss
def forward(self, p_ws, inflected_forms_indices): max_decoder_len = p_ws.shape[1] tgt_classes = p_ws.shape[2] inflected_forms_indices = [ seq_indices + [self.vocab.char_to_index(self.vocab.STOP_CHAR)] for seq_indices in inflected_forms_indices ] p_ws_target = pad_lists(inflected_forms_indices, -1, pad_len=max_decoder_len, dtype=torch.long, device=device) loss = self.criterion(p_ws.view(-1, tgt_classes), p_ws_target.view(-1)) return loss
def prune(self, interval, keep_end=False): """ keep only the times(and their associated values) that are at least `interval` distance apart Parameters ---------- interval: numeric, required the minimum distance between times to be preserved keep_end : bool, optional keep the last time and value of the timeseries, even if its less than `interval` distance from prior time """ self._times, self._values = pad_lists(interval, self._times, self._values, keep_end=keep_end)
def _new_slice(self, times, values, key): """ slicing functionality for timeseries """ try: start, stop, step = key.start, key.stop, key.step if all(x is None for x in [start, stop, step]): # [:] slice, return everything return times, values except AttributeError: start, stop, step = key, False, None if start is not None and start < times[ 0] and self.first_val is not False: # add default beginning value to front of list times = [start] + times values = [self.first_val] + values start_idx = index_of(start, times, begin=True) if stop is False: # slice only wants one value if self.interpolate: return start, self._interpolate(start, times, values) return start, values[start_idx] times, values = times[start_idx:], values[start_idx:] slice_times, slice_values = [x for x in times], [x for x in values] if start > slice_times[0]: # reset first time in slice_times slice_times[0] = start if step: slice_times, slice_values = pad_lists(step, slice_times, slice_values, keep_dist=True) stop_idx = index_of(stop, slice_times) if not stop or stop > slice_times[stop_idx]: # hack to include the last value if stop is past the end of list stop_idx += 1 if self.interpolate: return slice_times[:stop_idx], self._interpolate( slice_times[:stop_idx], times, values) return slice_times[:stop_idx], slice_values[:stop_idx]
def Evaluate(sess, model, dataset, transition_params_trained, parameters, epoch_num): start = time.time() accs = [] correct_preds, total_correct, total_preds = 0., 0., 0. word_count = 0 while dataset.has_next_sent('test'): sent = dataset.get_next_sent('test') feed_dict = { model.input_token_indices: sent.word_ids, model.input_token_character_indices: utils.pad_lists(sent.char_lists), model.input_token_lengths: sent.word_lengths, model.dropout_keep_prob: 1 } unary_scores, predictions = sess.run( [model.unary_scores, model.predictions], feed_dict) if parameters['use_crf']: predictions, _ = tf.contrib.crf.viterbi_decode( unary_scores, transition_params_trained) predictions = predictions[1:-1] gold_labels = sent.ner_ids words = sent.word_ids word_count += len(words) accs += [a == b for (a, b) in zip(gold_labels, predictions)] lab_chunks = set(utils.get_chunks(gold_labels, dataset.ner_map)) lab_pred_chunks = set(utils.get_chunks(predictions, dataset.ner_map)) #logging.info(sent.ner_ids) #logging.info(predictions) correct_preds += len(lab_chunks & lab_pred_chunks) total_preds += len(lab_pred_chunks) total_correct += len(lab_chunks) p = correct_preds / total_preds if correct_preds > 0 else 0 r = correct_preds / total_correct if correct_preds > 0 else 0 f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0 test_time = time.time() - start dataset.reset_index('test') logging.info("epoch: %d, f1 score: %.2f", epoch_num, f1 * 100.0) return test_time
def pad(self, interval, keep_end=False): """ pad timeseries so that there is a time(and value) at every interval if interpolate, the values will be interpolated, otherwise the previous value will be repeated Parameters ---------- interval: numeric, required the minimum distance between times to be preserved keep_end : bool, optional keep the last time and value of the timeseries, even if its less than `interval` distance from prior time """ new_times, new_values = pad_lists(interval, self._times, self._values, keep_end=keep_end) if self.interpolate: new_values = self._interpolate(new_times, self._times, self._values) self._times, self._values = new_times, new_values
def main(): logging.set_verbosity(logging.INFO) if not gfile.IsDirectory(OutputPath('')): gfile.MakeDirs(OutputPath('')) log_output = codecs.open("ner_training_log1", 'w') parameters = {} parameters['use_character_lstm'] = True parameters['character_embedding_dimension'] = 25 parameters['token_embedding_dimension'] = 100 parameters['token_pretrained_embedding_filepath'] = '' parameters['character_lstm_hidden_state_dimension'] = 25 parameters['token_lstm_hidden_state_dimension'] = 100 parameters['use_crf'] = True parameters['optimizer'] = 'sgd' parameters['learning_rate'] = 0.01 parameters['gradient_clipping_value'] = 5.0 parameters['dropout_rate'] = 0.2 parameters['maximum_number_of_epochs'] = 10 parameters['use_tag_embedding'] = True parameters['pos_embedding_dimension'] = 16 loading_time = time.time() train_data_path = '/cs/natlang-data/CoNLL/CoNLL-2003/eng.train' dev_data_path = '/cs/natlang-data/CoNLL/CoNLL-2003/eng.testa' test_data_path = '/cs/natlang-data/CoNLL/CoNLL-2003/eng.testb' logging.info("loading data and precomputing features...") dataset = Dataset(train_data_path, dev_data_path, test_data_path) dataset.load_dataset() sess = tf.Session() with sess.as_default(): model = EntityLSTM(dataset, parameters) sess.run(tf.global_variables_initializer()) #load glove token embeddings load_pretrained_token_embeddings(sess, model, dataset, parameters) epoch_num = 0 start = time.time() best = 0.0 while True: step = 0 epoch_num += 1 cost_sum = 0 while dataset.has_next_sent('train'): sent = dataset.get_next_sent('train') step += 1 feed_dict = { model.input_token_indices: sent.word_ids, model.input_label_indices: sent.ner_ids, model.input_pos_indices: sent.pos_ids, model.input_token_character_indices: utils.pad_lists(sent.char_lists), model.input_token_lengths: sent.word_lengths, model.dropout_keep_prob: 1 - parameters['dropout_rate'] } _, _, loss, transition_params_trained = sess.run([ model.train_op, model.global_step, model.loss, model.transition_parameters ], feed_dict) cost_sum += loss if step % 1000 == 0: current = Evaluate(sess, model, dataset, transition_params_trained, parameters) log_output.write('EPOCH %d, loss is %.2f' % (epoch_num, cost_sum / 1000)) if current > best: logging.info("saving the model...") model_saver = tf.train.Saver( max_to_keep=parameters['maximum_number_of_epochs']) model_saver.save( sess, OutputPath( 'char_model_{0:05d}.ckpt'.format(epoch_num))) best = current cost_sum = 0 dataset.reset_index('train') if epoch_num >= parameters['maximum_number_of_epochs']: break log_output.close() logging.info("finished training, time is %.2f", time.time() - start) total_token_num = 0 correct_token_num = 0 start = time.time() out_file = open("ner_out", "w") while dataset.has_next_sent('test'): sent = dataset.get_next_sent('test') feed_dict = { model.input_token_indices: sent.word_ids, model.input_label_indices: sent.ner_ids, model.input_pos_indices: sent.pos_ids, model.input_token_character_indices: utils.pad_lists(sent.char_lists), model.input_token_lengths: sent.word_lengths, model.dropout_keep_prob: 1 - parameters['dropout_rate'] } unary_scores, predictions = sess.run( [model.unary_scores, model.predictions], feed_dict) if parameters['use_crf']: predictions, _ = tf.contrib.crf.viterbi_decode( unary_scores, transition_params_trained) predictions = predictions[1:-1] else: predictions = predictions.tolist() total_token_num += len(predictions) gold_labels = sent.ner_ids words = sent.get_word_list() pos = sent.get_pos_list() for idx, p in enumerate(predictions): tag = gold_labels[idx] if p == tag: correct_token_num += 1 out_file.write("%s %s %s %s\n" % (words[idx], pos[idx], dataset.ner_map[tag], dataset.ner_map[p])) out_file.write("\n") out_file.close() logging.info('token number is %d, accuracy is %.2f%%, time is %.2f', total_token_num, (100.0 * correct_token_num / total_token_num), time.time() - start)
def main(): logging.set_verbosity(logging.INFO) if not gfile.IsDirectory(OutputPath('')): gfile.MakeDirs(OutputPath('')) log_output = codecs.open("batch_train_out", 'w') parameters = {} parameters['use_character_lstm'] = True parameters['character_embedding_dimension'] = 25 parameters['token_embedding_dimension'] = 100 parameters['token_pretrained_embedding_filepath'] = '' parameters['character_lstm_hidden_state_dimension'] = 25 parameters['token_lstm_hidden_state_dimension'] = 100 parameters['use_crf'] = True parameters['optimizer'] = 'adam' parameters['learning_rate'] = 0.005 parameters['gradient_clipping_value'] = 5.0 parameters['dropout_rate'] = 0.2 parameters['maximum_number_of_epochs'] = 10 loading_time = time.time() train_data_path = '/cs/natlang-user/vivian/wsj-conll/train.conllu' dev_data_path = '/cs/natlang-user/vivian/wsj-conll/dev.conllu' logging.info("loading data and precomputing features...") train_data = Dataset(train_data_path) train_data.load_dataset() test_data = Dataset(dev_data_path) test_data.load_dataset(train_data.word_map, train_data.tag_map, train_data.char_map) sess = tf.Session() with sess.as_default(): model = EntityLSTM(train_data, parameters) sess.run(tf.global_variables_initializer()) #load glove token embeddings load_pretrained_token_embeddings(sess, model, train_data, parameters) epoch_num = 0 start = time.time() best = 0.0 while True: step = 0 epoch_num += 1 cost_sum = 0 while train_data.has_next_sent(): sent = train_data.get_next_sent() step += 1 feed_dict = { model.input_token_indices: sent.word_ids, model.input_label_indices: sent.tag_ids, model.input_token_character_indices: utils.pad_lists(sent.char_lists), model.input_token_lengths: sent.word_lengths, model.dropout_keep_prob: 1-parameters['dropout_rate'] } _, _, loss, transition_params_trained = sess.run( [model.train_op, model.global_step, model.loss, model.transition_parameters], feed_dict) cost_sum += loss if step % 1000 == 0: current = Evaluate(sess, model, test_data, transition_params_trained, parameters) log_output.write('EPOCH %d, loss is %.2f, accuracy is %.2f\n'%(epoch_num, cost_sum/1000, current)) cost_sum = 0 if current > best: logging.info("saving the model...") model_saver = tf.train.Saver(max_to_keep=parameters['maximum_number_of_epochs']) model_saver.save(sess, OutputPath('char_model_{0:05d}.ckpt'.format(epoch_num))) best = current train_data.reset_index() if epoch_num >= parameters['maximum_number_of_epochs']: break log_output.close() logging.info("finished training, time is %.2f", time.time()-start) total_token_num = 0 correct_token_num = 0 start = time.time() while test_data.has_next_sent(): sent = test_data.get_next_sent() feed_dict = { model.input_token_indices: sent.word_ids, model.input_label_indices: sent.tag_ids, model.input_token_character_indices: utils.pad_lists(sent.char_lists), model.input_token_lengths: sent.word_lengths, model.dropout_keep_prob: 1-parameters['dropout_rate'] } logging.info("Train...") unary_scores, predictions = sess.run([model.unary_scores, model.predictions], feed_dict) if parameters['use_crf']: predictions, _ = tf.contrib.crf.viterbi_decode(unary_scores, transition_params_trained) predictions = predictions[1:-1] else: predictions = predictions.tolist() gold_labels = sent.tag_ids assert(len(predictions) == len(gold_labels)) total_token_num += len(predictions) for idx, p in enumerate(predictions): if p == gold_labels[idx]: correct_token_num += 1 logging.info('token number is %d, accuracy is %.2f%%, time is %.2f', total_token_num, (100.0*correct_token_num/total_token_num), time.time()-start)
def main(): logging.set_verbosity(logging.INFO) if not gfile.IsDirectory(OutputPath('')): gfile.MakeDirs(OutputPath('')) parameters = {} parameters['use_crf'] = True parameters['use_character_lstm'] = True parameters['character_embedding_dimension'] = 25 parameters['token_embedding_dimension'] = 100 parameters[ 'token_pretrained_embedding_filepath'] = '/cs/natlang-user/vivian/NeuroNER/data/word_vectors/glove.6B.100d.txt' #parameters['token_pretrained_embedding_filepath'] = '' parameters['character_lstm_hidden_state_dimension'] = 50 parameters['token_lstm_hidden_state_dimension'] = 100 parameters['optimizer'] = 'sgd' parameters['learning_rate'] = 0.005 parameters['gradient_clipping_value'] = 0 parameters['dropout_rate'] = 0.5 parameters['maximum_number_of_epochs'] = 50 parameters['freeze_token_embeddings'] = False loading_time = time.time() train_data_path = '/cs/natlang-user/vivian/engonto.train' dev_data_path = '/cs/natlang-user/vivian/engonto.testa' test_data_path = '/cs/natlang-user/vivian/engonto.testb' logging.info("loading data and precomputing features...") dataset = Dataset(train_data_path, dev_data_path, test_data_path, use_char=True) dataset.load_dataset() logging.info(dataset.ner_map) logging.info(dataset.ner_index) logging.info(time.time() - loading_time) total_time = 0.0 sess = tf.Session() with sess.as_default(): model = EntityLSTM(dataset, parameters) sess.run(tf.global_variables_initializer()) #load glove token embeddings model.load_pretrained_token_embeddings(sess, dataset, parameters) epoch_num = 0 start = time.time() best = 0.0 while True: step = 0 epoch_num += 1 cost_sum = 0 while dataset.has_next_sent('train'): sent = dataset.get_next_sent('train') step += 1 feed_dict = { model.input_token_indices: sent.word_ids, model.input_label_indices: sent.ner_ids, model.input_token_character_indices: utils.pad_lists(sent.char_lists), model.input_token_lengths: sent.word_lengths, model.dropout_keep_prob: 1 - parameters['dropout_rate'] } if parameters['use_crf']: _, loss, transition_params_trained = sess.run([ model.train_op, model.loss, model.transition_parameters ], feed_dict) else: _, loss = sess.run([model.train_op, model.loss], feed_dict) transition_params_trained = None ''' cost_sum += loss if step % 1000 == 0: current = Evaluate(sess, model, dataset, transition_params_trained, parameters) log_output.write('EPOCH %d, loss is %.2f'%(epoch_num, cost_sum/1000)) if current > best: logging.info("saving the model...") model_saver = tf.train.Saver(max_to_keep=parameters['maximum_number_of_epochs']) model_saver.save(sess, OutputPath('char_model_{0:05d}.ckpt'.format(epoch_num))) best = current cost_sum = 0 ''' current = Evaluate(sess, model, dataset, transition_params_trained, parameters, epoch_num) dataset.reset_index('train') if epoch_num >= parameters['maximum_number_of_epochs']: break model_saver = tf.train.Saver( max_to_keep=parameters['maximum_number_of_epochs']) model_saver.save(sess, OutputPath('char_model')) total_time += Evaluate(sess, model, dataset, transition_params_trained, parameters, epoch_num) logging.info("done")
def main(): logging.set_verbosity(logging.INFO) if not gfile.IsDirectory(OutputPath('')): gfile.MakeDirs(OutputPath('')) parameters = {} parameters['use_character_lstm'] = True parameters['character_embedding_dimension'] = 25 parameters['token_embedding_dimension'] = 100 parameters['token_pretrained_embedding_filepath'] = '' parameters['pretrained_model_checkpoint_filepath'] = OutputPath( 'char_model_{0:05d}.ckpt'.format(2)) parameters['character_lstm_hidden_state_dimension'] = 25 parameters['token_lstm_hidden_state_dimension'] = 100 parameters['use_crf'] = True parameters['optimizer'] = 'adam' parameters['learning_rate'] = 0.005 parameters['gradient_clipping_value'] = 5.0 parameters['dropout_rate'] = 0.2 parameters['maximum_number_of_epochs'] = 10 loading_time = time.time() test_data_path = '/cs/natlang-user/vivian/wsj-conll/test.conllu' wordMapPath = 'word_map' tagMapPath = 'tag_map' charMapPath = 'char_map' word_map = readMap(wordMapPath) tag_map = readMap(tagMapPath) char_map = readMap(charMapPath) test_data = Dataset(test_data_path) test_data.load_dataset(word_map, tag_map, char_map) sess = tf.Session() with sess.as_default(): model = EntityLSTM(test_data, parameters) sess.run(tf.global_variables_initializer()) model_saver = tf.train.Saver( max_to_keep=parameters['maximum_number_of_epochs']) model_saver.restore(sess, parameters['pretrained_model_checkpoint_filepath']) total_token_num = 0 correct_token_num = 0 start = time.time() transition_params_trained = sess.run(model.transition_parameters) start = time.time() while test_data.has_next_sent(): sent = test_data.get_next_sent() feed_dict = { model.input_token_indices: sent.word_ids, model.input_label_indices: sent.tag_ids, model.input_token_character_indices: utils.pad_lists(sent.char_lists), model.input_token_lengths: sent.word_lengths, model.dropout_keep_prob: 1 - parameters['dropout_rate'] } unary_scores, predictions = sess.run( [model.unary_scores, model.predictions], feed_dict) if parameters['use_crf']: predictions, _ = tf.contrib.crf.viterbi_decode( unary_scores, transition_params_trained) predictions = predictions[1:-1] else: predictions = predictions.tolist() gold_labels = sent.tag_ids total_token_num += len(predictions) for idx, p in enumerate(predictions): if p == gold_labels[idx]: correct_token_num += 1 logging.info('token number is %d, accuracy is %.2f%%, time is %.2f', total_token_num, (100.0 * correct_token_num / total_token_num), time.time() - start)
def main(): logging.set_verbosity(logging.INFO) if not gfile.IsDirectory(OutputPath('')): gfile.MakeDirs(OutputPath('')) parameters = {} parameters['use_character_lstm'] = False parameters['character_embedding_dimension'] = 25 parameters['token_embedding_dimension'] = 100 parameters['freeze_token_embeddings'] = False parameters['character_lstm_hidden_state_dimension'] = 25 parameters['token_lstm_hidden_state_dimension'] = 100 parameters['use_crf'] = True parameters['optimizer'] = 'adam' parameters['learning_rate'] = 0.002 parameters['gradient_clipping_value'] = 5.0 parameters['dropout_rate'] = 0.4 parameters['maximum_number_of_epochs'] = 10 parameters['batch_size'] = 32 loading_time = time.time() train_data_path = '/cs/natlang-user/vivian/wsj-conll/train.conllu' dev_data_path = '/cs/natlang-user/vivian/wsj-conll/test.conllu' logging.info("loading data and precomputing features...") train_data = Dataset(train_data_path) train_data.load_dataset() test_data = Dataset(dev_data_path) test_data.load_dataset(train_data.word_map, train_data.tag_map, train_data.char_map) sess = tf.Session() epoch_num = 0 with sess.as_default(): model = EntityLSTM(train_data, parameters) sess.run(tf.global_variables_initializer()) start = time.time() best = 0.0 while True: step = 0 sentences = [] tags = [] sentence_lengths = [] word_lengths = [] while len(sentences) < parameters['batch_size']: sent, epoch_num = advance_sent(epoch_num, train_data) sentences.append(sent.word_ids) tags.append(sent.tag_ids) sentence_lengths.append(sent.get_sent_len()) feed_dict = { model.input_token_indices: utils.pad_lists(sentences), model.input_sent_lengths: sentence_lengths, model.input_label_indices:utils.pad_lists(tags), model.dropout_keep_prob: 1-parameters['dropout_rate'] } _, _, loss, accuracy, transition_params_trained = sess.run( [model.train_op, model.global_step, model.loss, model.accuracy, model.transition_parameters], feed_dict) step += 1 ''' if step % 10 == 0: current = Evaluate(sess, model, test_data, transition_params_trained, parameters) if current > best: model_saver = tf.train.Saver(max_to_keep=parameters['maximum_number_of_epochs']) model_saver.save(sess, OutputPath('char_model_{0:05d}.ckpt'.format(epoch_num))) best = current logging.info('EPOCH %d, Training %.2f%% done', epoch_num, (100.0*step/train_data.get_sent_num())) logging.info('best accuracy is %.2f%%', best) ''' if epoch_num >= parameters['maximum_number_of_epochs']: break best = Evaluate(sess, model, test_data, transition_params_trained, parameters) logging.info("finished training, time is %.2f", time.time()-start)