def evaluate_tagger(model, batched_dev_data, dev_dep_trees, dev_elmo_hdf5, evaluator, writer, global_step): predictions = [] dev_loss = 0 total_correct, total_prop = 0, 0 model.eval() for i, batched_tensor in enumerate(batched_dev_data): x, y, lengths, weights = batched_tensor word_inputs_seqs, predicate_inputs_seqs, syn_label_ids, pes, sentences_ids, answers, input_lengths, masks, padding_answers = \ batch_data_variable(dev_dep_trees, x, y, lengths, weights) elmo_representations = dev_elmo_hdf5.forward( sentences_ids, word_inputs_seqs.size()[-1], [len(ans) for ans in answers]) if args.gpu: word_inputs_seqs, predicate_inputs_seqs, syn_label_ids, input_lengths, masks, padding_answers = \ word_inputs_seqs.cuda(), predicate_inputs_seqs.cuda(), syn_label_ids.cuda(), input_lengths.cuda(), masks.cuda(), padding_answers.cuda() elmo_representations = elmo_representations.cuda() output = model.forward(word_inputs_seqs, predicate_inputs_seqs, syn_label_ids, pes, elmo_representations, input_lengths) loss = model.compute_loss(output, padding_answers, masks) dev_loss += float(loss.data) # accumulate the dev loss output = torch.cat([output[i][:actual_length].view(actual_length, -1) for i, actual_length in \ enumerate(input_lengths)], dim=0) if args.gpu: # convert Variable to numpy p = output.data.cpu().numpy() else: p = output.data.numpy() p = numpy.argmax(p, axis=1) batch_tokens_size = sum(lengths) assert p.shape[0] == batch_tokens_size np_answer = numpy.concatenate(answers) correct = numpy.equal(p, np_answer).sum() denominator = batch_tokens_size # prop numpy.dot(answer, numpy.ones(answer.shape[0])) total_correct += int(correct) total_prop += int(denominator) # split the output of the Model according the order last_index, batch_p = 0, [] for length in input_lengths: batch_p.append(p[last_index:last_index + length]) last_index += length predictions.extend(batch_p) print('Dev loss={:.6f}'.format(dev_loss)) evaluator.evaluate(predictions) print total_correct, " / ", total_prop, " = ", 100.0 * total_correct / total_prop if evaluator.accuracy > evaluator.best_accuracy: evaluator.best_accuracy = evaluator.accuracy writer.write('{}\t{}\t{:.6f}\t{:.2f}\t{:.2f}\n'.format( global_step, time.strftime("%Y-%m-%d %H:%M:%S"), float(dev_loss), float(evaluator.accuracy), float(evaluator.best_accuracy))) writer.flush() if evaluator.has_best: model.save(os.path.join(args.model, 'model'))
def get_scores(config, task, model_path, word_dict_path, label_dict_path, syntactic_dict_path, input_path): with Timer('Data loading'): print ('Task: {}'.format(task)) allow_new_words = True print ('Allow new words in test data: {}'.format(allow_new_words)) # Load word and tag dictionary word_dict = Dictionary(padding_token=PADDING_TOKEN, unknown_token=UNKNOWN_TOKEN) # word tokens to Dict label_dict, syntactic_dict = Dictionary(), Dictionary() word_dict.load(word_dict_path) label_dict.load(label_dict_path) syntactic_dict.load(syntactic_dict_path) data = TaggerData(config, [], [], word_dict, label_dict, None, None) data.syntactic_dict = syntactic_dict # Load test data. if task == 'srl': test_sentences, emb_inits, emb_shapes = reader.get_srl_test_data( input_path, config, data.word_dict, data.label_dict, allow_new_words) else: test_sentences, emb_inits, emb_shapes = reader.get_postag_test_data( input_path, config, data.word_dict, data.label_dict, allow_new_words) print ('Read {} sentences.'.format(len(test_sentences))) # Add pre-trained embeddings for new words in the test data. # if allow_new_words: data.embedding_shapes = emb_shapes data.embeddings = emb_inits # Batching. test_data = data.get_test_data(test_sentences, batch_size=config.dev_batch_size) with Timer('Syntactic Information Extracting'): # extract the syntactic information from file test_dep_trees = SyntacticCONLL() test_dep_trees.read_from_file(args.input_dep_trees) # generate the syntactic label dict in training corpus data.syntactic_dict.accept_new = False test_dep_trees.get_syntactic_label_dict(data.syntactic_dict) with Timer('Model building and loading'): model = BiLSTMTaggerModel(data, config=config, gpu_id=args.gpu) model.load(model_path) for param in model.parameters(): print param.size() if args.gpu: print("Initialize the model with GPU!") model = model.cuda() with Timer('Running model'): scores = [] model.eval() for i, batched_tensor in enumerate(test_data): x, y, lengths, weights = batched_tensor word_inputs_seqs, predicate_inputs_seqs, syn_label_inputs_seqs, pes, answers, input_lengths, masks, padding_answers = \ batch_data_variable(test_dep_trees, None, x, y, lengths, weights) if args.gpu: word_inputs_seqs, predicate_inputs_seqs, syn_label_inputs_seqs, input_lengths, masks, \ padding_answers = \ word_inputs_seqs.cuda(), predicate_inputs_seqs.cuda(), syn_label_inputs_seqs.cuda(), \ input_lengths.cuda(), masks.cuda(), padding_answers.cuda() sc = model.forward(word_inputs_seqs, predicate_inputs_seqs, syn_label_inputs_seqs, pes, input_lengths) sc = sc.data.cpu().numpy() if args.gpu else sc.data.numpy() sc = [sc[j] for j in range(sc.shape[0])] scores.extend(sc) return scores, data, test_sentences, test_data
def get_scores(config, task, model_path, word_dict_path, label_dict_path, tpf_dict_path, input_path): with Timer('Data loading'): print('Task: {}'.format(task)) allow_new_words = True print('Allow new words in test data: {}'.format(allow_new_words)) # Load word and tag dictionary word_dict = Dictionary( padding_token=PADDING_TOKEN, unknown_token=UNKNOWN_TOKEN) # word tokens to Dict label_dict = Dictionary() tpf_dict = Dictionary() word_dict.load(word_dict_path) label_dict.load(label_dict_path) tpf_dict.load(tpf_dict_path) data = TaggerData(config, [], [], word_dict, label_dict, None, None) data.tpf2_dict = tpf_dict # Load test data. if task == 'srl': test_sentences, emb_inits, emb_shapes = reader.get_srl_test_data( input_path, config, data.word_dict, data.label_dict, allow_new_words) else: test_sentences, emb_inits, emb_shapes = reader.get_postag_test_data( input_path, config, data.word_dict, data.label_dict, allow_new_words) print('Read {} sentences.'.format(len(test_sentences))) # Add pre-trained embeddings for new words in the test data. # if allow_new_words: data.embedding_shapes = emb_shapes data.embeddings = emb_inits # Batching. test_data = data.get_test_data(test_sentences, batch_size=config.dev_batch_size) with Timer("Get test sentences dict"): test_sentences_w_id = [] for sen in get_srl_sentences(args.input): test_sentences_w_id.append(' '.join(sen[1])) test_sentences_ids = [int(sen[0][0]) for sen in test_sentences] temp = {} assert len(test_sentences_w_id) == len(test_sentences_ids) for idx, sen in zip(test_sentences_ids, test_sentences_w_id): temp[idx] = sen test_sentences_w_id = temp with Timer("Loading ELMO"): test_elmo_hdf5 = hdf5_reader() test_elmo_hdf5.read_from_file(args.input_elmo, test_sentences_w_id) with Timer('Syntactic Information Extracting' ): # extract the syntactic information from file test_dep_trees = SyntacticCONLL() test_dep_trees.read_from_file(args.input_dep_trees) with Timer("TPF2 generating..."): # generate the tree-based position features according the Dependency Tree. data.tpf2_dict.accept_new = False test_tpf2 = test_dep_trees.get_tpf2_dict(data.test_tensors, data.tpf2_dict) print("Extract {} test TPF2 features".format(len(test_tpf2))) assert len(test_tpf2) == len(data.test_tensors) with Timer('Model building and loading'): model = BiLSTMTaggerModel(data, config=config, gpu_id=args.gpu) model.load(model_path) for param in model.parameters(): print(param.size()) if args.gpu: print("Initialize the model with GPU!") model = model.cuda() with Timer('Running model'): scores = [] model.eval() for i, batched_tensor in enumerate(test_data): x, y, lengths, weights = batched_tensor word_inputs_seqs, predicate_inputs_seqs, tpf_ids, sentences_ids, answers, input_lengths, masks, padding_answers = \ batch_data_variable(test_tpf2, x, y, lengths, weights) elmo_representations = test_elmo_hdf5.forward( sentences_ids, word_inputs_seqs.size()[-1], [len(ans) for ans in answers]) if args.gpu: word_inputs_seqs, predicate_inputs_seqs, tpf_ids, input_lengths, masks, padding_answers = \ word_inputs_seqs.cuda(), predicate_inputs_seqs.cuda(), tpf_ids.cuda(), input_lengths.cuda(), masks.cuda(), padding_answers.cuda() elmo_representations = elmo_representations.cuda() sc = model.forward(word_inputs_seqs, predicate_inputs_seqs, tpf_ids, elmo_representations, input_lengths) sc = sc.data.cpu().numpy() if args.gpu else sc.data.numpy() sc = [sc[j] for j in range(sc.shape[0])] scores.extend(sc) return scores, data, test_sentences, test_data
def train_tagger(args): config = configuration.get_config(args.config) numpy.random.seed(666) torch.manual_seed(666) torch.set_printoptions(precision=20) ### gpu gpu = torch.cuda.is_available() if args.gpu and gpu: print("GPU available: {}\t GPU ID: {}".format(gpu, args.gpu)) torch.cuda.manual_seed(666) os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu i = 0 global_step = 0 epoch = 0 train_loss = 0.0 with Timer('Data loading'): vocab_path = args.vocab if args.vocab != '' else None label_path = args.labels if args.labels != '' else None gold_props_path = args.gold if args.gold != '' else None print('Task: {}'.format(args.task)) if args.task == 'srl': # Data and evaluator for SRL. data = TaggerData( config, *reader.get_srl_data(config, args.train, args.dev, vocab_path, label_path)) evaluator = SRLEvaluator(data.get_development_data(), data.label_dict, gold_props_file=gold_props_path, use_se_marker=config.use_se_marker, pred_props_file=None, word_dict=data.word_dict) else: print "Not implemented yet!" exit() # Data and evaluator for PropId. data = TaggerData( config, *reader.get_postag_data(config, args.train, args.dev, vocab_path, label_path)) evaluator = PropIdEvaluator(data.get_development_data(), data.label_dict) batched_dev_data = data.get_development_data( batch_size=config.dev_batch_size) print('Dev data has {} batches.'.format(len(batched_dev_data))) with Timer('Syntactic Information Extracting' ): # extract the syntactic information from file train_dep_trees = SyntacticCONLL() dev_dep_trees = SyntacticCONLL() train_dep_trees.read_from_file(args.train_dep_trees) dev_dep_trees.read_from_file(args.dev_dep_trees) # generate the syntactic label dict in training corpus data.syntactic_dict = train_dep_trees.get_syntactic_label_dict() data.syntactic_dict.accept_new = False dev_dep_trees.get_syntactic_label_dict(data.syntactic_dict) with Timer('Preparation'): if not os.path.isdir(args.model): print('Directory {} does not exist. Creating new.'.format( args.model)) os.makedirs(args.model) else: if len(os.listdir(args.model)) > 0: print ('[WARNING] Log directory {} is not empty, previous checkpoints might be overwritten' \ .format(args.model)) shutil.copyfile(args.config, os.path.join(args.model, 'config')) # Save word and label dict to model directory. data.word_dict.save(os.path.join(args.model, 'word_dict')) data.label_dict.save(os.path.join(args.model, 'label_dict')) data.syntactic_dict.save(os.path.join(args.model, 'syn_label_dict')) writer = open(os.path.join(args.model, 'checkpoints.tsv'), 'w') writer.write( 'step\tdatetime\tdev_loss\tdev_accuracy\tbest_dev_accuracy\n') with Timer('Building model'): model = BiLSTMTaggerModel(data, config=config, gpu_id=args.gpu) if args.gpu: print "BiLSTMTaggerModel initialize with Cuda!" model = model.cuda() if args.gpu != "" and not torch.cuda.is_available(): raise Exception("No GPU Found!") exit() for param in model.parameters(): print param.size() optimizer = torch.optim.Adadelta( model.parameters(), lr=1.0, rho=0.95) # initialize the optimizer outside the epoch batch_position_encoding = position_encoding_init( 200, 100) # 0: root, 1, ...n, n+1: padding | max length 200 while epoch < config.max_epochs: with Timer("Epoch%d" % epoch) as timer: model.train() train_data = data.get_training_data(include_last_batch=True) for batched_tensor in train_data: # for each batch in the training corpus x, y, lengths, weights = batched_tensor word_inputs_seqs, predicate_inputs_seqs, syn_label_inputs_seqs, pes, _, input_lengths, masks, padding_answers = \ batch_data_variable(train_dep_trees, batch_position_encoding, x, y, lengths, weights) if args.gpu: word_inputs_seqs, predicate_inputs_seqs, syn_label_inputs_seqs, input_lengths, masks, \ padding_answers = \ word_inputs_seqs.cuda(), predicate_inputs_seqs.cuda(), syn_label_inputs_seqs.cuda(), \ input_lengths.cuda(), masks.cuda(), padding_answers.cuda() optimizer.zero_grad() output = model.forward(word_inputs_seqs, predicate_inputs_seqs, syn_label_inputs_seqs, pes, input_lengths) loss = model.compute_loss(output, padding_answers, masks) loss.backward() # gradient clipping torch.nn.utils.clip_grad_norm(model.parameters(), 1.0) optimizer.step() train_loss += loss.data # should be tensor not Variable, avoiding the graph accumulates i += 1 global_step += 1 if i % 400 == 0: timer.tick("{} training steps, loss={:.3f}".format( i, float(train_loss / i))) train_loss = train_loss / i print("Epoch {}, steps={}, loss={:.3f}".format( epoch, i, float(train_loss))) i = 0 epoch += 1 train_loss = 0.0 if epoch % config.checkpoint_every_x_epochs == 0: with Timer('Evaluation'): evaluate_tagger(model, batch_position_encoding, batched_dev_data, dev_dep_trees, evaluator, writer, global_step) # Done. :) writer.close()