def get_scores(config, task, model_path, word_dict_path, label_dict_path, syntactic_dict_path, input_path): with Timer('Data loading'): print ('Task: {}'.format(task)) allow_new_words = True print ('Allow new words in test data: {}'.format(allow_new_words)) # Load word and tag dictionary word_dict = Dictionary(padding_token=PADDING_TOKEN, unknown_token=UNKNOWN_TOKEN) # word tokens to Dict label_dict, syntactic_dict = Dictionary(), Dictionary() word_dict.load(word_dict_path) label_dict.load(label_dict_path) syntactic_dict.load(syntactic_dict_path) data = TaggerData(config, [], [], word_dict, label_dict, None, None) data.syntactic_dict = syntactic_dict # Load test data. if task == 'srl': test_sentences, emb_inits, emb_shapes = reader.get_srl_test_data( input_path, config, data.word_dict, data.label_dict, allow_new_words) else: test_sentences, emb_inits, emb_shapes = reader.get_postag_test_data( input_path, config, data.word_dict, data.label_dict, allow_new_words) print ('Read {} sentences.'.format(len(test_sentences))) # Add pre-trained embeddings for new words in the test data. # if allow_new_words: data.embedding_shapes = emb_shapes data.embeddings = emb_inits # Batching. test_data = data.get_test_data(test_sentences, batch_size=config.dev_batch_size) with Timer('Syntactic Information Extracting'): # extract the syntactic information from file test_dep_trees = SyntacticCONLL() test_dep_trees.read_from_file(args.input_dep_trees) # generate the syntactic label dict in training corpus data.syntactic_dict.accept_new = False test_dep_trees.get_syntactic_label_dict(data.syntactic_dict) with Timer('Model building and loading'): model = BiLSTMTaggerModel(data, config=config, gpu_id=args.gpu) model.load(model_path) for param in model.parameters(): print param.size() if args.gpu: print("Initialize the model with GPU!") model = model.cuda() with Timer('Running model'): scores = [] model.eval() for i, batched_tensor in enumerate(test_data): x, y, lengths, weights = batched_tensor word_inputs_seqs, predicate_inputs_seqs, syn_label_inputs_seqs, pes, answers, input_lengths, masks, padding_answers = \ batch_data_variable(test_dep_trees, None, x, y, lengths, weights) if args.gpu: word_inputs_seqs, predicate_inputs_seqs, syn_label_inputs_seqs, input_lengths, masks, \ padding_answers = \ word_inputs_seqs.cuda(), predicate_inputs_seqs.cuda(), syn_label_inputs_seqs.cuda(), \ input_lengths.cuda(), masks.cuda(), padding_answers.cuda() sc = model.forward(word_inputs_seqs, predicate_inputs_seqs, syn_label_inputs_seqs, pes, input_lengths) sc = sc.data.cpu().numpy() if args.gpu else sc.data.numpy() sc = [sc[j] for j in range(sc.shape[0])] scores.extend(sc) return scores, data, test_sentences, test_data
def get_scores(config, task, model_path, word_dict_path, label_dict_path, tpf_dict_path, input_path): with Timer('Data loading'): print('Task: {}'.format(task)) allow_new_words = True print('Allow new words in test data: {}'.format(allow_new_words)) # Load word and tag dictionary word_dict = Dictionary( padding_token=PADDING_TOKEN, unknown_token=UNKNOWN_TOKEN) # word tokens to Dict label_dict = Dictionary() tpf_dict = Dictionary() word_dict.load(word_dict_path) label_dict.load(label_dict_path) tpf_dict.load(tpf_dict_path) data = TaggerData(config, [], [], word_dict, label_dict, None, None) data.tpf2_dict = tpf_dict # Load test data. if task == 'srl': test_sentences, emb_inits, emb_shapes = reader.get_srl_test_data( input_path, config, data.word_dict, data.label_dict, allow_new_words) else: test_sentences, emb_inits, emb_shapes = reader.get_postag_test_data( input_path, config, data.word_dict, data.label_dict, allow_new_words) print('Read {} sentences.'.format(len(test_sentences))) # Add pre-trained embeddings for new words in the test data. # if allow_new_words: data.embedding_shapes = emb_shapes data.embeddings = emb_inits # Batching. test_data = data.get_test_data(test_sentences, batch_size=config.dev_batch_size) with Timer("Get test sentences dict"): test_sentences_w_id = [] for sen in get_srl_sentences(args.input): test_sentences_w_id.append(' '.join(sen[1])) test_sentences_ids = [int(sen[0][0]) for sen in test_sentences] temp = {} assert len(test_sentences_w_id) == len(test_sentences_ids) for idx, sen in zip(test_sentences_ids, test_sentences_w_id): temp[idx] = sen test_sentences_w_id = temp with Timer("Loading ELMO"): test_elmo_hdf5 = hdf5_reader() test_elmo_hdf5.read_from_file(args.input_elmo, test_sentences_w_id) with Timer('Syntactic Information Extracting' ): # extract the syntactic information from file test_dep_trees = SyntacticCONLL() test_dep_trees.read_from_file(args.input_dep_trees) with Timer("TPF2 generating..."): # generate the tree-based position features according the Dependency Tree. data.tpf2_dict.accept_new = False test_tpf2 = test_dep_trees.get_tpf2_dict(data.test_tensors, data.tpf2_dict) print("Extract {} test TPF2 features".format(len(test_tpf2))) assert len(test_tpf2) == len(data.test_tensors) with Timer('Model building and loading'): model = BiLSTMTaggerModel(data, config=config, gpu_id=args.gpu) model.load(model_path) for param in model.parameters(): print(param.size()) if args.gpu: print("Initialize the model with GPU!") model = model.cuda() with Timer('Running model'): scores = [] model.eval() for i, batched_tensor in enumerate(test_data): x, y, lengths, weights = batched_tensor word_inputs_seqs, predicate_inputs_seqs, tpf_ids, sentences_ids, answers, input_lengths, masks, padding_answers = \ batch_data_variable(test_tpf2, x, y, lengths, weights) elmo_representations = test_elmo_hdf5.forward( sentences_ids, word_inputs_seqs.size()[-1], [len(ans) for ans in answers]) if args.gpu: word_inputs_seqs, predicate_inputs_seqs, tpf_ids, input_lengths, masks, padding_answers = \ word_inputs_seqs.cuda(), predicate_inputs_seqs.cuda(), tpf_ids.cuda(), input_lengths.cuda(), masks.cuda(), padding_answers.cuda() elmo_representations = elmo_representations.cuda() sc = model.forward(word_inputs_seqs, predicate_inputs_seqs, tpf_ids, elmo_representations, input_lengths) sc = sc.data.cpu().numpy() if args.gpu else sc.data.numpy() sc = [sc[j] for j in range(sc.shape[0])] scores.extend(sc) return scores, data, test_sentences, test_data
def train_tagger(args): config = configuration.get_config(args.config) numpy.random.seed(666) torch.manual_seed(666) torch.set_printoptions(precision=20) ### gpu gpu = torch.cuda.is_available() if args.gpu and gpu: print("GPU available: {}\t GPU ID: {}".format(gpu, args.gpu)) torch.cuda.manual_seed(666) os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu i = 0 global_step = 0 epoch = 0 train_loss = 0.0 with Timer('Data loading'): vocab_path = args.vocab if args.vocab != '' else None label_path = args.labels if args.labels != '' else None gold_props_path = args.gold if args.gold != '' else None print('Task: {}'.format(args.task)) if args.task == 'srl': # Data and evaluator for SRL. data = TaggerData( config, *reader.get_srl_data(config, args.train, args.dev, vocab_path, label_path)) evaluator = SRLEvaluator(data.get_development_data(), data.label_dict, gold_props_file=gold_props_path, use_se_marker=config.use_se_marker, pred_props_file=None, word_dict=data.word_dict) else: print "Not implemented yet!" exit() # Data and evaluator for PropId. data = TaggerData( config, *reader.get_postag_data(config, args.train, args.dev, vocab_path, label_path)) evaluator = PropIdEvaluator(data.get_development_data(), data.label_dict) batched_dev_data = data.get_development_data( batch_size=config.dev_batch_size) print('Dev data has {} batches.'.format(len(batched_dev_data))) with Timer('Syntactic Information Extracting' ): # extract the syntactic information from file train_dep_trees = SyntacticCONLL() dev_dep_trees = SyntacticCONLL() train_dep_trees.read_from_file(args.train_dep_trees) dev_dep_trees.read_from_file(args.dev_dep_trees) # generate the syntactic label dict in training corpus data.syntactic_dict = train_dep_trees.get_syntactic_label_dict() data.syntactic_dict.accept_new = False dev_dep_trees.get_syntactic_label_dict(data.syntactic_dict) with Timer('Preparation'): if not os.path.isdir(args.model): print('Directory {} does not exist. Creating new.'.format( args.model)) os.makedirs(args.model) else: if len(os.listdir(args.model)) > 0: print ('[WARNING] Log directory {} is not empty, previous checkpoints might be overwritten' \ .format(args.model)) shutil.copyfile(args.config, os.path.join(args.model, 'config')) # Save word and label dict to model directory. data.word_dict.save(os.path.join(args.model, 'word_dict')) data.label_dict.save(os.path.join(args.model, 'label_dict')) data.syntactic_dict.save(os.path.join(args.model, 'syn_label_dict')) writer = open(os.path.join(args.model, 'checkpoints.tsv'), 'w') writer.write( 'step\tdatetime\tdev_loss\tdev_accuracy\tbest_dev_accuracy\n') with Timer('Building model'): model = BiLSTMTaggerModel(data, config=config, gpu_id=args.gpu) if args.gpu: print "BiLSTMTaggerModel initialize with Cuda!" model = model.cuda() if args.gpu != "" and not torch.cuda.is_available(): raise Exception("No GPU Found!") exit() for param in model.parameters(): print param.size() optimizer = torch.optim.Adadelta( model.parameters(), lr=1.0, rho=0.95) # initialize the optimizer outside the epoch batch_position_encoding = position_encoding_init( 200, 100) # 0: root, 1, ...n, n+1: padding | max length 200 while epoch < config.max_epochs: with Timer("Epoch%d" % epoch) as timer: model.train() train_data = data.get_training_data(include_last_batch=True) for batched_tensor in train_data: # for each batch in the training corpus x, y, lengths, weights = batched_tensor word_inputs_seqs, predicate_inputs_seqs, syn_label_inputs_seqs, pes, _, input_lengths, masks, padding_answers = \ batch_data_variable(train_dep_trees, batch_position_encoding, x, y, lengths, weights) if args.gpu: word_inputs_seqs, predicate_inputs_seqs, syn_label_inputs_seqs, input_lengths, masks, \ padding_answers = \ word_inputs_seqs.cuda(), predicate_inputs_seqs.cuda(), syn_label_inputs_seqs.cuda(), \ input_lengths.cuda(), masks.cuda(), padding_answers.cuda() optimizer.zero_grad() output = model.forward(word_inputs_seqs, predicate_inputs_seqs, syn_label_inputs_seqs, pes, input_lengths) loss = model.compute_loss(output, padding_answers, masks) loss.backward() # gradient clipping torch.nn.utils.clip_grad_norm(model.parameters(), 1.0) optimizer.step() train_loss += loss.data # should be tensor not Variable, avoiding the graph accumulates i += 1 global_step += 1 if i % 400 == 0: timer.tick("{} training steps, loss={:.3f}".format( i, float(train_loss / i))) train_loss = train_loss / i print("Epoch {}, steps={}, loss={:.3f}".format( epoch, i, float(train_loss))) i = 0 epoch += 1 train_loss = 0.0 if epoch % config.checkpoint_every_x_epochs == 0: with Timer('Evaluation'): evaluate_tagger(model, batch_position_encoding, batched_dev_data, dev_dep_trees, evaluator, writer, global_step) # Done. :) writer.close()
data.char_dict, data.label_dict, allow_new_words) eval_data = load_eval_data(args.input) print('Read {} sentences.'.format(len(test_sentences))) # Add pre-trained embeddings for new words in the test data. # if allow_new_words: data.word_embeddings = emb[0] data.head_embeddings = emb[1] data.word_embedding_shapes = emb_shapes[0] data.head_embedding_shapes = emb_shapes[1] # Batching. test_data = data.get_test_data(test_sentences, batch_size=config.dev_batch_size) with Timer('Model building and loading'): model = BiLSTMTaggerModel(data, config=config, gpu_id=args.gpu) model.load(model_path) for param in model.parameters(): print param.size() if args.gpu: print("Initialize the model with GPU!") model = model.cuda() with Timer('Running model'): dev_loss = 0.0 srl_predictions = [] # with torch.no_grad(): # Eval don't need the grad model.eval() for i, batched_tensor in enumerate(test_data): sent_ids, sent_lengths, \
def train_tagger(args): # get the parse configuration config = configuration.get_config(args.config) config.span_based = args.span == "span" # set random seeds of numpy and torch numpy.random.seed(666) torch.manual_seed(666) # set pytorch print precision torch.set_printoptions(precision=20) # set the default number of threads torch.set_num_threads(4) # GPU of pytorch gpu = torch.cuda.is_available() if args.gpu and gpu: print("GPU available? {}\t and GPU ID is : {}".format(gpu, args.gpu)) # set pytorch.cuda's random seed torch.cuda.manual_seed(666) os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu with Timer('Data loading'): vocab_path = args.vocab if args.vocab != '' else None label_path = args.labels if args.labels != '' else None gold_props_path = args.gold if args.gold != '' else None print('Task is : {}'.format(args.task)) assert args.task == 'SRL' # Data for SRL. data = TaggerData( config, *reader.get_srl_data(config, args.train, args.dep_trees, args.dev, vocab_path, label_path)) # Generate SRL evaluator for Dev data """Actually, this evaluator has been abandoned, and the only function is to store the highest accuracy.""" evaluator = SRLEvaluator(data.get_development_data(), data.label_dict, gold_props_file=gold_props_path, pred_props_file=None, word_dict=data.word_dict) batched_dev_data = data.get_development_data( batch_size=config.dev_batch_size) print('Dev data has {} batches.'.format(len(batched_dev_data))) with Timer('Syntactic Information Extracting' ): # extract the syntactic information from file # Data for dep Trees train_dep_paths = args.train_dep_trees.split(';') dev_dep_paths = args.dev_dep_trees.split(';') dep_data_path_set = zip(train_dep_paths, dev_dep_paths) dep_treebanks_num = len(train_dep_paths) hete_deps = [] for i in xrange(dep_treebanks_num): train_path, dev_path = dep_data_path_set[i] train_dep_trees, dev_dep_trees = SyntacticCONLL(), SyntacticCONLL() train_dep_trees.read_from_file(train_path) dev_dep_trees.read_from_file(dev_path) # generate the syntactic label dict in training corpus train_dep_trees.get_syntactic_label_dict(data.dep_label_dicts[i]) dev_dep_trees.get_syntactic_label_dict(data.dep_label_dicts[i]) ## append hete_deps.append((train_dep_trees, dev_dep_trees)) with Timer('Preparation'): if not os.path.isdir(args.model): print('Directory {} does not exist. Creating new.'.format( args.model)) os.makedirs(args.model) else: if len(os.listdir(args.model)) > 0: print( '[WARNING] Log directory {} is not empty, previous checkpoints might be overwritten' .format(args.model)) shutil.copyfile(args.config, os.path.join(args.model, 'config')) # Save word and label dict to model directory. data.word_dict.save(os.path.join(args.model, 'word_dict')) data.head_dict.save(os.path.join(args.model, 'head_dict')) data.char_dict.save(os.path.join(args.model, 'char_dict')) data.label_dict.save(os.path.join(args.model, 'label_dict')) for i in xrange(len(data.dep_label_dicts)): data.dep_label_dicts[i].save( os.path.join(args.model, 'dep_label_dict' + str(i))) writer = open(os.path.join(args.model, 'checkpoints.tsv'), 'w') writer.write( 'step\tdatetime\tdev_loss\tdev_accuracy\tbest_dev_accuracy\n') with Timer('Building NN model'): model = BiLSTMTaggerModel(data, config=config, gpu_id=args.gpu) if args.gpu: print "BiLSTMTaggerModel initialize with GPU!" model = model.to(device) if args.gpu != "" and not torch.cuda.is_available(): raise Exception("No GPU Found!") exit() for name, param in model.named_parameters( ): # print pytorch model parameters and the corresponding names print name, param.size() i, global_step, epoch, train_loss = 0, 0, 0, 0.0 parameters = filter(lambda p: p.requires_grad, model.parameters()) last_lr = 0.001 no_more_better_performance = 0 optimizer = torch.optim.Adam( parameters, lr=last_lr) # initialize the model parameter optimizer max_steps = int(config.max_steps) while global_step <= max_steps: # epoch < config.max_epochs initial_time = time.time() with Timer("Epoch%d" % epoch) as timer: model.train() dep_train_data = data.get_dep_training_data( include_last_batch=True) train_data = data.get_training_data(include_last_batch=True) mixed_data = data.mix_training_data(train_data, dep_train_data) for batched_tensor, batched_dep_tensor in mixed_data: # for each batch in the training corpus sent_ids, sent_lengths, \ word_indexes, head_indexes, char_indexes, \ predicate_indexes, arg_starts, arg_ends, arg_labels, srl_lens,\ gold_predicates, num_gold_predicates = batched_tensor hete_dep_trees = get_hete_dep_trees_info( hete_deps, sent_ids, sent_lengths) if args.gpu: word_indexes, head_indexes, char_indexes,\ predicate_indexes, arg_starts, arg_ends, arg_labels, srl_lens = \ word_indexes.cuda(), head_indexes.cuda(), char_indexes.cuda(), predicate_indexes.cuda(), arg_starts.cuda(), \ arg_ends.cuda(), arg_labels.cuda(), srl_lens.cuda() # gold_predicates.cuda(), num_gold_predicates.cuda() optimizer.zero_grad() predicated_dict, srl_loss = model.forward( sent_lengths, word_indexes, head_indexes, char_indexes, (predicate_indexes, arg_starts, arg_ends, arg_labels, srl_lens), (gold_predicates, num_gold_predicates), tree_gru_input=hete_dep_trees) srl_loss.backward() # gradient clipping torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() # dep forward dep_losses = [] for ith, a_batched_dep_tensor in enumerate(batched_dep_tensor): word_indexes, char_indexes, mask, lengths, heads, labels = a_batched_dep_tensor if args.gpu: word_indexes, char_indexes = word_indexes.cuda( ), char_indexes.cuda() dep_loss = model.forward(lengths, word_indexes, None, char_indexes, None, None, None, (ith, heads, labels)) dep_losses.append(dep_loss.detach()) optimizer.zero_grad() dep_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() loss = srl_loss.detach() + sum(dep_losses) train_loss += float( loss.detach() ) # should be tensor not Variable, avoiding the graph accumulates i += 1 global_step += 1 if global_step % 100 == 0: last_lr = adjust_learning_rate(optimizer, last_lr) if i % 250 == 0: total_time = time.time() - initial_time timer.tick( "{} training steps, loss={:.3f}, steps/s={:.2f}". format(global_step, float(train_loss / i), float(global_step / total_time))) train_loss = 0.0 i = 0 train_loss = train_loss / i print("Epoch {}, steps={}, loss={:.3f}".format( epoch, i, float(train_loss))) i = 0 epoch += 1 train_loss = 0.0 if epoch % config.checkpoint_every_x_epochs == 0: with Timer('Evaluation'): evaluate_tagger(model, batched_dev_data, hete_deps, data.eval_data, data.label_dict, config, evaluator, writer, global_step) if evaluator.has_best is True: no_more_better_performance = 0 else: no_more_better_performance += 1 if no_more_better_performance >= 200: print( "no more better performance since the past 200 epochs!" ) exit() # Done. :) writer.close()
def train_tagger(args): config = configuration.get_config(args.config) numpy.random.seed(666) torch.manual_seed(666) torch.cuda.manual_seed(666) ### gpu gpu = torch.cuda.is_available() print("GPU available: ", gpu) i = 0 global_step = 0 epoch = 0 train_loss = 0.0 with Timer('Data loading'): vocab_path = args.vocab if args.vocab != '' else None label_path = args.labels if args.labels != '' else None gold_props_path = args.gold if args.gold != '' else None print ('Task: {}'.format(args.task)) if args.task == 'srl': # Data and evaluator for SRL. data = TaggerData(config, *reader.get_srl_data(config, args.train, args.dev, vocab_path, label_path)) evaluator = SRLEvaluator(data.get_development_data(), data.label_dict, gold_props_file=gold_props_path, use_se_marker=config.use_se_marker, pred_props_file=None, word_dict=data.word_dict) else: # Data and evaluator for PropId. data = TaggerData(config, *reader.get_postag_data(config, args.train, args.dev, vocab_path, label_path)) evaluator = PropIdEvaluator(data.get_development_data(), data.label_dict) batched_dev_data = data.get_development_data(batch_size=config.dev_batch_size) print ('Dev data has {} batches.'.format(len(batched_dev_data))) with Timer('Preparation'): if not os.path.isdir(args.model): print ('Directory {} does not exist. Creating new.'.format(args.model)) os.makedirs(args.model) else: if len(os.listdir(args.model)) > 0: print ('[WARNING] Log directory {} is not empty, previous checkpoints might be overwritten' .format(args.model)) shutil.copyfile(args.config, os.path.join(args.model, 'config')) # Save word and label dict to model directory. data.word_dict.save(os.path.join(args.model, 'word_dict')) data.label_dict.save(os.path.join(args.model, 'label_dict')) writer = open(os.path.join(args.model, 'checkpoints.tsv'), 'w') writer.write('step\tdatetime\tdev_loss\tdev_accuracy\tbest_dev_accuracy\n') with Timer('Building model'): model = BiLSTMTaggerModel(data, config=config, gpu_id=args.gpu) if args.gpu: print "Use Cuda!" model = model.cuda() if args.gpu != "" and not torch.cuda.is_available(): raise Exception("No GPU Found!") exit() for param in model.parameters(): print param.size() """for param in model.params: print param, param.name, param.shape.eval() loss_function = model.get_loss_function() eval_function = model.get_eval_function()""" while epoch < config.max_epochs: with Timer("Epoch%d" % epoch) as timer: train_data = data.get_training_data(include_last_batch=True) model.bilstm.dropout = 0.1 for batched_tensor in train_data: # for each batch in the training corpus x, y, _, weights = batched_tensor batch_input_lengths = ([sentence_x.shape[0] for sentence_x in x]) max_length = max(batch_input_lengths) # padding # input = [numpy.pad(sentence_x, (0, max_length - sentence_x.shape[0]), 'constant') for sentence_x in x] word_input = [numpy.pad(sentence_x[:, 0], (0, max_length - sentence_x.shape[0]), 'constant') \ for sentence_x in x] # padding predicate_input = [numpy.pad(sentence_x[:, 1], (0, max_length - sentence_x.shape[0]), 'constant') \ for sentence_x in x] # padding word_input, predicate_input = numpy.vstack(word_input), numpy.vstack(predicate_input) # numpy batch input to Variable word_input_seqs = torch.autograd.Variable(torch.from_numpy(word_input.astype('int64')).long()) predicate_input_seqs = torch.autograd.Variable(torch.from_numpy(predicate_input.astype('int64')).long()) # First: order the batch by decreasing sequence length input_lengths = torch.LongTensor(batch_input_lengths) input_lengths, perm_idx = input_lengths.sort(0, descending=True) word_input_seqs = word_input_seqs[perm_idx] predicate_input_seqs = predicate_input_seqs[perm_idx] answer = [None] * len(x) # resort the answer according to the input count = 0 list_y = list(y) for (i, ans) in zip(perm_idx, list_y): answer[count] = list_y[i] count += 1 answer = numpy.concatenate(answer) answer = torch.autograd.Variable(torch.from_numpy(answer).type(torch.LongTensor)) answer = answer.view(-1) # print answer, answer.size() # Then pack the sequences # packed_input = torch.nn.utils.rnn.pack_padded_sequence(input_seqs, input_lengths.numpy(), batch_first=True) # packed_input = packed_input.cuda() if args.gpu else packed_input if args.gpu: word_input_seqs, predicate_input_seqs, input_lengths, perm_idx =\ word_input_seqs.cuda(), predicate_input_seqs.cuda(), input_lengths.cuda(), perm_idx.cuda() answer = answer.cuda() model.zero_grad() output = model.forward(word_input_seqs, predicate_input_seqs, input_lengths, perm_idx, len(x)) # (batch input, batch size) loss = model.loss(output, answer) loss.backward() # gradient clipping # torch.nn.utils.clip_grad_norm(model.parameters(), 1.0) model.optimizer = torch.optim.Adadelta(model.parameters(), rho=0.95) model.optimizer.step() train_loss += loss i += 1 global_step += 1 if i % 400 == 0: timer.tick("{} training steps, loss={:.3f}".format(i, float(train_loss / i))) train_loss = train_loss / i print("Epoch {}, steps={}, loss={:.3f}".format(epoch, i, float(train_loss))) i = 0 epoch += 1 train_loss = 0.0 if epoch % config.checkpoint_every_x_epochs == 0: with Timer('Evaluation'): evaluate_tagger(model, batched_dev_data, evaluator, writer, global_step) # Done. :) writer.close()