Ejemplo n.º 1
0
                                                         args.task,
                                                         model_path,
                                                         gemb_model_path,
                                                         word_dict_path,
                                                         label_dict_path,
                                                         args.input)
    ensemble_scores = numpy.add(ensemble_scores, scores) if i > 0 else scores

  # Getting evaluator
  gold_props_file = args.gold if args.gold != '' else None
  pred_props_file = args.inputprops if args.inputprops != '' else None

  if args.task == 'srl':
    evaluator = SRLEvaluator(data.get_test_data(test_sentences, batch_size=None),
                             data.label_dict,
                             gold_props_file,
                             use_se_marker=config.use_se_marker,
                             pred_props_file=pred_props_file,
                             word_dict=data.word_dict)
  else:
    evaluator = PropIdEvaluator(data.get_test_data(test_sentences, batch_size=None),
                                data.label_dict) 

  if args.proto != '':
    print 'Writing to proto {}'.format(args.proto)
    pb_file = open(args.proto, 'wb')      
  else:
    pb_file = None

  with Timer("Decoding"):
    transition_params = get_transition_params(data.label_dict.idx2str)
    num_tokens = None
Ejemplo n.º 2
0
def train_tagger(args):
    config = configuration.get_config(args.config)
    i = 0
    global_step = 0
    epoch = 0
    train_loss = 0.0

    with Timer('Data loading'):
        vocab_path = args.vocab if args.vocab != '' else None
        label_path = args.labels if args.labels != '' else None
        gold_props_path = args.gold if args.gold != '' else None

        print('Task: {}'.format(args.task))
        if args.task == 'srl':
            # Data and evaluator for SRL.
            data = TaggerData(
                config,
                *reader.get_srl_data(config, args.train, args.dev, vocab_path,
                                     label_path))
            evaluator = SRLEvaluator(data.get_development_data(),
                                     data.label_dict,
                                     gold_props_file=gold_props_path,
                                     use_se_marker=config.use_se_marker,
                                     pred_props_file=None,
                                     word_dict=data.word_dict)
        else:
            # Data and evaluator for PropId.
            data = TaggerData(
                config,
                *reader.get_postag_data(config, args.train, args.dev,
                                        vocab_path, label_path))
            evaluator = PropIdEvaluator(data.get_development_data(),
                                        data.label_dict)

        batched_dev_data = data.get_development_data(
            batch_size=config.dev_batch_size)
        print('Dev data has {} batches.'.format(len(batched_dev_data)))

    with Timer('Preparation'):
        if not os.path.isdir(args.model):
            print('Directory {} does not exist. Creating new.'.format(
                args.model))
            os.makedirs(args.model)
        else:
            if len(os.listdir(args.model)) > 0:
                print(
                    '[WARNING] Log directory {} is not empty, previous checkpoints might be overwritten'
                    .format(args.model))
        shutil.copyfile(args.config, os.path.join(args.model, 'config'))
        # Save word and label dict to model directory.
        data.word_dict.save(os.path.join(args.model, 'word_dict'))
        data.label_dict.save(os.path.join(args.model, 'label_dict'))
        writer = open(os.path.join(args.model, 'checkpoints.tsv'), 'w')
        writer.write(
            'step\tdatetime\tdev_loss\tdev_accuracy\tbest_dev_accuracy\n')

    with Timer('Building model'):
        model = BiLSTMTaggerModel(data, config=config)
        for param in model.params:  #params : 러닝 파라미터 모델에서 학습해야될 weight, bias.
            print param, param.name, param.shape.eval()
        loss_function = model.get_loss_function()
        eval_function = model.get_eval_function()

    #training 후 다음 train을 위해 초기화 시키고 training 시킨 것을 평가한다
    while epoch < config.max_epochs:
        with Timer("Epoch%d" % epoch) as timer:
            train_data = data.get_training_data(include_last_batch=True)
            for batched_tensor in train_data:
                x, y, _, weights = batched_tensor
                loss = loss_function(x, weights,
                                     y)  #loss를 계산해서 최적화를 시켜 업데이트를 시키는 함수
                train_loss += loss
                i += 1
                global_step += 1
                if i % 400 == 0:
                    timer.tick("{} training steps, loss={:.3f}".format(
                        i, train_loss / i))

        train_loss = train_loss / i
        print("Epoch {}, steps={}, loss={:.3f}".format(epoch, i, train_loss))
        #초기화
        i = 0
        epoch += 1
        train_loss = 0.0
        if epoch % config.checkpoint_every_x_epochs == 0:
            with Timer('Evaluation'):
                evaluate_tagger(model, eval_function, batched_dev_data,
                                evaluator, writer, global_step)

    # Done. :)
    writer.close()
def train_tagger(args):
    # get the parse configuration
    config = configuration.get_config(args.config)
    # set random seeds of numpy and torch
    numpy.random.seed(666)
    torch.manual_seed(666)
    # set pytorch print precision
    torch.set_printoptions(precision=20)
    # set the default number of threads
    torch.set_num_threads(4)
    # GPU of pytorch
    gpu = torch.cuda.is_available()
    if args.gpu and gpu:
        print("GPU available? {}\t and GPU ID is : {}".format(gpu, args.gpu))
        # set pytorch.cuda's random seed
        torch.cuda.manual_seed(666)
        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu

    with Timer('Data loading'):
        vocab_path = args.vocab if args.vocab != '' else None
        chars_path = args.chars if args.chars != '' else None
        label_path = args.labels if args.labels != '' else None
        gold_props_path = args.gold if args.gold != '' else None

        print('Task is : {}'.format(args.task))
        assert args.task == 'SRL'
        # Data for SRL.
        data = TaggerData(
            config,
            *reader.get_srl_data(config, args.train, args.dep_trees, args.dev,
                                 vocab_path, chars_path, label_path))
        # Generate SRL evaluator for Dev data
        """Actually, this evaluator has been abandoned, and the only function is to store the highest accuracy."""
        evaluator = SRLEvaluator(data.get_development_data(),
                                 data.label_dict,
                                 gold_props_file=gold_props_path,
                                 pred_props_file=None,
                                 word_dict=data.word_dict)
        batched_dev_data = data.get_development_data(
            batch_size=config.dev_batch_size)
        print('Dev data has {} batches.'.format(len(batched_dev_data)))

    with Timer('Preparation'):
        if not os.path.isdir(args.model):
            print('Directory {} does not exist. Creating new.'.format(
                args.model))
            os.makedirs(args.model)
        else:
            if len(os.listdir(args.model)) > 0:
                print(
                    '[WARNING] Log directory {} is not empty, previous checkpoints might be overwritten'
                    .format(args.model))
        shutil.copyfile(args.config, os.path.join(args.model, 'config'))
        # Save word and label dict to model directory.
        data.word_dict.save(os.path.join(args.model, 'word_dict'))
        data.head_dict.save(os.path.join(args.model, 'head_dict'))
        data.char_dict.save(os.path.join(args.model, 'char_dict'))
        data.label_dict.save(os.path.join(args.model, 'label_dict'))
        data.dep_label_dict.save(os.path.join(args.model, 'dep_label_dict'))
        writer = open(os.path.join(args.model, 'checkpoints.tsv'), 'w')
        writer.write(
            'step\tdatetime\tdev_loss\tdev_accuracy\tbest_dev_accuracy\n')

    with Timer('Building NN model'):
        model = BiLSTMTaggerModel(data, config=config, gpu_id=args.gpu)
        if args.gpu:
            print "BiLSTMTaggerModel initialize with GPU!"
            model = model.to(device)
            if args.gpu != "" and not torch.cuda.is_available():
                raise Exception("No GPU Found!")
                exit()
        for name, param in model.named_parameters(
        ):  # print pytorch model parameters and the corresponding names
            print name, param.size()

    i, global_step, epoch, train_loss = 0, 0, 0, 0.0
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    last_lr = 0.001
    optimizer = torch.optim.Adam(
        parameters, lr=last_lr)  # initialize the model parameter optimizer
    max_steps = int(config.max_steps)
    while global_step <= max_steps:  # epoch < config.max_epochs
        initial_time = time.time()
        with Timer("Epoch%d" % epoch) as timer:
            model.train()
            dep_train_data = data.get_dep_training_data(
                include_last_batch=True)
            train_data = data.get_training_data(include_last_batch=True)
            mixed_data = data.mix_training_data(train_data, dep_train_data)
            for batched_tensor, batched_dep_tensor in mixed_data:  # for each batch in the training corpus
                # SRL forward
                sent_ids, sent_lengths, \
                word_indexes, head_indexes, char_indexes, \
                predicate_indexes, arg_starts, arg_ends, arg_labels, srl_lens,\
                gold_predicates, num_gold_predicates = batched_tensor

                if args.gpu:
                    word_indexes, head_indexes, char_indexes,\
                        predicate_indexes, arg_starts, arg_ends, arg_labels, srl_lens = \
                        word_indexes.cuda(), head_indexes.cuda(), char_indexes.cuda(), predicate_indexes.cuda(), arg_starts.cuda(), \
                        arg_ends.cuda(), arg_labels.cuda(), srl_lens.cuda()  # gold_predicates.cuda(), num_gold_predicates.cuda()

                predicated_dict, srl_loss = model.forward(
                    sent_lengths, word_indexes, head_indexes, char_indexes,
                    (predicate_indexes, arg_starts, arg_ends, arg_labels,
                     srl_lens), (gold_predicates, num_gold_predicates))
                optimizer.zero_grad()
                srl_loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                # dep forward
                word_indexes, char_indexes, mask, lengths, heads, labels = batched_dep_tensor
                if args.gpu:
                    word_indexes, char_indexes = word_indexes.cuda(
                    ), char_indexes.cuda()
                dep_loss = model.forward(lengths, word_indexes, None,
                                         char_indexes, None, None,
                                         (heads, labels))

                optimizer.zero_grad()
                dep_loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()

                loss = srl_loss + dep_loss
                train_loss += float(
                    loss.detach()
                )  # should be tensor not Variable, avoiding the graph accumulates

                i += 1
                global_step += 1
                if global_step % 100 == 0:
                    last_lr = adjust_learning_rate(optimizer, last_lr)
                if i % 250 == 0:
                    total_time = time.time() - initial_time
                    timer.tick(
                        "{} training steps, loss={:.3f}, steps/s={:.2f}".
                        format(global_step, float(train_loss / i),
                               float(global_step / total_time)))
                    train_loss = 0.0
                    i = 0

            train_loss = train_loss / i
            print("Epoch {}, steps={}, loss={:.3f}".format(
                epoch, i, float(train_loss)))

            i = 0
            epoch += 1
            train_loss = 0.0
            if epoch % config.checkpoint_every_x_epochs == 0:
                with Timer('Evaluation'):
                    evaluate_tagger(model, batched_dev_data, data.eval_data,
                                    data.label_dict, config, evaluator, writer,
                                    global_step)

    # Done. :)
    writer.close()
Ejemplo n.º 4
0
def train_tagger(args):
  config = configuration.get_config(args.config)
  numpy.random.seed(666)
  torch.manual_seed(666)
  torch.cuda.manual_seed(666)
  ### gpu
  gpu = torch.cuda.is_available()
  print("GPU available: ", gpu)

  i = 0
  global_step = 0
  epoch = 0
  train_loss = 0.0
  
  with Timer('Data loading'):
    vocab_path = args.vocab if args.vocab != '' else None
    label_path = args.labels if args.labels != '' else None
    gold_props_path = args.gold if args.gold != '' else None

    print ('Task: {}'.format(args.task))
    if args.task == 'srl':
      # Data and evaluator for SRL.
      data = TaggerData(config,
                        *reader.get_srl_data(config, args.train, args.dev, vocab_path, label_path))
      evaluator = SRLEvaluator(data.get_development_data(),
                               data.label_dict,
                               gold_props_file=gold_props_path,
                               use_se_marker=config.use_se_marker,
                               pred_props_file=None,
                               word_dict=data.word_dict)
    else:
      # Data and evaluator for PropId.
      data = TaggerData(config,
                        *reader.get_postag_data(config, args.train, args.dev, vocab_path, label_path))
      evaluator = PropIdEvaluator(data.get_development_data(),
                                  data.label_dict)

    batched_dev_data = data.get_development_data(batch_size=config.dev_batch_size)
    print ('Dev data has {} batches.'.format(len(batched_dev_data)))
  
  with Timer('Preparation'):
    if not os.path.isdir(args.model):
      print ('Directory {} does not exist. Creating new.'.format(args.model))
      os.makedirs(args.model)
    else:
      if len(os.listdir(args.model)) > 0:
        print ('[WARNING] Log directory {} is not empty, previous checkpoints might be overwritten'
             .format(args.model))
    shutil.copyfile(args.config, os.path.join(args.model, 'config'))
    # Save word and label dict to model directory.
    data.word_dict.save(os.path.join(args.model, 'word_dict'))
    data.label_dict.save(os.path.join(args.model, 'label_dict'))
    writer = open(os.path.join(args.model, 'checkpoints.tsv'), 'w')
    writer.write('step\tdatetime\tdev_loss\tdev_accuracy\tbest_dev_accuracy\n')

  with Timer('Building model'):
    model = BiLSTMTaggerModel(data, config=config, gpu_id=args.gpu)
    if args.gpu:
        print "Use Cuda!"
        model = model.cuda()
    if args.gpu != "" and not torch.cuda.is_available():
        raise Exception("No GPU Found!")
        exit()
    for param in model.parameters():
        print param.size()
    """for param in model.params:
      print param, param.name, param.shape.eval()
    loss_function = model.get_loss_function()
    eval_function = model.get_eval_function()"""

  while epoch < config.max_epochs:
    with Timer("Epoch%d" % epoch) as timer:
      train_data = data.get_training_data(include_last_batch=True)
      model.bilstm.dropout = 0.1
      for batched_tensor in train_data:  # for each batch in the training corpus
        x, y, _, weights = batched_tensor

        batch_input_lengths = ([sentence_x.shape[0] for sentence_x in x])
        max_length = max(batch_input_lengths)
        # padding
        # input = [numpy.pad(sentence_x, (0, max_length - sentence_x.shape[0]), 'constant') for sentence_x in x]
        word_input = [numpy.pad(sentence_x[:, 0], (0, max_length - sentence_x.shape[0]), 'constant') \
                 for sentence_x in x]  # padding
        predicate_input = [numpy.pad(sentence_x[:, 1], (0, max_length - sentence_x.shape[0]), 'constant') \
                      for sentence_x in x]  # padding
        word_input, predicate_input = numpy.vstack(word_input), numpy.vstack(predicate_input)

        # numpy batch input to Variable
        word_input_seqs = torch.autograd.Variable(torch.from_numpy(word_input.astype('int64')).long())
        predicate_input_seqs = torch.autograd.Variable(torch.from_numpy(predicate_input.astype('int64')).long())

        # First: order the batch by decreasing sequence length
        input_lengths = torch.LongTensor(batch_input_lengths)
        input_lengths, perm_idx = input_lengths.sort(0, descending=True)
        word_input_seqs = word_input_seqs[perm_idx]
        predicate_input_seqs = predicate_input_seqs[perm_idx]
        answer = [None] * len(x)  # resort the answer according to the input
        count = 0
        list_y = list(y)
        for (i, ans) in zip(perm_idx, list_y):
            answer[count] = list_y[i]
            count += 1
        answer = numpy.concatenate(answer)
        answer = torch.autograd.Variable(torch.from_numpy(answer).type(torch.LongTensor))
        answer = answer.view(-1)

        # print answer, answer.size()
        # Then pack the sequences
        # packed_input = torch.nn.utils.rnn.pack_padded_sequence(input_seqs, input_lengths.numpy(), batch_first=True)
        # packed_input = packed_input.cuda() if args.gpu else packed_input
        if args.gpu:
            word_input_seqs, predicate_input_seqs, input_lengths, perm_idx =\
                word_input_seqs.cuda(), predicate_input_seqs.cuda(), input_lengths.cuda(), perm_idx.cuda()
            answer = answer.cuda()
        model.zero_grad()
        output = model.forward(word_input_seqs, predicate_input_seqs, input_lengths, perm_idx, len(x))  # (batch input, batch size)
        loss = model.loss(output, answer)
        loss.backward()
        # gradient clipping
        # torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)
        model.optimizer = torch.optim.Adadelta(model.parameters(), rho=0.95)
        model.optimizer.step()

        train_loss += loss
        i += 1
        global_step += 1

        if i % 400 == 0:
          timer.tick("{} training steps, loss={:.3f}".format(i, float(train_loss / i)))
        
    train_loss = train_loss / i
    print("Epoch {}, steps={}, loss={:.3f}".format(epoch, i, float(train_loss)))
    i = 0
    epoch += 1
    train_loss = 0.0
    if epoch % config.checkpoint_every_x_epochs == 0:
      with Timer('Evaluation'):
        evaluate_tagger(model, batched_dev_data, evaluator, writer, global_step)

  # Done. :)
  writer.close()        
Ejemplo n.º 5
0
def train_tagger(args):
    config = configuration.get_config(args.config)
    numpy.random.seed(666)
    torch.manual_seed(666)
    torch.set_printoptions(precision=20)
    ### gpu
    gpu = torch.cuda.is_available()
    if args.gpu and gpu:
        print("GPU available: {}\t GPU ID: {}".format(gpu, args.gpu))
        torch.cuda.manual_seed(666)
        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu

    i = 0
    global_step = 0
    epoch = 0
    train_loss = 0.0

    with Timer('Data loading'):
        vocab_path = args.vocab if args.vocab != '' else None
        label_path = args.labels if args.labels != '' else None
        gold_props_path = args.gold if args.gold != '' else None

        print('Task: {}'.format(args.task))
        if args.task == 'srl':
            # Data and evaluator for SRL.
            data = TaggerData(
                config,
                *reader.get_srl_data(config, args.train, args.dev, vocab_path,
                                     label_path))
            evaluator = SRLEvaluator(data.get_development_data(),
                                     data.label_dict,
                                     gold_props_file=gold_props_path,
                                     use_se_marker=config.use_se_marker,
                                     pred_props_file=None,
                                     word_dict=data.word_dict)
        else:
            print "Not implemented yet!"
            exit()
            # Data and evaluator for PropId.
            data = TaggerData(
                config,
                *reader.get_postag_data(config, args.train, args.dev,
                                        vocab_path, label_path))
            evaluator = PropIdEvaluator(data.get_development_data(),
                                        data.label_dict)

        batched_dev_data = data.get_development_data(
            batch_size=config.dev_batch_size)
        print('Dev data has {} batches.'.format(len(batched_dev_data)))

    with Timer("Get training and devlel sentences dict"):
        training_sentences = []
        for sen in get_srl_sentences(args.train):
            if len(sen[1]) <= config.max_train_length:
                training_sentences.append(' '.join(sen[1]))
        training_ids = [int(sen[0][0]) for sen in data.train_sents]
        temp = {}
        assert len(training_sentences) == len(training_ids)
        for idx, sen in zip(training_ids, training_sentences):
            temp[idx] = sen
        training_sentences = temp

        devel_sentences = [
            ' '.join(sen[1]) for sen in get_srl_sentences(args.dev)
        ]
        devel_ids = [int(sen[0][0]) for sen in data.dev_sents]
        temp = {}
        assert len(devel_sentences) == len(devel_ids)
        for idx, sen in zip(devel_ids, devel_sentences):
            temp[idx] = sen
        devel_sentences = temp

    with Timer('Syntactic Information Extracting'
               ):  # extract the syntactic information from file
        train_dep_trees = SyntacticCONLL()
        dev_dep_trees = SyntacticCONLL()
        train_dep_trees.read_from_file(args.train_dep_trees)
        dev_dep_trees.read_from_file(args.dev_dep_trees)
        # generate the syntactic label dict in training corpus
        data.syntactic_dict = train_dep_trees.get_syntactic_label_dict()
        data.syntactic_dict.accept_new = False
        dev_dep_trees.get_syntactic_label_dict(data.syntactic_dict)

    with Timer("Pattern generating..."):
        # generate the tree-based position features according the Dependency Tree.
        train_pattern, data.pattern_dict = train_dep_trees.get_tpf2_dict(
            data.train_tensors)
        print("Extract {} training pattern features".format(
            len(train_pattern)))
        assert len(train_pattern) == len(data.train_tensors)
        data.pattern_dict.accept_new = False
        dev_pattern = dev_dep_trees.get_tpf2_dict(data.dev_tensors,
                                                  data.pattern_dict)
        print("Extract {} dev pattern features".format(len(dev_pattern)))
        assert len(dev_pattern) == len(data.dev_tensors)

    with Timer("Loading ELMO"):
        train_elmo_hdf5 = hdf5_reader()
        train_elmo_hdf5.read_from_file(args.train_elmo, training_sentences)
        dev_elmo_hdf5 = hdf5_reader()
        dev_elmo_hdf5.read_from_file(args.dev_elmo, devel_sentences)

    with Timer('Preparation'):
        if not os.path.isdir(args.model):
            print('Directory {} does not exist. Creating new.'.format(
                args.model))
            os.makedirs(args.model)
        else:
            if len(os.listdir(args.model)) > 0:
                print ('[WARNING] Log directory {} is not empty, previous checkpoints might be overwritten' \
                    .format(args.model))
        shutil.copyfile(args.config, os.path.join(args.model, 'config'))
        # Save word and label dict to model directory.
        data.word_dict.save(os.path.join(args.model, 'word_dict'))
        data.label_dict.save(os.path.join(args.model, 'label_dict'))
        data.syntactic_dict.save(os.path.join(args.model, 'syn_label_dict'))
        data.pattern_dict.save(os.path.join(args.model, 'pattern_dict'))
        writer = open(os.path.join(args.model, 'checkpoints.tsv'), 'w')
        writer.write(
            'step\tdatetime\tdev_loss\tdev_accuracy\tbest_dev_accuracy\n')

    with Timer('Building model'):
        model = BiLSTMTaggerModel(data, config=config, gpu_id=args.gpu)
        if args.gpu:
            print "BiLSTMTaggerModel initialize with Cuda!"
            model = model.cuda()
            if args.gpu != "" and not torch.cuda.is_available():
                raise Exception("No GPU Found!")
                exit()
        for param in model.parameters():
            print param.size()

    optimizer = torch.optim.Adadelta(
        model.parameters(), lr=1.0,
        rho=0.95)  # initialize the optimizer outside the epoch
    while epoch < config.max_epochs:
        with Timer("Epoch%d" % epoch) as timer:
            model.train()
            train_data = data.get_training_data(include_last_batch=True)
            for batched_tensor in train_data:  # for each batch in the training corpus
                x, y, lengths, weights = batched_tensor
                word_inputs_seqs, predicate_inputs_seqs, pattern_inputs_seqs, pattern_li_seqs, pattern_la_seqs, pattern_lp_seqs, \
                    sentences_ids, answers, input_lengths, masks, padding_answers = \
                    batch_data_variable(train_pattern, x, y, lengths, weights)
                elmo_representations = train_elmo_hdf5.forward(
                    sentences_ids,
                    word_inputs_seqs.size()[-1], [len(ans) for ans in answers])

                if args.gpu:
                    word_inputs_seqs, predicate_inputs_seqs, pattern_inputs_seqs, pattern_li_seqs, pattern_la_seqs, pattern_lp_seqs, \
                        input_lengths, masks, padding_answers = \
                        word_inputs_seqs.cuda(), predicate_inputs_seqs.cuda(), pattern_inputs_seqs.cuda(), pattern_li_seqs.cuda(),\
                        pattern_la_seqs.cuda(), pattern_lp_seqs.cuda(), input_lengths.cuda(), masks.cuda(), padding_answers.cuda()
                    elmo_representations = elmo_representations.cuda()

                optimizer.zero_grad()
                output = model.forward(word_inputs_seqs, predicate_inputs_seqs, pattern_inputs_seqs, \
                                       pattern_li_seqs, pattern_la_seqs, pattern_lp_seqs, elmo_representations, input_lengths)
                loss = model.compute_loss(output, padding_answers, masks)
                loss.backward()
                # gradient clipping
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()

                train_loss += loss.data  # should be tensor not Variable, avoiding the graph accumulates

                i += 1
                global_step += 1
                if i % 400 == 0:
                    timer.tick("{} training steps, loss={:.3f}".format(
                        i, float(train_loss / i)))
                    sys.stdout.flush()

            train_loss = train_loss / i
            print("Epoch {}, steps={}, loss={:.3f}".format(
                epoch, i, float(train_loss)))
            i = 0
            epoch += 1
            train_loss = 0.0
            if epoch % config.checkpoint_every_x_epochs == 0:
                with Timer('Evaluation'):
                    evaluate_tagger(model, batched_dev_data, dev_pattern,
                                    dev_elmo_hdf5, evaluator, writer,
                                    global_step)

    # Done. :)
    writer.close()