Ejemplo n.º 1
0
def get_scores(config, task, model_path, gemb_model_path, word_dict_path, label_dict_path, input_path):  
  with Timer('Data loading'):
    print ('Task: {}'.format(task))
    allow_new_words = True
    print ('Allow new words in test data: {}'.format(allow_new_words))
  
    # Load word and tag dictionary
    word_dict = Dictionary(unknown_token=UNKNOWN_TOKEN)
    label_dict = Dictionary()
    word_dict.load(word_dict_path)
    label_dict.load(label_dict_path)
    data = TaggerData(config, [], [], word_dict, label_dict, None, None)

    # Load test data.
    if task == 'srl':
      test_sentences, emb_inits, emb_shapes = reader.get_srl_test_data(
                                                    input_path,
                                                    config,
                                                    data.word_dict,
                                                    data.label_dict,
                                                    allow_new_words)
    else:
      test_sentences, emb_inits, emb_shapes = reader.get_postag_test_data(
                                                    input_path,
                                                    config,
                                                    data.word_dict,
                                                    data.label_dict,
                                                    allow_new_words)
    
    print ('Read {} sentences.'.format(len(test_sentences)))
  
    # Add pre-trained embeddings for new words in the test data.
    #if allow_new_words:
    data.embedding_shapes = emb_shapes
    data.embeddings = emb_inits

    # Batching.
    test_data = data.get_gemb_test_data(test_sentences, batch_size=config.dev_batch_size)
      
  with Timer('Model building and loading'):
    model = BiLSTMTaggerModel(data, config=config, fast_predict=True)
    model.load(model_path)
    model.add_gemb()
    model.gemb.load(gemb_model_path)

    ctx_emb_function = model.get_ctx_emb_function()
    dist_function = model.get_distribution_by_ctx_emb_function()
     
  with Timer('Running model'):
    scores = None
    for i, batched_tensor in enumerate(test_data):
      x, _, oov_pos, num_tokens, weights = batched_tensor # weights is mask
      oov_pos = oov_pos[0] # batch must be 1
      gembedding, inputs_0 = ctx_emb_function(x, weights, oov_pos)
      inputs_0_new = replace_with_gemb(inputs_0, gembedding, oov_pos)
      p, sc = dist_function(inputs_0_new, weights)
      scores = numpy.concatenate((scores, sc), axis=0) if i > 0 else sc
   
  return scores, data, test_sentences, test_data
def evaluate_tagger(model, ctx_emb_function, gemb_eval_function,
                    batched_dev_data, evaluator, writer, global_step):
    predictions = None
    dev_loss = 0
    for i, batched_tensor in enumerate(batched_dev_data):
        x, y, oov_pos, _, weights = batched_tensor  # weights is mask
        oov_pos = oov_pos[0]  # batch size must be 1
        gembedding, inputs_0 = ctx_emb_function(x, weights, oov_pos)
        inputs_0_new = replace_with_gemb(inputs_0, gembedding, oov_pos)

        p, loss = gemb_eval_function(inputs_0_new, weights, y)
        predictions = numpy.concatenate(
            (predictions, p), axis=0) if i > 0 else p
        dev_loss += loss

    print('Dev loss={:.6f}'.format(dev_loss))
    evaluator.evaluate(predictions)
    writer.write('{}\t{}\t{:.6f}\t{:.2f}\t{:.2f}\n'.format(
        global_step, time.strftime("%Y-%m-%d %H:%M:%S"), dev_loss,
        evaluator.accuracy, evaluator.best_accuracy))
    writer.flush()
    if evaluator.has_best:
        model.gemb.save(os.path.join(args.gemb_model, 'gemb_model'))