Ejemplo n.º 1
0
def get_scores(config, task, model_path, word_dict_path, label_dict_path, input_path, \
               lower_case=True, allow_new_words=True, replace_vocab=True):
  with Timer('Data loading'):
    print ('Task: {}'.format(task))
    print ('Allow new words in test data: {}. Lower case words: {}'.format(allow_new_words, lower_case))
  
    # Load word and tag dictionary
    word_dict = Dictionary(unknown_token=UNKNOWN_TOKEN)
    label_dict = Dictionary()
    word_dict.load(word_dict_path)
    label_dict.load(label_dict_path)
    data = TaggerData(config, [], [], word_dict, label_dict, None, None)

    # Load test data.
    if task == 'srl':
      test_sentences, emb_inits, emb_shapes = reader.get_srl_test_data(
                                                    input_path,
                                                    config,
                                                    data.word_dict,
                                                    data.label_dict,
                                                    lower_case,
                                                    allow_new_words)
    else:
      test_sentences, emb_inits, emb_shapes = reader.get_postag_test_data(
                                                    input_path,
                                                    config,
                                                    data.word_dict,
                                                    data.label_dict,
                                                    lower_case,
                                                    allow_new_words)
    
    print ('Read {} sentences.'.format(len(test_sentences)))
  
    # Add pre-trained embeddings for new words in the test data.
    #if allow_new_words:
    data.embedding_shapes = emb_shapes
    data.embeddings = emb_inits

    # Batching.
    test_data = data.get_test_data(test_sentences, batch_size=config.dev_batch_size)
      
  with Timer('Model building and loading'):
    model = BiLSTMTaggerModel(data, config=config, fast_predict=True)
    model.load(model_path, word_dict, replace_vocab)
    dist_function = model.get_distribution_function()
     
  with Timer('Running model'):
    scores = None
    for i, batched_tensor in enumerate(test_data):
      x, _, num_tokens, weights = batched_tensor
      p, sc = dist_function(x, weights)
      scores = numpy.concatenate((scores, sc), axis=0) if i > 0 else sc
   
  return scores, data, test_sentences, test_data
def get_scores_ctx(config, task, model_path, gemb_model_path, word_dict_path,
                   label_dict_path, input_path):
    with Timer('Data loading'):
        print('Task: {}'.format(task))
        allow_new_words = False
        print('Allow new words in test data: {}'.format(allow_new_words))

        # Load word and tag dictionary
        word_dict = Dictionary(unknown_token=UNKNOWN_TOKEN)
        label_dict = Dictionary()
        word_dict.load(word_dict_path)
        label_dict.load(label_dict_path)
        data = TaggerData(config, [], [], word_dict, label_dict, None, None)

        # Load test data.
        if task == 'srl':
            test_sentences, emb_inits, emb_shapes = reader.get_srl_test_data_gemb(
                input_path, config, data.word_dict, data.label_dict,
                allow_new_words)
        else:
            test_sentences, emb_inits, emb_shapes = reader.get_postag_test_data(
                input_path, config, data.word_dict, data.label_dict,
                allow_new_words)

        print('Read {} sentences.'.format(len(test_sentences)))

        # Add pre-trained embeddings for new words in the test data.
        #if allow_new_words:
        data.embedding_shapes = emb_shapes
        data.embeddings = emb_inits

        # Batching.
        test_data = data.get_ctx_gemb_test_data(
            test_sentences, batch_size=config.dev_batch_size)

    with Timer('Model building and loading'):
        model = BiLSTMTaggerModel(data, config=config, fast_predict=True)
        model.load(model_path)
        model.add_ctx_gemb()
        model.gemb.load(gemb_model_path)

        ctx_emb_function = model.get_ctx_emb_function()
        dist_function = model.get_distribution_by_gemb_function()

    with Timer('Running model'):
        scores = None
        for i, batched_tensor in enumerate(test_data):
            x, _, oov_pos, num_tokens, weights = batched_tensor  # weights is mask
            oov_pos = oov_pos[0]  # batch must be 1
            gembedding, inputs_0 = ctx_emb_function(x, weights, oov_pos)
            inputs_0_new = replace_with_gemb(inputs_0, gembedding, oov_pos)
            p, sc = dist_function(inputs_0_new, weights)
            scores = numpy.concatenate((scores, sc), axis=0) if i > 0 else sc

    return scores, data, test_sentences, test_data
Ejemplo n.º 3
0
def load_model(model_path, model_type):
    config = configuration.get_config(os.path.join(model_path, 'config'))
    # Load word and tag dictionary
    word_dict = Dictionary(unknown_token=UNKNOWN_TOKEN)
    label_dict = Dictionary()
    word_dict.load(os.path.join(model_path, 'word_dict'))
    label_dict.load(os.path.join(model_path, 'label_dict'))
    data = TaggerData(config, [], [], word_dict, label_dict, None, None)

    if model_type == 'srl':
        test_sentences, emb_inits, emb_shapes = reader.get_srl_test_data(
            None, config, data.word_dict, data.label_dict, False)
    else:
        test_sentences, emb_inits, emb_shapes = reader.get_postag_test_data(
            None, config, data.word_dict, data.label_dict, False)

    data.embedding_shapes = emb_shapes
    data.embeddings = emb_inits
    model = BiLSTMTaggerModel(data, config=config, fast_predict=True)
    model.load(os.path.join(model_path, 'model.npz'))
    return model, data
def train_tagger(args):
    config = configuration.get_config(args.config)
    i = 0
    global_step = 0
    epoch = 0
    train_loss = 0.0

    with Timer('Data loading'):
        vocab_path = args.vocab if args.vocab != '' else None
        label_path = args.labels if args.labels != '' else None
        gold_props_path = args.gold if args.gold != '' else None

        print('Task: {}'.format(args.task))
        if args.task == 'srl':
            # Data and evaluator for SRL.
            data = TaggerData(
                config,
                *reader.get_srl_data(config, args.train, args.dev, vocab_path,
                                     label_path))
            evaluator = SRLEvaluator(data.get_development_data(),
                                     data.label_dict,
                                     gold_props_file=gold_props_path,
                                     use_se_marker=config.use_se_marker,
                                     pred_props_file=None,
                                     word_dict=data.word_dict)
        else:
            # Data and evaluator for PropId.
            data = TaggerData(
                config,
                *reader.get_postag_data(config, args.train, args.dev,
                                        vocab_path, label_path))
            evaluator = PropIdEvaluator(data.get_development_data(),
                                        data.label_dict)

        batched_dev_data = data.get_development_data(
            batch_size=config.dev_batch_size)
        print('Dev data has {} batches.'.format(len(batched_dev_data)))

    with Timer('Preparation'):
        if not os.path.isdir(args.model):
            print('Directory {} does not exist. Creating new.'.format(
                args.model))
            os.makedirs(args.model)
        else:
            if len(os.listdir(args.model)) > 0:
                print(
                    '[WARNING] Log directory {} is not empty, previous checkpoints might be overwritten'
                    .format(args.model))
        shutil.copyfile(args.config, os.path.join(args.model, 'config'))
        # Save word and label dict to model directory.
        data.word_dict.save(os.path.join(args.model, 'word_dict'))
        data.label_dict.save(os.path.join(args.model, 'label_dict'))
        writer = open(os.path.join(args.model, 'checkpoints.tsv'), 'w')
        writer.write(
            'step\tdatetime\tdev_loss\tdev_accuracy\tbest_dev_accuracy\n')

    with Timer('Building model'):
        model = BiLSTMTaggerModel(data, config=config)
        for param in model.params:
            print param, param.name, param.shape.eval()
        loss_function = model.get_loss_function()
        eval_function = model.get_eval_function()

    model.save(os.path.join(args.model, 'model'))
    return

    while epoch < config.max_epochs:
        with Timer("Epoch%d" % epoch) as timer:
            train_data = data.get_training_data(include_last_batch=True)
            for batched_tensor in train_data:
                x, y, _, weights = batched_tensor
                loss = loss_function(x, weights, y)
                train_loss += loss
                i += 1
                global_step += 1
                if i % 400 == 0:
                    timer.tick("{} training steps, loss={:.3f}".format(
                        i, train_loss / i))

        train_loss = train_loss / i
        print("Epoch {}, steps={}, loss={:.3f}".format(epoch, i, train_loss))
        i = 0
        epoch += 1
        train_loss = 0.0
        if epoch % config.checkpoint_every_x_epochs == 0:
            with Timer('Evaluation'):
                evaluate_tagger(model, eval_function, batched_dev_data,
                                evaluator, writer, global_step)

    # Done. :)
    writer.close()