Beispiel #1
0
def get_scores(config, task, model_path, word_dict_path, label_dict_path, input_path, \
               lower_case=True, allow_new_words=True, replace_vocab=True):
  with Timer('Data loading'):
    print ('Task: {}'.format(task))
    print ('Allow new words in test data: {}. Lower case words: {}'.format(allow_new_words, lower_case))
  
    # Load word and tag dictionary
    word_dict = Dictionary(unknown_token=UNKNOWN_TOKEN)
    label_dict = Dictionary()
    word_dict.load(word_dict_path)
    label_dict.load(label_dict_path)
    data = TaggerData(config, [], [], word_dict, label_dict, None, None)

    # Load test data.
    if task == 'srl':
      test_sentences, emb_inits, emb_shapes = reader.get_srl_test_data(
                                                    input_path,
                                                    config,
                                                    data.word_dict,
                                                    data.label_dict,
                                                    lower_case,
                                                    allow_new_words)
    else:
      test_sentences, emb_inits, emb_shapes = reader.get_postag_test_data(
                                                    input_path,
                                                    config,
                                                    data.word_dict,
                                                    data.label_dict,
                                                    lower_case,
                                                    allow_new_words)
    
    print ('Read {} sentences.'.format(len(test_sentences)))
  
    # Add pre-trained embeddings for new words in the test data.
    #if allow_new_words:
    data.embedding_shapes = emb_shapes
    data.embeddings = emb_inits

    # Batching.
    test_data = data.get_test_data(test_sentences, batch_size=config.dev_batch_size)
      
  with Timer('Model building and loading'):
    model = BiLSTMTaggerModel(data, config=config, fast_predict=True)
    model.load(model_path, word_dict, replace_vocab)
    dist_function = model.get_distribution_function()
     
  with Timer('Running model'):
    scores = None
    for i, batched_tensor in enumerate(test_data):
      x, _, num_tokens, weights = batched_tensor
      p, sc = dist_function(x, weights)
      scores = numpy.concatenate((scores, sc), axis=0) if i > 0 else sc
   
  return scores, data, test_sentences, test_data
Beispiel #2
0
def get_scores(config, task, model_path, word_dict_path, label_dict_path, syntactic_dict_path, input_path):
    with Timer('Data loading'):
        print ('Task: {}'.format(task))
        allow_new_words = True
        print ('Allow new words in test data: {}'.format(allow_new_words))

        # Load word and tag dictionary
        word_dict = Dictionary(padding_token=PADDING_TOKEN, unknown_token=UNKNOWN_TOKEN)  # word tokens to Dict
        label_dict, syntactic_dict = Dictionary(), Dictionary()
        word_dict.load(word_dict_path)
        label_dict.load(label_dict_path)
        syntactic_dict.load(syntactic_dict_path)
        data = TaggerData(config, [], [], word_dict, label_dict, None, None)
        data.syntactic_dict = syntactic_dict

        # Load test data.
        if task == 'srl':
            test_sentences, emb_inits, emb_shapes = reader.get_srl_test_data(
                input_path,
                config,
                data.word_dict,
                data.label_dict,
                allow_new_words)
        else:
            test_sentences, emb_inits, emb_shapes = reader.get_postag_test_data(
                input_path,
                config,
                data.word_dict,
                data.label_dict,
                allow_new_words)

        print ('Read {} sentences.'.format(len(test_sentences)))

        # Add pre-trained embeddings for new words in the test data.
        # if allow_new_words:
        data.embedding_shapes = emb_shapes
        data.embeddings = emb_inits
        # Batching.
        test_data = data.get_test_data(test_sentences, batch_size=config.dev_batch_size)

    with Timer('Syntactic Information Extracting'):  # extract the syntactic information from file
        test_dep_trees = SyntacticCONLL()
        test_dep_trees.read_from_file(args.input_dep_trees)
        # generate the syntactic label dict in training corpus
        data.syntactic_dict.accept_new = False
        test_dep_trees.get_syntactic_label_dict(data.syntactic_dict)

    with Timer('Model building and loading'):
        model = BiLSTMTaggerModel(data, config=config, gpu_id=args.gpu)
        model.load(model_path)
        for param in model.parameters():
            print param.size()
        if args.gpu:
            print("Initialize the model with GPU!")
            model = model.cuda()

    with Timer('Running model'):
        scores = []
        model.eval()
        for i, batched_tensor in enumerate(test_data):
            x, y, lengths, weights = batched_tensor
            word_inputs_seqs, predicate_inputs_seqs, syn_label_inputs_seqs, pes, answers, input_lengths, masks, padding_answers = \
                batch_data_variable(test_dep_trees, None, x, y, lengths, weights)

            if args.gpu:
                word_inputs_seqs, predicate_inputs_seqs, syn_label_inputs_seqs, input_lengths, masks, \
                padding_answers = \
                    word_inputs_seqs.cuda(), predicate_inputs_seqs.cuda(), syn_label_inputs_seqs.cuda(), \
                    input_lengths.cuda(), masks.cuda(), padding_answers.cuda()

            sc = model.forward(word_inputs_seqs, predicate_inputs_seqs, syn_label_inputs_seqs, pes, input_lengths)
            sc = sc.data.cpu().numpy() if args.gpu else sc.data.numpy()
            sc = [sc[j] for j in range(sc.shape[0])]
            scores.extend(sc)

    return scores, data, test_sentences, test_data
Beispiel #3
0
def get_scores(config, task, model_path, word_dict_path, label_dict_path,
               tpf_dict_path, input_path):
    with Timer('Data loading'):
        print('Task: {}'.format(task))
        allow_new_words = True
        print('Allow new words in test data: {}'.format(allow_new_words))

        # Load word and tag dictionary
        word_dict = Dictionary(
            padding_token=PADDING_TOKEN,
            unknown_token=UNKNOWN_TOKEN)  # word tokens to Dict
        label_dict = Dictionary()
        tpf_dict = Dictionary()
        word_dict.load(word_dict_path)
        label_dict.load(label_dict_path)
        tpf_dict.load(tpf_dict_path)
        data = TaggerData(config, [], [], word_dict, label_dict, None, None)
        data.tpf2_dict = tpf_dict

        # Load test data.
        if task == 'srl':
            test_sentences, emb_inits, emb_shapes = reader.get_srl_test_data(
                input_path, config, data.word_dict, data.label_dict,
                allow_new_words)
        else:
            test_sentences, emb_inits, emb_shapes = reader.get_postag_test_data(
                input_path, config, data.word_dict, data.label_dict,
                allow_new_words)

        print('Read {} sentences.'.format(len(test_sentences)))

        # Add pre-trained embeddings for new words in the test data.
        # if allow_new_words:
        data.embedding_shapes = emb_shapes
        data.embeddings = emb_inits
        # Batching.
        test_data = data.get_test_data(test_sentences,
                                       batch_size=config.dev_batch_size)

    with Timer("Get test sentences dict"):
        test_sentences_w_id = []
        for sen in get_srl_sentences(args.input):
            test_sentences_w_id.append(' '.join(sen[1]))
        test_sentences_ids = [int(sen[0][0]) for sen in test_sentences]
        temp = {}
        assert len(test_sentences_w_id) == len(test_sentences_ids)
        for idx, sen in zip(test_sentences_ids, test_sentences_w_id):
            temp[idx] = sen
        test_sentences_w_id = temp

    with Timer("Loading ELMO"):
        test_elmo_hdf5 = hdf5_reader()
        test_elmo_hdf5.read_from_file(args.input_elmo, test_sentences_w_id)

    with Timer('Syntactic Information Extracting'
               ):  # extract the syntactic information from file
        test_dep_trees = SyntacticCONLL()
        test_dep_trees.read_from_file(args.input_dep_trees)

    with Timer("TPF2 generating..."):
        # generate the tree-based position features according the Dependency Tree.
        data.tpf2_dict.accept_new = False
        test_tpf2 = test_dep_trees.get_tpf2_dict(data.test_tensors,
                                                 data.tpf2_dict)
        print("Extract {} test TPF2 features".format(len(test_tpf2)))
        assert len(test_tpf2) == len(data.test_tensors)

    with Timer('Model building and loading'):
        model = BiLSTMTaggerModel(data, config=config, gpu_id=args.gpu)
        model.load(model_path)
        for param in model.parameters():
            print(param.size())
        if args.gpu:
            print("Initialize the model with GPU!")
            model = model.cuda()

    with Timer('Running model'):
        scores = []
        model.eval()
        for i, batched_tensor in enumerate(test_data):
            x, y, lengths, weights = batched_tensor
            word_inputs_seqs, predicate_inputs_seqs, tpf_ids, sentences_ids, answers, input_lengths, masks, padding_answers = \
                batch_data_variable(test_tpf2, x, y, lengths, weights)
            elmo_representations = test_elmo_hdf5.forward(
                sentences_ids,
                word_inputs_seqs.size()[-1], [len(ans) for ans in answers])
            if args.gpu:
                word_inputs_seqs, predicate_inputs_seqs, tpf_ids, input_lengths, masks, padding_answers = \
                    word_inputs_seqs.cuda(), predicate_inputs_seqs.cuda(), tpf_ids.cuda(), input_lengths.cuda(), masks.cuda(), padding_answers.cuda()
                elmo_representations = elmo_representations.cuda()

            sc = model.forward(word_inputs_seqs, predicate_inputs_seqs,
                               tpf_ids, elmo_representations, input_lengths)
            sc = sc.data.cpu().numpy() if args.gpu else sc.data.numpy()
            sc = [sc[j] for j in range(sc.shape[0])]
            scores.extend(sc)

    return scores, data, test_sentences, test_data
Beispiel #4
0
            # Load test data.
            if task == 'srl':
                test_sentences, emb, emb_shapes = reader.get_srl_test_data(
                    args.input, config, data.word_dict, data.head_dict,
                    data.char_dict, data.label_dict, allow_new_words)
                eval_data = load_eval_data(args.input)

            print('Read {} sentences.'.format(len(test_sentences)))
            # Add pre-trained embeddings for new words in the test data.
            # if allow_new_words:
            data.word_embeddings = emb[0]
            data.head_embeddings = emb[1]
            data.word_embedding_shapes = emb_shapes[0]
            data.head_embedding_shapes = emb_shapes[1]
            # Batching.
            test_data = data.get_test_data(test_sentences,
                                           batch_size=config.dev_batch_size)

        with Timer('Model building and loading'):
            model = BiLSTMTaggerModel(data, config=config, gpu_id=args.gpu)
            model.load(model_path)
            for param in model.parameters():
                print param.size()
            if args.gpu:
                print("Initialize the model with GPU!")
                model = model.cuda()

        with Timer('Running model'):
            dev_loss = 0.0
            srl_predictions = []

            # with torch.no_grad():  # Eval don't need the grad