def test_forward_runs_with_non_bijective_mapping(self):
     elmo_fixtures_path = self.FIXTURES_ROOT / 'elmo'
     options_file = str(elmo_fixtures_path / 'options.json')
     weight_file = str(elmo_fixtures_path / 'lm_weights.hdf5')
     params = Params({
             "token_embedders": {
                     "words": {
                             "type": "embedding",
                             "num_embeddings": 20,
                             "embedding_dim": 2,
                             },
                     "elmo": {
                             "type": "elmo_token_embedder",
                             "options_file": options_file,
                             "weight_file": weight_file
                             },
                     },
             "embedder_to_indexer_map": {"words": ["words"], "elmo": ["elmo", "words"]}
             })
     token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params)
     inputs = {
             'words': (torch.rand(3, 6) * 20).long(),
             'elmo': (torch.rand(3, 6, 50) * 15).long(),
             }
     token_embedder(inputs)
 def test_forward_works_on_higher_order_input(self):
     params = Params({
             "words": {
                     "type": "embedding",
                     "num_embeddings": 20,
                     "embedding_dim": 2,
                     },
             "characters": {
                     "type": "character_encoding",
                     "embedding": {
                             "embedding_dim": 4,
                             "num_embeddings": 15,
                             },
                     "encoder": {
                             "type": "cnn",
                             "embedding_dim": 4,
                             "num_filters": 10,
                             "ngram_filter_sizes": [3],
                             },
                     }
             })
     token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params)
     inputs = {
             'words': Variable(torch.rand(3, 4, 5, 6) * 20).long(),
             'characters': Variable(torch.rand(3, 4, 5, 6, 7) * 15).long(),
             }
     assert token_embedder(inputs, num_wrapping_dims=2).size() == (3, 4, 5, 6, 12)
 def setUp(self):
     super(TestBasicTextFieldEmbedder, self).setUp()
     self.vocab = Vocabulary()
     self.vocab.add_token_to_namespace("1")
     self.vocab.add_token_to_namespace("2")
     self.vocab.add_token_to_namespace("3")
     self.vocab.add_token_to_namespace("4")
     params = Params({
             "words1": {
                     "type": "embedding",
                     "embedding_dim": 2
                     },
             "words2": {
                     "type": "embedding",
                     "embedding_dim": 5
                     },
             "words3": {
                     "type": "embedding",
                     "embedding_dim": 3
                     }
             })
     self.token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params)
     self.inputs = {
             "words1": Variable(torch.LongTensor([[0, 2, 3, 5]])),
             "words2": Variable(torch.LongTensor([[1, 4, 3, 2]])),
             "words3": Variable(torch.LongTensor([[1, 5, 1, 2]]))
             }
    def test_old_from_params_new_from_params(self):
        old_params = Params({
                "words1": {
                        "type": "embedding",
                        "embedding_dim": 2
                        },
                "words2": {
                        "type": "embedding",
                        "embedding_dim": 5
                        },
                "words3": {
                        "type": "embedding",
                        "embedding_dim": 3
                        }
                })

        # Allow loading the parameters in the old format
        with pytest.warns(DeprecationWarning):
            old_embedder = BasicTextFieldEmbedder.from_params(params=old_params, vocab=self.vocab)

        new_params = Params({
                "token_embedders": {
                        "words1": {
                                "type": "embedding",
                                "embedding_dim": 2
                                },
                        "words2": {
                                "type": "embedding",
                                "embedding_dim": 5
                                },
                        "words3": {
                                "type": "embedding",
                                "embedding_dim": 3
                                }
                        }
                })

        # But also allow loading the parameters in the new format
        new_embedder = BasicTextFieldEmbedder.from_params(params=new_params, vocab=self.vocab)
        assert old_embedder._token_embedders.keys() == new_embedder._token_embedders.keys() #pylint: disable=protected-access

        assert new_embedder(self.inputs).size() == (1, 4, 10)
    def test_old_from_params_new_from_params(self):

        old_params = Params({
                "words1": {
                        "type": "embedding",
                        "embedding_dim": 2
                        },
                "words2": {
                        "type": "embedding",
                        "embedding_dim": 5
                        },
                "words3": {
                        "type": "embedding",
                        "embedding_dim": 3
                        }
                })

        with pytest.warns(DeprecationWarning):
            BasicTextFieldEmbedder.from_params(params=old_params, vocab=self.vocab)

        new_params = Params({
                "token_embedders": {
                        "words1": {
                                "type": "embedding",
                                "embedding_dim": 2
                                },
                        "words2": {
                                "type": "embedding",
                                "embedding_dim": 5
                                },
                        "words3": {
                                "type": "embedding",
                                "embedding_dim": 3
                                }
                        }
                })

        token_embedder = BasicTextFieldEmbedder.from_params(params=new_params, vocab=self.vocab)
        assert token_embedder(self.inputs).size() == (1, 4, 10)
Beispiel #6
0
def main():
    token_indexer = SingleIdTokenIndexer()
    reader = JigsawDatasetReader(
        tokenizer=custom_tokenizer(),
        token_indexers={"tokens": token_indexer},
    )

    # Kaggle的多标签“恶意评论分类挑战
    dataset_root = Path('../../data/jigsaw')
    train_dataset, dev_dataset = (reader.read(
        dataset_root / fname) for fname in ["train.csv", "test_proced.csv"])

    print(
        f"total train samples: {len(train_dataset)}, dev samples: {len(dev_dataset)}"
    )

    # 建立词汇表,从数据集中建立
    vocab = Vocabulary.from_instances(train_dataset + dev_dataset)
    vocab_dim = vocab.get_vocab_size('tokens')
    print("vocab: ", vocab.get_vocab_size('labels'), vocab_dim)

    # 构建网络,此处网络为lstm-linear
    embedding_dim = 300
    hidden_dim = 128
    token_embedding = Embedding(num_embeddings=vocab_dim,
                                embedding_dim=embedding_dim)
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(embedding_dim,
                      hidden_dim,
                      bidirectional=True,
                      batch_first=True))
    model = MultiLabelClassifier(word_embeddings, 0.5, encoder, 0.2,
                                 len(label_cols), vocab)

    # allennlp 目前好像不支持单机多卡,或者支持性能不好
    gpu_id = 0 if torch.cuda.is_available() else -1
    if gpu_id > -1: model.cuda(gpu_id)

    # 构建迭代器,并为迭代器指定vocab
    iterator = BucketIterator(batch_size=32,
                              sorting_keys=[("tokens", "num_tokens")])
    iterator.index_with(vocab)

    # --------------------- forward demo ----------------------
    # generator = iter(iterator(train_dataset, shuffle=True))
    # for _ in range(5):
    #     batch = next(generator)
    #     print('---\nbatch ', batch.keys(), batch['tokens'].keys(), batch['tokens']['tokens'].shape, batch['label'].shape) # [batch, sentence_len, token_len]
    #     batch = move_to_device(batch, gpu_id)
    #     tokens = batch['tokens']
    #
    #     # option1. forward one step by one
    #     mask = get_text_field_mask(tokens)
    #     embeddings = model.word_embeddings(tokens)
    #     print("embeddings: ", embeddings.shape)
    #     state = model.encoder(embeddings, mask)
    #     class_logits = model.linear(state)
    #
    #     print("lstm state: ", state.shape, class_logits.shape)
    #
    #     # option2. do forward on the model
    #     y = model(**batch)
    #     metric = model.get_metrics()
    #     print("model out: ", y, '\n', metric)

    # --------------------- train ---------------------
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=1e-4,
                                 weight_decay=1e-5)
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        iterator=iterator,
        train_dataset=train_dataset,
        validation_dataset=dev_dataset,
        # serialization_dir="./models/",
        cuda_device=gpu_id,
        patience=10,
        num_epochs=20)
    trainer.train()
Beispiel #7
0
def EnhancedRCNN_train():

    print("enter train")
    with open (model_config.glove_file_path) as fp:
        text = fp.readlines()

    # 这里如何优雅地解决这个初始counter的问题
    glove_lines = len(text)
    token_counts = {"tokens": dict([(line.split(' ')[0], glove_lines - idx + 2) for idx, line in enumerate(text)])}
    #print(list(token_counts.items())[:10])
    vocab = Vocabulary(counter=token_counts,
                        min_count={"tokens": 1},
                        #non_padded_namespaces=['tokens'],
                        pretrained_files={'tokens': model_config.glove_file_path},
                        only_include_pretrained_words=True)

    EMBEDDING_DIM = 300
    token_embedding = Embedding.from_params(
        vocab=vocab,
        params=Params({ 'trainable': False,
                        'pretrained_file': model_config.glove_file_path,
                        'embedding_dim': EMBEDDING_DIM,
                        'vocab_namespace': "tokens"})
    )

    print("GloVe loaded")
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    model = EnhancedRCNNModel(word_embeddings, model_config.num_class, vocab=vocab)

    if torch.cuda.is_available():
        cuda_device = list(range(torch.cuda.device_count()))

        model = model.cuda(cuda_device[0])
    else:
        cuda_device = -1
    print("cuda device : {}".format(cuda_device))

    reader = ListWiseDatasetReader(vocab=vocab)
    train_dataset = reader.read(os.path.join(model_config.snli_base_path, "snli_1.0_train.jsonl"))
    dev_dataset = reader.read(os.path.join(model_config.snli_base_path, "snli_1.0_dev.jsonl"))
    test_dataset = reader.read(os.path.join(model_config.snli_base_path, "snli_1.0_test.jsonl"))

    #fc_lr = 1e-3
    optimizer = torch.optim.SGD(model.parameters(), lr=model_config.learning_rate, momentum=0.9)
    '''
    optimizer = torch.optim.SGD([{'params': model.embedder.parameters()},
                                 {'params': model.fc1.parameters(), 'lr': fc_lr},
                                 {'params': model.fc2.parameters(), 'lr': fc_lr},
                                 {'params': model.proj_1.parameters(), 'lr': fc_lr},
                                 {'params': model.proj_2.parameters(), 'lr': fc_lr},
                                 {'params': model.bert_prediction.parameters(), 'lr': fc_lr},
                                 ], lr=model_config.learning_rate, momentum=0.9)
    '''
    #optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
    iterator_train = BucketIterator(batch_size=model_config.batch_size,
                                    sorting_keys=[("left_input_tokens_field", "num_tokens"),
                                                  ("right_input_tokens_field", "num_tokens")])
    iterator_train.index_with(vocab)

    model.train()
    trainer = Trainer(model = model,
                      optimizer = optimizer,
                      iterator = iterator_train,
                      train_dataset = train_dataset,
                      validation_dataset = dev_dataset,
                      patience = model_config.patience,
                      num_epochs = model_config.epochs,
                      cuda_device = cuda_device,
                      shuffle=True
                      )
    train_start_time = time.time()
    trainer.train()
    train_end_time = time.time()

    # test
    model.eval()

    preds = []
    gd = []
    gd_pos = []

    with torch.no_grad():
        iterator_test = BucketIterator(batch_size = model_config.batch_size,
                                       sorting_keys=[("left_input_tokens_field", "num_tokens"),
                                                  ("right_input_tokens_field", "num_tokens")])
        iterator_test.index_with(vocab)
        generator_test = iterator_test(test_dataset, 1, False)
        test_start_time = time.time()
        for batch in generator_test:
            batch = move_to_device(batch, cuda_device[0])
            gd.extend(batch['label'].squeeze(-1).long().cpu().numpy().tolist())
            out_dict = model(batch['left_input_tokens_field'], batch['right_input_tokens_field'],
                             batch['label'])
            batch_pred = torch.argmax(out_dict['logits'], -1).cpu().numpy()
            preds.extend(batch_pred.tolist())

            sorted_batch, sorted_idx = torch.sort(out_dict['logits'], dim=-1, descending=True)
            label_mat = batch['label'].repeat(1, out_dict['logits'].shape[-1]).long().cuda()
            pos_mat = label_mat.eq(sorted_idx.cuda())
            pos_tensor = pos_mat.nonzero()[:, 1].cpu().numpy().tolist()

            gd_pos.extend(pos_tensor)
        test_end_time = time.time()

    print("p@1 : ", (np.sum(np.equal(gd, preds))) / len(gd))
    print("[train time] : {}".format(train_end_time - train_start_time))
    print("[test time] : {}".format(test_end_time - test_start_time))
    # 先检查文件是否存在,不存在则写入,存在则continue
    save_path = os.path.join(root_path, model_config.save_path)
    if os.path.exists(save_path):
        print("save path already exists")
    else:
        pd = pandas.DataFrame({'gd': gd, 'preds': preds})
        pd.to_csv(save_path, index=False)
        print("save to path : {}".format(save_path))
Beispiel #8
0
def main():
    reader = Seq2SeqDatasetReader(
        source_tokenizer=WordTokenizer(),
        target_tokenizer=CharacterTokenizer(),
        source_token_indexers={'tokens': SingleIdTokenIndexer()},
        target_token_indexers={
            'tokens': SingleIdTokenIndexer(namespace='target_tokens')
        })
    train_dataset = reader.read('data/tatoeba/tatoeba.eng_cmn.train.tsv')
    validation_dataset = reader.read('data/tatoeba/tatoeba.eng_cmn.dev.tsv')

    vocab = Vocabulary.from_instances(train_dataset + validation_dataset,
                                      min_count={
                                          'tokens': 3,
                                          'target_tokens': 3
                                      })

    en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                             embedding_dim=EN_EMBEDDING_DIM)
    # encoder = PytorchSeq2SeqWrapper(
    #     torch.nn.LSTM(EN_EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
    encoder = StackedSelfAttentionEncoder(input_dim=EN_EMBEDDING_DIM,
                                          hidden_dim=HIDDEN_DIM,
                                          projection_dim=128,
                                          feedforward_hidden_dim=128,
                                          num_layers=1,
                                          num_attention_heads=8)

    source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding})

    # attention = LinearAttention(HIDDEN_DIM, HIDDEN_DIM, activation=Activation.by_name('tanh')())
    # attention = BilinearAttention(HIDDEN_DIM, HIDDEN_DIM)
    attention = DotProductAttention()

    max_decoding_steps = 20  # TODO: make this variable
    model = SimpleSeq2Seq(vocab,
                          source_embedder,
                          encoder,
                          max_decoding_steps,
                          target_embedding_dim=ZH_EMBEDDING_DIM,
                          target_namespace='target_tokens',
                          attention=attention,
                          beam_size=8,
                          use_bleu=True)
    optimizer = optim.Adam(model.parameters())
    iterator = BucketIterator(batch_size=32,
                              sorting_keys=[("source_tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      num_epochs=1,
                      cuda_device=CUDA_DEVICE)

    for i in range(50):
        print('Epoch: {}'.format(i))
        trainer.train()

        predictor = SimpleSeq2SeqPredictor(model, reader)

        for instance in itertools.islice(validation_dataset, 10):
            print('SOURCE:', instance.fields['source_tokens'].tokens)
            print('GOLD:', instance.fields['target_tokens'].tokens)
            print('PRED:',
                  predictor.predict_instance(instance)['predicted_tokens'])
Beispiel #9
0
def train_main():
    config = Config(
        testing=True,
        seed=1,
        batch_size=64,
        lr=3e-4,
        epochs=2,
        hidden_sz=64,
        max_seq_len=100,  # necessary to limit memory usage
        max_vocab_size=100000,
    )
    token_indexer = ELMoTokenCharactersIndexer()
    # 目标标签,普通恶评、严重恶评、污言秽语、威胁、侮辱和身份仇视
    # label_cols = ["toxic", "severe_toxic", "obscene",
    #               "threat", "insult", "identity_hate"]
    # reader = JigsawDatasetReader(tokenizer=tokenizer,
    #                              token_indexers={"tokens": token_indexer},
    #                              label_cols=label_cols)

    # Kaggle的多标签“恶意评论分类挑战
    # dataset_root = Path('/home/lirui/nlp/learning_allenNLP/data/jigsaw')
    # train_dataset, dev_dataset = (reader.read(dataset_root/ fname) for fname in ["train.csv", "test_proced.csv"])

    # stanford  情感分类-sst5 数据集
    reader = StanfordSentimentTreeBankDatasetReader(token_indexers={'tokens': token_indexer})
    train_dataset = reader.read('~/nlp/dataset/sst/trees/train.txt')
    dev_dataset = reader.read('~/nlp/dataset/sst/trees/test.txt')

    print(f"total train samples: {len(train_dataset)}, dev samples: {len(dev_dataset)}")

    # 建立词汇表,
    vocab = Vocabulary.from_instances(train_dataset + dev_dataset)

    # pretrained elmo LM model, transformed from bilm-tf with dump_weights in bin/training.py
    options_file = '../models/elmo/elmo_2x4096_512_2048cnn_2xhighway_options.json'
    weight_file = '../models/elmo/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5'

    token_embedding = ElmoTokenEmbedder(options_file, weight_file,
                                        requires_grad=True,
                                        # do_layer_norm=True
                                        )

    # Pass in the ElmoTokenEmbedder instance instead
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    elmo_embedding_dim = word_embeddings.get_output_dim()
    hidden_dim = 256
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(elmo_embedding_dim, hidden_dim, bidirectional=True,batch_first=True))

    model = SSTClassifier(word_embeddings,
                          encoder,
                          out_dim=vocab.get_vocab_size("labels"),
                          vocab=vocab)

    gpu_id = 0 if torch.cuda.is_available() else -1
    if gpu_id > -1:  model.cuda(gpu_id)

    # 构建迭代器,并为迭代器指定vocab
    iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")])
    iterator.index_with(vocab)

    # -------- forward demo ---------
    # generator = iter(iterator(train_dataset, shuffle=True))
    # for _ in range(5):
    #     batch = next(generator) # [batch, sentence_len, token_len]
    #     print('---\nbatch ', batch.keys(), batch['tokens'].keys(), batch['tokens']['tokens'].shape, batch['label'].shape)
    #     batch = nn_util.move_to_device(batch, 0 if use_gpu else -1)
    #
    #     tokens = batch['tokens']
    #     mask = get_text_field_mask(tokens)
    #     embeddings = model.word_embeddings(tokens)
    #     print("embeddings: ", embeddings.shape)
    #     state = model.encoder(embeddings, mask)
    #     class_logits = model.linear(state)
    #
    #     print("lstm state: ", state.shape, class_logits.shape)
    #
    #     y = model(**batch)
    #     print("model out: ", y)
    #
    # print("\nparams ")
    # for n, p in model.named_parameters():
    #     print(n, p.size())

    # --------- train ------------
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

    iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")])
    iterator.index_with(vocab)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      # serialization_dir="./models/",
                      cuda_device=gpu_id,
                      patience=10,
                      num_epochs=20)

    trainer.train()
# embeddings with respect to the vocabulary size of each of the relevant namespaces
# in the vocabulary.
word_embedding = Embedding(num_embeddings=vocab.get_vocab_size("token_ids"), embedding_dim=10)
char_embedding = Embedding(num_embeddings=vocab.get_vocab_size("token_characters"), embedding_dim=5)
character_cnn = CnnEncoder(embedding_dim=5, num_filters=2, output_dim=8)

# This is going to embed an integer character tensor of shape: (batch_size, max_sentence_length, max_word_length) into
# a 4D tensor with an additional embedding dimension, representing the vector for each character.
# and then apply the character_cnn we defined above over the word dimension, resulting in a tensor
# of shape: (batch_size, max_sentence_length, num_filters * ngram_filter_sizes). 
token_character_encoder = TokenCharactersEncoder(embedding=char_embedding, encoder=character_cnn)

# Notice that these keys have the same keys as the TokenIndexers when we created our TextField.
# This is how the text_field_embedder knows which function to apply to which array. 
# There should be a 1-1 mapping between TokenIndexers and TokenEmbedders in your model.
text_field_embedder = BasicTextFieldEmbedder({"tokens": word_embedding, "characters": token_character_encoder})

# Convert the indexed dataset into Pytorch Variables. 
batch = Batch(instances)
tensors = batch.as_tensor_dict(batch.get_padding_lengths())
print("Torch tensors for passing to a model: \n\n", tensors)
print("\n\n")
# tensors is a nested dictionary, first keyed by the
# name we gave our instances (in most cases you'd have more
# than one field in an instance) and then by the key of each
# token indexer we passed to TextField.

# This will contain two tensors: one from representing each
# word as an index and one representing each _character_
# in each word as an index. 
text_field_variables = tensors["sentence"]
def build_model(args, vocab, pretrained_embs, tasks):
    '''Build model according to arguments

    args:
        - args (TODO): object with attributes:
        - vocab (Vocab):
        - pretrained_embs (TODO): word embeddings to use

    returns
    '''
    d_word, n_layers_highway = args.d_word, args.n_layers_highway

    # Build embedding layers
    if args.glove:
        word_embs = pretrained_embs
        train_embs = bool(args.train_words)
    else:
        log.info("\tLearning embeddings from scratch!")
        word_embs = None
        train_embs = True
    word_embedder = Embedding(
        vocab.get_vocab_size('tokens'),
        d_word,
        weight=word_embs,
        trainable=train_embs,
        padding_index=vocab.get_token_index('@@PADDING@@'))
    d_inp_phrase = 0

    # Handle elmo and cove
    token_embedder = {}
    if args.elmo:
        log.info("\tUsing ELMo embeddings!")
        if args.deep_elmo:
            n_reps = 2
            log.info("\tUsing deep ELMo embeddings!")
        else:
            n_reps = 1
        if args.elmo_no_glove:
            log.info("\tNOT using GLoVe embeddings!")
        else:
            token_embedder = {"words": word_embedder}
            log.info("\tUsing GLoVe embeddings!")
            d_inp_phrase += d_word
        elmo = Elmo(options_file=ELMO_OPT_PATH,
                    weight_file=ELMO_WEIGHTS_PATH,
                    num_output_representations=n_reps)
        d_inp_phrase += 1024
    else:
        elmo = None
        token_embedder = {"words": word_embedder}
        d_inp_phrase += d_word
    text_field_embedder = BasicTextFieldEmbedder(token_embedder) if "words" in token_embedder \
                            else None
    d_hid_phrase = args.d_hid if args.pair_enc != 'bow' else d_inp_phrase

    if args.cove:
        cove_layer = cove_lstm(n_vocab=vocab.get_vocab_size('tokens'),
                               vectors=word_embedder.weight.data)
        d_inp_phrase += 600
        log.info("\tUsing CoVe embeddings!")
    else:
        cove_layer = None

    # Build encoders
    phrase_layer = s2s_e.by_name('lstm').from_params(
        Params({
            'input_size': d_inp_phrase,
            'hidden_size': d_hid_phrase,
            'num_layers': args.n_layers_enc,
            'bidirectional': True
        }))
    if args.pair_enc == 'bow':
        sent_encoder = BoWSentEncoder(
            vocab, text_field_embedder)  # maybe should take in CoVe/ELMO?
        pair_encoder = None  # model will just run sent_encoder on both inputs
    else:  # output will be 2 x d_hid_phrase (+ deep elmo)
        sent_encoder = HeadlessSentEncoder(vocab,
                                           text_field_embedder,
                                           n_layers_highway,
                                           phrase_layer,
                                           dropout=args.dropout,
                                           cove_layer=cove_layer,
                                           elmo_layer=elmo)
    d_single = 2 * d_hid_phrase + (args.elmo and args.deep_elmo) * 1024
    if args.pair_enc == 'simple':  # output will be 4 x [2 x d_hid_phrase (+ deep elmo)]
        pair_encoder = HeadlessPairEncoder(vocab,
                                           text_field_embedder,
                                           n_layers_highway,
                                           phrase_layer,
                                           cove_layer=cove_layer,
                                           elmo_layer=elmo,
                                           dropout=args.dropout)
        d_pair = d_single
    elif args.pair_enc == 'attn':
        log.info("\tUsing attention!")
        d_inp_model = 4 * d_hid_phrase + (args.elmo and args.deep_elmo) * 1024
        d_hid_model = d_hid_phrase  # make it as large as the original sentence encoding
        modeling_layer = s2s_e.by_name('lstm').from_params(
            Params({
                'input_size': d_inp_model,
                'hidden_size': d_hid_model,
                'num_layers': 1,
                'bidirectional': True
            }))
        pair_encoder = HeadlessPairAttnEncoder(vocab,
                                               text_field_embedder,
                                               n_layers_highway,
                                               phrase_layer,
                                               DotProductSimilarity(),
                                               modeling_layer,
                                               cove_layer=cove_layer,
                                               elmo_layer=elmo,
                                               deep_elmo=args.deep_elmo,
                                               dropout=args.dropout)
        d_pair = 2 * d_hid_phrase
        # output will be 4 x [2 x d_hid_model], where d_hid_model = 2 x d_hid_phrase
        #                = 4 x [2 x 2 x d_hid_phrase]

    # Build model and classifiers
    model = MultiTaskModel(args, sent_encoder, pair_encoder)
    build_classifiers(tasks, model, d_pair, d_single)
    if args.cuda >= 0:
        model = model.cuda()
    return model
Beispiel #12
0
def main():
    parser = argparse.ArgumentParser(description='Evidence Inference experiments')
    parser.add_argument('--cuda_device', type=int, default=0,
                        help='GPU number (default: 0)')
    parser.add_argument('--epochs', type=int, default=2,
                        help='upper epoch limit (default: 2)')
    parser.add_argument('--patience', type=int, default=1,
                        help='trainer patience  (default: 1)')
    parser.add_argument('--batch_size', type=int, default=8,
                        help='batch size (default: 8)')
    parser.add_argument('--dropout', type=float, default=0.2,
                        help='dropout for the model (default: 0.2)')
    parser.add_argument('--emb_size', type=int, default=256,
                        help='elmo embeddings size (default: 256)')
    parser.add_argument('--model_name', type=str, default='attention',
                        help='model name (default: attention)')
    parser.add_argument('--tunable', action='store_true',
                        help='tune the underlying embedding model (default: False)')
    args = parser.parse_args()

    processed_annotations = pickle.load(open('data/data/p_annotations.p', 'rb'))

    prompts = pd.read_csv('data/data/prompts_merged.csv')

    prompts_dictionary = {}
    for index, row in prompts.iterrows():
        prompts_dictionary[row['PromptID']] = [row['Outcome'], row['Intervention'], row['Comparator']]

    for article_key in processed_annotations:
        for article_item in processed_annotations[article_key]:
            article_item += prompts_dictionary[article_item[-1]]

    train = []
    valid = []
    test = []

    with open('data/splits/train_article_ids.txt') as train_file:
        for line in train_file:
            train.append(int(line.strip()))

    with open('data/splits/validation_article_ids.txt') as valid_file:
        for line in valid_file:
            valid.append(int(line.strip()))

    with open('data/splits/test_article_ids.txt') as test_file:
        for line in test_file:
            test.append(int(line.strip()))

    bert_token_indexer = {'bert': PretrainedBertIndexer('scibert/vocab.txt', max_pieces=512)}

    reader = EIDatasetReader(bert_token_indexer, processed_annotations)
    train_data = reader.read(train)
    valid_data = reader.read(valid)
    test_data = reader.read(test)

    vocab = Vocabulary.from_instances(train_data + valid_data + test_data)

    bert_token_embedding = PretrainedBertEmbedder(
        'scibert/weights.tar.gz', requires_grad=args.tunable
    )

    word_embeddings = BasicTextFieldEmbedder(
        {"bert": bert_token_embedding},
        {"bert": ['bert']},
        allow_unmatched_keys=True
    )

    model = Baseline(word_embeddings, vocab)

    global cuda_device
    cuda_device = args.cuda_device

    if torch.cuda.is_available():
        logger.info('Running on GPU')
        model = model.cuda(cuda_device)
    else:
        logger.info('Running on CPU')
        cuda_device = -1

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    iterator = BucketIterator(batch_size=args.batch_size,
                              sorting_keys=[('article', 'num_fields')],
                              padding_noise=0.1)
    iterator.index_with(vocab)

    serialization_dir = 'model_checkpoints/' + args.model_name

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_data,
                      validation_dataset=test_data,
                      patience=args.patience,
                      validation_metric='+accuracy',
                      num_epochs=args.epochs,
                      cuda_device=cuda_device,
                      serialization_dir=serialization_dir)

    result = trainer.train()
    for key in result:
        print(str(key) + ': ' + str(result[key]))

    test_metrics = evaluate(trainer.model, test_data, iterator,
                            cuda_device=cuda_device,
                            batch_weight_key="")

    print('Test Data statistics:')
    for key, value in test_metrics.items():
        print(str(key) + ': ' + str(value))
Beispiel #13
0
        return {"accuracy": self.accuracy.get_metric(reset)}


reader = PosDatasetReader()
train_dataset = reader.read(
    cached_path('https://raw.githubusercontent.com/allenai/allennlp'
                '/master/tutorials/tagger/training.txt'))
validation_dataset = reader.read(
    cached_path('https://raw.githubusercontent.com/allenai/allennlp'
                '/master/tutorials/tagger/validation.txt'))
vocab = Vocabulary.from_instances(train_dataset + validation_dataset)
EMBEDDING_DIM = 6
HIDDEN_DIM = 6
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                            embedding_dim=EMBEDDING_DIM)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
lstm = PytorchSeq2SeqWrapper(
    torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
model = LstmTagger(word_embeddings, lstm, vocab)
optimizer = optim.SGD(model.parameters(), lr=0.1)
iterator = BucketIterator(batch_size=2,
                          sorting_keys=[("sentence", "num_tokens")])
iterator.index_with(vocab)
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=validation_dataset,
                  patience=10,
                  num_epochs=1000)
trainer.train()
reader = Seq2SeqDatasetReader(
    source_tokenizer=WordTokenizer(),
    target_tokenizer=CharacterTokenizer(),
    source_token_indexers={'tokens': SingleIdTokenIndexer()},
    target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='target_tokens')},
    lazy=True)

vocab = Vocabulary.from_files('/home/earendil/NLP/neural_machine_translation/checkpoint_vocab_epoch_13')

en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                         embedding_dim=EN_EMBEDDING_DIM)

encoder = StackedSelfAttentionEncoder(input_dim=EN_EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, projection_dim=128,
                                      feedforward_hidden_dim=128, num_layers=1, num_attention_heads=8)

source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding})

attention = DotProductAttention()

max_decoding_steps = 300
model_pred = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps,
                           target_embedding_dim=ZH_EMBEDDING_DIM,
                           target_namespace='target_tokens',
                           attention=attention,
                           beam_size=8,
                           use_bleu=True)

# Reload the trained model.
with open('/home/earendil/NLP/neural_machine_translation/checkpoint_model_epoch_13', 'rb') as f:
    model_pred.load_state_dict(torch.load(f, map_location=torch.device('cpu')))
    model_pred.eval()
Beispiel #15
0
    def __init__(self):
        config = conf['seq2seq_allen']
        prefix = config['processed_data_prefix']
        train_file = config['train_data']
        valid_file = config['valid_data']
        src_embedding_dim = config['src_embedding_dim']
        trg_embedding_dim = config['trg_embedding_dim']
        hidden_dim = config['hidden_dim']

        if torch.cuda.is_available():
            cuda_device = 0
        else:
            cuda_device = -1

        self.reader = Seq2SeqDatasetReader(
            source_tokenizer=WordTokenizer(),
            target_tokenizer=WordTokenizer(),
            source_token_indexers={'tokens': SingleIdTokenIndexer()},
            target_token_indexers={
                'tokens': SingleIdTokenIndexer(namespace='target_tokens')
            })

        self.train_dataset = self.reader.read(os.path.join(prefix, train_file))
        self.valid_dataset = self.reader.read(os.path.join(prefix, valid_file))

        vocab = Vocabulary.from_instances(self.train_dataset +
                                          self.valid_dataset,
                                          min_count={
                                              'tokens': 3,
                                              'target_tokens': 3
                                          })

        src_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'),
            embedding_dim=src_embedding_dim)

        encoder = PytorchSeq2SeqWrapper(
            torch.nn.LSTM(src_embedding_dim, hidden_dim, batch_first=True))

        source_embedder = BasicTextFieldEmbedder({"tokens": src_embedding})

        attention = LinearAttention(hidden_dim,
                                    hidden_dim,
                                    activation=Activation.by_name('tanh')())

        self.model = SimpleSeq2Seq(
            vocab=vocab,
            source_embedder=source_embedder,
            encoder=encoder,
            max_decoding_steps=20,
            target_embedding_dim=trg_embedding_dim,
            target_namespace='target_tokens',
            attention=attention,  # pass attention
            use_bleu=True)

        optimizer = optim.Adam(self.model.parameters())
        iterator = BucketIterator(batch_size=32,
                                  sorting_keys=[("source_tokens", "num_tokens")
                                                ])
        # 迭代器需要接受vocab,在训练时可以用vocab来index数据
        iterator.index_with(vocab)

        self.model.cuda(cuda_device)

        self.trainer = Trainer(model=self.model,
                               optimizer=optimizer,
                               iterator=iterator,
                               patience=10,
                               validation_metric="+accuracy",
                               train_dataset=self.train_dataset,
                               validation_dataset=self.valid_dataset,
                               num_epochs=1,
                               cuda_device=cuda_device)
class TransformerQA(Model):
    """
    Registered as `"transformer_qa"`, this class implements a reading comprehension model patterned
    after the proposed model in [Devlin et al]([email protected]:huggingface/transformers.git),
    with improvements borrowed from the SQuAD model in the transformers project.

    It predicts start tokens and end tokens with a linear layer on top of word piece embeddings.

    If you want to use this model on SQuAD datasets, you can use it with the
    [`TransformerSquadReader`](../../dataset_readers/transformer_squad#transformersquadreader)
    dataset reader, registered as `"transformer_squad"`.

    Note that the metrics that the model produces are calculated on a per-instance basis only. Since there could
    be more than one instance per question, these metrics are not the official numbers on either SQuAD task.

    To get official numbers for SQuAD v1.1, for example, you can run

    ```
    python -m allennlp_models.rc.tools.transformer_qa_eval
    ```

    # Parameters

    vocab : `Vocabulary`

    transformer_model_name : `str`, optional (default=`'bert-base-cased'`)
        This model chooses the embedder according to this setting. You probably want to make sure this is set to
        the same thing as the reader.
    """
    def __init__(self,
                 vocab: Vocabulary,
                 transformer_model_name: str = "bert-base-cased",
                 **kwargs) -> None:
        super().__init__(vocab, **kwargs)
        self._text_field_embedder = BasicTextFieldEmbedder(
            {"tokens": PretrainedTransformerEmbedder(transformer_model_name)})
        self._linear_layer = nn.Linear(
            self._text_field_embedder.get_output_dim(), 2)

        self._span_start_accuracy = CategoricalAccuracy()
        self._span_end_accuracy = CategoricalAccuracy()
        self._span_accuracy = BooleanAccuracy()
        self._per_instance_metrics = SquadEmAndF1()

    def forward(  # type: ignore
        self,
        question_with_context: Dict[str, Dict[str, torch.LongTensor]],
        context_span: torch.IntTensor,
        cls_index: torch.LongTensor = None,
        answer_span: torch.IntTensor = None,
        metadata: List[Dict[str, Any]] = None,
    ) -> Dict[str, torch.Tensor]:
        """
        # Parameters

        question_with_context : `Dict[str, torch.LongTensor]`
            From a `TextField`. The model assumes that this text field contains the context followed by the
            question. It further assumes that the tokens have type ids set such that any token that can be part of
            the answer (i.e., tokens from the context) has type id 0, and any other token (including
            `[CLS]` and `[SEP]`) has type id 1.

        context_span : `torch.IntTensor`
            From a `SpanField`. This marks the span of word pieces in `question` from which answers can come.

        cls_index : `torch.LongTensor`, optional
            A tensor of shape `(batch_size,)` that provides the index of the `[CLS]` token
            in the `question_with_context` for each instance.

            This is needed because the `[CLS]` token is used to indicate that the question
            is impossible.

            If this is `None`, it's assumed that the `[CLS]` token is at index 0 for each instance
            in the batch.

        answer_span : `torch.IntTensor`, optional
            From a `SpanField`. This is the thing we are trying to predict - the span of text that marks the
            answer. If given, we compute a loss that gets included in the output directory.

        metadata : `List[Dict[str, Any]]`, optional
            If present, this should contain the question id, and the original texts of context, question, tokenized
            version of both, and a list of possible answers. The length of the `metadata` list should be the
            batch size, and each dictionary should have the keys `id`, `question`, `context`,
            `question_tokens`, `context_tokens`, and `answers`.

        # Returns

        `Dict[str, torch.Tensor]` :
            An output dictionary with the following fields:

            - span_start_logits (`torch.FloatTensor`) :
              A tensor of shape `(batch_size, passage_length)` representing unnormalized log
              probabilities of the span start position.
            - span_end_logits (`torch.FloatTensor`) :
              A tensor of shape `(batch_size, passage_length)` representing unnormalized log
              probabilities of the span end position (inclusive).
            - best_span_scores (`torch.FloatTensor`) :
              The score for each of the best spans.
            - loss (`torch.FloatTensor`, optional) :
              A scalar loss to be optimised, evaluated against `answer_span`.
            - best_span (`torch.IntTensor`, optional) :
              Provided when not in train mode and sufficient metadata given for the instance.
              The result of a constrained inference over `span_start_logits` and
              `span_end_logits` to find the most probable span.  Shape is `(batch_size, 2)`
              and each offset is a token index, unless the best span for an instance
              was predicted to be the `[CLS]` token, in which case the span will be (-1, -1).
            - best_span_str (`List[str]`, optional) :
              Provided when not in train mode and sufficient metadata given for the instance.
              This is the string from the original passage that the model thinks is the best answer
              to the question.

        """
        embedded_question = self._text_field_embedder(question_with_context)
        # shape: (batch_size, sequence_length, 2)
        logits = self._linear_layer(embedded_question)
        # shape: (batch_size, sequence_length, 1)
        span_start_logits, span_end_logits = logits.split(1, dim=-1)
        # shape: (batch_size, sequence_length)
        span_start_logits = span_start_logits.squeeze(-1)
        # shape: (batch_size, sequence_length)
        span_end_logits = span_end_logits.squeeze(-1)

        # Create a mask for `question_with_context` to mask out tokens that are not part
        # of the context.
        # shape: (batch_size, sequence_length)
        possible_answer_mask = torch.zeros_like(
            get_token_ids_from_text_field_tensors(question_with_context),
            dtype=torch.bool)
        for i, (start, end) in enumerate(context_span):
            possible_answer_mask[i, start:end + 1] = True
            # Also unmask the [CLS] token since that token is used to indicate that
            # the question is impossible.
            possible_answer_mask[
                i, 0 if cls_index is None else cls_index[i]] = True

        # Replace the masked values with a very negative constant since we're in log-space.
        # shape: (batch_size, sequence_length)
        span_start_logits = replace_masked_values_with_big_negative_number(
            span_start_logits, possible_answer_mask)
        # shape: (batch_size, sequence_length)
        span_end_logits = replace_masked_values_with_big_negative_number(
            span_end_logits, possible_answer_mask)

        # Now calculate the best span.
        # shape: (batch_size, 2)
        best_spans = get_best_span(span_start_logits, span_end_logits)

        # Sum the span start score with the span end score to get an overall score for the span.
        # shape: (batch_size,)
        best_span_scores = torch.gather(
            span_start_logits, 1,
            best_spans[:, 0].unsqueeze(1)) + torch.gather(
                span_end_logits, 1, best_spans[:, 1].unsqueeze(1))
        best_span_scores = best_span_scores.squeeze(1)

        output_dict = {
            "span_start_logits": span_start_logits,
            "span_end_logits": span_end_logits,
            "best_span_scores": best_span_scores,
        }

        # Compute the loss.
        if answer_span is not None:
            output_dict["loss"] = self._evaluate_span(best_spans,
                                                      span_start_logits,
                                                      span_end_logits,
                                                      answer_span)

        # Gather the string of the best span and compute the EM and F1 against the gold span,
        # if given.
        if not self.training and metadata is not None:
            (
                output_dict["best_span_str"],
                output_dict["best_span"],
            ) = self._collect_best_span_strings(best_spans, context_span,
                                                metadata, cls_index)

        return output_dict

    def _evaluate_span(
        self,
        best_spans: torch.Tensor,
        span_start_logits: torch.Tensor,
        span_end_logits: torch.Tensor,
        answer_span: torch.Tensor,
    ) -> torch.Tensor:
        """
        Calculate the loss against the `answer_span` and also update the span metrics.
        """
        span_start = answer_span[:, 0]
        span_end = answer_span[:, 1]
        self._span_accuracy(best_spans, answer_span)

        start_loss = cross_entropy(span_start_logits,
                                   span_start,
                                   ignore_index=-1)
        big_constant = min(torch.finfo(start_loss.dtype).max, 1e9)
        assert not torch.any(start_loss > big_constant), "Start loss too high"

        end_loss = cross_entropy(span_end_logits, span_end, ignore_index=-1)
        assert not torch.any(end_loss > big_constant), "End loss too high"

        self._span_start_accuracy(span_start_logits, span_start)
        self._span_end_accuracy(span_end_logits, span_end)

        return (start_loss + end_loss) / 2

    def _collect_best_span_strings(
        self,
        best_spans: torch.Tensor,
        context_span: torch.IntTensor,
        metadata: List[Dict[str, Any]],
        cls_index: Optional[torch.LongTensor],
    ) -> Tuple[List[str], torch.Tensor]:
        """
        Collect the string of the best predicted span from the context metadata and
        update `self._per_instance_metrics`, which in the case of SQuAD v1.1 / v2.0
        includes the EM and F1 score.

        This returns a `Tuple[List[str], torch.Tensor]`, where the `List[str]` is the
        predicted answer for each instance in the batch, and the tensor is just the input
        tensor `best_spans` after adjustments so that each answer span corresponds to the
        context tokens only, and not the question tokens. Spans that correspond to the
        `[CLS]` token, i.e. the question was predicted to be impossible, will be set
        to `(-1, -1)`.
        """
        _best_spans = best_spans.detach().cpu().numpy()

        best_span_strings: List[str] = []
        best_span_strings_for_metric: List[str] = []
        answer_strings_for_metric: List[List[str]] = []

        for (metadata_entry, best_span, cspan, cls_ind) in zip(
                metadata,
                _best_spans,
                context_span,
                cls_index or (0 for _ in range(len(metadata))),
        ):
            context_tokens_for_question = metadata_entry["context_tokens"]

            if best_span[0] == cls_ind:
                # Predicting [CLS] is interpreted as predicting the question as unanswerable.
                best_span_string = ""
                # NOTE: even though we've "detached" 'best_spans' above, this still
                # modifies the original tensor in-place.
                best_span[0], best_span[1] = -1, -1
            else:
                best_span -= int(cspan[0])
                assert np.all(best_span >= 0)

                predicted_start, predicted_end = tuple(best_span)

                while (predicted_start >= 0
                       and context_tokens_for_question[predicted_start].idx is
                       None):
                    predicted_start -= 1
                if predicted_start < 0:
                    logger.warning(
                        f"Could not map the token '{context_tokens_for_question[best_span[0]].text}' at index "
                        f"'{best_span[0]}' to an offset in the original text.")
                    character_start = 0
                else:
                    character_start = context_tokens_for_question[
                        predicted_start].idx

                while (predicted_end < len(context_tokens_for_question) and
                       context_tokens_for_question[predicted_end].idx is None):
                    predicted_end += 1
                if predicted_end >= len(context_tokens_for_question):
                    logger.warning(
                        f"Could not map the token '{context_tokens_for_question[best_span[1]].text}' at index "
                        f"'{best_span[1]}' to an offset in the original text.")
                    character_end = len(metadata_entry["context"])
                else:
                    end_token = context_tokens_for_question[predicted_end]
                    character_end = end_token.idx + len(
                        sanitize_wordpiece(end_token.text))

                best_span_string = metadata_entry["context"][
                    character_start:character_end]

            best_span_strings.append(best_span_string)
            answers = metadata_entry.get("answers")
            if answers:
                best_span_strings_for_metric.append(best_span_string)
                answer_strings_for_metric.append(answers)

        if answer_strings_for_metric:
            self._per_instance_metrics(best_span_strings_for_metric,
                                       answer_strings_for_metric)

        return best_span_strings, best_spans

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        output = {
            "start_acc": self._span_start_accuracy.get_metric(reset),
            "end_acc": self._span_end_accuracy.get_metric(reset),
            "span_acc": self._span_accuracy.get_metric(reset),
        }
        if not self.training:
            exact_match, f1_score = self._per_instance_metrics.get_metric(
                reset)
            output["per_instance_em"] = exact_match
            output["per_instance_f1"] = f1_score
        return output

    default_predictor = "transformer_qa"
Beispiel #17
0
    #                             vocab_namespace="source_char_tokens",
    #                             vocab=vocab)
    #  src_char_encoder = TokenCharactersEncoder(embedding=src_char_embedding,
    #                                            encoder=GruSeq2VecEncoder(input_size=args.emb_dim,
    #                                                                      hidden_size=args.hid_dim))
    tgt_embedding = Embedding(embedding_dim=args.emb_dim,
                              vocab_namespace="target_tokens",
                              vocab=vocab)
    #  tgt_char_embedding = Embedding(embedding_dim=args.emb_dim,
    #                             vocab_namespace="target_char_tokens",
    #                             vocab=vocab)
    #  tgt_char_encoder = TokenCharactersEncoder(embedding=tgt_char_embedding,
    #                                            encoder=GruSeq2VecEncoder(input_size=args.emb_dim,
    #                                                                      hidden_size=args.hid_dim))
    src_embedders = BasicTextFieldEmbedder({
        "tokens": src_embedding,
        #  "character_tokens": src_char_encoder
    })
    # tgt_embedders = BasicTextFieldEmbedder({
    #     "tokens": tgt_embedding,
    #  "character_tokens": tgt_char_encoder
    # })

    train_loader = SimpleDataLoader.from_dataset_reader(
        reader=dataset_reader,
        data_path=args.train_file,
        batch_size=args.bs,
        shuffle=True)
    train_loader.index_with(vocab)
    val_loader = SimpleDataLoader.from_dataset_reader(
        reader=dataset_reader, data_path=args.valid_file, batch_size=args.bs)
    val_loader.index_with(vocab)
Beispiel #18
0
    def __init__(self,
                 vocab: Vocabulary,
                 params: Params,
                 regularizer: RegularizerApplicator = None):

        super(LayerNerEmdRelation, self).__init__(vocab=vocab,
                                                  regularizer=regularizer)

        # Base text Field Embedder
        text_field_embedder_params = params.pop("text_field_embedder")
        text_field_embedder = BasicTextFieldEmbedder.from_params(
            vocab=vocab, params=text_field_embedder_params)
        self._text_field_embedder = text_field_embedder

        ############
        # NER Stuffs
        ############
        ner_params = params.pop("ner")

        # Encoder
        encoder_ner_params = ner_params.pop("encoder")
        encoder_ner = Seq2SeqEncoder.from_params(encoder_ner_params)
        self._encoder_ner = encoder_ner

        # Tagger NER - CRF Tagger
        tagger_ner_params = ner_params.pop("tagger")
        tagger_ner = CrfTagger(
            vocab=vocab,
            text_field_embedder=self._text_field_embedder,
            encoder=self._encoder_ner,
            label_namespace=tagger_ner_params.pop("label_namespace", "labels"),
            constraint_type=tagger_ner_params.pop("constraint_type", None),
            dropout=tagger_ner_params.pop("dropout", None),
            regularizer=regularizer,
        )
        self._tagger_ner = tagger_ner

        ############
        # EMD Stuffs
        ############
        emd_params = params.pop("emd")

        # Encoder
        encoder_emd_params = emd_params.pop("encoder")
        encoder_emd = Seq2SeqEncoder.from_params(encoder_emd_params)
        self._encoder_emd = encoder_emd

        shortcut_text_field_embedder = ShortcutConnectTextFieldEmbedder(
            base_text_field_embedder=self._text_field_embedder,
            previous_encoders=[self._encoder_ner])
        self._shortcut_text_field_embedder = shortcut_text_field_embedder

        # Tagger: EMD - CRF Tagger
        tagger_emd_params = emd_params.pop("tagger")
        tagger_emd = CrfTagger(
            vocab=vocab,
            text_field_embedder=self._shortcut_text_field_embedder,
            encoder=self._encoder_emd,
            label_namespace=tagger_emd_params.pop("label_namespace", "labels"),
            constraint_type=tagger_emd_params.pop("constraint_type", None),
            dropout=tagger_ner_params.pop("dropout", None),
            regularizer=regularizer,
        )
        self._tagger_emd = tagger_emd

        ############################
        # Relation Extraction Stuffs
        ############################
        relation_params = params.pop("relation")

        # Encoder
        encoder_relation_params = relation_params.pop("encoder")
        encoder_relation = Seq2SeqEncoder.from_params(encoder_relation_params)
        self._encoder_relation = encoder_relation

        shortcut_text_field_embedder_relation = ShortcutConnectTextFieldEmbedder(
            base_text_field_embedder=self._text_field_embedder,
            previous_encoders=[self._encoder_ner, self._encoder_emd])
        self._shortcut_text_field_embedder_relation = shortcut_text_field_embedder_relation

        # Tagger: Relation
        tagger_relation_params = relation_params.pop("tagger")
        tagger_relation = RelationExtractor(
            vocab=vocab,
            text_field_embedder=self._shortcut_text_field_embedder_relation,
            context_layer=self._encoder_relation,
            d=tagger_relation_params.pop_int("d"),
            l=tagger_relation_params.pop_int("l"),
            n_classes=tagger_relation_params.pop("n_classes"),
            activation=tagger_relation_params.pop("activation"),
        )
        self._tagger_relation = tagger_relation

        logger.info("Multi-Task Learning Model has been instantiated.")
Beispiel #19
0
                    '/master/tutorials/tagger/validation.txt'))

    vocab = Vocabulary.from_instances(train_dataset + validation_dataset)

    EMBEDDING_DIM = 6
    HIDDEN_DIM = 6

    model_params = Params({
        'type': 'lstm',
        'input_size': EMBEDDING_DIM,
        'hidden_size': HIDDEN_DIM
    })

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)
    word_embedding = BasicTextFieldEmbedder({'tokens': token_embedding})
    lstm = Seq2SeqEncoder.from_params(model_params)

    model = POSTagger(word_embedding, lstm, vocab)

    optimizer = optim.SGD(model.parameters(), lr=0.1)

    iterator = BucketIterator(batch_size=2,
                              sorting_keys=[('sentence', 'num_tokens')])
    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
Beispiel #20
0
    def __init__(self,
                 vocab: Vocabulary,
                 params: Params,
                 regularizer: RegularizerApplicator = None):

        super(LayerNerEmdCoref, self).__init__(vocab=vocab,
                                               regularizer=regularizer)

        # Base text Field Embedder
        text_field_embedder_params = params.pop("text_field_embedder")
        text_field_embedder = BasicTextFieldEmbedder.from_params(
            vocab=vocab, params=text_field_embedder_params)
        self._text_field_embedder = text_field_embedder

        ############
        # NER Stuffs
        ############
        ner_params = params.pop("ner")

        # Encoder
        encoder_ner_params = ner_params.pop("encoder")
        encoder_ner = Seq2SeqEncoder.from_params(encoder_ner_params)
        self._encoder_ner = encoder_ner

        # Tagger NER - CRF Tagger
        tagger_ner_params = ner_params.pop("tagger")
        tagger_ner = CrfTagger(
            vocab=vocab,
            text_field_embedder=self._text_field_embedder,
            encoder=self._encoder_ner,
            label_namespace=tagger_ner_params.pop("label_namespace", "labels"),
            constraint_type=tagger_ner_params.pop("constraint_type", None),
            dropout=tagger_ner_params.pop("dropout", None),
            regularizer=regularizer,
        )
        self._tagger_ner = tagger_ner

        ############
        # EMD Stuffs
        ############
        emd_params = params.pop("emd")

        # Encoder
        encoder_emd_params = emd_params.pop("encoder")
        encoder_emd = Seq2SeqEncoder.from_params(encoder_emd_params)
        self._encoder_emd = encoder_emd

        shortcut_text_field_embedder = ShortcutConnectTextFieldEmbedder(
            base_text_field_embedder=self._text_field_embedder,
            previous_encoders=[self._encoder_ner])
        self._shortcut_text_field_embedder = shortcut_text_field_embedder

        # Tagger: EMD - CRF Tagger
        tagger_emd_params = emd_params.pop("tagger")
        tagger_emd = CrfTagger(
            vocab=vocab,
            text_field_embedder=self._shortcut_text_field_embedder,
            encoder=self._encoder_emd,
            label_namespace=tagger_emd_params.pop("label_namespace", "labels"),
            constraint_type=tagger_emd_params.pop("constraint_type", None),
            dropout=tagger_ner_params.pop("dropout", None),
            regularizer=regularizer,
        )
        self._tagger_emd = tagger_emd

        ##############
        # Coref Stuffs
        ##############
        coref_params = params.pop("coref")

        # Encoder
        encoder_coref_params = coref_params.pop("encoder")
        encoder_coref = Seq2SeqEncoder.from_params(encoder_coref_params)
        self._encoder_coref = encoder_coref

        shortcut_text_field_embedder_coref = ShortcutConnectTextFieldEmbedder(
            base_text_field_embedder=self._text_field_embedder,
            previous_encoders=[self._encoder_ner, self._encoder_emd])
        self._shortcut_text_field_embedder_coref = shortcut_text_field_embedder_coref

        # Tagger: Coreference
        tagger_coref_params = coref_params.pop("tagger")
        eval_on_gold_mentions = tagger_coref_params.pop_bool(
            "eval_on_gold_mentions", False)
        init_params = tagger_coref_params.pop("initializer", None)
        initializer = (InitializerApplicator.from_params(init_params)
                       if init_params is not None else InitializerApplicator())

        tagger_coref = CoreferenceCustom(
            vocab=vocab,
            text_field_embedder=self._shortcut_text_field_embedder_coref,
            context_layer=self._encoder_coref,
            mention_feedforward=FeedForward.from_params(
                tagger_coref_params.pop("mention_feedforward")),
            antecedent_feedforward=FeedForward.from_params(
                tagger_coref_params.pop("antecedent_feedforward")),
            feature_size=tagger_coref_params.pop_int("feature_size"),
            max_span_width=tagger_coref_params.pop_int("max_span_width"),
            spans_per_word=tagger_coref_params.pop_float("spans_per_word"),
            max_antecedents=tagger_coref_params.pop_int("max_antecedents"),
            lexical_dropout=tagger_coref_params.pop_float(
                "lexical_dropout", 0.2),
            initializer=initializer,
            regularizer=regularizer,
            eval_on_gold_mentions=eval_on_gold_mentions,
        )
        self._tagger_coref = tagger_coref
        if eval_on_gold_mentions:
            self._tagger_coref._eval_on_gold_mentions = True

        logger.info("Multi-Task Learning Model has been instantiated.")
class TransformerQA(Model):
    """
    This class implements a reading comprehension model patterned after the proposed model in
    https://arxiv.org/abs/1810.04805 (Devlin et al), with improvements borrowed from the SQuAD model in the
    transformers project.

    It predicts start tokens and end tokens with a linear layer on top of word piece embeddings.

    Note that the metrics that the model produces are calculated on a per-instance basis only. Since there could
    be more than one instance per question, these metrics are not the official numbers on the SQuAD task. To get
    official numbers, run the script in scripts/transformer_qa_eval.py.

    Parameters
    ----------
    vocab : ``Vocabulary``
    transformer_model_name : ``str``, optional (default=``bert-base-cased``)
        This model chooses the embedder according to this setting. You probably want to make sure this is set to
        the same thing as the reader.
    """
    def __init__(self,
                 vocab: Vocabulary,
                 transformer_model_name: str = "bert-base-cased",
                 hidden_size=768,
                 **kwargs) -> None:
        super().__init__(vocab, **kwargs)
        self._text_field_embedder = BasicTextFieldEmbedder({
            "tokens":
            PretrainedTransformerEmbedder(transformer_model_name,
                                          hidden_size=hidden_size,
                                          task="QA")
        })
        self._linear_layer = nn.Linear(
            self._text_field_embedder.get_output_dim(), 2)

        self._span_start_accuracy = CategoricalAccuracy()
        self._span_end_accuracy = CategoricalAccuracy()
        self._span_accuracy = BooleanAccuracy()
        self._per_instance_metrics = SquadEmAndF1()

    def forward(  # type: ignore
        self,
        question_with_context: Dict[str, Dict[str, torch.LongTensor]],
        context_span: torch.IntTensor,
        answer_span: Optional[torch.IntTensor] = None,
        metadata: List[Dict[str, Any]] = None,
    ) -> Dict[str, torch.Tensor]:
        """
        Parameters
        ----------
        question_with_context : Dict[str, torch.LongTensor]
            From a ``TextField``. The model assumes that this text field contains the context followed by the
            question. It further assumes that the tokens have type ids set such that any token that can be part of
            the answer (i.e., tokens from the context) has type id 0, and any other token (including [CLS] and
            [SEP]) has type id 1.
        context_span : ``torch.IntTensor``
            From a ``SpanField``. This marks the span of word pieces in ``question`` from which answers can come.
        answer_span : ``torch.IntTensor``, optional
            From a ``SpanField``. This is the thing we are trying to predict - the span of text that marks the
            answer. If given, we compute a loss that gets included in the output directory.
        metadata : ``List[Dict[str, Any]]``, optional
            If present, this should contain the question id, and the original texts of context, question, tokenized
            version of both, and a list of possible answers. The length of the ``metadata`` list should be the
            batch size, and each dictionary should have the keys ``id``, ``question``, ``context``,
            ``question_tokens``, ``context_tokens``, and ``answers``.

        Returns
        -------
        An output dictionary consisting of:
        span_start_logits : torch.FloatTensor
            A tensor of shape ``(batch_size, passage_length)`` representing unnormalized log
            probabilities of the span start position.
        span_start_probs : torch.FloatTensor
            The result of ``softmax(span_start_logits)``.
        span_end_logits : torch.FloatTensor
            A tensor of shape ``(batch_size, passage_length)`` representing unnormalized log
            probabilities of the span end position (inclusive).
        span_end_probs : torch.FloatTensor
            The result of ``softmax(span_end_logits)``.
        best_span : torch.IntTensor
            The result of a constrained inference over ``span_start_logits`` and
            ``span_end_logits`` to find the most probable span.  Shape is ``(batch_size, 2)``
            and each offset is a token index.
        best_span_scores : torch.FloatTensor
            The score for each of the best spans.
        loss : torch.FloatTensor, optional
            A scalar loss to be optimised.
        best_span_str : List[str]
            If sufficient metadata was provided for the instances in the batch, we also return the
            string from the original passage that the model thinks is the best answer to the
            question.
        """
        embedded_question = self._text_field_embedder(question_with_context)
        logits = self._linear_layer(embedded_question)
        span_start_logits, span_end_logits = logits.split(1, dim=-1)
        span_start_logits = span_start_logits.squeeze(-1)
        span_end_logits = span_end_logits.squeeze(-1)

        possible_answer_mask = torch.zeros_like(
            get_token_ids_from_text_field_tensors(question_with_context),
            dtype=torch.bool)
        for i, (start, end) in enumerate(context_span):
            possible_answer_mask[i, start:end + 1] = True

        span_start_logits = util.replace_masked_values(span_start_logits,
                                                       possible_answer_mask,
                                                       -1e32)
        span_end_logits = util.replace_masked_values(span_end_logits,
                                                     possible_answer_mask,
                                                     -1e32)
        span_start_probs = torch.nn.functional.softmax(span_start_logits,
                                                       dim=-1)
        span_end_probs = torch.nn.functional.softmax(span_end_logits, dim=-1)
        best_spans = get_best_span(span_start_logits, span_end_logits)
        best_span_scores = torch.gather(
            span_start_logits, 1,
            best_spans[:, 0].unsqueeze(1)) + torch.gather(
                span_end_logits, 1, best_spans[:, 1].unsqueeze(1))
        best_span_scores = best_span_scores.squeeze(1)

        output_dict = {
            "span_start_logits": span_start_logits,
            "span_start_probs": span_start_probs,
            "span_end_logits": span_end_logits,
            "span_end_probs": span_end_probs,
            "best_span": best_spans,
            "best_span_scores": best_span_scores,
        }

        # Compute the loss for training.
        if answer_span is not None:
            span_start = answer_span[:, 0]
            span_end = answer_span[:, 1]
            span_mask = span_start != -1
            self._span_accuracy(best_spans, answer_span,
                                span_mask.unsqueeze(-1).expand_as(best_spans))

            start_loss = cross_entropy(span_start_logits,
                                       span_start,
                                       ignore_index=-1)
            if torch.any(start_loss > 1e9):
                logger.critical("Start loss too high (%r)", start_loss)
                logger.critical("span_start_logits: %r", span_start_logits)
                logger.critical("span_start: %r", span_start)
                assert False

            end_loss = cross_entropy(span_end_logits,
                                     span_end,
                                     ignore_index=-1)
            if torch.any(end_loss > 1e9):
                logger.critical("End loss too high (%r)", end_loss)
                logger.critical("span_end_logits: %r", span_end_logits)
                logger.critical("span_end: %r", span_end)
                assert False

            loss = (start_loss + end_loss) / 2

            self._span_start_accuracy(span_start_logits, span_start, span_mask)
            self._span_end_accuracy(span_end_logits, span_end, span_mask)

            output_dict["loss"] = loss

        # Compute the EM and F1 on SQuAD and add the tokenized input to the output.
        if metadata is not None:
            best_spans = best_spans.detach().cpu().numpy()

            output_dict["best_span_str"] = []
            context_tokens = []
            for metadata_entry, best_span in zip(metadata, best_spans):
                context_tokens_for_question = metadata_entry["context_tokens"]
                context_tokens.append(context_tokens_for_question)

                best_span -= 1 + len(metadata_entry["question_tokens"]) + 2
                assert np.all(best_span >= 0)

                predicted_start, predicted_end = tuple(best_span)

                while (predicted_start >= 0
                       and context_tokens_for_question[predicted_start].idx is
                       None):
                    predicted_start -= 1
                if predicted_start < 0:
                    logger.warning(
                        f"Could not map the token '{context_tokens_for_question[best_span[0]].text}' at index "
                        f"'{best_span[0]}' to an offset in the original text.")
                    character_start = 0
                else:
                    character_start = context_tokens_for_question[
                        predicted_start].idx

                while (predicted_end < len(context_tokens_for_question) and
                       context_tokens_for_question[predicted_end].idx is None):
                    predicted_end += 1
                if predicted_end >= len(context_tokens_for_question):
                    logger.warning(
                        f"Could not map the token '{context_tokens_for_question[best_span[1]].text}' at index "
                        f"'{best_span[1]}' to an offset in the original text.")
                    character_end = len(metadata_entry["context"])
                else:
                    end_token = context_tokens_for_question[predicted_end]
                    character_end = end_token.idx + len(
                        sanitize_wordpiece(end_token.text))

                best_span_string = metadata_entry["context"][
                    character_start:character_end]
                output_dict["best_span_str"].append(best_span_string)

                answers = metadata_entry.get("answers")
                if len(answers) > 0:
                    self._per_instance_metrics(best_span_string, answers)
            output_dict["context_tokens"] = context_tokens
        return output_dict

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        exact_match, f1_score = self._per_instance_metrics.get_metric(reset)
        return {
            "start_acc": self._span_start_accuracy.get_metric(reset),
            "end_acc": self._span_end_accuracy.get_metric(reset),
            "span_acc": self._span_accuracy.get_metric(reset),
            "per_instance_em": exact_match,
            "per_instance_f1": f1_score,
        }
Beispiel #22
0
from allennlp.data import Vocabulary
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
import torch

# This is what gets created by TextField.as_tensor with a SingleIdTokenIndexer;
# see the exercises above.
token_tensor = {'tokens': {'tokens': torch.LongTensor([1, 3, 2, 1, 4, 3])}}

vocab = Vocabulary()
vocab.add_tokens_to_namespace(['This', 'is', 'some', 'text', '.'],
                              namespace='token_vocab')

glove_file = 'https://allennlp.s3.amazonaws.com/datasets/glove/glove.6B.50d.txt.gz'

# This is for embedding each token.
embedding = Embedding(vocab=vocab,
                      vocab_namespace='token_vocab',
                      embedding_dim=50,
                      pretrained_file=glove_file)

embedder = BasicTextFieldEmbedder(token_embedders={'tokens': embedding})

embedded_tokens = embedder(token_tensor)
print(embedded_tokens.size())
Beispiel #23
0
        config["validation_cont"]["candidate_set_path"],
        config["validation_cont"]["candidate_set_from_to"][1])

    # embedding layer (use pre-trained, but make it trainable as well)
    if config["token_embedder_type"] == "embedding":
        vocab = Vocabulary.from_files(config["vocab_directory"])
        tokens_embedder = Embedding.from_params(
            vocab,
            Params({
                "pretrained_file": config["pre_trained_embedding"],
                "embedding_dim": config["pre_trained_embedding_dim"],
                "trainable": config["train_embedding"],
                "padding_index": 0,
                "sparse": config["sparse_gradient_embedding"]
            }))
        word_embedder = BasicTextFieldEmbedder({"tokens": tokens_embedder})

    elif config["token_embedder_type"] == "fasttext":
        vocab = None  #FastTextVocab(config["fasttext_vocab_mapping"])
        tokens_embedder = FastTextEmbeddingBag(
            numpy.load(config["fasttext_weights"]),
            sparse=True,
            requires_grad=config["train_embedding"],
            mode=config["fasttext_merge_mode"])
        word_embedder = BasicTextFieldEmbedder({"tokens": tokens_embedder},
                                               allow_unmatched_keys=True,
                                               embedder_to_indexer_map={
                                                   "tokens": {
                                                       "tokens": "tokens",
                                                       "offsets": "offsets"
                                                   }
Beispiel #24
0
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders.bert_token_embedder import PretrainedBertEmbedder

bert_embedder = PretrainedBertEmbedder(
    pretrained_model = "./biobert_v1.1_pubmed/weights.tar.gz",
    top_layer_only=True,
    requires_grad=False
)

#print('Bert Model:', bert_embedder.bert_model.encoder.layer[11])
for param in bert_embedder.bert_model.encoder.layer[8:].parameters():
    param.requires_grad = True


word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder(
                                                            token_embedders={"tokens": bert_embedder}, 
                                                            allow_unmatched_keys=True)

# %%
BERT_DIM = word_embeddings.get_output_dim()
print('Bert dim:', BERT_DIM)

class BertSentencePooler(Seq2VecEncoder):
    def __init__(self, vocab):
        super().__init__(vocab)

    def forward(self, embs:torch.tensor, mask:torch.tensor=None) -> torch.tensor:
        bert_out = embs[:, :, 0]
        return bert_out
    
    def get_output_dim(self) -> int:
Beispiel #25
0
# options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json"
# weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5"

# Use the 'Small' pre-trained model
# options_file = ('https://s3-us-west-2.amazonaws.com/allennlp/models/elmo'
#                '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json')
# weight_file = ('https://s3-us-west-2.amazonaws.com/allennlp/models/elmo'
#               '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5')

elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)

vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                  min_count={'tokens': 3})

# Pass in the ElmoTokenEmbedder instance instead
word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})

# The dimension of the ELMo embedding will be 2 x [size of LSTM hidden states]
elmo_embedding_dim = 1024
lstm = PytorchSeq2VecWrapper(
    torch.nn.LSTM(elmo_embedding_dim, HIDDEN_DIM, batch_first=True))

model = LstmClassifier(word_embeddings, lstm, vocab)
optimizer = optim.AdamW(model.parameters())

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

iterator = BucketIterator(batch_size=32,
                          sorting_keys=[("tokens", "num_tokens")])
def main():
    # In order to use ELMo, each word in a sentence needs to be indexed with
    # an array of character IDs.
    elmo_token_indexer = ELMoTokenCharactersIndexer()
    reader = StanfordSentimentTreeBankDatasetReader(
        token_indexers={'tokens': elmo_token_indexer})

    train_dataset = reader.read(
        'data/stanfordSentimentTreebank/trees/train.txt')
    dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt')

    # Initialize the ELMo-based token embedder using a pre-trained file.
    # This takes a while if you run this script for the first time

    # Original
    # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
    # weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

    # Medium
    # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json"
    # weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5"

    # Use the 'Small' pre-trained model
    options_file = (
        'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo'
        '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json'
    )
    weight_file = (
        'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo'
        '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5'
    )

    elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)

    vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                      min_count={'tokens': 3})

    # Pass in the ElmoTokenEmbedder instance instead
    word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})

    # The dimension of the ELMo embedding will be 2 x [size of LSTM hidden states]
    elmo_embedding_dim = 256
    lstm = PytorchSeq2VecWrapper(
        torch.nn.LSTM(elmo_embedding_dim, HIDDEN_DIM, batch_first=True))

    model = LstmClassifier(word_embeddings, lstm, vocab)
    optimizer = optim.Adam(model.parameters())

    iterator = BucketIterator(batch_size=32,
                              sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      patience=10,
                      num_epochs=20)

    trainer.train()

    tokens = ['This', 'is', 'the', 'best', 'movie', 'ever', '!']
    predictor = SentenceClassifierPredictor(model, dataset_reader=reader)
    logits = predictor.predict(tokens)['logits']
    label_id = np.argmax(logits)

    print(model.vocab.get_token_from_index(label_id, 'labels'))
# This is what gets created by TextField.as_tensor with a SingleIdTokenIndexer;
# Note that we added the batch dimension at the front.  You choose the 'indexer1'
# name when you configure your data processing code.
token_tensor = {'indexer1': {'tokens': torch.LongTensor([[1, 3, 2, 9, 4, 3]])}}

# You would typically get the number of embeddings here from the vocabulary;
# if you use `allennlp train`, there is a separate process for instantiating the
# Embedding object using the vocabulary that you don't need to worry about for
# now.
embedding = Embedding(num_embeddings=10, embedding_dim=3)

# This 'indexer1' key must match the 'indexer1' key in the `token_tensor` above.
# We use these names to align the TokenIndexers used in the data code with the
# TokenEmbedders that do the work on the model side.
embedder = BasicTextFieldEmbedder(token_embedders={'indexer1': embedding})

embedded_tokens = embedder(token_tensor)
print("Using the TextFieldEmbedder:", embedded_tokens)

# As we've said a few times, what's going on inside is that we match keys between
# the token tensor and the token embedders, then pass the inner dictionary to the
# token embedder.  The above lines perform the following logic:
embedded_tokens = embedding(**token_tensor['indexer1'])
print("Using the Embedding directly:", embedded_tokens)

# This is what gets created by TextField.as_tensor with a TokenCharactersIndexer
# Note that we added the batch dimension at the front. Don't worry too much
# about the magic 'token_characters' key - that is hard-coded to be produced
# by the TokenCharactersIndexer, and accepted by TokenCharactersEncoder;
# you don't have to produce those yourself in normal settings, it's done for you.
Beispiel #28
0
def main(args):
    ALL_DATASET_PATHS = get_all_dataset_paths(
        args.dataset_paths_file, 
        args.dataset_path_prefix
    )
    SELECTED_TASK_NAMES = args.task
    PROJECTION_DIM = args.proj_dim
    HIDDEN_DIM = args.hidden_dim
    # BIDIRECTIONAL=True
    # INTERMEDIATE_INPUT=2*HIDDEN_DIM if BIDIRECTIONAL else HIDDEN_DIM
    DROPOUT = args.dropout
    LR = args.lr
    WEIGHT_DECAY = args.weight_decay
    BATCH_SIZE = args.batch_size
    NUM_EPOCHS = args.epochs
    PATIENTCE = args.patience
    SERIALIZATION_DIR = args.model_dir
    CLEAN_MODEL_DIR = args.clean_model_dir
    CUDA_DEVICE = cuda_device(args.cuda)
    TEST_MODE = args.test_mode
    # device = torch.device(f"cuda:{CUDA_DEVICE}" if torch.cuda.is_available() and args.cuda else "cpu")

    TASKS = [TASK_CONFIGS[task_name] for task_name in SELECTED_TASK_NAMES]
    dataset_paths = {
        task_name: ALL_DATASET_PATHS[task_name] for task_name in SELECTED_TASK_NAMES
    }

    tag_namespace_hashing_fn = {
        tag_namespace: i for i, tag_namespace in enumerate(TASK_CONFIGS.keys())
    }.get

    elmo_token_indexer = ELMoTokenCharactersIndexer()
    token_indexers = {"tokens": elmo_token_indexer}
    readers = {
        task.tag_namespace: ConLLDatasetReader(
            task.tag_namespace,
            token_indexers=token_indexers,
            tag_namespace_hashing_fn=tag_namespace_hashing_fn,
        )
        for task in TASKS
    }

    elmo_embedder = ElmoTokenEmbedder(
        options_file,
        weight_file,
        requires_grad=False,
        dropout=DROPOUT,
        projection_dim=PROJECTION_DIM,
    )
    # elmo_embedder = Elmo(options_file, weight_file, num_output_representations=3)

    # Pass in the ElmoTokenEmbedder instance instead
    word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})
    ELMO_EMBEDDING_DIM = elmo_embedder.get_output_dim()

    # POS -> CHUNK -> NER
    task_suffixes = set(
        [task_name.rsplit("_", 1)[-1] for task_name in SELECTED_TASK_NAMES]
    )
    encoders = get_task_encoder_dict(args, task_suffixes, ELMO_EMBEDDING_DIM)

    if not TEST_MODE:
        train_dataset = read_datasets(dataset_paths, readers, data_split="train")
        validation_dataset = read_datasets(dataset_paths, readers, data_split="dev")

        vocab = create_vocab([train_dataset, validation_dataset])

        # Special case for CCG
        if "ccg" in task_suffixes or "pos" in task_suffixes:
            for task in TASKS:
                if task.task_type == "ccg":
                    for tag in ["B-NOUN.SHAPE", "I-NOUN.PROCESS"]:
                        vocab.add_token_to_namespace(tag, task.tag_namespace)
                if task.tag_namespace == "ud_pos":
                    for tag in ["CONJ"]:
                        vocab.add_token_to_namespace(tag, task.tag_namespace)
				
    else:
        vocab = Vocabulary.from_files(os.path.join(SERIALIZATION_DIR, "vocabulary"))

    # encoder = PassThroughEncoder(ELMO_EMBEDDING_DIM)
    model = MultiTaskCRFTagger(word_embeddings, encoders, vocab, TASKS)
    model = model.cuda(device=CUDA_DEVICE)

    if not TEST_MODE:
        iterator = CustomHomogeneousBatchIterator(
            partition_key="dataset", batch_size=BATCH_SIZE, cache_instances=True
        )
        iterator.index_with(vocab)

        if CLEAN_MODEL_DIR:
            if os.path.exists(SERIALIZATION_DIR):
                logger.info(f"Deleting {SERIALIZATION_DIR}")
                shutil.rmtree(SERIALIZATION_DIR)
            logger.info(f"Creating {SERIALIZATION_DIR}")
            os.makedirs(SERIALIZATION_DIR)

        logger.info(f"Writing arguments to arguments.json in {SERIALIZATION_DIR}")
        with open(os.path.join(SERIALIZATION_DIR, "arguments.json"), "w+") as fp:
            json.dump(vars(args), fp, indent=2)

        logger.info(f"Writing vocabulary in {SERIALIZATION_DIR}")
        vocab.save_to_files(os.path.join(SERIALIZATION_DIR, "vocabulary"))
        # Use list to ensure each epoch is a full pass through the data
        combined_training_dataset = list(roundrobin_iterator(*train_dataset.values()))
        combined_validation_dataset = list(
            roundrobin_iterator(*validation_dataset.values())
        )

        # optimizer = optim.ASGD(model.parameters(), lr=0.01, t0=100, weight_decay=0.1)
        optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

        training_stats = []
        trainer = Trainer(
            model=model,
            optimizer=optimizer,
            iterator=iterator,
            train_dataset=combined_training_dataset,
            validation_dataset=combined_validation_dataset,
            patience=PATIENTCE,
            num_epochs=NUM_EPOCHS,
            cuda_device=CUDA_DEVICE,
            serialization_dir=SERIALIZATION_DIR,
            # model_save_interval=600
        )
        stats = trainer.train()
        training_stats.append(stats)

        with open(os.path.join(SERIALIZATION_DIR, "training_stats.json"), "w+") as fp:
            json.dump(training_stats, fp, indent=2)
    else:
        model.load_state_dict(torch.load(os.path.join(SERIALIZATION_DIR, "best.th")))
        model = model.cuda(device=CUDA_DEVICE)

    # Empty cache to ensure larger batch can be loaded for testing
    torch.cuda.empty_cache()

    test_filepaths = {
        task.tag_namespace: dataset_paths[task.tag_namespace]["test"] for task in TASKS
    }

    logger.info("Evaluating on test data")

    test_iterator = CustomHomogeneousBatchIterator(
        partition_key="dataset", batch_size=BATCH_SIZE * 2
    )
    test_iterator.index_with(vocab)
    model = model.eval()
    test_stats = evaluate_multiple_data(
        model, readers, test_iterator, test_filepaths, cuda_device=CUDA_DEVICE
    )
    with open(os.path.join(SERIALIZATION_DIR, "test_stats.json"), "w+") as fp:
        json.dump(test_stats, fp, indent=2)