Esempio n. 1
0
def _get_reader(config,
                skip_labels=False,
                bert_max_length=None,
                reader_max_length=150,
                read_first=None):
    indexers = {}
    for embedder_config in config.embedder.models:
        if embedder_config.name == 'elmo':
            indexers[embedder_config.name] = ELMoTokenCharactersIndexer()
        elif embedder_config.name.endswith('bert'):
            bert_path = os.path.join(config.data.pretrained_models_dir,
                                     embedder_config.name)
            indexers[
                embedder_config.name] = PretrainedTransformerMismatchedIndexer(
                    model_name=bert_path,
                    tokenizer_kwargs={'do_lower_case': False},
                    max_length=bert_max_length)
        elif embedder_config.name == 'char_bilstm':
            indexers[embedder_config.name] = TokenCharactersIndexer()
        else:
            assert False, 'Unknown embedder {}'.format(embedder_config.name)

    return UDDatasetReader(indexers,
                           skip_labels=skip_labels,
                           max_length=reader_max_length,
                           read_first=read_first)
Esempio n. 2
0
def _get_reader(config,
                skip_labels=False,
                bert_max_length=None,
                reader_max_length=150,
                read_first=None):
    indexer = None
    if config.embedder.name == 'elmo':
        indexer = ELMoTokenCharactersIndexer()
    elif config.embedder.name.endswith('bert'):
        bert_path = os.path.join(config.data.pretrained_models_dir,
                                 config.embedder.name)
        indexer = PretrainedTransformerMismatchedIndexer(
            model_name=bert_path,
            tokenizer_kwargs={'do_lower_case': False},
            max_length=bert_max_length)
    elif config.embedder.name == 'both':
        elmo_indexer = ELMoTokenCharactersIndexer()

        bert_path = os.path.join(config.data.pretrained_models_dir, 'ru_bert')
        bert_indexer = PretrainedTransformerMismatchedIndexer(
            model_name=bert_path,
            tokenizer_kwargs={'do_lower_case': False},
            max_length=bert_max_length)

        return UDDatasetReader({
            'elmo': elmo_indexer,
            'ru_bert': bert_indexer
        },
                               skip_labels=skip_labels,
                               max_length=reader_max_length,
                               read_first=read_first)
    else:
        assert False, 'Unknown embedder {}'.format(config.embedder.name)

    return UDDatasetReader({config.embedder.name: indexer},
                           skip_labels=skip_labels,
                           max_length=reader_max_length,
                           read_first=read_first)
Esempio n. 3
0
def load_model(model, params, model_file, gpuid):
    # select a bert specific indexer
    if params["with_bert"]:
        from allennlp.data.token_indexers.pretrained_transformer_mismatched_indexer import PretrainedTransformerMismatchedIndexer
        indexer = PretrainedTransformerMismatchedIndexer(
            model_name=params["bert_name"], max_length=params["bert_max_len"])
    # separate by spaces
    else:
        from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
        indexer = SingleIdTokenIndexer()
    # Select a device
    if gpuid >= 0 and torch.cuda.is_available():
        cuda_device = gpuid
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1
        model = model.cpu()
    model.load_state_dict(torch.load(model_file))
    model.eval()
    return model, indexer
Esempio n. 4
0
def main():

    opts = options()

    # select a bert specific indexer
    if opts.with_bert:
        from allennlp.data.token_indexers.pretrained_transformer_mismatched_indexer import PretrainedTransformerMismatchedIndexer
        indexer = PretrainedTransformerMismatchedIndexer(
            model_name=opts.bert_name, max_length=opts.bert_max_len)
    # separate by spaces
    else:
        from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
        indexer = SingleIdTokenIndexer()

    reader = TaggerDatasetReader(token_indexers={"tokens": indexer})
    train_dataset = reader.read(opts.train_file)
    valid_dataset = reader.read(opts.valid_file)
    params = Tagger.opts2params(opts)

    with open(opts.model_dir + "/params.pkl", mode='wb') as f:
        pickle.dump(params, f)

    vocab = Vocabulary.from_instances(train_dataset + valid_dataset,
                                      min_count={'tokens': opts.min_freq})
    train_dataset.index_with(vocab)
    valid_dataset.index_with(vocab)
    train_data_loader = PyTorchDataLoader(train_dataset,
                                          batch_sampler=BucketBatchSampler(
                                              train_dataset,
                                              batch_size=opts.batch_size,
                                              sorting_keys=["tokens"]))
    valid_data_loader = PyTorchDataLoader(valid_dataset,
                                          batch_sampler=BucketBatchSampler(
                                              valid_dataset,
                                              batch_size=opts.batch_size,
                                              sorting_keys=["tokens"]))

    model = Tagger.build(params, vocab)
    if torch.cuda.is_available():
        cuda_device = opts.gpuid
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1

    # select an optimizer for fine-tuning
    if opts.with_bert:
        from allennlp.training.optimizers import HuggingfaceAdamWOptimizer
        parameters = [[n, p] for n, p in model.named_parameters()
                      if p.requires_grad]
        optimizer = HuggingfaceAdamWOptimizer(model_parameters=parameters,
                                              lr=0.0003,
                                              parameter_groups=[
                                                  ([".*transformer.*"], {
                                                      "lr": 1e-05
                                                  })
                                              ])
    # optimizer for random initialization
    else:
        import torch.optim as optim
        optimizer = optim.Adam(model.parameters(), lr=0.001)

    trainer = GradientDescentTrainer(
        model=model,
        optimizer=optimizer,
        data_loader=train_data_loader,
        validation_data_loader=valid_data_loader,
        num_epochs=1,
        use_amp=opts.use_amp,
        num_gradient_accumulation_steps=opts.num_gradient_accumulation_steps,
        cuda_device=cuda_device)

    vocab.save_to_files(opts.model_dir + "/vocab")

    best_f1 = 0.0
    for i in range(opts.epochs):
        epoch = i + 1
        print('Epoch: {}'.format(epoch))
        info = trainer.train()
        print(info)
        if info["validation_accuracy"] > best_f1:
            best_f1 = info["validation_accuracy"]
            with open(opts.model_dir + "/save_" + str(epoch) + ".save",
                      'wb') as f_model:
                torch.save(model.state_dict(), f_model)