예제 #1
0
def train(args,
          model,
          dataset_reader,
          train_loader,
          device=0,
          val_loader=None,
          test_data=None,
          num_epochs=10,
          patience=None,
          serialization_dir=None):
    optimizer = AdamOptimizer(model.named_parameters(),
                              lr=args.lr,
                              weight_decay=args.l2)
    trainer = GradientDescentTrainer(
        model=model,
        optimizer=optimizer,
        data_loader=train_loader,
        validation_data_loader=val_loader,
        cuda_device=device,
        num_epochs=num_epochs,
        serialization_dir=serialization_dir,
        patience=patience,
        grad_clipping=args.clip,
    )
    trainer.train()

    if test_data is not None:
        predictor = Seq2SeqPredictor(model, dataset_reader)
        for instance in itertools.islice(test_data, 10):
            print('SOURCE:', instance.fields['source_tokens'].tokens)
            print('GOLD:', instance.fields['target_tokens'].tokens)
            print('PRED:',
                  predictor.predict_instance(instance)['predicted_tokens'])
            print('-' * 50)
예제 #2
0
def main():
    reader = SkipGramReader()
    text8 = reader.read('data/text8/text8')

    vocab = Vocabulary.from_instances(text8,
                                      min_count={
                                          'token_in': 5,
                                          'token_out': 5
                                      })

    reader = SkipGramReader(vocab=vocab)
    text8 = reader.read('data/text8/text8')
    text8.index_with(vocab)

    embedding_in = Embedding(num_embeddings=vocab.get_vocab_size('token_in'),
                             embedding_dim=EMBEDDING_DIM)
    embedding_out = Embedding(num_embeddings=vocab.get_vocab_size('token_out'),
                              embedding_dim=EMBEDDING_DIM)
    if CUDA_DEVICE > -1:
        embedding_in = embedding_in.to(CUDA_DEVICE)
        embedding_out = embedding_out.to(CUDA_DEVICE)

    data_loader = DataLoader(text8, batch_size=BATCH_SIZE)

    # model = SkipGramNegativeSamplingModel(
    #     vocab=vocab,
    #     embedding_in=embedding_in,
    #     embedding_out=embedding_out,
    #     neg_samples=10,
    #     cuda_device=CUDA_DEVICE)

    model = SkipGramModel(vocab=vocab,
                          embedding_in=embedding_in,
                          cuda_device=CUDA_DEVICE)

    optimizer = optim.Adam(model.parameters())

    trainer = GradientDescentTrainer(model=model,
                                     optimizer=optimizer,
                                     data_loader=data_loader,
                                     num_epochs=5,
                                     cuda_device=CUDA_DEVICE)
    trainer.train()

    # write_embeddings(embedding_in, 'data/text8/embeddings.txt', vocab)
    print(get_synonyms('one', embedding_in, vocab))
    print(get_synonyms('december', embedding_in, vocab))
    print(get_synonyms('flower', embedding_in, vocab))
    print(get_synonyms('design', embedding_in, vocab))
    print(get_synonyms('snow', embedding_in, vocab))

    rho = evaluate_embeddings(embedding_in, vocab)
    print('simlex999 speareman correlation: {}'.format(rho))
def train_model(model, lr, wd, train_loader, validation_loader, patience,
                epochs, cuda_device, serialization_dir):
    """Train an initialized model"""

    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

    trainer = GradientDescentTrainer(model=model,
                                     data_loader=train_loader,
                                     validation_data_loader=validation_loader,
                                     optimizer=optimizer,
                                     patience=patience,
                                     num_epochs=epochs,
                                     cuda_device=cuda_device,
                                     serialization_dir=serialization_dir)

    fold_metrics = trainer.train()
    # Save embedding weights for visualization
    # n = word_embeddings.token_embedder_tokens.weight.item()
    # pd.DataFrame(n).to_csv(os.path.join(TENSORBOARD_DIR, run_name, 'model_weights.tsv'),
    #                       header=None, index=None, sep='\t')

    return fold_metrics, model
예제 #4
0
    # モデルの学習

    with tempfile.TemporaryDirectory() as serialization_dir:
        parameters = [
            [n, p] for n, p in model.named_parameters() if p.requires_grad]
        optimizer = AdamOptimizer(parameters)
        trainer = GradientDescentTrainer(
            model=model,
            serialization_dir=serialization_dir,
            data_loader=train_loader,
            validation_data_loader=valid_loader,
            num_epochs=20,
            optimizer=optimizer,
            cuda_device=0)

        trainer.train()

    # モデルの実行

    predictor = IntentEstimatorPredictor(model, dataset_reader)

    text = dataset_reader.tokenizer('東京駅から富山駅まで行きたいです')
    output = predictor.predict(text)
    print(text)
    print([(vocab.get_token_from_index(label_id, 'labels'), prob)
           for label_id, prob in enumerate(output['probs'])])

    text = dataset_reader.tokenizer('富山の天気はどうかな')
    output = predictor.predict(text)
    print(text)
    print([(vocab.get_token_from_index(label_id, 'labels'), prob)
예제 #5
0
    cuda_device = 0
    model = model.cuda(cuda_device)
else:
    cuda_device = -1


# don't put a slash after this?
serialization_dir = f"tmp-{expname}/"

if os.path.exists(serialization_dir):
    print("serialization directory exists, removing...")
    shutil.rmtree(serialization_dir)

batch_size = 32
validation_batch_size = 64

data_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=allennlp_collate)
validation_data_loader = DataLoader(validation_dataset, batch_size=validation_batch_size, collate_fn=allennlp_collate)

trainer = GradientDescentTrainer(model=model,
                  optimizer=optimizer,
                  data_loader=data_loader,
                  validation_data_loader=validation_data_loader,
                  patience=10,
                  num_epochs=75,
                  validation_metric="+f1-measure-overall",
                  cuda_device=cuda_device,
                  serialization_dir=serialization_dir)

metrics = trainer.train()
예제 #6
0
def main():
    # load the binary SST dataset.
    single_id_indexer = SingleIdTokenIndexer(lowercase_tokens=True)  # word tokenizer
    # use_subtrees gives us a bit of extra data by breaking down each example into sub sentences.
    reader = StanfordSentimentTreeBankDatasetReader(granularity="2-class",
                                                    token_indexers={"tokens": single_id_indexer},
                                                    use_subtrees=True)
    train_data = reader.read('https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/train.txt')
    reader = StanfordSentimentTreeBankDatasetReader(granularity="2-class",
                                                    token_indexers={"tokens": single_id_indexer})
    dev_data = reader.read('https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/dev.txt')
    # test_dataset = reader.read('data/sst/test.txt')

    vocab = Vocabulary.from_instances(train_data)
    train_data.index_with(vocab)
    dev_data.index_with(vocab)

    # Randomly initialize vectors
    if EMBEDDING_TYPE == "None":
        token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=300)
        word_embedding_dim = 300

    # Load word2vec vectors
    elif EMBEDDING_TYPE == "w2v":
        embedding_path = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip"
        weight = _read_pretrained_embeddings_file(embedding_path,
                                                  embedding_dim=300,
                                                  vocab=vocab,
                                                  namespace="tokens")
        token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                    embedding_dim=300,
                                    weight=weight,
                                    trainable=False)
        word_embedding_dim = 300

    # Initialize model, cuda(), and optimizer
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(word_embedding_dim,
                                                  hidden_size=512,
                                                  num_layers=2,
                                                  batch_first=True))
    model = LstmClassifier(word_embeddings, encoder, vocab)
    model.cuda()

    # where to save the model
    model_path = "/tmp/" + EMBEDDING_TYPE + "_" + "model.th"
    vocab_path = "/tmp/" + EMBEDDING_TYPE + "_" + "vocab"
    # if the model already exists (its been trained), load the pre-trained weights and vocabulary
    if os.path.isfile(model_path):
        vocab = Vocabulary.from_files(vocab_path)
        model = LstmClassifier(word_embeddings, encoder, vocab)
        with open(model_path, 'rb') as f:
            model.load_state_dict(torch.load(f))
    # otherwise train model from scratch and save its weights
    else:
        train_sampler = BucketBatchSampler(train_data, batch_size=32, sorting_keys=[("tokens")])
        dev_sampler = BucketBatchSampler(dev_data, batch_size=32, sorting_keys=[("tokens")])
        train_loader = DataLoader(train_data, batch_sampler=train_sampler)
        dev_loader = DataLoader(dev_data, batch_sampler=dev_sampler)
        optimizer = optim.Adam(model.parameters())
        trainer = GradientDescentTrainer(model=model,
                                         optimizer=optimizer,
                                         data_loader=train_loader,
                                         validation_data_loader=dev_loader,
                                         num_epochs=5,
                                         patience=1,
                                         cuda_device=0)
        trainer.train()
        with open(model_path, 'wb') as f:
            torch.save(model.state_dict(), f)
        vocab.save_to_files(vocab_path)
    model.train().cuda()  # rnn cannot do backwards in eval mode

    # Register a gradient hook on the embeddings. This saves the gradient w.r.t. the word embeddings.
    # We use the gradient later in the attack.
    utils.add_hooks(model)
    embedding_weight = utils.get_embedding_weight(model)  # also save the word embedding matrix

    # Build k-d Tree if you are using gradient + nearest neighbor attack
    # tree = KDTree(embedding_weight.numpy())

    # filter the dataset to only positive or negative examples
    # (the trigger will cause the opposite prediction)
    dataset_label_filter = "0"
    targeted_dev_data = []
    for instance in dev_data:
        if instance['label'].label == dataset_label_filter:
            targeted_dev_data.append(instance)
    targeted_dev_data = AllennlpDataset(targeted_dev_data, vocab)

    # get accuracy before adding triggers
    utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids=None)
    model.train()  # rnn cannot do backwards in eval mode

    # initialize triggers which are concatenated to the input
    num_trigger_tokens = 3
    trigger_token_ids = [vocab.get_token_index("the")] * num_trigger_tokens

    # Use batches of size universal_perturb_batch_size for the attacks.
    universal_perturb_batch_size = 128
    targeted_sampler = BasicBatchSampler(sampler=SequentialSampler(targeted_dev_data),
                                         batch_size=universal_perturb_batch_size,
                                         drop_last=False)  # TODO don't drop last
    targeted_loader = DataLoader(targeted_dev_data, batch_sampler=targeted_sampler)
    # sample batches, update the triggers, and repeat
    for epoch in range(5):
        for batch in targeted_loader:
            # get accuracy with current triggers
            utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids)
            model.train()  # rnn cannot do backwards in eval mode

            # get gradient w.r.t. trigger embeddings for current batch
            averaged_grad = utils.get_average_grad(model, batch, trigger_token_ids)

            # pass the gradients to a particular attack to generate token candidates for each token.
            cand_trigger_token_ids = attacks.hotflip_attack(averaged_grad,
                                                            embedding_weight,
                                                            trigger_token_ids,
                                                            num_candidates=40,
                                                            increase_loss=True)
            # cand_trigger_token_ids = attacks.random_attack(embedding_weight,
            #                                                trigger_token_ids,
            #                                                num_candidates=40)
            # cand_trigger_token_ids = attacks.nearest_neighbor_grad(averaged_grad,
            #                                                        embedding_weight,
            #                                                        trigger_token_ids,
            #                                                        tree,
            #                                                        100,
            #                                                        num_candidates=40,
            #                                                        increase_loss=True)

            # Tries all of the candidates and returns the trigger sequence with highest loss.
            trigger_token_ids = utils.get_best_candidates(model,
                                                          batch,
                                                          trigger_token_ids,
                                                          cand_trigger_token_ids)

    # print accuracy after adding triggers
    utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids)
예제 #7
0
def main():
    reader = StanfordSentimentTreeBankDatasetReader()

    s3_prefix = 'https://s3.amazonaws.com/realworldnlpbook/data'
    # train_dataset = reader.read(f'{s3_prefix}/stanfordSentimentTreebank/trees/train.txt')
    # dev_dataset = reader.read(f'{s3_prefix}/stanfordSentimentTreebank/trees/dev.txt')
    train_dataset = reader.read('Treebank_train.txt')
    print(type(train_dataset))
    print(train_dataset)

    dev_dataset = reader.read('Treebank_dev.txt')



    # You can optionally specify the minimum count of tokens/labels.
    # `min_count={'tokens':3}` here means that any tokens that appear less than three times
    # will be ignored and not included in the vocabulary.

    # 您可以选择指定令牌 / 标签的最小计数。
    # 'min_count = {tokens:3}'
    # 这里的意思是任何出现少于三次的标记都将被忽略,并且不会包含在词汇表中。
    vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                      min_count={'tokens': 3})

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)

    # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens,
    # not for labels, which are used as-is as the "answer" of the sentence classification

    # BasicTextFieldEmbedder需要一个dict-我们需要一个仅用于令牌的嵌入,
    # 不适用于标签,它被用作句子分类的“答案”
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

    # Seq2VecEncoder is a neural network abstraction that takes a sequence of something
    # (usually a sequence of embedded word vectors), processes it, and returns a single
    # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but
    # AllenNLP also supports CNNs and other simple architectures (for example,
    # just averaging over the input vectors).

    # Seq2VecEncoder是一个神经网络抽象,它需要一系列的东西
    # (通常是一系列嵌入的词向量),处理它,并返回一个
    # 矢量。通常这是基于RNN的体系结构(例如,LSTM或GRU),但是
    # AllenNLP还支持cnn和其他简单的体系结构(例如,
    # 对输入向量求平均值)。
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

    model = LstmClassifier(word_embeddings, encoder, vocab)

    train_dataset.index_with(vocab)
    dev_dataset.index_with(vocab)

    train_data_loader = DataLoader(train_dataset,
                                   batch_sampler=BucketBatchSampler(
                                       train_dataset,
                                       batch_size=32,
                                       sorting_keys=["tokens"]))
    dev_data_loader = DataLoader(dev_dataset,
                                 batch_sampler=BucketBatchSampler(
                                     dev_dataset,
                                     batch_size=32,
                                     sorting_keys=["tokens"]))

    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

    trainer = GradientDescentTrainer(
        model=model,
        optimizer=optimizer,
        data_loader=train_data_loader,
        validation_data_loader=dev_data_loader,
        patience=10,
        num_epochs=20)

    trainer.train()

    predictor = SentenceClassifierPredictor(model, dataset_reader=reader)
    # logits = predictor.predict('This is the best movie ever!')['logits']
    logits = predictor.predict('''On August 28, Mustafa varank, Turkey's minister of industry and technology, said Turkey plans to become a production center for automotive batteries by investing in cells, battery modules and battery packs. The country also hopes to become Europe's largest and the world's top five electric and autopilot auto makers by 2030. In order to achieve this goal, varank said Turkey would support the investment of electronic and electrical companies in the automotive industry. Varank points out that modern Turkish plants will cover half of the world's I20 capacity, 90% of which is expected to be exported abroad. "It took 27 months to build this line, with a total investment of $194 million. The productivity of I20 in Turkey will exceed 60%, which will increase gradually. In the past year, Turkey has developed EMUs, SUVs, tractors and excavators equipped with electric engines, and now plans to develop electric vehicle technology. Varank said Turkey would build an ecosystem to produce key components for electric vehicles, such as electric engines, inverters, charging equipment and compressors. He stressed that the automobile industry is the "locomotive" of Turkey's industrial sector, which also provides advantages for other industries. In May and June this year, Turkey's industrial production increased by double-digit compared with the same period last year. In the first half of 2020, Turkey issued 1200 investment award certificates worth US $108 billion (about US $16.7 billion) and created 163000 new jobs. On August 28, Turkey released its economic confidence index for August, and varank said: "the positive trend continues, and our citizens have more positive expectations for the post epidemic period." Choi Hong GHI, South Korea's ambassador to Ankara, said that Hyundai Motor, one of the world's top five auto manufacturers, established its first overseas factory in Turkey 23 years ago. "Hyundai's zmit factory is a symbol of economic cooperation between the two countries, which directly promotes employment and exports in Turkey." Eckkyun Oh, chief executive of Hyundai assan, said the company has produced more than two million cars in Turkey, most of which are exported to countries in Europe, the Middle East and North Africa. "We will produce 100000 new I20 cars here," he said.''')['logits']
    label_id = np.argmax(logits)

    print(model.vocab.get_token_from_index(label_id, 'labels'))
예제 #8
0
def main():
    reader = StanfordSentimentTreeBankDatasetReader()

    s3_prefix = 'https://s3.amazonaws.com/realworldnlpbook/data'
    train_dataset = reader.read(f'{s3_prefix}/stanfordSentimentTreebank/trees/train.txt')
    dev_dataset = reader.read(f'{s3_prefix}/stanfordSentimentTreebank/trees/dev.txt')

    # You can optionally specify the minimum count of tokens/labels.
    # `min_count={'tokens':3}` here means that any tokens that appear less than three times
    # will be ignored and not included in the vocabulary.
    vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                      min_count={'tokens': 3})

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)

    # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens,
    # not for labels, which are used as-is as the "answer" of the sentence classification
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

    # Seq2VecEncoder is a neural network abstraction that takes a sequence of something
    # (usually a sequence of embedded word vectors), processes it, and returns a single
    # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but
    # AllenNLP also supports CNNs and other simple architectures (for example,
    # just averaging over the input vectors).
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

    model = LstmClassifier(word_embeddings, encoder, vocab)

    train_dataset.index_with(vocab)
    dev_dataset.index_with(vocab)

    train_data_loader = DataLoader(train_dataset,
                                   batch_sampler=BucketBatchSampler(
                                       train_dataset,
                                       batch_size=32,
                                       sorting_keys=["tokens"]))
    dev_data_loader = DataLoader(dev_dataset,
                                 batch_sampler=BucketBatchSampler(
                                     dev_dataset,
                                     batch_size=32,
                                     sorting_keys=["tokens"]))

    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

    trainer = GradientDescentTrainer(
        model=model,
        optimizer=optimizer,
        data_loader=train_data_loader,
        validation_data_loader=dev_data_loader,
        patience=10,
        num_epochs=20)

    trainer.train()

    predictor = SentenceClassifierPredictor(model, dataset_reader=reader)
    logits = predictor.predict('This is the best movie ever!')['logits']
    label_id = np.argmax(logits)

    print(model.vocab.get_token_from_index(label_id, 'labels'))
예제 #9
0
def main():

    opts = options()

    # select a bert specific indexer
    if opts.with_bert:
        from allennlp.data.token_indexers.pretrained_transformer_mismatched_indexer import PretrainedTransformerMismatchedIndexer
        indexer = PretrainedTransformerMismatchedIndexer(
            model_name=opts.bert_name, max_length=opts.bert_max_len)
    # separate by spaces
    else:
        from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
        indexer = SingleIdTokenIndexer()

    reader = TaggerDatasetReader(token_indexers={"tokens": indexer})
    train_dataset = reader.read(opts.train_file)
    valid_dataset = reader.read(opts.valid_file)
    params = Tagger.opts2params(opts)

    with open(opts.model_dir + "/params.pkl", mode='wb') as f:
        pickle.dump(params, f)

    vocab = Vocabulary.from_instances(train_dataset + valid_dataset,
                                      min_count={'tokens': opts.min_freq})
    train_dataset.index_with(vocab)
    valid_dataset.index_with(vocab)
    train_data_loader = PyTorchDataLoader(train_dataset,
                                          batch_sampler=BucketBatchSampler(
                                              train_dataset,
                                              batch_size=opts.batch_size,
                                              sorting_keys=["tokens"]))
    valid_data_loader = PyTorchDataLoader(valid_dataset,
                                          batch_sampler=BucketBatchSampler(
                                              valid_dataset,
                                              batch_size=opts.batch_size,
                                              sorting_keys=["tokens"]))

    model = Tagger.build(params, vocab)
    if torch.cuda.is_available():
        cuda_device = opts.gpuid
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1

    # select an optimizer for fine-tuning
    if opts.with_bert:
        from allennlp.training.optimizers import HuggingfaceAdamWOptimizer
        parameters = [[n, p] for n, p in model.named_parameters()
                      if p.requires_grad]
        optimizer = HuggingfaceAdamWOptimizer(model_parameters=parameters,
                                              lr=0.0003,
                                              parameter_groups=[
                                                  ([".*transformer.*"], {
                                                      "lr": 1e-05
                                                  })
                                              ])
    # optimizer for random initialization
    else:
        import torch.optim as optim
        optimizer = optim.Adam(model.parameters(), lr=0.001)

    trainer = GradientDescentTrainer(
        model=model,
        optimizer=optimizer,
        data_loader=train_data_loader,
        validation_data_loader=valid_data_loader,
        num_epochs=1,
        use_amp=opts.use_amp,
        num_gradient_accumulation_steps=opts.num_gradient_accumulation_steps,
        cuda_device=cuda_device)

    vocab.save_to_files(opts.model_dir + "/vocab")

    best_f1 = 0.0
    for i in range(opts.epochs):
        epoch = i + 1
        print('Epoch: {}'.format(epoch))
        info = trainer.train()
        print(info)
        if info["validation_accuracy"] > best_f1:
            best_f1 = info["validation_accuracy"]
            with open(opts.model_dir + "/save_" + str(epoch) + ".save",
                      'wb') as f_model:
                torch.save(model.state_dict(), f_model)