Beispiel #1
0
def main():
    elmo_token_indexer = ELMoTokenCharactersIndexer()

    reader = StanfordSentimentTreeBankDatasetReader(
        token_indexers={'tokens': elmo_token_indexer})

    train_dataset = reader.read(
        'data/stanfordSentimentTreebank/trees/train.txt')
    dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt')

    # You can optionally specify the minimum count of tokens/labels.
    # `min_count={'tokens':3}` here means that any tokens that appear less than three times
    # will be ignored and not included in the vocabulary.
    vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                      min_count={'tokens': 3})

    # Use the 'Small' pre-trained model
    options_file = (
        'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo'
        '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json'
    )
    weight_file = (
        'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo'
        '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5'
    )

    elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)

    # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens,
    # not for labels, which are used as-is as the "answer" of the sentence classification
    word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})

    # Seq2VecEncoder is a neural network abstraction that takes a sequence of something
    # (usually a sequence of embedded word vectors), processes it, and returns a single
    # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but
    # AllenNLP also supports CNNs and other simple architectures (for example,
    # just averaging over the input vectors).
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(elmo_embedding_dim, HIDDEN_DIM, batch_first=True))

    model = LstmClassifier(word_embeddings, encoder, vocab)
    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

    iterator = BucketIterator(batch_size=32,
                              sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      patience=10,
                      num_epochs=20)

    trainer.train()
Beispiel #2
0
def main():
    reader = StanfordSentimentTreeBankDatasetReader()

    train_dataset = reader.read(
        'data/stanfordSentimentTreebank/trees/train.txt')
    dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt')

    # You can optionally specify the minimum count of tokens/labels.
    # `min_count={'tokens':3}` here means that any tokens that appear less than three times
    # will be ignored and not included in the vocabulary.
    vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                      min_count={'tokens': 3})

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)

    # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens,
    # not for labels, which are used as-is as the "answer" of the sentence classification
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

    # Seq2VecEncoder is a neural network abstraction that takes a sequence of something
    # (usually a sequence of embedded word vectors), processes it, and returns a single
    # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but
    # AllenNLP also supports CNNs and other simple architectures (for example,
    # just averaging over the input vectors).
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

    model = LstmClassifier(word_embeddings, encoder, vocab)
    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

    iterator = BucketIterator(batch_size=32,
                              sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      patience=10,
                      num_epochs=20)

    trainer.train()

    predictor = SentenceClassifierPredictor(model, dataset_reader=reader)
    logits = predictor.predict('This is the best movie ever!')['logits']
    label_id = np.argmax(logits)

    print(model.vocab.get_token_from_index(label_id, 'labels'))
def main():
    reader = StanfordSentimentTreeBankDatasetReader()

    train_dataset = reader.read('data/stanfordSentimentTreebank/trees/train.txt')
    dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt')

    # You can optionally specify the minimum count of tokens/labels.
    # `min_count={'tokens':3}` here means that any tokens that appear less than three times
    # will be ignored and not included in the vocabulary.
    vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                      min_count={'tokens': 3})

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)

    # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens,
    # not for labels, which are used as-is as the "answer" of the sentence classification
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

    lstm = PytorchSeq2VecWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

    model = LstmClassifier(word_embeddings, lstm, vocab)
    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

    iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      patience=10,
                      num_epochs=20)

    trainer.train()

    tokens = ['This', 'is', 'the', 'best', 'movie', 'ever', '!']
    predictor = SentenceClassifierPredictor(model, dataset_reader=reader)
    logits = predictor.predict(tokens)['logits']
    label_id = np.argmax(logits)

    print(model.vocab.get_token_from_index(label_id, 'labels'))
Beispiel #4
0
def main():
    reader = StanfordSentimentTreeBankDatasetReader()

    train_dataset = reader.read('train.txt')
    dev_dataset = reader.read('dev.txt')

    # You can optionally specify the minimum count of tokens/labels.
    # `min_count={'tokens':3}` here means that any tokens that appear less than three times
    # will be ignored and not included in the vocabulary.
    vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                      min_count={'tokens': 3})

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)

    # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens,
    # not for labels, which are used as-is as the "answer" of the sentence classification
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

    # Seq2VecEncoder is a neural network abstraction that takes a sequence of something
    # (usually a sequence of embedded word vectors), processes it, and returns a single
    # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but
    # AllenNLP also supports CNNs and other simple architectures (for example,
    # just averaging over the input vectors).
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

    model = LstmClassifier(word_embeddings, encoder, vocab)
    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
    # optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)

    iterator = BucketIterator(batch_size=32,
                              sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      patience=20,
                      num_epochs=1000)
    trainer.train()
    predictor = SentenceClassifierPredictor(model, dataset_reader=reader)
    day = 12
    while day <= 30:
        # 0,1,2,3,text,source,6,7,8,9,10,favourites_count,12,13,14,15,followers_count,friends_count,18,19,20,lang
        total = 0
        res = {'0': 0, '1': 0, '2': 0, '3': 0, '4': 0}
        with open(f'2020-03-{day} Coronavirus Tweets.CSV', 'r') as csvfile:
            reader = csv.reader(csvfile)
            for row in reader:
                lang = row[-1]
                if lang != 'en':
                    continue
                source = row[5]
                if source == 'Twitter for Advertisers':
                    continue
                followers_count = row[16]
                friends_count = row[17]
                try:
                    followers_count = int(followers_count)
                    friends_count = int(friends_count)
                    if friends_count > followers_count * 80:
                        continue
                except Exception:
                    print("Cannot get friends and follower")
                content = clean_tweets(row[4])
                if not content:
                    continue
                try:
                    if content.count('#') >= 5:
                        continue
                except Exception:
                    print("Cannot get hash tag")
                total += 1
                try:
                    fav = row[11]
                    fav = int(fav)
                except Exception:
                    print("Cannot get favorite")
                try:
                    logits = predictor.predict(content)['logits']
                    label_id = np.argmax(logits)
                    lab = model.vocab.get_token_from_index(label_id, 'labels')
                    res[lab] += 1
                    total += fav
                    res[lab] += fav
                except Exception:
                    print(f"Error in {row[4]}")
        print(f"Day {day}: Total: {total} tweets")
        print(
            f"Day {day}: Strongly negative: {int((res['0']/total)*1000)/100}% ",
            end='')
        print(
            f"Day {day}: Weakly   negative: {int((res['1']/total)*1000)/100}% ",
            end='')
        print(
            f"Day {day}: Neutral          : {int((res['2']/total)*1000)/100}% ",
            end='')
        print(
            f"Day {day}: Weakly   positive: {int((res['3']/total)*1000)/100}% ",
            end='')
        print(
            f"Day {day}: Strongly positive: {int((res['4']/total)*1000)/100}% ",
            end='')
        with open('tweets.log', 'w+') as log:
            log.write(f"Day {day}: Total: {total} tweets")
            log.write(
                f"Day {day}: Strongly negative: {int((res['0']/total)*1000)/100}%"
            )
            log.write(
                f"Day {day}: Weakly   negative: {int((res['1']/total)*1000)/100}%"
            )
            log.write(
                f"Day {day}: Neutral          : {int((res['2']/total)*1000)/100}%"
            )
            log.write(
                f"Day {day}: Weakly   positive: {int((res['3']/total)*1000)/100}%"
            )
            log.write(
                f"Day {day}: Strongly positive: {int((res['4']/total)*1000)/100}%"
            )
        day += 1
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import ElmoTokenEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.predictors import TextClassifierPredictor

from sentiment_simple_lstm import LstmClassifier

if __name__ == "__main__":
    simple_lstm = False
    elmo_lstm = True

    # Simple LSTM
    if simple_lstm:
        EMBEDDING_DIM = 128
        HIDDEN_DIM = 128
        reader = StanfordSentimentTreeBankDatasetReader()
        train_dataset = reader.read('data/stanfordSentimentTreebank/trees/train.txt')
        dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt')
        test_dataset = reader.read('data/stanfordSentimentTreebank/trees/test.txt')
        vocab = Vocabulary.from_instances(train_dataset + dev_dataset, min_count={'tokens': 3})
        token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM)
        word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
        lstm = PytorchSeq2VecWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
        model = LstmClassifier(word_embeddings, lstm, vocab)
        with open("models/simple_LSTM_sentiment_classifier.th", 'rb') as f:
            model.load_state_dict(torch.load(f))
        predictor = TextClassifierPredictor(model, dataset_reader=reader)
        test_results = predictor.predict_batch_instance(test_dataset)

    # ELMo LSTM
    if elmo_lstm:
Beispiel #6
0
def model_train():
    reader = StanfordSentimentTreeBankDatasetReader()

    # train_dataset = reader.read('/Users/geor/git/comp5222-tools/allenNLP/realworldnlp/data/tatoeba/sentences.top10langs.train.tsv')
    # dev_dataset = reader.read('data/tatoeba/sentences.top10langs.dev.tsv')
    # train_dataset = reader.read('data/stanfordSentimentTreebank/trees/train.txt')
    # dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt')
    train_dataset = reader.read(
        'https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/train.txt'
    )
    dev_dataset = reader.read(
        'https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/dev.txt'
    )

    # You can optionally specify the minimum count of tokens/labels.
    # `min_count={'tokens':3}` here means that any tokens that appear less than three times
    # will be ignored and not included in the vocabulary.
    vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                      min_count={'tokens': 3})

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)

    # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens,
    # not for labels, which are used as-is as the "answer" of the sentence classification
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

    # Seq2VecEncoder is a neural network abstraction that takes a sequence of something
    # (usually a sequence of embedded word vectors), processes it, and returns a single
    # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but
    # AllenNLP also supports CNNs and other simple architectures (for example,
    # just averaging over the input vectors).
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

    model = LstmClassifier(word_embeddings, encoder, vocab)
    # model.cuda() #ge add # might be wrong

    if torch.cuda.is_available():
        cuda_device = 0
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1

    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

    iterator = BucketIterator(batch_size=32,
                              sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        iterator=iterator,
        train_dataset=train_dataset,
        validation_dataset=dev_dataset,
        patience=10,
        num_epochs=20,
        cuda_device=cuda_device)  # add cuda_device=cuda_device @ge

    model_file_name = 'trained_model_stanford.pth'  # the name of the model
    if os.path.isfile(model_file_name):
        print("model exist. Loading the model now")
        torch.load(model_file_name)

    else:
        print("model not exist. Training the model now")
        trainer.train()
        torch.save(model, model_file_name)

    predictor = SentenceClassifierPredictor(
        model, dataset_reader=reader)  #ge evaluate test set

    # "test"
    # logits = predictor.predict('I like this comment so much!')['logits']
    # print(logits)  # test
    # label_id = np.argmax(logits)
    # print(model.vocab.get_token_from_index(label_id, 'labels'))
    # "test"
    return predictor, model
Beispiel #7
0
def main():
    # load the binary SST dataset.
    single_id_indexer = SingleIdTokenIndexer(
        lowercase_tokens=True)  # word tokenizer
    # use_subtrees gives us a bit of extra data by breaking down each example into sub sentences.
    reader = StanfordSentimentTreeBankDatasetReader(
        granularity="2-class",
        token_indexers={"tokens": single_id_indexer},
        use_subtrees=True)
    train_data = reader.read(
        'https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/train.txt')
    reader = StanfordSentimentTreeBankDatasetReader(
        granularity="2-class", token_indexers={"tokens": single_id_indexer})
    dev_data = reader.read(
        'https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/dev.txt')
    # test_dataset = reader.read('data/sst/test.txt')

    vocab = Vocabulary.from_instances(train_data)

    # Randomly initialize vectors
    if EMBEDDING_TYPE == "None":
        token_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=300)
        word_embedding_dim = 300

    # Load word2vec vectors
    elif EMBEDDING_TYPE == "w2v":
        embedding_path = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip"
        weight = _read_pretrained_embeddings_file(embedding_path,
                                                  embedding_dim=300,
                                                  vocab=vocab,
                                                  namespace="tokens")
        token_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'),
            embedding_dim=300,
            weight=weight,
            trainable=False)
        word_embedding_dim = 300

    # Initialize model, cuda(), and optimizer
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(word_embedding_dim,
                      hidden_size=512,
                      num_layers=2,
                      batch_first=True))
    model = LstmClassifier(word_embeddings, encoder, vocab)
    model.cuda()

    # where to save the model
    model_path = "/tmp/" + EMBEDDING_TYPE + "_" + "model.th"
    vocab_path = "/tmp/" + EMBEDDING_TYPE + "_" + "vocab"
    # if the model already exists (its been trained), load the pre-trained weights and vocabulary
    if os.path.isfile(model_path):
        vocab = Vocabulary.from_files(vocab_path)
        model = LstmClassifier(word_embeddings, encoder, vocab)
        with open(model_path, 'rb') as f:
            model.load_state_dict(torch.load(f))
    # otherwise train model from scratch and save its weights
    else:
        iterator = BucketIterator(batch_size=32,
                                  sorting_keys=[("tokens", "num_tokens")])
        iterator.index_with(vocab)
        optimizer = optim.Adam(model.parameters())
        trainer = Trainer(model=model,
                          optimizer=optimizer,
                          iterator=iterator,
                          train_dataset=train_data,
                          validation_dataset=dev_data,
                          num_epochs=5,
                          patience=1,
                          cuda_device=0)
        trainer.train()
        with open(model_path, 'wb') as f:
            torch.save(model.state_dict(), f)
        vocab.save_to_files(vocab_path)
    model.train().cuda()  # rnn cannot do backwards in train mode

    # Register a gradient hook on the embeddings. This saves the gradient w.r.t. the word embeddings.
    # We use the gradient later in the attack.
    utils.add_hooks(model)
    embedding_weight = utils.get_embedding_weight(
        model)  # also save the word embedding matrix

    # Use batches of size universal_perturb_batch_size for the attacks.
    universal_perturb_batch_size = 128
    iterator = BasicIterator(batch_size=universal_perturb_batch_size)
    iterator.index_with(vocab)

    # Build k-d Tree if you are using gradient + nearest neighbor attack
    # tree = KDTree(embedding_weight.numpy())

    # filter the dataset to only positive or negative examples
    # (the trigger will cause the opposite prediction)
    dataset_label_filter = "0"
    targeted_dev_data = []
    for instance in dev_data:
        if instance['label'].label == dataset_label_filter:
            targeted_dev_data.append(instance)

    # get accuracy before adding triggers
    utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids=None)
    model.train()  # rnn cannot do backwards in train mode

    # initialize triggers which are concatenated to the input
    num_trigger_tokens = 3
    trigger_token_ids = [vocab.get_token_index("the")] * num_trigger_tokens

    # sample batches, update the triggers, and repeat
    for batch in lazy_groups_of(iterator(targeted_dev_data,
                                         num_epochs=5,
                                         shuffle=True),
                                group_size=1):
        # get accuracy with current triggers
        utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids)
        model.train()  # rnn cannot do backwards in train mode

        # get gradient w.r.t. trigger embeddings for current batch
        averaged_grad = utils.get_average_grad(model, batch, trigger_token_ids)

        # pass the gradients to a particular attack to generate token candidates for each token.
        cand_trigger_token_ids = attacks.hotflip_attack(averaged_grad,
                                                        embedding_weight,
                                                        trigger_token_ids,
                                                        num_candidates=40,
                                                        increase_loss=True)
        # cand_trigger_token_ids = attacks.random_attack(embedding_weight,
        #                                                trigger_token_ids,
        #                                                num_candidates=40)
        # cand_trigger_token_ids = attacks.nearest_neighbor_grad(averaged_grad,
        #                                                        embedding_weight,
        #                                                        trigger_token_ids,
        #                                                        tree,
        #                                                        100,
        #                                                        num_candidates=40,
        #                                                        increase_loss=True)

        # Tries all of the candidates and returns the trigger sequence with highest loss.
        trigger_token_ids = utils.get_best_candidates(model, batch,
                                                      trigger_token_ids,
                                                      cand_trigger_token_ids)

    # print accuracy after adding triggers
    utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids)
def main():
    # In order to use ELMo, each word in a sentence needs to be indexed with
    # an array of character IDs.
    elmo_token_indexer = ELMoTokenCharactersIndexer()
    reader = StanfordSentimentTreeBankDatasetReader(
        token_indexers={'tokens': elmo_token_indexer})

    train_dataset = reader.read('data/stanfordSentimentTreebank/trees/train.txt')
    dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt')

    # Initialize the ELMo-based token embedder using a pre-trained file.
    # This takes a while if you run this script for the first time

    # Original
    # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
    # weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

    # Medium
    # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json"
    # weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5"

    # Use the 'Small' pre-trained model
    options_file = ('https://s3-us-west-2.amazonaws.com/allennlp/models/elmo'
                    '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json')
    weight_file = ('https://s3-us-west-2.amazonaws.com/allennlp/models/elmo'
                   '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5')

    elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)

    vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                      min_count={'tokens': 3})

    # Pass in the ElmoTokenEmbedder instance instead
    word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})

    # The dimension of the ELMo embedding will be 2 x [size of LSTM hidden states]
    elmo_embedding_dim = 256
    lstm = PytorchSeq2VecWrapper(
        torch.nn.LSTM(elmo_embedding_dim, HIDDEN_DIM, batch_first=True))

    model = LstmClassifier(word_embeddings, lstm, vocab)
    optimizer = optim.Adam(model.parameters())

    iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      patience=10,
                      num_epochs=20)

    trainer.train()

    tokens = ['This', 'is', 'the', 'best', 'movie', 'ever', '!']
    predictor = SentenceClassifierPredictor(model, dataset_reader=reader)
    logits = predictor.predict(tokens)['logits']
    label_id = np.argmax(logits)

    print(model.vocab.get_token_from_index(label_id, 'labels'))
        embeddings = self.word_embeddings(tokens)
        encoder_out = self.encoder(embeddings, mask)
        logits = self.hidden2tag(encoder_out)

        # In AllenNLP, the output of forward() is a dictionary.
        # Your output dictionary must contain a "loss" key for your model to be trained.
        output = {"logits": logits}
        if label is not None:
            self.accuracy(logits, label)
            output["loss"] = self.loss_function(logits, label)

        return output


if __name__ == "__main__":
    reader = StanfordSentimentTreeBankDatasetReader()
    train_dataset = reader.read(
        'data/stanfordSentimentTreebank/trees/train.txt')
    dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt')

    # You can optionally specify the minimum count of tokens/labels.
    vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                      min_count={'tokens': 3})

    EMBEDDING_DIM = 128
    HIDDEN_DIM = 128
    EPOCHS = 100
    BATCH_SIZE = 32
    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)
Beispiel #10
0
def main():
    reader = StanfordSentimentTreeBankDatasetReader()

    s3_prefix = 'https://s3.amazonaws.com/realworldnlpbook/data'
    # train_dataset = reader.read(f'{s3_prefix}/stanfordSentimentTreebank/trees/train.txt')
    # dev_dataset = reader.read(f'{s3_prefix}/stanfordSentimentTreebank/trees/dev.txt')
    train_dataset = reader.read('Treebank_train.txt')
    print(type(train_dataset))
    print(train_dataset)

    dev_dataset = reader.read('Treebank_dev.txt')



    # You can optionally specify the minimum count of tokens/labels.
    # `min_count={'tokens':3}` here means that any tokens that appear less than three times
    # will be ignored and not included in the vocabulary.

    # 您可以选择指定令牌 / 标签的最小计数。
    # 'min_count = {tokens:3}'
    # 这里的意思是任何出现少于三次的标记都将被忽略,并且不会包含在词汇表中。
    vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                      min_count={'tokens': 3})

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)

    # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens,
    # not for labels, which are used as-is as the "answer" of the sentence classification

    # BasicTextFieldEmbedder需要一个dict-我们需要一个仅用于令牌的嵌入,
    # 不适用于标签,它被用作句子分类的“答案”
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

    # Seq2VecEncoder is a neural network abstraction that takes a sequence of something
    # (usually a sequence of embedded word vectors), processes it, and returns a single
    # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but
    # AllenNLP also supports CNNs and other simple architectures (for example,
    # just averaging over the input vectors).

    # Seq2VecEncoder是一个神经网络抽象,它需要一系列的东西
    # (通常是一系列嵌入的词向量),处理它,并返回一个
    # 矢量。通常这是基于RNN的体系结构(例如,LSTM或GRU),但是
    # AllenNLP还支持cnn和其他简单的体系结构(例如,
    # 对输入向量求平均值)。
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

    model = LstmClassifier(word_embeddings, encoder, vocab)

    train_dataset.index_with(vocab)
    dev_dataset.index_with(vocab)

    train_data_loader = DataLoader(train_dataset,
                                   batch_sampler=BucketBatchSampler(
                                       train_dataset,
                                       batch_size=32,
                                       sorting_keys=["tokens"]))
    dev_data_loader = DataLoader(dev_dataset,
                                 batch_sampler=BucketBatchSampler(
                                     dev_dataset,
                                     batch_size=32,
                                     sorting_keys=["tokens"]))

    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

    trainer = GradientDescentTrainer(
        model=model,
        optimizer=optimizer,
        data_loader=train_data_loader,
        validation_data_loader=dev_data_loader,
        patience=10,
        num_epochs=20)

    trainer.train()

    predictor = SentenceClassifierPredictor(model, dataset_reader=reader)
    # logits = predictor.predict('This is the best movie ever!')['logits']
    logits = predictor.predict('''On August 28, Mustafa varank, Turkey's minister of industry and technology, said Turkey plans to become a production center for automotive batteries by investing in cells, battery modules and battery packs. The country also hopes to become Europe's largest and the world's top five electric and autopilot auto makers by 2030. In order to achieve this goal, varank said Turkey would support the investment of electronic and electrical companies in the automotive industry. Varank points out that modern Turkish plants will cover half of the world's I20 capacity, 90% of which is expected to be exported abroad. "It took 27 months to build this line, with a total investment of $194 million. The productivity of I20 in Turkey will exceed 60%, which will increase gradually. In the past year, Turkey has developed EMUs, SUVs, tractors and excavators equipped with electric engines, and now plans to develop electric vehicle technology. Varank said Turkey would build an ecosystem to produce key components for electric vehicles, such as electric engines, inverters, charging equipment and compressors. He stressed that the automobile industry is the "locomotive" of Turkey's industrial sector, which also provides advantages for other industries. In May and June this year, Turkey's industrial production increased by double-digit compared with the same period last year. In the first half of 2020, Turkey issued 1200 investment award certificates worth US $108 billion (about US $16.7 billion) and created 163000 new jobs. On August 28, Turkey released its economic confidence index for August, and varank said: "the positive trend continues, and our citizens have more positive expectations for the post epidemic period." Choi Hong GHI, South Korea's ambassador to Ankara, said that Hyundai Motor, one of the world's top five auto manufacturers, established its first overseas factory in Turkey 23 years ago. "Hyundai's zmit factory is a symbol of economic cooperation between the two countries, which directly promotes employment and exports in Turkey." Eckkyun Oh, chief executive of Hyundai assan, said the company has produced more than two million cars in Turkey, most of which are exported to countries in Europe, the Middle East and North Africa. "We will produce 100000 new I20 cars here," he said.''')['logits']
    label_id = np.argmax(logits)

    print(model.vocab.get_token_from_index(label_id, 'labels'))
Beispiel #11
0
    embedding_weight = get_embedding_weight(model)
    model.train().cuda()

    # arange ARAE word embedding in consistent with sst model.
    ARAE_weight_embedding = []
    for num in range(len(ARAE_idx2word)):
        ARAE_weight_embedding.append(
            embedding_weight[sst_vocab.get_token_index(
                ARAE_idx2word[num])].numpy())
    ARAE_weight_embedding = torch.from_numpy(
        np.array(ARAE_weight_embedding)).cuda()

    ### collect positive/negative sentences
    single_id_indexer = SingleIdTokenIndexer(
        lowercase_tokens=True)  # word tokenizer
    reader = StanfordSentimentTreeBankDatasetReader(
        granularity="2-class", token_indexers={"tokens": single_id_indexer})
    dev_data = reader.read(
        'https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/dev.txt')

    # For sentiment analysis, get rid of positive and negative.
    pos_path = os.path.join(args.sentiment_path, 'positive_words.txt')
    neg_path = os.path.join(args.sentiment_path, 'negative_words.txt')

    pos_words = list()
    with open(cached_path(pos_path), "r") as data_file:
        for line in data_file.readlines():
            if line[0] != ';':
                line = line.strip("\n")
                if not line:
                    continue
                else:
Beispiel #12
0
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import ElmoTokenEmbedder
from allennlp.training.trainer import Trainer

from examples.sentiment.sst_classifier import LstmClassifier
from realworldnlp.predictors import SentenceClassifierPredictor

#Model definition and training

HIDDEN_DIM = 512
CUDA_DEVICE = 0

# In order to use ELMo, each word in a sentence needs to be indexed with
# an array of character IDs.
elmo_token_indexer = ELMoTokenCharactersIndexer()
reader = StanfordSentimentTreeBankDatasetReader(
    token_indexers={'tokens': elmo_token_indexer})

train_dataset = reader.read(
    'https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/train.txt'
)
dev_dataset = reader.read(
    'https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/dev.txt'
)

# Initialize the ELMo-based token embedder using a pre-trained file.
# This takes a while if you run this script for the first time

# Original
options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"