def main():
    reader = StanfordSentimentTreeBankDatasetReader()
    train_path = 'https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/train.txt'
    dev_path = 'https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/dev.txt'

    sampler = BucketBatchSampler(batch_size=32, sorting_keys=["tokens"])
    train_data_loader = MultiProcessDataLoader(reader,
                                               train_path,
                                               batch_sampler=sampler)
    dev_data_loader = MultiProcessDataLoader(reader,
                                             dev_path,
                                             batch_sampler=sampler)

    # You can optionally specify the minimum count of tokens/labels.
    # `min_count={'tokens':3}` here means that any tokens that appear less than three times
    # will be ignored and not included in the vocabulary.
    vocab = Vocabulary.from_instances(chain(train_data_loader.iter_instances(),
                                            dev_data_loader.iter_instances()),
                                      min_count={'tokens': 3})
    train_data_loader.index_with(vocab)
    dev_data_loader.index_with(vocab)

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)

    # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens,
    # not for labels, which are used as-is as the "answer" of the sentence classification
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

    # Seq2VecEncoder is a neural network abstraction that takes a sequence of something
    # (usually a sequence of embedded word vectors), processes it, and returns a single
    # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but
    # AllenNLP also supports CNNs and other simple architectures (for example,
    # just averaging over the input vectors).
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

    model = LstmClassifier(word_embeddings, encoder, vocab)

    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

    trainer = GradientDescentTrainer(model=model,
                                     optimizer=optimizer,
                                     data_loader=train_data_loader,
                                     validation_data_loader=dev_data_loader,
                                     patience=10,
                                     num_epochs=20,
                                     cuda_device=-1)

    trainer.train()

    predictor = SentenceClassifierPredictor(model, dataset_reader=reader)
    logits = predictor.predict('This is the best movie ever!')['logits']
    label_id = np.argmax(logits)

    print(model.vocab.get_token_from_index(label_id, 'labels'))
def main():
    reader = StanfordSentimentTreeBankDatasetReader()

    train_dataset = reader.read('data/stanfordSentimentTreebank/trees/train.txt')
    dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt')

    # You can optionally specify the minimum count of tokens/labels.
    # `min_count={'tokens':3}` here means that any tokens that appear less than three times
    # will be ignored and not included in the vocabulary.
    vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                      min_count={'tokens': 3})

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)

    # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens,
    # not for labels, which are used as-is as the "answer" of the sentence classification
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

    lstm = PytorchSeq2VecWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

    model = LstmClassifier(word_embeddings, lstm, vocab)
    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

    iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      patience=10,
                      num_epochs=20)

    trainer.train()

    tokens = ['This', 'is', 'the', 'best', 'movie', 'ever', '!']
    predictor = SentenceClassifierPredictor(model, dataset_reader=reader)
    logits = predictor.predict(tokens)['logits']
    label_id = np.argmax(logits)

    print(model.vocab.get_token_from_index(label_id, 'labels'))
Exemple #3
0
def main():
    reader = StanfordSentimentTreeBankDatasetReader()

    train_dataset = reader.read('train.txt')
    dev_dataset = reader.read('dev.txt')

    # You can optionally specify the minimum count of tokens/labels.
    # `min_count={'tokens':3}` here means that any tokens that appear less than three times
    # will be ignored and not included in the vocabulary.
    vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                      min_count={'tokens': 3})

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)

    # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens,
    # not for labels, which are used as-is as the "answer" of the sentence classification
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

    # Seq2VecEncoder is a neural network abstraction that takes a sequence of something
    # (usually a sequence of embedded word vectors), processes it, and returns a single
    # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but
    # AllenNLP also supports CNNs and other simple architectures (for example,
    # just averaging over the input vectors).
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

    model = LstmClassifier(word_embeddings, encoder, vocab)
    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
    # optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)

    iterator = BucketIterator(batch_size=32,
                              sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      patience=20,
                      num_epochs=1000)
    trainer.train()
    predictor = SentenceClassifierPredictor(model, dataset_reader=reader)
    day = 12
    while day <= 30:
        # 0,1,2,3,text,source,6,7,8,9,10,favourites_count,12,13,14,15,followers_count,friends_count,18,19,20,lang
        total = 0
        res = {'0': 0, '1': 0, '2': 0, '3': 0, '4': 0}
        with open(f'2020-03-{day} Coronavirus Tweets.CSV', 'r') as csvfile:
            reader = csv.reader(csvfile)
            for row in reader:
                lang = row[-1]
                if lang != 'en':
                    continue
                source = row[5]
                if source == 'Twitter for Advertisers':
                    continue
                followers_count = row[16]
                friends_count = row[17]
                try:
                    followers_count = int(followers_count)
                    friends_count = int(friends_count)
                    if friends_count > followers_count * 80:
                        continue
                except Exception:
                    print("Cannot get friends and follower")
                content = clean_tweets(row[4])
                if not content:
                    continue
                try:
                    if content.count('#') >= 5:
                        continue
                except Exception:
                    print("Cannot get hash tag")
                total += 1
                try:
                    fav = row[11]
                    fav = int(fav)
                except Exception:
                    print("Cannot get favorite")
                try:
                    logits = predictor.predict(content)['logits']
                    label_id = np.argmax(logits)
                    lab = model.vocab.get_token_from_index(label_id, 'labels')
                    res[lab] += 1
                    total += fav
                    res[lab] += fav
                except Exception:
                    print(f"Error in {row[4]}")
        print(f"Day {day}: Total: {total} tweets")
        print(
            f"Day {day}: Strongly negative: {int((res['0']/total)*1000)/100}% ",
            end='')
        print(
            f"Day {day}: Weakly   negative: {int((res['1']/total)*1000)/100}% ",
            end='')
        print(
            f"Day {day}: Neutral          : {int((res['2']/total)*1000)/100}% ",
            end='')
        print(
            f"Day {day}: Weakly   positive: {int((res['3']/total)*1000)/100}% ",
            end='')
        print(
            f"Day {day}: Strongly positive: {int((res['4']/total)*1000)/100}% ",
            end='')
        with open('tweets.log', 'w+') as log:
            log.write(f"Day {day}: Total: {total} tweets")
            log.write(
                f"Day {day}: Strongly negative: {int((res['0']/total)*1000)/100}%"
            )
            log.write(
                f"Day {day}: Weakly   negative: {int((res['1']/total)*1000)/100}%"
            )
            log.write(
                f"Day {day}: Neutral          : {int((res['2']/total)*1000)/100}%"
            )
            log.write(
                f"Day {day}: Weakly   positive: {int((res['3']/total)*1000)/100}%"
            )
            log.write(
                f"Day {day}: Strongly positive: {int((res['4']/total)*1000)/100}%"
            )
        day += 1
Exemple #4
0
def model_train():
    reader = StanfordSentimentTreeBankDatasetReader()

    # train_dataset = reader.read('/Users/geor/git/comp5222-tools/allenNLP/realworldnlp/data/tatoeba/sentences.top10langs.train.tsv')
    # dev_dataset = reader.read('data/tatoeba/sentences.top10langs.dev.tsv')
    # train_dataset = reader.read('data/stanfordSentimentTreebank/trees/train.txt')
    # dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt')
    train_dataset = reader.read(
        'https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/train.txt'
    )
    dev_dataset = reader.read(
        'https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/dev.txt'
    )

    # You can optionally specify the minimum count of tokens/labels.
    # `min_count={'tokens':3}` here means that any tokens that appear less than three times
    # will be ignored and not included in the vocabulary.
    vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                      min_count={'tokens': 3})

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)

    # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens,
    # not for labels, which are used as-is as the "answer" of the sentence classification
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

    # Seq2VecEncoder is a neural network abstraction that takes a sequence of something
    # (usually a sequence of embedded word vectors), processes it, and returns a single
    # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but
    # AllenNLP also supports CNNs and other simple architectures (for example,
    # just averaging over the input vectors).
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

    model = LstmClassifier(word_embeddings, encoder, vocab)
    # model.cuda() #ge add # might be wrong

    if torch.cuda.is_available():
        cuda_device = 0
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1

    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

    iterator = BucketIterator(batch_size=32,
                              sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        iterator=iterator,
        train_dataset=train_dataset,
        validation_dataset=dev_dataset,
        patience=10,
        num_epochs=20,
        cuda_device=cuda_device)  # add cuda_device=cuda_device @ge

    model_file_name = 'trained_model_stanford.pth'  # the name of the model
    if os.path.isfile(model_file_name):
        print("model exist. Loading the model now")
        torch.load(model_file_name)

    else:
        print("model not exist. Training the model now")
        trainer.train()
        torch.save(model, model_file_name)

    predictor = SentenceClassifierPredictor(
        model, dataset_reader=reader)  #ge evaluate test set

    # "test"
    # logits = predictor.predict('I like this comment so much!')['logits']
    # print(logits)  # test
    # label_id = np.argmax(logits)
    # print(model.vocab.get_token_from_index(label_id, 'labels'))
    # "test"
    return predictor, model
def main():
    # In order to use ELMo, each word in a sentence needs to be indexed with
    # an array of character IDs.
    elmo_token_indexer = ELMoTokenCharactersIndexer()
    reader = StanfordSentimentTreeBankDatasetReader(
        token_indexers={'tokens': elmo_token_indexer})

    train_dataset = reader.read('data/stanfordSentimentTreebank/trees/train.txt')
    dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt')

    # Initialize the ELMo-based token embedder using a pre-trained file.
    # This takes a while if you run this script for the first time

    # Original
    # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
    # weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

    # Medium
    # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json"
    # weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5"

    # Use the 'Small' pre-trained model
    options_file = ('https://s3-us-west-2.amazonaws.com/allennlp/models/elmo'
                    '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json')
    weight_file = ('https://s3-us-west-2.amazonaws.com/allennlp/models/elmo'
                   '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5')

    elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)

    vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                      min_count={'tokens': 3})

    # Pass in the ElmoTokenEmbedder instance instead
    word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})

    # The dimension of the ELMo embedding will be 2 x [size of LSTM hidden states]
    elmo_embedding_dim = 256
    lstm = PytorchSeq2VecWrapper(
        torch.nn.LSTM(elmo_embedding_dim, HIDDEN_DIM, batch_first=True))

    model = LstmClassifier(word_embeddings, lstm, vocab)
    optimizer = optim.Adam(model.parameters())

    iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      patience=10,
                      num_epochs=20)

    trainer.train()

    tokens = ['This', 'is', 'the', 'best', 'movie', 'ever', '!']
    predictor = SentenceClassifierPredictor(model, dataset_reader=reader)
    logits = predictor.predict(tokens)['logits']
    label_id = np.argmax(logits)

    print(model.vocab.get_token_from_index(label_id, 'labels'))
Exemple #6
0
elmo_embedding_dim = 1024
vocab = Vocabulary.from_files("/.../sentiment_vocab1")

lstm = PytorchSeq2VecWrapper(
    torch.nn.LSTM(elmo_embedding_dim, HIDDEN_DIM, batch_first=True))
elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)
word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})
model = LstmClassifier(word_embeddings, lstm, vocab)

# Reload the trained model.
with open("/.../sentiment_model1", 'rb') as f:
    model.load_state_dict(torch.load(f))
    model.eval()

print(model)

# Perform predictions

elmo_token_indexer = ELMoTokenCharactersIndexer()
reader = StanfordSentimentTreeBankDatasetReader(
    token_indexers={'tokens': elmo_token_indexer})

tokens = 'Severe acute respiratory syndrome SARS emerged in 2003 as a new epidemic form of life-threatening situations'
predictor = SentenceClassifierPredictor(model, dataset_reader=reader)
logits = predictor.predict(tokens)['logits']
label_id = np.argmax(logits)

print(model.vocab.get_token_from_index(label_id, 'labels'))

# In[ ]: