def main():
  # First load the models as it was, that is loading it with the training sizes and vocab
  training_data = DataReader(training_data_filepath)

  vocab = training_data.vocab

  # Build a list of trigrams
  words = training_data.get_words()

  # Get the pretrained word vectors
  word_to_index, embed_dict = get_pretrained_word_indexes(pretrained_filepath)

  # Update word_to_index and vocabulary
  word_to_index, vocab = update_word_indexes_vocab(word_to_index, vocab)

  # Get the numpy matrix containing the pretrained word vectors
  # with randomly initialized unknown words from the corpus
  word_embeddings = get_embeddings_matrix(word_to_index, embed_dict, WORD_EMBEDDINGS_DIMENSION)

  model = NGramLanguageModeler(len(vocab), 50, CONTEXT_SIZE, word_embeddings)
  model.load_state_dict(torch.load("AWS_model.pt"))

  test_data = DataReader(test_data_filepath, read_limit=READ_LIMIT)

  evaluate_model(model, test_data, word_to_index)
Esempio n. 2
0
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.embeddings.weight.data.copy_(torch.from_numpy(word_embeddings))
        self.linear1 = nn.Linear(context_size * embedding_dim, vocab_size)
        self.embeddings.weight.requires_grad = False # Do not train the pre-calculated embeddings

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.tanh(self.linear1(embeds))
        log_probs = F.log_softmax(out)
        return log_probs

# Read corpus and compile the vocabulary
training_data = DataReader(training_data_filepath, read_limit=READ_LIMIT)
vocab = training_data.vocab
# Build a list of trigrams
words = training_data.get_words()
trigrams = extract_list_of_ngrams(words, CONTEXT_SIZE + 1)
# Get the pretrained word vectors
word_to_ix, embed_dict = get_pretrained_word_indexes(pretrained_filepath)
# Update word_to_ix and vocabulary
word_to_ix, vocab = update_word_indexes_vocab(word_to_ix, vocab)
# Get the numpy matrix containing the pretrained word vectors
# with randomly initialized unknown words from the corpus
word_embeddings = get_embeddings_matrix(word_to_ix, embed_dict, EMBEDDING_DIM)

losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE, word_embeddings)
optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=LEARNING_RATE)

epoch_times = []