コード例 #1
0
def main():
    # "http://mattmahoney.net/dc/text8.zip" download first
    data_dir = 'data/word2vec/text8/text8'

    # 1. build vocab from file
    vocab = build_vocab(data_dir)

    # 2. build reader
    reader = SimpleSkipGramReader(
        window_size=WIN_SIZE)  # or SkipGramReader(vocab=vocab)
    text8 = reader.read(data_dir)

    embedding_in = Embedding(
        num_embeddings=vocab.get_vocab_size('token_target'),
        embedding_dim=EMBEDDING_DIM)
    embedding_out = Embedding(
        num_embeddings=vocab.get_vocab_size('token_context'),
        embedding_dim=EMBEDDING_DIM)

    if CUDA_DEVICE > -1:
        embedding_in = embedding_in.to(CUDA_DEVICE)
        embedding_out = embedding_out.to(CUDA_DEVICE)

    iterator = BasicIterator(batch_size=BATCH_SIZE)
    iterator.index_with(vocab)  # important, transform token to index

    model = SkipGramNegativeSamplingModel(vocab,
                                          embedding_in,
                                          embedding_out,
                                          neg_samples=10,
                                          cuda_device=CUDA_DEVICE)
    #
    # model = SkipGramModel(vocab=vocab,
    #                       embedding_in=embedding_in,
    #                       cuda_device=CUDA_DEVICE)

    optimizer = optim.Adam(model.parameters())

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=text8,
                      num_epochs=5,
                      cuda_device=CUDA_DEVICE)
    trainer.train()

    # write_embeddings(embedding_in, 'data/text8/embeddings.txt', vocab)
    print(get_synonyms('one', embedding_in, vocab))
    print(get_synonyms('december', embedding_in, vocab))
    print(get_synonyms('flower', embedding_in, vocab))
    print(get_synonyms('design', embedding_in, vocab))
    print(get_synonyms('snow', embedding_in, vocab))

    rho = evaluate_embeddings(embedding_in, vocab)
    print('simlex999 speareman correlation: {}'.format(rho))
コード例 #2
0
def main():
    reader = SkipGramReader()
    text8 = reader.read('data/text8/text8')

    vocab = Vocabulary.from_instances(text8,
                                      min_count={
                                          'token_in': 5,
                                          'token_out': 5
                                      })

    reader = SkipGramReader(vocab=vocab)
    text8 = reader.read('data/text8/text8')

    embedding_in = Embedding(num_embeddings=vocab.get_vocab_size('token_in'),
                             embedding_dim=EMBEDDING_DIM)
    embedding_out = Embedding(num_embeddings=vocab.get_vocab_size('token_out'),
                              embedding_dim=EMBEDDING_DIM)
    if CUDA_DEVICE > -1:
        embedding_in = embedding_in.to(CUDA_DEVICE)
        embedding_out = embedding_out.to(CUDA_DEVICE)
    iterator = BasicIterator(batch_size=BATCH_SIZE)
    iterator.index_with(vocab)

    # model = SkipGramNegativeSamplingModel(
    #     vocab=vocab,
    #     embedding_in=embedding_in,
    #     embedding_out=embedding_out,
    #     neg_samples=10,
    #     cuda_device=CUDA_DEVICE)

    model = SkipGramModel(vocab=vocab,
                          embedding_in=embedding_in,
                          cuda_device=CUDA_DEVICE)

    optimizer = optim.Adam(model.parameters())

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=text8,
                      num_epochs=5,
                      cuda_device=CUDA_DEVICE)
    trainer.train()

    # write_embeddings(embedding_in, 'data/text8/embeddings.txt', vocab)
    print(get_synonyms('one', embedding_in, vocab))
    print(get_synonyms('december', embedding_in, vocab))
    print(get_synonyms('flower', embedding_in, vocab))
    print(get_synonyms('design', embedding_in, vocab))
    print(get_synonyms('snow', embedding_in, vocab))

    rho = evaluate_embeddings(embedding_in, vocab)
    print('simlex999 speareman correlation: {}'.format(rho))
コード例 #3
0
def main():
    reader = SkipGramReader()
    dataset = reader.read("data/cv/0/train.txt")
    vocab = Vocabulary().from_files("data/vocabulary")
    params = Params(params={})
    vocab.extend_from_instances(params, dataset)

    reader = SkipGramReader(vocab=vocab)
    dataset = reader.read("data/cv/0/train.txt")
    embedding_in = Embedding(num_embeddings=vocab.get_vocab_size('token_in'),
                             embedding_dim=EMBEDDING_DIM)
    embedding_out = Embedding(num_embeddings=vocab.get_vocab_size('token_out'),
                              embedding_dim=EMBEDDING_DIM)
    
    
    if CUDA_DEVICE > -1:
        embedding_in = embedding_in.to(CUDA_DEVICE)
        embedding_out = embedding_out.to(CUDA_DEVICE)
    iterator = BasicIterator(batch_size=BATCH_SIZE)
    iterator.index_with(vocab)

    model = SkipGramModel(vocab=vocab,
                          embedding_in=embedding_in,
                          cuda_device=CUDA_DEVICE)

    # model = SkipGramNegativeSamplingModel(
    #     vocab=vocab,
    #     embedding_in=embedding_in,
    #     embedding_out=embedding_out,
    #     neg_samples=10,
    #     cuda_device=CUDA_DEVICE)

    optimizer = optim.Adam(model.parameters())

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=dataset,
                      num_epochs=20,
                      cuda_device=CUDA_DEVICE)
    trainer.train()

    torch.save(embedding_in.state_dict(), "saved_models/word2vec.th")

    print(get_synonyms('C', embedding_in, vocab))
    print(get_synonyms('G7', embedding_in, vocab))
    print(get_synonyms('G', embedding_in, vocab))
    print(get_synonyms('F', embedding_in, vocab))
    print(get_synonyms('C7', embedding_in, vocab))
コード例 #4
0
    def __init__(self, vocab: Vocabulary, cuda_device=-1) -> None:
        super().__init__(vocab)
        self.cuda_device = cuda_device

        token_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'),
            embedding_dim=EMBEDDING_SIZE)
        if cuda_device > -1:
            token_embedding = token_embedding.to(cuda_device)
        self.embedder = BasicTextFieldEmbedder({"tokens": token_embedding})

        self.rnn = PytorchSeq2SeqWrapper(
            torch.nn.LSTM(EMBEDDING_SIZE, HIDDEN_SIZE, batch_first=True))

        self.hidden2out = torch.nn.Linear(
            in_features=self.rnn.get_output_dim(),
            out_features=vocab.get_vocab_size('tokens'))
        if cuda_device > -1:
            self.hidden2out = self.hidden2out.to(cuda_device)
            self.rnn = self.rnn.to(cuda_device)
コード例 #5
0
    def get_embedder(self, vocab, Word_embedding_dim, char_embeddedng_dim,
                     CNN_num_filters, CNN_encoder_dim):
        # The word embedding will transform every word to a "Word_embedding_dim" real valued vector
        # Having a tensor (batch_size, max_sentence_length, Word_embedding_dim)

        indexers_dict = dict()
        if (Word_embedding_dim > 0):
            word_embedding = Embedding(
                num_embeddings=vocab.get_vocab_size("token_ids"),
                embedding_dim=Word_embedding_dim)

            word_embedding = word_embedding.to(device=self.cf_a.device,
                                               dtype=self.cf_a.dtype)
            indexers_dict["tokens"] = word_embedding
        if (CNN_encoder_dim > 0):
            # The char embedding will transform every character into a ""char_embeddedng_dim" real valued vector
            # Having a tensor (batch_size, max_sentence_length, max_word_length, char_embeddedng_dim)
            char_embedding = Embedding(
                num_embeddings=vocab.get_vocab_size("token_chars"),
                embedding_dim=char_embeddedng_dim)
            # The Encoder will apply the cNN over the max_word_length dimension
            # Having a tensor (batch_size, max_sentence_length, num_filters * ngram_filter_sizes)
            character_cnn = CnnEncoder(ngram_filter_sizes=(1, 1),
                                       embedding_dim=char_embeddedng_dim,
                                       num_filters=CNN_num_filters,
                                       output_dim=CNN_encoder_dim)

            # We concatenate the char embdding and Encoding
            token_character_encoder = TokenCharactersEncoder(
                embedding=char_embedding, encoder=character_cnn)

            token_character_encoder = token_character_encoder.to(
                device=self.cf_a.device, dtype=self.cf_a.dtype)
            indexers_dict["chars"] = token_character_encoder
        ### Now we finally create the finally embedder indicating what are the token ids it embedds
        text_field_embedder = BasicTextFieldEmbedder(indexers_dict)

        return text_field_embedder
コード例 #6
0
                                      'token_in': 2,
                                      'token_out': 2
                                  })
del (text8)
#reading the dataset with Vocabulary to sub-sample the frequent words
reader = SkipGramReader(vocab=vocab)
text8 = reader.read('data/text8')

BATCH_SIZE = 256  #batch_size specifies the size of the batch (the number of instances in a batch)
iterator = BasicIterator(batch_size=BATCH_SIZE)
iterator.index_with(vocab)
EMBEDDING_DIM = 300
embedding_in = Embedding(num_embeddings=vocab.get_vocab_size('token_in'),
                         embedding_dim=EMBEDDING_DIM)
if CUDA_DEVICE > -1:
    embedding_in = embedding_in.to(CUDA_DEVICE)  # we are able now to use GPU 0

#---------------------------Defining the skip-gram Model-----------------------------------#


#1 we implement the Skip-gram model
class SkipGramModel(Model):
    def __init__(self, vocab, embedding_in, cuda_device=-1):
        super().__init__(vocab)
        #2 Embedding object is passed from outside rather than defined inside
        self.embedding_in = embedding_in
        #3 this create a linear layer(we don't need biases)
        self.linear = torch.nn.Linear(
            in_features=EMBEDDING_DIM,  #size of the input vector
            out_features=vocab.get_vocab_size(
                'token_out'),  #size of the output vector