def main(): # "http://mattmahoney.net/dc/text8.zip" download first data_dir = 'data/word2vec/text8/text8' # 1. build vocab from file vocab = build_vocab(data_dir) # 2. build reader reader = SimpleSkipGramReader( window_size=WIN_SIZE) # or SkipGramReader(vocab=vocab) text8 = reader.read(data_dir) embedding_in = Embedding( num_embeddings=vocab.get_vocab_size('token_target'), embedding_dim=EMBEDDING_DIM) embedding_out = Embedding( num_embeddings=vocab.get_vocab_size('token_context'), embedding_dim=EMBEDDING_DIM) if CUDA_DEVICE > -1: embedding_in = embedding_in.to(CUDA_DEVICE) embedding_out = embedding_out.to(CUDA_DEVICE) iterator = BasicIterator(batch_size=BATCH_SIZE) iterator.index_with(vocab) # important, transform token to index model = SkipGramNegativeSamplingModel(vocab, embedding_in, embedding_out, neg_samples=10, cuda_device=CUDA_DEVICE) # # model = SkipGramModel(vocab=vocab, # embedding_in=embedding_in, # cuda_device=CUDA_DEVICE) optimizer = optim.Adam(model.parameters()) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=text8, num_epochs=5, cuda_device=CUDA_DEVICE) trainer.train() # write_embeddings(embedding_in, 'data/text8/embeddings.txt', vocab) print(get_synonyms('one', embedding_in, vocab)) print(get_synonyms('december', embedding_in, vocab)) print(get_synonyms('flower', embedding_in, vocab)) print(get_synonyms('design', embedding_in, vocab)) print(get_synonyms('snow', embedding_in, vocab)) rho = evaluate_embeddings(embedding_in, vocab) print('simlex999 speareman correlation: {}'.format(rho))
def main(): reader = SkipGramReader() text8 = reader.read('data/text8/text8') vocab = Vocabulary.from_instances(text8, min_count={ 'token_in': 5, 'token_out': 5 }) reader = SkipGramReader(vocab=vocab) text8 = reader.read('data/text8/text8') embedding_in = Embedding(num_embeddings=vocab.get_vocab_size('token_in'), embedding_dim=EMBEDDING_DIM) embedding_out = Embedding(num_embeddings=vocab.get_vocab_size('token_out'), embedding_dim=EMBEDDING_DIM) if CUDA_DEVICE > -1: embedding_in = embedding_in.to(CUDA_DEVICE) embedding_out = embedding_out.to(CUDA_DEVICE) iterator = BasicIterator(batch_size=BATCH_SIZE) iterator.index_with(vocab) # model = SkipGramNegativeSamplingModel( # vocab=vocab, # embedding_in=embedding_in, # embedding_out=embedding_out, # neg_samples=10, # cuda_device=CUDA_DEVICE) model = SkipGramModel(vocab=vocab, embedding_in=embedding_in, cuda_device=CUDA_DEVICE) optimizer = optim.Adam(model.parameters()) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=text8, num_epochs=5, cuda_device=CUDA_DEVICE) trainer.train() # write_embeddings(embedding_in, 'data/text8/embeddings.txt', vocab) print(get_synonyms('one', embedding_in, vocab)) print(get_synonyms('december', embedding_in, vocab)) print(get_synonyms('flower', embedding_in, vocab)) print(get_synonyms('design', embedding_in, vocab)) print(get_synonyms('snow', embedding_in, vocab)) rho = evaluate_embeddings(embedding_in, vocab) print('simlex999 speareman correlation: {}'.format(rho))
def main(): reader = SkipGramReader() dataset = reader.read("data/cv/0/train.txt") vocab = Vocabulary().from_files("data/vocabulary") params = Params(params={}) vocab.extend_from_instances(params, dataset) reader = SkipGramReader(vocab=vocab) dataset = reader.read("data/cv/0/train.txt") embedding_in = Embedding(num_embeddings=vocab.get_vocab_size('token_in'), embedding_dim=EMBEDDING_DIM) embedding_out = Embedding(num_embeddings=vocab.get_vocab_size('token_out'), embedding_dim=EMBEDDING_DIM) if CUDA_DEVICE > -1: embedding_in = embedding_in.to(CUDA_DEVICE) embedding_out = embedding_out.to(CUDA_DEVICE) iterator = BasicIterator(batch_size=BATCH_SIZE) iterator.index_with(vocab) model = SkipGramModel(vocab=vocab, embedding_in=embedding_in, cuda_device=CUDA_DEVICE) # model = SkipGramNegativeSamplingModel( # vocab=vocab, # embedding_in=embedding_in, # embedding_out=embedding_out, # neg_samples=10, # cuda_device=CUDA_DEVICE) optimizer = optim.Adam(model.parameters()) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=dataset, num_epochs=20, cuda_device=CUDA_DEVICE) trainer.train() torch.save(embedding_in.state_dict(), "saved_models/word2vec.th") print(get_synonyms('C', embedding_in, vocab)) print(get_synonyms('G7', embedding_in, vocab)) print(get_synonyms('G', embedding_in, vocab)) print(get_synonyms('F', embedding_in, vocab)) print(get_synonyms('C7', embedding_in, vocab))
def __init__(self, vocab: Vocabulary, cuda_device=-1) -> None: super().__init__(vocab) self.cuda_device = cuda_device token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_SIZE) if cuda_device > -1: token_embedding = token_embedding.to(cuda_device) self.embedder = BasicTextFieldEmbedder({"tokens": token_embedding}) self.rnn = PytorchSeq2SeqWrapper( torch.nn.LSTM(EMBEDDING_SIZE, HIDDEN_SIZE, batch_first=True)) self.hidden2out = torch.nn.Linear( in_features=self.rnn.get_output_dim(), out_features=vocab.get_vocab_size('tokens')) if cuda_device > -1: self.hidden2out = self.hidden2out.to(cuda_device) self.rnn = self.rnn.to(cuda_device)
def get_embedder(self, vocab, Word_embedding_dim, char_embeddedng_dim, CNN_num_filters, CNN_encoder_dim): # The word embedding will transform every word to a "Word_embedding_dim" real valued vector # Having a tensor (batch_size, max_sentence_length, Word_embedding_dim) indexers_dict = dict() if (Word_embedding_dim > 0): word_embedding = Embedding( num_embeddings=vocab.get_vocab_size("token_ids"), embedding_dim=Word_embedding_dim) word_embedding = word_embedding.to(device=self.cf_a.device, dtype=self.cf_a.dtype) indexers_dict["tokens"] = word_embedding if (CNN_encoder_dim > 0): # The char embedding will transform every character into a ""char_embeddedng_dim" real valued vector # Having a tensor (batch_size, max_sentence_length, max_word_length, char_embeddedng_dim) char_embedding = Embedding( num_embeddings=vocab.get_vocab_size("token_chars"), embedding_dim=char_embeddedng_dim) # The Encoder will apply the cNN over the max_word_length dimension # Having a tensor (batch_size, max_sentence_length, num_filters * ngram_filter_sizes) character_cnn = CnnEncoder(ngram_filter_sizes=(1, 1), embedding_dim=char_embeddedng_dim, num_filters=CNN_num_filters, output_dim=CNN_encoder_dim) # We concatenate the char embdding and Encoding token_character_encoder = TokenCharactersEncoder( embedding=char_embedding, encoder=character_cnn) token_character_encoder = token_character_encoder.to( device=self.cf_a.device, dtype=self.cf_a.dtype) indexers_dict["chars"] = token_character_encoder ### Now we finally create the finally embedder indicating what are the token ids it embedds text_field_embedder = BasicTextFieldEmbedder(indexers_dict) return text_field_embedder
'token_in': 2, 'token_out': 2 }) del (text8) #reading the dataset with Vocabulary to sub-sample the frequent words reader = SkipGramReader(vocab=vocab) text8 = reader.read('data/text8') BATCH_SIZE = 256 #batch_size specifies the size of the batch (the number of instances in a batch) iterator = BasicIterator(batch_size=BATCH_SIZE) iterator.index_with(vocab) EMBEDDING_DIM = 300 embedding_in = Embedding(num_embeddings=vocab.get_vocab_size('token_in'), embedding_dim=EMBEDDING_DIM) if CUDA_DEVICE > -1: embedding_in = embedding_in.to(CUDA_DEVICE) # we are able now to use GPU 0 #---------------------------Defining the skip-gram Model-----------------------------------# #1 we implement the Skip-gram model class SkipGramModel(Model): def __init__(self, vocab, embedding_in, cuda_device=-1): super().__init__(vocab) #2 Embedding object is passed from outside rather than defined inside self.embedding_in = embedding_in #3 this create a linear layer(we don't need biases) self.linear = torch.nn.Linear( in_features=EMBEDDING_DIM, #size of the input vector out_features=vocab.get_vocab_size( 'token_out'), #size of the output vector