def load_embeddings(file_path, word_dim, vocab_size, num_copy_tokens): special_tokens = CasedWordVocab.SPECIAL_TOKENS base_embeds = SimpleEmbeddings.from_file(file_path, word_dim, vocab_size) _, embed_dim = base_embeds.array.shape def sample_embeds(num_embeds, seed): shape = (num_embeds, embed_dim) return emulate_distribution(shape, base_embeds.array, seed=seed) special_tokens_array = sample_embeds(len(special_tokens), seed=0) copy_tokens_array = sample_embeds( num_copy_tokens, seed=1) # different seed to have different val # copy tokens are appended at the end new_array = np.concatenate( (special_tokens_array, base_embeds.array, copy_tokens_array), axis=0) new_vocab = HardCopyVocab(base_embeds.vocab.tokens, num_copy_tokens) # check that tokens come in the order that we assumed correct_tokens = list(special_tokens) # special tokens first correct_tokens.extend(base_embeds.vocab.tokens) # then base tokens correct_tokens.extend('<copy{}>'.format(i) for i in xrange(num_copy_tokens)) # copy tokens last assert new_vocab.tokens == correct_tokens return SimpleEmbeddings(new_array, new_vocab)
def _build_editor(cls, config, num_iter, eps, momentum): """Build Editor. Args: config (Config): Editor config Returns: Editor """ file_path = join(data.workspace.word_vectors, config.wvec_path) word_embeddings = SimpleEmbeddings.from_file(file_path, config.word_dim, vocab_size=config.vocab_size) word_embeddings = word_embeddings.with_special_tokens() source_token_embedder = TokenEmbedder(word_embeddings) target_token_embedder = TokenEmbedder(word_embeddings) if config.decoder_cell == 'SimpleDecoderCell': decoder_cell = SimpleDecoderCell(target_token_embedder, config.hidden_dim, config.word_dim, config.agenda_dim) elif config.decoder_cell == 'AttentionDecoderCell': decoder_cell = AttentionDecoderCell(target_token_embedder, config.agenda_dim, config.hidden_dim, config.hidden_dim, config.attention_dim, config.no_insert_delete_attn, num_layers=config.decoder_layers) else: raise ValueError('{} not implemented'.format(config.decoder_cell)) editor = Editor(source_token_embedder, config.hidden_dim, config.agenda_dim, config.edit_dim, config.lamb_reg, config.norm_eps, config.norm_max, config.kill_edit, decoder_cell, config.encoder_layers, num_iter, eps, momentum) editor = try_gpu(editor) return editor
def _build_model(cls, config): file_path = join(data.workspace.word_vectors, config.model.wvec_path) word_embeddings = SimpleEmbeddings.from_file( file_path, config.model.word_dim, vocab_size=config.model.vocab_size) word_embeddings = word_embeddings.with_special_tokens() token_embedder = TokenEmbedder(word_embeddings) model = None if config.model.type == 0: # regular language model model = LanguageModel(token_embedder, config.model.hidden_dim, config.model.agenda_dim, config.model.num_layers, cls._make_logger()) elif config.model.type == 1: # SVAE model = NoisyLanguageModel( token_embedder, config.model.hidden_dim, config.model.agenda_dim, config.model.num_layers, config.model.kl_weight_steps, config.model.kl_weight_rate, config.model.kl_weight_cap, config.model.dci_keep_rate, cls._make_logger()) assert model is not None model = try_gpu(model) optimizer = optim.Adam(model.parameters(), lr=config.optim.learning_rate) return model, optimizer
def _build_model(config, training_examples): # build scorer model_config = config.retriever embeds_path = join(data.workspace.word_vectors, 'glove.6B.{}d.txt'.format(model_config.word_dim)) word_embeds = SimpleEmbeddings.from_file(embeds_path, model_config.word_dim, model_config.vocab_size) word_embeds = word_embeds.with_special_tokens() def seq_embedder(trainable): sent_dim = model_config.sent_dim token_embedder = TokenEmbedder(word_embeds, trainable) if trainable: transform = Linear(token_embedder.embed_dim, sent_dim) # if trainable, also add a linear transform else: transform = lambda x: x return BOWSequenceEmbedder(token_embedder, embed_dim=sent_dim, pool=model_config.pool_method, transform=transform) neg_sampler = UniformNegativeSampler(training_examples) input_embedder = seq_embedder(trainable=model_config.train_input) output_embedder = seq_embedder(trainable=model_config.train_output) scorer = Seq2SeqScorer(input_embedder, output_embedder, neg_sampler, score_method=model_config.score_method, loss_method=model_config.loss_method) scorer = try_gpu(scorer) # build optimizer optimizer = optim.Adam(scorer.parameters(), lr=config.optim.learning_rate) return scorer, optimizer
def model(self): array = np.array([ [1, 2, 3], [2, 4, 6], [3, 5, 7], ], dtype=np.float32) vocab = SimpleVocab(u'a b c'.split()) embeddings = SimpleEmbeddings(array, vocab) return TokenEmbedder(embeddings, 'token_embeds')
def base_pred_embeddings(self): array = np.array([ [0, 0, 0, 0], [1, 2, 3, 4], [0, 2, 0, 8], ], dtype=np.float32) vocab = SimpleVocab(u'<unk> b0 b1'.split()) return SimpleEmbeddings(array, vocab)
def embeddings(self): array = np.array([ [0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11], [12, 13, 14], [15, 16, 17], ], dtype=np.float32) vocab = SimpleVocab(['<pad>', 'a', 'b', 'c', 'd', 'e']) return SimpleEmbeddings(array, vocab)
def token_embedder(self, base_vocab, embeds_array, dynamic_vocabs): word_embeds = SimpleEmbeddings(embeds_array, base_vocab) base_embedder = TokenEmbedder(word_embeds) return DynamicMultiVocabTokenEmbedder(base_embedder, dynamic_vocabs, base_vocab)
def embeds(vocab): array = np.eye(len(vocab)) return SimpleEmbeddings(array, vocab)