def test_cache_xlu_embeds(self): embeddings_ref = PretrainedEmbedding() dialects = ["en_US", "en_UK", "es_XX"] for dialect in dialects: embeddings_ref.load_pretrained_embeddings( EMBED_RAW_PATH, append=True, dialect=dialect ) with tempfile.NamedTemporaryFile( delete=False, suffix=".{}".format("cached") ) as cached_path: embeddings_ref.cache_pretrained_embeddings(cached_path.name) embeddings_cached = PretrainedEmbedding() embeddings_cached.load_cached_embeddings(cached_path.name) np.testing.assert_array_equal( sorted(embeddings_cached.stoi.keys()), sorted(embeddings_ref.stoi.keys()) ) np.testing.assert_array_equal( embeddings_cached.embed_vocab, embeddings_ref.embed_vocab ) np.testing.assert_array_equal( sorted(embeddings_cached.stoi.values()), sorted(embeddings_ref.stoi.values()), ) for word_idx in embeddings_ref.stoi.values(): np.testing.assert_array_almost_equal( embeddings_cached.embedding_vectors[word_idx], embeddings_ref.embedding_vectors[word_idx], )
def from_config( cls, config: WordFeatConfig, metadata: Optional[FieldMeta] = None, tensorizer: Optional[Tensorizer] = None, init_from_saved_state: Optional[bool] = False, ): """Factory method to construct an instance of WordEmbedding from the module's config object and the field's metadata object. Args: config (WordFeatConfig): Configuration object specifying all the parameters of WordEmbedding. metadata (FieldMeta): Object containing this field's metadata. Returns: type: An instance of WordEmbedding. """ if tensorizer is not None: if config.vocab_from_pretrained_embeddings: raise ValueError( "In new data design, to add tokens from a pretrained embeddings " "file to the vocab, specify `vocab_file` in the token tensorizer." ) embeddings_weight = None # We don't need to load pretrained embeddings if we know the # embedding weights are going to be loaded from a snapshot. if config.pretrained_embeddings_path and not init_from_saved_state: pretrained_embedding = PretrainedEmbedding( config.pretrained_embeddings_path, # doesn't support fbpkg lowercase_tokens=config.lowercase_tokens, skip_header=config.skip_header, ) embeddings_weight = pretrained_embedding.initialize_embeddings_weights( tensorizer.vocab.idx, tensorizer.vocab.unk_token, config.embed_dim, config.embedding_init_strategy, ) num_embeddings = len(tensorizer.vocab) unk_token_idx = tensorizer.vocab.get_unk_index() vocab = tensorizer.vocab else: # This else condition should go away after metadata goes away. num_embeddings = metadata.vocab_size embeddings_weight = metadata.pretrained_embeds_weight unk_token_idx = metadata.unk_token_idx vocab = metadata.vocab return cls( num_embeddings=num_embeddings, embedding_dim=config.embed_dim, embeddings_weight=embeddings_weight, init_range=config.embedding_init_range, unk_token_idx=unk_token_idx, mlp_layer_dims=config.mlp_layer_dims, padding_idx=config.padding_idx, vocab=vocab, )
def from_config( cls, config: WordFeatConfig, metadata: Optional[FieldMeta] = None, tensorizer: Optional[Tensorizer] = None, ): """Factory method to construct an instance of WordEmbedding from the module's config object and the field's metadata object. Args: config (WordFeatConfig): Configuration object specifying all the parameters of WordEmbedding. metadata (FieldMeta): Object containing this field's metadata. Returns: type: An instance of WordEmbedding. """ if tensorizer is not None: embeddings_weight = None if config.pretrained_embeddings_path: pretrained_embedding = PretrainedEmbedding( config.pretrained_embeddings_path, # doesn't support fbpkg lowercase_tokens=tensorizer.tokenizer.lowercase, ) if config.vocab_from_pretrained_embeddings: if not config.vocab_from_train_data: # Reset token counter. tensorizer.vocab_builder._counter = collections.Counter( ) tensorizer.vocab_builder.add_all( pretrained_embedding.embed_vocab) tensorizer.vocab = tensorizer.vocab_builder.make_vocab() embeddings_weight = pretrained_embedding.initialize_embeddings_weights( tensorizer.vocab.idx, UNK, config.embed_dim, config.embedding_init_strategy, ) num_embeddings = len(tensorizer.vocab) unk_token_idx = tensorizer.vocab.idx[UNK] else: # This else condition should go away after metadata goes away. num_embeddings = metadata.vocab_size embeddings_weight = metadata.pretrained_embeds_weight unk_token_idx = metadata.unk_token_idx return cls( num_embeddings=num_embeddings, embedding_dim=config.embed_dim, embeddings_weight=embeddings_weight, init_range=config.embedding_init_range, unk_token_idx=unk_token_idx, mlp_layer_dims=config.mlp_layer_dims, )
def __init__( self, pretrained_embeddings_path: str, embedding_dim: int, mlp_layer_dims: Optional[Sequence[int]] = None, lowercase_tokens: bool = False, skip_header: bool = True, delimiter: str = " ", vocab: ScriptVocabulary = None, ) -> None: super().__init__() vocab = vocab or build_vocab(pretrained_embeddings_path) pretrained_embedding = PretrainedEmbedding( pretrained_embeddings_path, lowercase_tokens=lowercase_tokens, skip_header=skip_header, delimiter=delimiter, ) embeddings_weight = pretrained_embedding.initialize_embeddings_weights( vocab.idx, # tensorizer.vocab.idx, vocab.unk_token, # tensorizer.vocab.unk_token, embedding_dim, EmbedInitStrategy.RANDOM, ) num_embeddings = len(vocab.idx) self.embedding = nn.Embedding( num_embeddings, embedding_dim, _weight=embeddings_weight, padding_idx=vocab.get_pad_index(), ) # Initialize unk embedding with zeros # to guard the model against randomized decisions based on unknown words unk_token_idx = vocab.get_unk_index() if unk_token_idx >= 0: self.embedding.weight.data[unk_token_idx].fill_(0.0) # Create MLP layers if mlp_layer_dims is None: mlp_layer_dims = [] self.mlp = nn.Sequential( *( nn.Sequential(nn.Linear(m, n), nn.ReLU()) for m, n in zip([embedding_dim] + list(mlp_layer_dims), mlp_layer_dims) ) ) self.output_dim = mlp_layer_dims[-1] if mlp_layer_dims else embedding_dim
def from_config( cls, config: Config, tensorizer: Tensorizer = None, init_from_saved_state: Optional[bool] = False, ): """Factory method to construct an instance of WordEmbedding from the module's config object and the field's metadata object. Args: config (WordSeqEmbedding.Config): Configuration object specifying all the parameters of WordEmbedding. Returns: type: An instance of WordSeqEmbedding. """ embeddings_weight = None # We don't need to load pretrained embeddings if we know the # embedding weights are going to be loaded from a snapshot. if config.pretrained_embeddings_path and not init_from_saved_state: pretrained_embedding = PretrainedEmbedding( config.pretrained_embeddings_path, # doesn't support fbpkg lowercase_tokens=config.lowercase_tokens, skip_header=config.skip_header, delimiter=config.delimiter, ) embeddings_weight = pretrained_embedding.initialize_embeddings_weights( tensorizer.vocab.idx, tensorizer.vocab.unk_token, config.word_embed_dim, config.embedding_init_strategy, ) num_embeddings = len(tensorizer.vocab) unk_token_idx = tensorizer.vocab.get_unk_index() vocab = tensorizer.vocab vocab_pad_idx = vocab.get_pad_index(value=-1) if vocab_pad_idx == -1: vocab_pad_idx = None return cls( lstm_config=config.lstm, num_embeddings=num_embeddings, word_embed_dim=config.word_embed_dim, embeddings_weight=embeddings_weight, init_range=config.embedding_init_range, init_std=config.embeddding_init_std, unk_token_idx=unk_token_idx, padding_idx=config.padding_idx or vocab_pad_idx, vocab=vocab, )
def test_assign_pretrained_weights(self): embeddings_ref = PretrainedEmbedding() embeddings_ref.load_cached_embeddings(EMBED_CACHED_PATH) VOCAB = ["UNK", "aloha", "the"] embed_vocab_to_idx = {tok: i for i, tok in enumerate(VOCAB)} pretrained_embeds = embeddings_ref.initialize_embeddings_weights( embed_vocab_to_idx, "UNK", EMBED_DIM, EmbedInitStrategy.RANDOM ) assert pretrained_embeds.shape[0] == len(VOCAB) assert pretrained_embeds.shape[1] == EMBED_DIM np.testing.assert_array_almost_equal( pretrained_embeds[1].numpy(), [-0.43124, 0.014934, -0.50635, 0.60506, 0.56051], ) # embedding vector for 'aloha' np.testing.assert_array_almost_equal( pretrained_embeds[2].numpy(), [-0.39153, -0.19803, 0.2573, -0.18617, 0.25551], ) # embedding vector for 'the'
def test_load_pretrained_embeddings(self): pretrained_emb = PretrainedEmbedding(EMBED_RAW_PATH) self.assertEqual(len(pretrained_emb.embed_vocab), VOCAB_SIZE) self.assertEqual(pretrained_emb.embed_vocab[0], "</s>") self.assertEqual(pretrained_emb.embed_vocab[2], "to") self.assertEqual(len(pretrained_emb.stoi), VOCAB_SIZE) self.assertEqual(pretrained_emb.stoi["</s>"], 0) self.assertEqual(pretrained_emb.stoi["to"], 2) self.assertEqual(pretrained_emb.embedding_vectors.size(0), VOCAB_SIZE) self.assertEqual(pretrained_emb.embedding_vectors.size(1), EMBED_DIM)
def from_config( cls, config: WordFeatConfig, metadata: Optional[FieldMeta] = None, tensorizer: Optional[Tensorizer] = None, init_from_saved_state: Optional[bool] = False, ): """Factory method to construct an instance of WordEmbedding from the module's config object and the field's metadata object. Args: config (WordFeatConfig): Configuration object specifying all the parameters of WordEmbedding. metadata (FieldMeta): Object containing this field's metadata. Returns: type: An instance of WordEmbedding. """ if tensorizer is not None: embeddings_weight = None if config.pretrained_embeddings_path and ( # We don't need to load pretrained embeddings if we know the # embedding weights are going to be loaded from a snapshot. The # exception is if we rely on the pretrained embeddings to give us # the vocab, in which case, we have to load it regardless. config.vocab_from_pretrained_embeddings or not init_from_saved_state): pretrained_embedding = PretrainedEmbedding( config.pretrained_embeddings_path, # doesn't support fbpkg lowercase_tokens=config.lowercase_tokens, ) if config.vocab_from_pretrained_embeddings: # pretrained embeddings will get a freq count of 1 assert config.min_freq == 1, ( "If `vocab_from_pretrained_embeddings` is set, the vocab's " "`min_freq` must be 1") if not config.vocab_from_train_data: # Reset token counter. tensorizer.vocab_builder._counter = collections.Counter( ) pretrained_vocab = pretrained_embedding.embed_vocab if config.vocab_size: pretrained_vocab = pretrained_vocab[:config.vocab_size] tensorizer.vocab_builder.add_all(pretrained_vocab) tensorizer.vocab = tensorizer.vocab_builder.make_vocab() embeddings_weight = pretrained_embedding.initialize_embeddings_weights( tensorizer.vocab.idx, UNK, config.embed_dim, config.embedding_init_strategy, ) num_embeddings = len(tensorizer.vocab) unk_token_idx = tensorizer.vocab.idx[UNK] else: # This else condition should go away after metadata goes away. num_embeddings = metadata.vocab_size embeddings_weight = metadata.pretrained_embeds_weight unk_token_idx = metadata.unk_token_idx return cls( num_embeddings=num_embeddings, embedding_dim=config.embed_dim, embeddings_weight=embeddings_weight, init_range=config.embedding_init_range, unk_token_idx=unk_token_idx, mlp_layer_dims=config.mlp_layer_dims, )