def embedding_matrix( vocab, embedding_size, representation='dense', embeddings_trainable=True, pretrained_embeddings=None, force_embedding_size=False, embedding_initializer=None, ): vocab_size = len(vocab) if representation == 'dense': if pretrained_embeddings is not None and pretrained_embeddings is not False: embeddings_matrix = load_pretrained_embeddings( pretrained_embeddings, vocab) if embeddings_matrix.shape[-1] != embedding_size: raise ValueError( 'The size of the pretrained embeddings is {}, ' 'but the specified embedding_size is {}. ' 'Please change the embedding_size accordingly.'.format( embeddings_matrix.shape[-1], embedding_size)) embedding_initializer_obj = tf.constant(embeddings_matrix, dtype=tf.float32) else: if vocab_size < embedding_size and not force_embedding_size: logger.info( ' embedding_size ({}) is greater than vocab_size ({}). ' 'Setting embedding size to be equal to vocab_size.'.format( embedding_size, vocab_size)) embedding_size = vocab_size if embedding_initializer is not None: embedding_initializer_obj_ref = get_initializer( embedding_initializer) else: embedding_initializer_obj_ref = get_initializer({ TYPE: 'uniform', 'minval': -1.0, 'maxval': 1.0 }) embedding_initializer_obj = embedding_initializer_obj_ref( [vocab_size, embedding_size]) embeddings = tf.Variable(embedding_initializer_obj, trainable=embeddings_trainable, name='embeddings') elif representation == 'sparse': embedding_size = vocab_size embeddings = tf.Variable( get_initializer('identity')([vocab_size, embedding_size]), trainable=False, name='embeddings') else: raise Exception('Embedding representation {} not supported.'.format( representation)) return embeddings, embedding_size
def embedding_matrix( vocab: List[str], embedding_size: int, representation: str = "dense", embeddings_trainable: bool = True, pretrained_embeddings: Optional[str] = None, force_embedding_size: bool = False, embedding_initializer: Optional[Union[str, Dict]] = None, ) -> Tuple[nn.Module, int]: """Returns initialized torch.nn.Embedding module and embedding size.""" vocab_size = len(vocab) if representation == "dense": if pretrained_embeddings: embeddings_matrix = load_pretrained_embeddings(pretrained_embeddings, vocab) if embeddings_matrix.shape[-1] != embedding_size: if not force_embedding_size: embedding_size = embeddings_matrix.shape[-1] logger.info(f"Setting embedding size to be equal to {embeddings_matrix.shape[-1]}.") else: raise ValueError( f"The size of the pretrained embeddings is " f"{embeddings_matrix.shape[-1]}, but the specified " f"embedding_size is {embedding_size}. Please change " f"the embedding_size accordingly." ) embedding_initializer_obj = torch.tensor(embeddings_matrix, dtype=torch.float32) else: if vocab_size < embedding_size and not force_embedding_size: logger.info( f" embedding_size ({embedding_size}) is greater than " f"vocab_size ({vocab_size}). Setting embedding size to be " f"equal to vocab_size." ) embedding_size = vocab_size if embedding_initializer is not None: embedding_initializer_obj_ref = get_initializer(embedding_initializer) else: embedding_initializer_obj_ref = get_initializer({TYPE: "uniform", "a": -1.0, "b": 1.0}) embedding_initializer_obj = embedding_initializer_obj_ref([vocab_size, embedding_size]) embeddings = embedding_initializer_obj elif representation == "sparse": embedding_size = vocab_size embeddings = get_initializer("identity")([vocab_size, embedding_size]) embeddings.requires_grad = False else: raise Exception(f"Embedding representation {representation} not supported.") embeddings = nn.Embedding.from_pretrained(embeddings, freeze=not embeddings_trainable) return embeddings, embedding_size