Ejemplo n.º 1
0
def embedding_matrix(
    vocab,
    embedding_size,
    representation='dense',
    embeddings_trainable=True,
    pretrained_embeddings=None,
    force_embedding_size=False,
    embedding_initializer=None,
):
    vocab_size = len(vocab)
    if representation == 'dense':
        if pretrained_embeddings is not None and pretrained_embeddings is not False:
            embeddings_matrix = load_pretrained_embeddings(
                pretrained_embeddings, vocab)
            if embeddings_matrix.shape[-1] != embedding_size:
                raise ValueError(
                    'The size of the pretrained embeddings is {}, '
                    'but the specified embedding_size is {}. '
                    'Please change the embedding_size accordingly.'.format(
                        embeddings_matrix.shape[-1], embedding_size))
            embedding_initializer_obj = tf.constant(embeddings_matrix,
                                                    dtype=tf.float32)

        else:
            if vocab_size < embedding_size and not force_embedding_size:
                logger.info(
                    '  embedding_size ({}) is greater than vocab_size ({}). '
                    'Setting embedding size to be equal to vocab_size.'.format(
                        embedding_size, vocab_size))
                embedding_size = vocab_size

            if embedding_initializer is not None:
                embedding_initializer_obj_ref = get_initializer(
                    embedding_initializer)
            else:
                embedding_initializer_obj_ref = get_initializer({
                    TYPE: 'uniform',
                    'minval': -1.0,
                    'maxval': 1.0
                })
            embedding_initializer_obj = embedding_initializer_obj_ref(
                [vocab_size, embedding_size])

        embeddings = tf.Variable(embedding_initializer_obj,
                                 trainable=embeddings_trainable,
                                 name='embeddings')

    elif representation == 'sparse':
        embedding_size = vocab_size
        embeddings = tf.Variable(
            get_initializer('identity')([vocab_size, embedding_size]),
            trainable=False,
            name='embeddings')

    else:
        raise Exception('Embedding representation {} not supported.'.format(
            representation))

    return embeddings, embedding_size
Ejemplo n.º 2
0
def embedding_matrix(
    vocab: List[str],
    embedding_size: int,
    representation: str = "dense",
    embeddings_trainable: bool = True,
    pretrained_embeddings: Optional[str] = None,
    force_embedding_size: bool = False,
    embedding_initializer: Optional[Union[str, Dict]] = None,
) -> Tuple[nn.Module, int]:
    """Returns initialized torch.nn.Embedding module and embedding size."""

    vocab_size = len(vocab)
    if representation == "dense":
        if pretrained_embeddings:
            embeddings_matrix = load_pretrained_embeddings(pretrained_embeddings, vocab)
            if embeddings_matrix.shape[-1] != embedding_size:
                if not force_embedding_size:
                    embedding_size = embeddings_matrix.shape[-1]
                    logger.info(f"Setting embedding size to be equal to {embeddings_matrix.shape[-1]}.")
                else:
                    raise ValueError(
                        f"The size of the pretrained embeddings is "
                        f"{embeddings_matrix.shape[-1]}, but the specified "
                        f"embedding_size is {embedding_size}. Please change "
                        f"the embedding_size accordingly."
                    )
            embedding_initializer_obj = torch.tensor(embeddings_matrix, dtype=torch.float32)

        else:
            if vocab_size < embedding_size and not force_embedding_size:
                logger.info(
                    f"  embedding_size ({embedding_size}) is greater than "
                    f"vocab_size ({vocab_size}). Setting embedding size to be "
                    f"equal to vocab_size."
                )
                embedding_size = vocab_size

            if embedding_initializer is not None:
                embedding_initializer_obj_ref = get_initializer(embedding_initializer)
            else:
                embedding_initializer_obj_ref = get_initializer({TYPE: "uniform", "a": -1.0, "b": 1.0})
            embedding_initializer_obj = embedding_initializer_obj_ref([vocab_size, embedding_size])

        embeddings = embedding_initializer_obj

    elif representation == "sparse":
        embedding_size = vocab_size
        embeddings = get_initializer("identity")([vocab_size, embedding_size])
        embeddings.requires_grad = False
    else:
        raise Exception(f"Embedding representation {representation} not supported.")

    embeddings = nn.Embedding.from_pretrained(embeddings, freeze=not embeddings_trainable)
    return embeddings, embedding_size
Ejemplo n.º 3
0
def test_get_initializer():
    """Currently only checks for when the parameters are None."""
    tensor_size = (2, 3)

    # Test for when the parameters are None
    torch.random.manual_seed(0)
    initialized_tensor = get_initializer(None)(*tensor_size, device=DEVICE)

    # Check that the tensor using the expected initialization and the same seed is identical
    default_initializer = nn.init.xavier_uniform_
    torch.random.manual_seed(0)
    default_tensor = default_initializer(
        torch.empty(*tensor_size, device=DEVICE))
    assert torch.equal(initialized_tensor, default_tensor)
Ejemplo n.º 4
0
    def __init__(
            self,
            embedding_size=10,
            embeddings_on_cpu=False,
            should_softmax=False,
            fc_layers=None,
            num_fc_layers=0,
            fc_size=10,
            use_bias=True,
            weights_initializer='glorot_uniform',
            bias_initializer='zeros',
            weights_regularizer=None,
            bias_regularizer=None,
            activity_regularizer=None,
            # weights_constraint=None,
            # bias_constraint=None,
            norm=None,
            norm_params=None,
            activation='relu',
            dropout=0,
            **kwargs):
        """
            :param embedding_size: it is the maximum embedding size, the actual
                   size will be `min(vocaularyb_size, embedding_size)`
                   for `dense` representations and exacly `vocaularyb_size`
                   for the `sparse` encoding, where `vocabulary_size` is
                   the number of different strings appearing in the training set
                   in the column the feature is named after (plus 1 for `<UNK>`).
            :type embedding_size: Integer
            :param embeddings_on_cpu: by default embedings matrices are stored
                   on GPU memory if a GPU is used, as it allows
                   for faster access, but in some cases the embedding matrix
                   may be really big and this parameter forces the placement
                   of the embedding matrix in regular memroy and the CPU is used
                   to resolve them, slightly slowing down the process
                   as a result of data transfer between CPU and GPU memory.
            :param dropout: determines if there should be a dropout layer before
                   returning the encoder output.
            :type dropout: Boolean
            :param initializer: the initializer to use. If `None`, the default
                   initialized of each variable is used (`glorot_uniform`
                   in most cases). Options are: `constant`, `identity`, `zeros`,
                    `ones`, `orthogonal`, `normal`, `uniform`,
                    `truncated_normal`, `variance_scaling`, `glorot_normal`,
                    `glorot_uniform`, `xavier_normal`, `xavier_uniform`,
                    `he_normal`, `he_uniform`, `lecun_normal`, `lecun_uniform`.
                    Alternatively it is possible to specify a dictionary with
                    a key `type` that identifies the type of initialzier and
                    other keys for its parameters, e.g.
                    `{type: normal, mean: 0, stddev: 0}`.
                    To know the parameters of each initializer, please refer to
                    TensorFlow's documentation.
            :type initializer: str
            :param regularize: if `True` the embedding wieghts are added to
                   the set of weights that get reularized by a regularization
                   loss (if the `regularization_lambda` in `training`
                   is greater than 0).
            :type regularize: Boolean
        """
        super(H3WeightedSum, self).__init__()
        logger.debug(' {}'.format(self.name))

        self.should_softmax = should_softmax
        self.reduce_sequence = SequenceReducer(reduce_mode='sum')

        self.h3_embed = H3Embed(
            embedding_size,
            embeddings_on_cpu=embeddings_on_cpu,
            dropout=dropout,
            weights_initializer=weights_initializer,
            bias_initializer=bias_initializer,
            weights_regularizer=weights_regularizer,
            bias_regularizer=bias_regularizer,
            activity_regularizer=activity_regularizer,
            # weights_constraint=weights_constraint,
            # bias_constraint=bias_constraint,
            reduce_output=None)

        self.aggregation_weights = tf.Variable(
            get_initializer(weights_initializer)([19, 1]))

        logger.debug('  FCStack')
        self.fc_stack = FCStack(
            layers=fc_layers,
            num_layers=num_fc_layers,
            default_fc_size=fc_size,
            default_use_bias=use_bias,
            default_weights_initializer=weights_initializer,
            default_bias_initializer=bias_initializer,
            default_weights_regularizer=weights_regularizer,
            default_bias_regularizer=bias_regularizer,
            default_activity_regularizer=activity_regularizer,
            # default_weights_constraint=weights_constraint,
            # default_bias_constraint=bias_constraint,
            default_norm=norm,
            default_norm_params=norm_params,
            default_activation=activation,
            default_dropout=dropout,
        )
Ejemplo n.º 5
0
    def __init__(
        self,
        embedding_size: int = 10,
        embeddings_on_cpu: bool = False,
        should_softmax: bool = False,
        fc_layers: Optional[List] = None,
        num_fc_layers: int = 0,
        output_size: int = 10,
        use_bias: bool = True,
        weights_initializer: str = "xavier_uniform",
        bias_initializer: str = "zeros",
        norm: Optional[str] = None,
        norm_params: Dict = None,
        activation: str = "relu",
        dropout: float = 0,
        **kwargs,
    ):
        """
        :param embedding_size: it is the maximum embedding size, the actual
               size will be `min(vocabulary_size, embedding_size)`
               for `dense` representations and exactly `vocabulary_size`
               for the `sparse` encoding, where `vocabulary_size` is
               the number of different strings appearing in the training set
               in the column the feature is named after (plus 1 for
               `<UNK>`).
        :type embedding_size: Integer
        :param embeddings_on_cpu: by default embeddings matrices are stored
               on GPU memory if a GPU is used, as it allows
               for faster access, but in some cases the embedding matrix
               may be really big and this parameter forces the placement
               of the embedding matrix in regular memory and the CPU is used
               to resolve them, slightly slowing down the process
               as a result of data transfer between CPU and GPU memory.
        :param dropout: determines if there should be a dropout layer before
               returning the encoder output.
        :type dropout: Boolean
        """
        super().__init__()
        logger.debug(f" {self.name}")

        self.should_softmax = should_softmax
        self.sum_sequence_reducer = SequenceReducer(reduce_mode="sum")

        self.h3_embed = H3Embed(
            embedding_size,
            embeddings_on_cpu=embeddings_on_cpu,
            dropout=dropout,
            weights_initializer=weights_initializer,
            bias_initializer=bias_initializer,
            reduce_output="None",
        )

        self.register_buffer(
            "aggregation_weights",
            torch.Tensor(
                get_initializer(weights_initializer)([H3_INPUT_SIZE, 1])))

        logger.debug("  FCStack")
        self.fc_stack = FCStack(
            first_layer_input_size=self.h3_embed.output_shape[0],
            layers=fc_layers,
            num_layers=num_fc_layers,
            default_output_size=output_size,
            default_use_bias=use_bias,
            default_weights_initializer=weights_initializer,
            default_bias_initializer=bias_initializer,
            default_norm=norm,
            default_norm_params=norm_params,
            default_activation=activation,
            default_dropout=dropout,
        )