Example #1
0
    def __init__(self, cfg, name=None):
        """
        Fundamental pretrained Ernie model
        """
        log.debug('init ErnieModel with config: %s' % repr(cfg))
        D.Layer.__init__(self)
        d_model = cfg['hidden_size']
        d_emb = cfg.get('emb_size', cfg['hidden_size'])
        d_vocab = cfg['vocab_size']
        d_pos = cfg['max_position_embeddings']
        d_sent = cfg.get("sent_type_vocab_size") or cfg['type_vocab_size']
        self.n_head = cfg['num_attention_heads']
        self.return_additional_info = cfg.get('return_additional_info', False)
        initializer = F.initializer.TruncatedNormal(scale=cfg['initializer_range'])

        self.ln = _build_ln(d_model, name=append_name(name, 'pre_encoder'))
        self.word_emb = D.Embedding([d_vocab, d_emb], param_attr=F.ParamAttr(name=append_name(name, 'word_embedding'), initializer=initializer))
        self.pos_emb = D.Embedding([d_pos, d_emb], param_attr=F.ParamAttr(name=append_name(name, 'pos_embedding'), initializer=initializer))
        self.sent_emb = D.Embedding([d_sent, d_emb], param_attr=F.ParamAttr(name=append_name(name, 'sent_embedding'), initializer=initializer))
        prob = cfg['hidden_dropout_prob']
        self.dropout = lambda i: L.dropout(i, dropout_prob=prob, dropout_implementation="upscale_in_train",) if self.training else i

        self.encoder_stack = ErnieEncoderStack(cfg, append_name(name, 'encoder'))
        if cfg.get('has_pooler', True):
            self.pooler = _build_linear(cfg['hidden_size'], cfg['hidden_size'], append_name(name, 'pooled_fc'), initializer, act='tanh')
        else:
            self.pooler = None
        self.train()
Example #2
0
    def __init__(self,
                 n_src_vocab,
                 len_max_seq,
                 n_layers,
                 n_head,
                 d_k,
                 d_q,
                 d_model,
                 d_inner,
                 fft_conv1d_kernel,
                 fft_conv1d_padding,
                 dropout=0.1):
        """Encoder layer of FastSpeech.

        Args:
            n_src_vocab (int): the number of source vocabulary.
            len_max_seq (int): the max mel len of sequence.
            n_layers (int): the layers number of FFTBlock.
            n_head (int): the head number of multihead attention.
            d_k (int): the dim of key in multihead attention.
            d_q (int): the dim of query in multihead attention.
            d_model (int): the dim of hidden layer in multihead attention.
            d_inner (int): the dim of hidden layer in ffn.
            fft_conv1d_kernel (int): the conv kernel size in FFTBlock.
            fft_conv1d_padding (int): the conv padding size in FFTBlock.
            dropout (float, optional): dropout probability of FFTBlock. Defaults to 0.1.
        """
        super(Encoder, self).__init__()
        n_position = len_max_seq + 1
        self.n_head = n_head

        self.src_word_emb = dg.Embedding(size=[n_src_vocab, d_model],
                                         padding_idx=0,
                                         param_attr=fluid.initializer.Normal(
                                             loc=0.0, scale=1.0))
        self.pos_inp = get_sinusoid_encoding_table(n_position,
                                                   d_model,
                                                   padding_idx=0)
        self.position_enc = dg.Embedding(
            size=[n_position, d_model],
            param_attr=fluid.ParamAttr(
                initializer=fluid.initializer.NumpyArrayInitializer(
                    self.pos_inp),
                trainable=False))
        self.layer_stack = [
            FFTBlock(d_model,
                     d_inner,
                     n_head,
                     d_k,
                     d_q,
                     fft_conv1d_kernel,
                     fft_conv1d_padding,
                     dropout=dropout) for _ in range(n_layers)
        ]
        for i, layer in enumerate(self.layer_stack):
            self.add_sublayer('fft_{}'.format(i), layer)
Example #3
0
    def __init__(self, args):
        super(TranEmbed, self).__init__(args)

        # Initialize feat feature, feat can be char or pos
        if args.feat == "char":
            self.feat_embed = CharTransformer(
                n_chars=args.n_feats,
                n_out=args.n_tran_feat_embed,
                pad_index=args.feat_pad_index,
                nums_heads=args.n_tran_feat_head,
                num_layers=args.n_tran_feat_layer,
            )
            feat_embed_size = args.n_tran_feat_embed

        else:
            self.feat_embed = dygraph.Embedding(size=(args.n_feats,
                                                      args.n_feat_embed))
            feat_embed_size = args.n_feat_embed

        self.transformer = Transformer(
            hidden_size=args.n_embed + feat_embed_size,
            vocab_size=args.n_words,
            name="word_transformer",
            num_heads=args.n_tran_word_head,
            num_layers=args.n_tran_word_layer,
        )
        self.mlp_input_size = args.n_embed + feat_embed_size
Example #4
0
    def __init__(self, args):
        super(LSTMEmbed, self).__init__(args)

        # Initialize feat feature, feat can be char or pos
        if args.feat == "char":
            self.feat_embed = CharLSTM(
                n_chars=args.n_feats,
                n_embed=args.n_char_embed,
                n_out=args.n_lstm_feat_embed,
                pad_index=args.feat_pad_index,
            )
            feat_embed_size = args.n_lstm_feat_embed

        else:
            self.feat_embed = dygraph.Embedding(size=(args.n_feats,
                                                      args.n_feat_embed))
            feat_embed_size = args.n_feat_embed

        # lstm layer
        self.lstm = BiLSTM(
            input_size=args.n_embed + feat_embed_size,
            hidden_size=args.n_lstm_hidden,
            num_layers=args.n_lstm_layers,
            dropout=args.lstm_dropout,
        )
        self.lstm_dropout = SharedDropout(p=args.lstm_dropout)
        self.mlp_input_size = args.n_lstm_hidden * 2
Example #5
0
    def __init__(self,
            num_class,
            vocab_size,
            emb_dim=128,
            gru_dim=256,
            fc_hid_dim=256,
            is_sparse=True,
            bi_direction=True,
            ):
        super(GRU, self).__init__()

        self.bi_direction = bi_direction

        self.embedding = D.Embedding(
            size=[vocab_size, emb_dim],
            dtype='float32',
            #param_attr=F.ParamAttr(learning_rate=30),
            is_sparse=is_sparse)

        self._hid_fc1 = D.Linear(input_dim=emb_dim, output_dim=gru_dim * 3)

        self._gru_forward = DynamicGRU(size=gru_dim, h_0=None, is_reverse=False)

        if bi_direction:
            self._gru_backward = DynamicGRU(size=gru_dim, h_0=None, is_reverse=True)
            self._hid_fc2 = D.Linear(input_dim=gru_dim * 2, output_dim=fc_hid_dim, act="tanh")
        else:
            self._hid_fc2 = D.Linear(input_dim=gru_dim, output_dim=fc_hid_dim, act="tanh")

        self._output_fc = D.Linear(input_dim=fc_hid_dim, output_dim=num_class, act=None)
Example #6
0
    def __init__(self,
                 backbone,
                 transformer,
                 num_classes,
                 num_queries,
                 aux_loss=False):
        """
        Initializes the model.

        Parameters:
            backbone: See backbone.py
            transformer: See transformer.py
            num_classes: number of object classes
            num_queries: number of object queries, ie the detection slot. This is the maximal number of objects
                         DETR can detect in a single image. For COCO, we recommend 100 queries.
            aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
        """
        super().__init__()
        self.num_queries = num_queries
        self.transformer = transformer
        hidden_dim = transformer.d_model
        self.class_embed = dg.Linear(hidden_dim, num_classes + 1)
        self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
        self.query_embed = dg.Embedding((num_queries, hidden_dim))
        self.input_proj = dg.Conv2D(backbone.num_channels,
                                    hidden_dim,
                                    filter_size=1)
        self.backbone = backbone
        self.aux_loss = aux_loss
    def __init__(self,
                 n_class=1000,
                 chn=96,
                 blocks_with_attention="B2",
                 resolution=256):
        super().__init__()

        def DBlock(in_channel,
                   out_channel,
                   downsample=True,
                   use_attention=False,
                   skip_proj=None):
            return ResBlock(in_channel,
                            out_channel,
                            conditional=False,
                            upsample=False,
                            downsample=downsample,
                            use_attention=use_attention,
                            skip_proj=skip_proj)

        self.chn = chn
        self.colors = 3
        self.resolution = resolution
        self.blocks_with_attention = set(blocks_with_attention.split(","))
        self.blocks_with_attention.discard('')

        dblock = []
        in_channels, out_channels = self.get_in_out_channels()

        self.sa_ids = [
            int(s.split('B')[-1]) for s in self.blocks_with_attention
        ]

        for i, (nc_in,
                nc_out) in enumerate(zip(in_channels[:-1], out_channels[:-1])):
            dblock.append(
                DBlock(nc_in,
                       nc_out,
                       downsample=True,
                       use_attention=(i + 1) in self.sa_ids,
                       skip_proj=nc_in == nc_out))
        dblock.append(
            DBlock(in_channels[-1],
                   out_channels[-1],
                   downsample=False,
                   use_attention=len(out_channels) in self.sa_ids,
                   skip_proj=in_channels[-1] == out_channels[-1]))
        self.blocks = dg.LayerList(dblock)

        self.final_fc = SpectralNorm(dg.Linear(16 * chn, 1))

        self.embed_y = dg.Embedding(size=[n_class, 16 * chn],
                                    is_sparse=False,
                                    param_attr=Uniform(-0.1, 0.1))
        self.embed_y = SpectralNorm(self.embed_y)
Example #8
0
    def __init__(self, n_chars, n_embed, n_out, pad_index=0):
        super(CharLSTM, self).__init__()
        self.n_chars = n_chars
        self.n_embed = n_embed
        self.n_out = n_out
        self.pad_index = pad_index

        # the embedding layer
        self.embed = dygraph.Embedding(size=(n_chars, n_embed))
        # the lstm layer
        self.lstm = BiLSTM(input_size=n_embed, hidden_size=n_out // 2)
Example #9
0
 def __init__(self, vocab_size, emb_dim, is_sparse=True,
         dtype="float32", name="emb", padding_idx=None):
     """初始
     """
     super(EmbeddingLayer, self).__init__()
     self.emb_layer = D.Embedding(
         size=[vocab_size, emb_dim],
         dtype=dtype,
         is_sparse=is_sparse,
         padding_idx=padding_idx,
         param_attr=F.ParamAttr(
             name=name, initializer=F.initializer.Xavier()))
Example #10
0
def create_model(config):
    char_embedding = dg.Embedding((en.n_vocab, config["char_dim"]), param_attr=I.Normal(scale=0.1))
    multi_speaker = config["n_speakers"] > 1
    speaker_embedding = dg.Embedding((config["n_speakers"], config["speaker_dim"]), param_attr=I.Normal(scale=0.1)) \
        if multi_speaker else None
    encoder = Encoder(config["encoder_layers"], config["char_dim"], 
                      config["encoder_dim"], config["kernel_size"], 
                      has_bias=multi_speaker, bias_dim=config["speaker_dim"], 
                      keep_prob=1.0 - config["dropout"])
    decoder = Decoder(config["n_mels"], config["reduction_factor"], 
                      list(config["prenet_sizes"]) + [config["char_dim"]], 
                      config["decoder_layers"], config["kernel_size"], 
                      config["attention_dim"],
                      position_encoding_weight=config["position_weight"], 
                      omega=config["position_rate"], 
                      has_bias=multi_speaker, bias_dim=config["speaker_dim"], 
                      keep_prob=1.0 - config["dropout"])
    postnet = PostNet(config["postnet_layers"], config["char_dim"], 
                      config["postnet_dim"], config["kernel_size"], 
                      config["n_mels"], config["reduction_factor"], 
                      has_bias=multi_speaker, bias_dim=config["speaker_dim"], 
                      keep_prob=1.0 - config["dropout"])
    spectranet = SpectraNet(char_embedding, speaker_embedding, encoder, decoder, postnet)
    return spectranet
Example #11
0
def Embedding(name_scope,
              num_embeddings,
              embed_dim,
              is_sparse=False,
              is_distributed=False,
              padding_idx=None,
              std=0.01,
              dtype="float32"):
    # param attrs
    weight_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(
        scale=std))
    layer = dg.Embedding(name_scope, (num_embeddings, embed_dim),
                         padding_idx=padding_idx,
                         param_attr=weight_attr,
                         dtype=dtype)
    return layer
Example #12
0
    def __init__(self,
            num_class,
            vocab_size,
            emb_dim=32,
            num_filters=10,
            fc_hid_dim=32,
            num_channels=1,
            win_size_list=None,
            is_sparse=True,
            use_cudnn=True,
            ):
        super(TextCNN, self).__init__()

        self.embedding = D.Embedding(
            size=[vocab_size, emb_dim],
            dtype='float32',
            is_sparse=is_sparse)

        logging.info("num_class    = {}".format(num_class))
        logging.info("vocab size   = {}".format(vocab_size))
        logging.info("emb_dim      = {}".format(emb_dim))
        logging.info("num filters  = {}".format(num_filters))
        logging.info("fc_hid_dim   = {}".format(fc_hid_dim))
        logging.info("num channels = {}".format(num_channels))
        logging.info("windows size = {}".format(win_size_list))
        logging.info("is sparse    = {}".format(is_sparse))
        logging.info("use cudnn    = {}".format(use_cudnn))

        win_size_list = [3] if win_size_list is None else win_size_list
        def gen_conv_pool(win_size):
            """生成指定窗口的卷积池化层
            """
            return ConvPool(
                    num_channels,
                    num_filters,
                    [win_size, emb_dim],
                    padding=[1, 0],
                    use_cudnn=use_cudnn,
                    )

        self.conv_pool_list = D.LayerList([gen_conv_pool(win_size) for win_size in win_size_list])

        self._hid_fc = D.Linear(input_dim=num_filters * len(win_size_list), output_dim=fc_hid_dim, act="tanh")
        self._output_fc = D.Linear(input_dim=fc_hid_dim, output_dim=num_class, act=None)
Example #13
0
    def __init__(self, vocab_size, emb_dim=32, is_sparse=True, hidden_dropout_prob=0.1, triplet_margin=1.0, *args, **kwargs):
        super(TextCNNSiameseNet, self).__init__()
        self.triplet_margin = triplet_margin
        logging.info("triplet_margin: {}".format(triplet_margin))

        self.embedding = D.Embedding(
            size=[vocab_size, emb_dim],
            dtype='float32',
            is_sparse=is_sparse,
            )

        self.textcnn = TextCNN(emb_dim, *args, **kwargs)

        logging.info("feature dropout: {}".format(hidden_dropout_prob))

        self.dropout = lambda i: L.dropout(i,
                dropout_prob=hidden_dropout_prob,
                dropout_implementation="upscale_in_train",
                ) if self.training else i
Example #14
0
    def __init__(self, embedding_size, num_hidden, num_head=4, n_layers=3):
        """Encoder layer of TransformerTTS.

        Args:
            embedding_size (int): the size of position embedding.
            num_hidden (int): the size of hidden layer in network.
            num_head (int, optional): the head number of multihead attention. Defaults to 4.
            n_layers (int, optional): the layers number of multihead attention. Defaults to 3.
        """
        super(Encoder, self).__init__()
        self.num_hidden = num_hidden
        self.num_head = num_head
        param = fluid.ParamAttr(initializer=fluid.initializer.Constant(
            value=1.0))
        self.alpha = self.create_parameter(
            shape=(1, ), attr=param, dtype='float32')
        self.pos_inp = get_sinusoid_encoding_table(
            1024, self.num_hidden, padding_idx=0)
        self.pos_emb = dg.Embedding(
            size=[1024, num_hidden],
            param_attr=fluid.ParamAttr(
                initializer=fluid.initializer.NumpyArrayInitializer(
                    self.pos_inp),
                trainable=False))
        self.encoder_prenet = EncoderPrenet(
            embedding_size=embedding_size,
            num_hidden=num_hidden,
            use_cudnn=True)
        self.layers = [
            MultiheadAttention(num_hidden, num_hidden // num_head,
                               num_hidden // num_head) for _ in range(n_layers)
        ]
        for i, layer in enumerate(self.layers):
            self.add_sublayer("self_attn_{}".format(i), layer)
        self.ffns = [
            PositionwiseFeedForward(
                num_hidden,
                num_hidden * num_head,
                filter_size=1,
                use_cudnn=True) for _ in range(n_layers)
        ]
        for i, layer in enumerate(self.ffns):
            self.add_sublayer("ffns_{}".format(i), layer)
Example #15
0
    def __init__(self, n_loop, n_layer, residual_channels, output_dim,
                 condition_dim, filter_size, loss_type, log_scale_min):
        """Wavenet that transform upsampled mel spectrogram into waveform.

        Args:
            n_loop (int): n_loop for the internal ResidualNet.
            n_layer (int): n_loop for the internal ResidualNet.
            residual_channels (int): the channel of the input.
            output_dim (int): the channel of the output distribution. 
            condition_dim (int): the channel of the condition.
            filter_size (int): the filter size of the internal ResidualNet.
            loss_type (str): loss type of the wavenet. Possible values are 'softmax' and 'mog'. If `loss_type` is 'softmax', the output is the logits of the catrgotical(multinomial) distribution, `output_dim` means the number of classes of the categorical distribution. If `loss_type` is mog(mixture of gaussians), the output is the parameters of a mixture of gaussians, which consists of weight(in the form of logit) of each gaussian distribution and its mean and log standard deviaton. So when `loss_type` is 'mog', `output_dim` should be perfectly divided by 3.
            log_scale_min (int): the minimum value of log standard deviation of the output gaussian distributions. Note that this value is only used for computing loss if `loss_type` is 'mog', values less than `log_scale_min` is clipped when computing loss.
        """
        super(WaveNet, self).__init__()
        if loss_type not in ["softmax", "mog"]:
            raise ValueError("loss_type {} is not supported".format(loss_type))
        if loss_type == "softmax":
            self.embed = dg.Embedding((output_dim, residual_channels))
        else:
            assert output_dim % 3 == 0, "with MoG output, the output dim must be divided by 3"
            self.embed = Linear(1, residual_channels)

        self.resnet = ResidualNet(n_loop, n_layer, residual_channels,
                                  condition_dim, filter_size)
        self.context_size = self.resnet.context_size

        skip_channels = residual_channels  # assume the same channel
        self.proj1 = Linear(skip_channels, skip_channels)
        self.proj2 = Linear(skip_channels, skip_channels)
        # if loss_type is softmax, output_dim is n_vocab of waveform magnitude.
        # if loss_type is mog, output_dim is 3 * gaussian, (weight, mean and stddev)
        self.proj3 = Linear(skip_channels, output_dim)

        self.loss_type = loss_type
        self.output_dim = output_dim
        self.input_dim = 1
        self.skip_channels = skip_channels
        self.log_scale_min = log_scale_min
Example #16
0
    def __init__(self,
                 name_scope,
                 n_position,
                 d_pos_vec,
                 position_rate=1.0,
                 is_sparse=False,
                 is_distributed=False,
                 param_attr=None,
                 max_norm=None,
                 padding_idx=None,
                 dtype="float32"):
        super(PositionEmbedding, self).__init__(name_scope, dtype=dtype)
        self.embed = dg.Embedding(self.full_name(),
                                  size=(n_position, d_pos_vec),
                                  is_sparse=is_sparse,
                                  is_distributed=is_distributed,
                                  padding_idx=None,
                                  param_attr=param_attr,
                                  dtype=dtype)
        self.set_weight(
            position_encoding_init(n_position,
                                   d_pos_vec,
                                   position_rate=position_rate,
                                   sinusoidal=False).astype(dtype))

        self._is_sparse = is_sparse
        self._is_distributed = is_distributed
        self._remote_prefetch = self._is_sparse and (not self._is_distributed)
        if self._remote_prefetch:
            assert self._is_sparse is True and self._is_distributed is False

        self._padding_idx = (-1 if padding_idx is None else
                             padding_idx if padding_idx >= 0 else
                             (n_position + padding_idx))
        self._position_rate = position_rate
        self._max_norm = max_norm
        self._dtype = dtype
Example #17
0
 def __init__(self, args):
     super(PretraEmbedding, self).__init__()
     self.args = args
     # the embedding layer
     self.word_embed = dygraph.Embedding(size=(args.n_words, args.n_embed))
     self.embed_dropout = IndependentDropout(p=args.embed_dropout)
Example #18
0
 def __init__(self, num_pos_feats=256):
     super().__init__()
     self.row_embed = dg.Embedding(size=(50, num_pos_feats))
     self.col_embed = dg.Embedding(size=(50, num_pos_feats))
     self.reset_parameters()
Example #19
0
    def __init__(self, num_hidden, config, num_head=4, n_layers=3):
        """Decoder layer of TransformerTTS.

        Args:
            num_hidden (int): the number of source vocabulary.
            config: the yaml configs used in decoder.
            n_layers (int, optional): the layers number of multihead attention. Defaults to 4.
            num_head (int, optional): the head number of multihead attention. Defaults to 3.
        """
        super(Decoder, self).__init__()
        self.num_hidden = num_hidden
        self.num_head = num_head
        param = fluid.ParamAttr()
        self.alpha = self.create_parameter(
            shape=(1, ),
            attr=param,
            dtype='float32',
            default_initializer=fluid.initializer.ConstantInitializer(
                value=1.0))
        self.pos_inp = get_sinusoid_encoding_table(1024,
                                                   self.num_hidden,
                                                   padding_idx=0)
        self.pos_emb = dg.Embedding(
            size=[1024, num_hidden],
            padding_idx=0,
            param_attr=fluid.ParamAttr(
                initializer=fluid.initializer.NumpyArrayInitializer(
                    self.pos_inp),
                trainable=False))
        self.decoder_prenet = PreNet(input_size=config['audio']['num_mels'],
                                     hidden_size=num_hidden * 2,
                                     output_size=num_hidden,
                                     dropout_rate=0.2)
        k = math.sqrt(1.0 / num_hidden)
        self.linear = dg.Linear(
            num_hidden,
            num_hidden,
            param_attr=fluid.ParamAttr(
                initializer=fluid.initializer.XavierInitializer()),
            bias_attr=fluid.ParamAttr(
                initializer=fluid.initializer.Uniform(low=-k, high=k)))

        self.selfattn_layers = [
            MultiheadAttention(num_hidden, num_hidden // num_head,
                               num_hidden // num_head) for _ in range(n_layers)
        ]
        for i, layer in enumerate(self.selfattn_layers):
            self.add_sublayer("self_attn_{}".format(i), layer)
        self.attn_layers = [
            MultiheadAttention(num_hidden, num_hidden // num_head,
                               num_hidden // num_head) for _ in range(n_layers)
        ]
        for i, layer in enumerate(self.attn_layers):
            self.add_sublayer("attn_{}".format(i), layer)
        self.ffns = [
            PositionwiseFeedForward(num_hidden,
                                    num_hidden * num_head,
                                    filter_size=1) for _ in range(n_layers)
        ]
        for i, layer in enumerate(self.ffns):
            self.add_sublayer("ffns_{}".format(i), layer)
        self.mel_linear = dg.Linear(
            num_hidden,
            config['audio']['num_mels'] * config['audio']['outputs_per_step'],
            param_attr=fluid.ParamAttr(
                initializer=fluid.initializer.XavierInitializer()),
            bias_attr=fluid.ParamAttr(
                initializer=fluid.initializer.Uniform(low=-k, high=k)))
        self.stop_linear = dg.Linear(
            num_hidden,
            1,
            param_attr=fluid.ParamAttr(
                initializer=fluid.initializer.XavierInitializer()),
            bias_attr=fluid.ParamAttr(
                initializer=fluid.initializer.Uniform(low=-k, high=k)))

        self.postconvnet = PostConvNet(
            config['audio']['num_mels'],
            config['hidden_size'],
            filter_size=5,
            padding=4,
            num_conv=5,
            outputs_per_step=config['audio']['outputs_per_step'],
            use_cudnn=True)
Example #20
0
def make_model(config):
    c = config["model"]
    # speaker embedding
    n_speakers = c["n_speakers"]
    speaker_dim = c["speaker_embed_dim"]
    if n_speakers > 1:
        speaker_embed = dg.Embedding(
            (n_speakers, speaker_dim),
            param_attr=I.Normal(scale=c["speaker_embedding_weight_std"]))
    else:
        speaker_embed = None

    # encoder
    h = c["encoder_channels"]
    k = c["kernel_size"]
    encoder_convolutions = (
        ConvSpec(h, k, 1),
        ConvSpec(h, k, 3),
        ConvSpec(h, k, 9),
        ConvSpec(h, k, 27),
        ConvSpec(h, k, 1),
        ConvSpec(h, k, 3),
        ConvSpec(h, k, 9),
        ConvSpec(h, k, 27),
        ConvSpec(h, k, 1),
        ConvSpec(h, k, 3),
    )
    encoder = Encoder(n_vocab=en.n_vocab,
                      embed_dim=c["text_embed_dim"],
                      n_speakers=n_speakers,
                      speaker_dim=speaker_dim,
                      embedding_weight_std=c["embedding_weight_std"],
                      convolutions=encoder_convolutions,
                      dropout=c["dropout"])
    if c["freeze_embedding"]:
        freeze(encoder.embed)

    # decoder
    h = c["decoder_channels"]
    k = c["kernel_size"]
    prenet_convolutions = (ConvSpec(h, k, 1), ConvSpec(h, k, 3))
    attentive_convolutions = (
        ConvSpec(h, k, 1),
        ConvSpec(h, k, 3),
        ConvSpec(h, k, 9),
        ConvSpec(h, k, 27),
        ConvSpec(h, k, 1),
    )
    attention = [True, False, False, False, True]
    force_monotonic_attention = [True, False, False, False, True]
    window = WindowRange(c["window_backward"], c["window_ahead"])
    decoder = Decoder(n_speakers,
                      speaker_dim,
                      embed_dim=c["text_embed_dim"],
                      mel_dim=config["transform"]["n_mels"],
                      r=c["outputs_per_step"],
                      max_positions=c["max_positions"],
                      preattention=prenet_convolutions,
                      convolutions=attentive_convolutions,
                      attention=attention,
                      dropout=c["dropout"],
                      use_memory_mask=c["use_memory_mask"],
                      force_monotonic_attention=force_monotonic_attention,
                      query_position_rate=c["query_position_rate"],
                      key_position_rate=c["key_position_rate"],
                      window_range=window,
                      key_projection=c["key_projection"],
                      value_projection=c["value_projection"])
    if not c["trainable_positional_encodings"]:
        freeze(decoder.embed_keys_positions)
        freeze(decoder.embed_query_positions)

    # converter(postnet)
    linear_dim = 1 + config["transform"]["n_fft"] // 2
    h = c["converter_channels"]
    k = c["kernel_size"]
    postnet_convolutions = (
        ConvSpec(h, k, 1),
        ConvSpec(h, k, 3),
        ConvSpec(2 * h, k, 1),
        ConvSpec(2 * h, k, 3),
    )
    use_decoder_states = c["use_decoder_state_for_postnet_input"]
    converter = Converter(n_speakers,
                          speaker_dim,
                          in_channels=decoder.state_dim if use_decoder_states
                          else config["transform"]["n_mels"],
                          linear_dim=linear_dim,
                          time_upsampling=c["downsample_factor"],
                          convolutions=postnet_convolutions,
                          dropout=c["dropout"])

    model = DeepVoice3(encoder,
                       decoder,
                       converter,
                       speaker_embed,
                       use_decoder_states=use_decoder_states)
    return model
Example #21
0
 def __init__(self):
     super().__init__()
     self.emb = D.Embedding([30002, 128], padding_idx=0)
     self.cnn = D.Conv2D(128, 128, (1, 3), padding=(0, 1), act='relu')
     self.pool = D.Pool2D((1, 3), pool_padding=(0, 1))
     self.fc = D.Linear(128, 2)
Example #22
0
 def __init__(self):
     super().__init__()
     self.emb = D.Embedding([len(student_vocab), 128], padding_idx=0)
     self.fc = D.Linear(128, 2)
Example #23
0
    def __init__(self, args, pretrained_embed=None):
        super(Model, self).__init__()
        self.args = args
        # the embedding layer
        self.word_embed = dygraph.Embedding(size=(args.n_words, args.n_embed))

        if args.pretrained_embed_shape is not None:
            if pretrained_embed is not None:
                pre_param_attrs = fluid.ParamAttr(
                    name="pretrained_emb",
                    initializer=initializer.NumpyArrayInitializer(
                        pretrained_embed),
                    trainable=True)
                self.pretrained = dygraph.Embedding(
                    size=args.pretrained_embed_shape,
                    param_attr=pre_param_attrs)
                self.word_embed.weight = layers.create_parameter(
                    shape=(self.args.n_words, self.args.n_embed),
                    dtype='float32',
                    default_initializer=initializer.Constant(value=0.0))
            else:
                self.pretrained = dygraph.Embedding(
                    size=args.pretrained_embed_shape)
        # Initialize feat feature, feat can be char or pos
        if args.feat == 'char':
            self.feat_embed = CharLSTM(n_chars=args.n_feats,
                                       n_embed=args.n_char_embed,
                                       n_out=args.n_feat_embed,
                                       pad_index=args.feat_pad_index)
        else:
            self.feat_embed = dygraph.Embedding(size=(args.n_feats,
                                                      args.n_feat_embed))
        self.embed_dropout = IndependentDropout(p=args.embed_dropout)

        # lstm layer
        self.lstm = BiLSTM(input_size=args.n_embed + args.n_feat_embed,
                           hidden_size=args.n_lstm_hidden,
                           num_layers=args.n_lstm_layers,
                           dropout=args.lstm_dropout)
        self.lstm_dropout = SharedDropout(p=args.lstm_dropout)

        # mlp layer
        self.mlp_arc_h = MLP(n_in=args.n_lstm_hidden * 2,
                             n_out=args.n_mlp_arc,
                             dropout=args.mlp_dropout)
        self.mlp_arc_d = MLP(n_in=args.n_lstm_hidden * 2,
                             n_out=args.n_mlp_arc,
                             dropout=args.mlp_dropout)
        self.mlp_rel_h = MLP(n_in=args.n_lstm_hidden * 2,
                             n_out=args.n_mlp_rel,
                             dropout=args.mlp_dropout)
        self.mlp_rel_d = MLP(n_in=args.n_lstm_hidden * 2,
                             n_out=args.n_mlp_rel,
                             dropout=args.mlp_dropout)

        # biaffine layers
        self.arc_attn = Biaffine(n_in=args.n_mlp_arc,
                                 bias_x=True,
                                 bias_y=False)
        self.rel_attn = Biaffine(n_in=args.n_mlp_rel,
                                 n_out=args.n_rels,
                                 bias_x=True,
                                 bias_y=True)
        self.pad_index = args.pad_index
        self.unk_index = args.unk_index
Example #24
0
def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
               padding_idx, embedding_std, max_positions, n_vocab,
               freeze_embedding, filter_size, encoder_channels, mel_dim,
               decoder_channels, r, trainable_positional_encodings,
               use_memory_mask, query_position_rate, key_position_rate,
               window_behind, window_ahead, key_projection, value_projection,
               downsample_factor, linear_dim, use_decoder_states,
               converter_channels, dropout):
    """just a simple function to create a deepvoice 3 model"""
    if n_speakers > 1:
        spe = dg.Embedding((n_speakers, speaker_dim),
                           param_attr=I.Normal(scale=speaker_embed_std))
    else:
        spe = None

    h = encoder_channels
    k = filter_size
    encoder_convolutions = (
        ConvSpec(h, k, 1),
        ConvSpec(h, k, 3),
        ConvSpec(h, k, 9),
        ConvSpec(h, k, 27),
        ConvSpec(h, k, 1),
        ConvSpec(h, k, 3),
        ConvSpec(h, k, 9),
        ConvSpec(h, k, 27),
        ConvSpec(h, k, 1),
        ConvSpec(h, k, 3),
    )
    enc = Encoder(n_vocab,
                  embed_dim,
                  n_speakers,
                  speaker_dim,
                  padding_idx=None,
                  embedding_weight_std=embedding_std,
                  convolutions=encoder_convolutions,
                  dropout=dropout)
    if freeze_embedding:
        freeze(enc.embed)

    h = decoder_channels
    prenet_convolutions = (ConvSpec(h, k, 1), ConvSpec(h, k, 3))
    attentive_convolutions = (
        ConvSpec(h, k, 1),
        ConvSpec(h, k, 3),
        ConvSpec(h, k, 9),
        ConvSpec(h, k, 27),
        ConvSpec(h, k, 1),
    )
    attention = [True, False, False, False, True]
    force_monotonic_attention = [True, False, False, False, True]
    dec = Decoder(n_speakers,
                  speaker_dim,
                  embed_dim,
                  mel_dim,
                  r=r,
                  max_positions=max_positions,
                  preattention=prenet_convolutions,
                  convolutions=attentive_convolutions,
                  attention=attention,
                  dropout=dropout,
                  use_memory_mask=use_memory_mask,
                  force_monotonic_attention=force_monotonic_attention,
                  query_position_rate=query_position_rate,
                  key_position_rate=key_position_rate,
                  window_range=WindowRange(window_behind, window_ahead),
                  key_projection=key_projection,
                  value_projection=value_projection)
    if not trainable_positional_encodings:
        freeze(dec.embed_keys_positions)
        freeze(dec.embed_query_positions)

    h = converter_channels
    postnet_convolutions = (
        ConvSpec(h, k, 1),
        ConvSpec(h, k, 3),
        ConvSpec(2 * h, k, 1),
        ConvSpec(2 * h, k, 3),
    )
    cvt = Converter(n_speakers,
                    speaker_dim,
                    dec.state_dim if use_decoder_states else mel_dim,
                    linear_dim,
                    time_upsampling=downsample_factor,
                    convolutions=postnet_convolutions,
                    dropout=dropout)
    dv3 = DeepVoice3(enc, dec, cvt, spe, use_decoder_states)
    return dv3
Example #25
0
 def init_ernie_model(self, args):
     self.word_embed = dygraph.Embedding(size=(args.ernie_vocabs_size,
                                               args.lstm_by_wp_embed_size))
Example #26
0
    def __init__(self,
                 n_vocab,
                 embed_dim,
                 n_speakers,
                 speaker_dim,
                 padding_idx=None,
                 embedding_weight_std=0.1,
                 convolutions=(ConvSpec(64, 5, 1), ) * 7,
                 dropout=0.):
        """Encoder of Deep Voice 3.

        Args:
            n_vocab (int): vocabulary size of the text embedding.
            embed_dim (int): embedding size of the text embedding.
            n_speakers (int): number of speakers.
            speaker_dim (int): speaker embedding size.
            padding_idx (int, optional): padding index of text embedding. Defaults to None.
            embedding_weight_std (float, optional): standard deviation of the embedding weights when intialized. Defaults to 0.1.
            convolutions (Iterable[ConvSpec], optional): specifications of the convolutional layers. ConvSpec is a namedtuple of output channels, filter_size and dilation. Defaults to (ConvSpec(64, 5, 1), )*7.
            dropout (float, optional): dropout probability. Defaults to 0..
        """
        super(Encoder, self).__init__()
        self.embedding_weight_std = embedding_weight_std
        self.embed = dg.Embedding(
            (n_vocab, embed_dim),
            padding_idx=padding_idx,
            param_attr=I.Normal(scale=embedding_weight_std))

        self.dropout = dropout
        if n_speakers > 1:
            std = np.sqrt((1 - dropout) / speaker_dim)
            self.sp_proj1 = Linear(speaker_dim,
                                   embed_dim,
                                   act="softsign",
                                   param_attr=I.Normal(scale=std))
            self.sp_proj2 = Linear(speaker_dim,
                                   embed_dim,
                                   act="softsign",
                                   param_attr=I.Normal(scale=std))
        self.n_speakers = n_speakers

        self.convolutions = dg.LayerList()
        in_channels = embed_dim
        std_mul = 1.0
        for (out_channels, filter_size, dilation) in convolutions:
            # 1 * 1 convolution & relu
            if in_channels != out_channels:
                std = np.sqrt(std_mul / in_channels)
                self.convolutions.append(
                    Conv1D(in_channels,
                           out_channels,
                           1,
                           act="relu",
                           param_attr=I.Normal(scale=std)))
                in_channels = out_channels
                std_mul = 2.0

            self.convolutions.append(
                Conv1DGLU(n_speakers,
                          speaker_dim,
                          in_channels,
                          out_channels,
                          filter_size,
                          dilation,
                          std_mul,
                          dropout,
                          causal=False,
                          residual=True))
            in_channels = out_channels
            std_mul = 4.0

        std = np.sqrt(std_mul * (1 - dropout) / in_channels)
        self.convolutions.append(
            Conv1D(in_channels, embed_dim, 1, param_attr=I.Normal(scale=std)))
Example #27
0
    def __init__(self, embedding_size, num_hidden, use_cudnn=True):
        """ Encoder prenet layer of TransformerTTS.

        Args:
            embedding_size (int): the size of embedding.
            num_hidden (int): the size of hidden layer in network.
            use_cudnn (bool, optional): use cudnn or not. Defaults to True.
        """
        super(EncoderPrenet, self).__init__()
        self.embedding_size = embedding_size
        self.num_hidden = num_hidden
        self.use_cudnn = use_cudnn
        self.embedding = dg.Embedding(
            size=[len(symbols), embedding_size],
            padding_idx=0,
            param_attr=fluid.initializer.Normal(
                loc=0.0, scale=1.0))
        self.conv_list = []
        k = math.sqrt(1.0 / embedding_size)
        self.conv_list.append(
            Conv1D(
                num_channels=embedding_size,
                num_filters=num_hidden,
                filter_size=5,
                padding=int(np.floor(5 / 2)),
                param_attr=fluid.ParamAttr(
                    initializer=fluid.initializer.XavierInitializer()),
                bias_attr=fluid.ParamAttr(
                    initializer=fluid.initializer.Uniform(
                        low=-k, high=k)),
                use_cudnn=use_cudnn))
        k = math.sqrt(1.0 / num_hidden)
        for _ in range(2):
            self.conv_list.append(
                Conv1D(
                    num_channels=num_hidden,
                    num_filters=num_hidden,
                    filter_size=5,
                    padding=int(np.floor(5 / 2)),
                    param_attr=fluid.ParamAttr(
                        initializer=fluid.initializer.XavierInitializer()),
                    bias_attr=fluid.ParamAttr(
                        initializer=fluid.initializer.Uniform(
                            low=-k, high=k)),
                    use_cudnn=use_cudnn))

        for i, layer in enumerate(self.conv_list):
            self.add_sublayer("conv_list_{}".format(i), layer)

        self.batch_norm_list = [
            dg.BatchNorm(
                num_hidden, data_layout='NCHW') for _ in range(3)
        ]

        for i, layer in enumerate(self.batch_norm_list):
            self.add_sublayer("batch_norm_list_{}".format(i), layer)

        k = math.sqrt(1.0 / num_hidden)
        self.projection = dg.Linear(
            num_hidden,
            num_hidden,
            param_attr=fluid.ParamAttr(
                initializer=fluid.initializer.XavierInitializer()),
            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-k, high=k)))
Example #28
0
    def _initialize(self):
        """
        initialize with the necessary elements
        """
        self.tts_checkpoint_path = os.path.join(self.directory, "assets",
                                                "tts", "step-1780000")
        self.waveflow_checkpoint_path = os.path.join(self.directory, "assets",
                                                     "vocoder", "step-2000000")
        self.waveflow_config_path = os.path.join(self.directory, "assets",
                                                 "vocoder",
                                                 "waveflow_ljspeech.yaml")
        tts_checkpoint_path = os.path.join(self.directory, "assets", "tts",
                                           "ljspeech.yaml")
        with open(tts_checkpoint_path) as f:
            self.tts_config = ruamel.yaml.safe_load(f)

        with fluid.dygraph.guard(fluid.CPUPlace()):
            char_embedding = dg.Embedding(
                (en.n_vocab, self.tts_config["char_dim"]))
            multi_speaker = self.tts_config["n_speakers"] > 1
            speaker_embedding = dg.Embedding((self.tts_config["n_speakers"], self.tts_config["speaker_dim"])) \
                if multi_speaker else None
            encoder = Encoder(self.tts_config["encoder_layers"],
                              self.tts_config["char_dim"],
                              self.tts_config["encoder_dim"],
                              self.tts_config["kernel_size"],
                              has_bias=multi_speaker,
                              bias_dim=self.tts_config["speaker_dim"],
                              keep_prob=1.0 - self.tts_config["dropout"])
            decoder = Decoder(
                self.tts_config["n_mels"],
                self.tts_config["reduction_factor"],
                list(self.tts_config["prenet_sizes"]) +
                [self.tts_config["char_dim"]],
                self.tts_config["decoder_layers"],
                self.tts_config["kernel_size"],
                self.tts_config["attention_dim"],
                position_encoding_weight=self.tts_config["position_weight"],
                omega=self.tts_config["position_rate"],
                has_bias=multi_speaker,
                bias_dim=self.tts_config["speaker_dim"],
                keep_prob=1.0 - self.tts_config["dropout"])
            postnet = PostNet(self.tts_config["postnet_layers"],
                              self.tts_config["char_dim"],
                              self.tts_config["postnet_dim"],
                              self.tts_config["kernel_size"],
                              self.tts_config["n_mels"],
                              self.tts_config["reduction_factor"],
                              has_bias=multi_speaker,
                              bias_dim=self.tts_config["speaker_dim"],
                              keep_prob=1.0 - self.tts_config["dropout"])
            self.tts_model = SpectraNet(char_embedding, speaker_embedding,
                                        encoder, decoder, postnet)
            io.load_parameters(model=self.tts_model,
                               checkpoint_path=self.tts_checkpoint_path)
            for name, layer in self.tts_model.named_sublayers():
                try:
                    remove_weight_norm(layer)
                except ValueError:
                    # this layer has not weight norm hook
                    pass

            self.waveflow = WaveflowVocoder(
                config_path=self.waveflow_config_path,
                checkpoint_path=self.waveflow_checkpoint_path)
            self.griffin = GriffinLimVocoder(
                sharpening_factor=self.tts_config["sharpening_factor"],
                sample_rate=self.tts_config["sample_rate"],
                n_fft=self.tts_config["n_fft"],
                win_length=self.tts_config["win_length"],
                hop_length=self.tts_config["hop_length"])