def __init__(self, cfg, name=None): """ Fundamental pretrained Ernie model """ log.debug('init ErnieModel with config: %s' % repr(cfg)) D.Layer.__init__(self) d_model = cfg['hidden_size'] d_emb = cfg.get('emb_size', cfg['hidden_size']) d_vocab = cfg['vocab_size'] d_pos = cfg['max_position_embeddings'] d_sent = cfg.get("sent_type_vocab_size") or cfg['type_vocab_size'] self.n_head = cfg['num_attention_heads'] self.return_additional_info = cfg.get('return_additional_info', False) initializer = F.initializer.TruncatedNormal(scale=cfg['initializer_range']) self.ln = _build_ln(d_model, name=append_name(name, 'pre_encoder')) self.word_emb = D.Embedding([d_vocab, d_emb], param_attr=F.ParamAttr(name=append_name(name, 'word_embedding'), initializer=initializer)) self.pos_emb = D.Embedding([d_pos, d_emb], param_attr=F.ParamAttr(name=append_name(name, 'pos_embedding'), initializer=initializer)) self.sent_emb = D.Embedding([d_sent, d_emb], param_attr=F.ParamAttr(name=append_name(name, 'sent_embedding'), initializer=initializer)) prob = cfg['hidden_dropout_prob'] self.dropout = lambda i: L.dropout(i, dropout_prob=prob, dropout_implementation="upscale_in_train",) if self.training else i self.encoder_stack = ErnieEncoderStack(cfg, append_name(name, 'encoder')) if cfg.get('has_pooler', True): self.pooler = _build_linear(cfg['hidden_size'], cfg['hidden_size'], append_name(name, 'pooled_fc'), initializer, act='tanh') else: self.pooler = None self.train()
def __init__(self, n_src_vocab, len_max_seq, n_layers, n_head, d_k, d_q, d_model, d_inner, fft_conv1d_kernel, fft_conv1d_padding, dropout=0.1): """Encoder layer of FastSpeech. Args: n_src_vocab (int): the number of source vocabulary. len_max_seq (int): the max mel len of sequence. n_layers (int): the layers number of FFTBlock. n_head (int): the head number of multihead attention. d_k (int): the dim of key in multihead attention. d_q (int): the dim of query in multihead attention. d_model (int): the dim of hidden layer in multihead attention. d_inner (int): the dim of hidden layer in ffn. fft_conv1d_kernel (int): the conv kernel size in FFTBlock. fft_conv1d_padding (int): the conv padding size in FFTBlock. dropout (float, optional): dropout probability of FFTBlock. Defaults to 0.1. """ super(Encoder, self).__init__() n_position = len_max_seq + 1 self.n_head = n_head self.src_word_emb = dg.Embedding(size=[n_src_vocab, d_model], padding_idx=0, param_attr=fluid.initializer.Normal( loc=0.0, scale=1.0)) self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0) self.position_enc = dg.Embedding( size=[n_position, d_model], param_attr=fluid.ParamAttr( initializer=fluid.initializer.NumpyArrayInitializer( self.pos_inp), trainable=False)) self.layer_stack = [ FFTBlock(d_model, d_inner, n_head, d_k, d_q, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers) ] for i, layer in enumerate(self.layer_stack): self.add_sublayer('fft_{}'.format(i), layer)
def __init__(self, args): super(TranEmbed, self).__init__(args) # Initialize feat feature, feat can be char or pos if args.feat == "char": self.feat_embed = CharTransformer( n_chars=args.n_feats, n_out=args.n_tran_feat_embed, pad_index=args.feat_pad_index, nums_heads=args.n_tran_feat_head, num_layers=args.n_tran_feat_layer, ) feat_embed_size = args.n_tran_feat_embed else: self.feat_embed = dygraph.Embedding(size=(args.n_feats, args.n_feat_embed)) feat_embed_size = args.n_feat_embed self.transformer = Transformer( hidden_size=args.n_embed + feat_embed_size, vocab_size=args.n_words, name="word_transformer", num_heads=args.n_tran_word_head, num_layers=args.n_tran_word_layer, ) self.mlp_input_size = args.n_embed + feat_embed_size
def __init__(self, args): super(LSTMEmbed, self).__init__(args) # Initialize feat feature, feat can be char or pos if args.feat == "char": self.feat_embed = CharLSTM( n_chars=args.n_feats, n_embed=args.n_char_embed, n_out=args.n_lstm_feat_embed, pad_index=args.feat_pad_index, ) feat_embed_size = args.n_lstm_feat_embed else: self.feat_embed = dygraph.Embedding(size=(args.n_feats, args.n_feat_embed)) feat_embed_size = args.n_feat_embed # lstm layer self.lstm = BiLSTM( input_size=args.n_embed + feat_embed_size, hidden_size=args.n_lstm_hidden, num_layers=args.n_lstm_layers, dropout=args.lstm_dropout, ) self.lstm_dropout = SharedDropout(p=args.lstm_dropout) self.mlp_input_size = args.n_lstm_hidden * 2
def __init__(self, num_class, vocab_size, emb_dim=128, gru_dim=256, fc_hid_dim=256, is_sparse=True, bi_direction=True, ): super(GRU, self).__init__() self.bi_direction = bi_direction self.embedding = D.Embedding( size=[vocab_size, emb_dim], dtype='float32', #param_attr=F.ParamAttr(learning_rate=30), is_sparse=is_sparse) self._hid_fc1 = D.Linear(input_dim=emb_dim, output_dim=gru_dim * 3) self._gru_forward = DynamicGRU(size=gru_dim, h_0=None, is_reverse=False) if bi_direction: self._gru_backward = DynamicGRU(size=gru_dim, h_0=None, is_reverse=True) self._hid_fc2 = D.Linear(input_dim=gru_dim * 2, output_dim=fc_hid_dim, act="tanh") else: self._hid_fc2 = D.Linear(input_dim=gru_dim, output_dim=fc_hid_dim, act="tanh") self._output_fc = D.Linear(input_dim=fc_hid_dim, output_dim=num_class, act=None)
def __init__(self, backbone, transformer, num_classes, num_queries, aux_loss=False): """ Initializes the model. Parameters: backbone: See backbone.py transformer: See transformer.py num_classes: number of object classes num_queries: number of object queries, ie the detection slot. This is the maximal number of objects DETR can detect in a single image. For COCO, we recommend 100 queries. aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used. """ super().__init__() self.num_queries = num_queries self.transformer = transformer hidden_dim = transformer.d_model self.class_embed = dg.Linear(hidden_dim, num_classes + 1) self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3) self.query_embed = dg.Embedding((num_queries, hidden_dim)) self.input_proj = dg.Conv2D(backbone.num_channels, hidden_dim, filter_size=1) self.backbone = backbone self.aux_loss = aux_loss
def __init__(self, n_class=1000, chn=96, blocks_with_attention="B2", resolution=256): super().__init__() def DBlock(in_channel, out_channel, downsample=True, use_attention=False, skip_proj=None): return ResBlock(in_channel, out_channel, conditional=False, upsample=False, downsample=downsample, use_attention=use_attention, skip_proj=skip_proj) self.chn = chn self.colors = 3 self.resolution = resolution self.blocks_with_attention = set(blocks_with_attention.split(",")) self.blocks_with_attention.discard('') dblock = [] in_channels, out_channels = self.get_in_out_channels() self.sa_ids = [ int(s.split('B')[-1]) for s in self.blocks_with_attention ] for i, (nc_in, nc_out) in enumerate(zip(in_channels[:-1], out_channels[:-1])): dblock.append( DBlock(nc_in, nc_out, downsample=True, use_attention=(i + 1) in self.sa_ids, skip_proj=nc_in == nc_out)) dblock.append( DBlock(in_channels[-1], out_channels[-1], downsample=False, use_attention=len(out_channels) in self.sa_ids, skip_proj=in_channels[-1] == out_channels[-1])) self.blocks = dg.LayerList(dblock) self.final_fc = SpectralNorm(dg.Linear(16 * chn, 1)) self.embed_y = dg.Embedding(size=[n_class, 16 * chn], is_sparse=False, param_attr=Uniform(-0.1, 0.1)) self.embed_y = SpectralNorm(self.embed_y)
def __init__(self, n_chars, n_embed, n_out, pad_index=0): super(CharLSTM, self).__init__() self.n_chars = n_chars self.n_embed = n_embed self.n_out = n_out self.pad_index = pad_index # the embedding layer self.embed = dygraph.Embedding(size=(n_chars, n_embed)) # the lstm layer self.lstm = BiLSTM(input_size=n_embed, hidden_size=n_out // 2)
def __init__(self, vocab_size, emb_dim, is_sparse=True, dtype="float32", name="emb", padding_idx=None): """初始 """ super(EmbeddingLayer, self).__init__() self.emb_layer = D.Embedding( size=[vocab_size, emb_dim], dtype=dtype, is_sparse=is_sparse, padding_idx=padding_idx, param_attr=F.ParamAttr( name=name, initializer=F.initializer.Xavier()))
def create_model(config): char_embedding = dg.Embedding((en.n_vocab, config["char_dim"]), param_attr=I.Normal(scale=0.1)) multi_speaker = config["n_speakers"] > 1 speaker_embedding = dg.Embedding((config["n_speakers"], config["speaker_dim"]), param_attr=I.Normal(scale=0.1)) \ if multi_speaker else None encoder = Encoder(config["encoder_layers"], config["char_dim"], config["encoder_dim"], config["kernel_size"], has_bias=multi_speaker, bias_dim=config["speaker_dim"], keep_prob=1.0 - config["dropout"]) decoder = Decoder(config["n_mels"], config["reduction_factor"], list(config["prenet_sizes"]) + [config["char_dim"]], config["decoder_layers"], config["kernel_size"], config["attention_dim"], position_encoding_weight=config["position_weight"], omega=config["position_rate"], has_bias=multi_speaker, bias_dim=config["speaker_dim"], keep_prob=1.0 - config["dropout"]) postnet = PostNet(config["postnet_layers"], config["char_dim"], config["postnet_dim"], config["kernel_size"], config["n_mels"], config["reduction_factor"], has_bias=multi_speaker, bias_dim=config["speaker_dim"], keep_prob=1.0 - config["dropout"]) spectranet = SpectraNet(char_embedding, speaker_embedding, encoder, decoder, postnet) return spectranet
def Embedding(name_scope, num_embeddings, embed_dim, is_sparse=False, is_distributed=False, padding_idx=None, std=0.01, dtype="float32"): # param attrs weight_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal( scale=std)) layer = dg.Embedding(name_scope, (num_embeddings, embed_dim), padding_idx=padding_idx, param_attr=weight_attr, dtype=dtype) return layer
def __init__(self, num_class, vocab_size, emb_dim=32, num_filters=10, fc_hid_dim=32, num_channels=1, win_size_list=None, is_sparse=True, use_cudnn=True, ): super(TextCNN, self).__init__() self.embedding = D.Embedding( size=[vocab_size, emb_dim], dtype='float32', is_sparse=is_sparse) logging.info("num_class = {}".format(num_class)) logging.info("vocab size = {}".format(vocab_size)) logging.info("emb_dim = {}".format(emb_dim)) logging.info("num filters = {}".format(num_filters)) logging.info("fc_hid_dim = {}".format(fc_hid_dim)) logging.info("num channels = {}".format(num_channels)) logging.info("windows size = {}".format(win_size_list)) logging.info("is sparse = {}".format(is_sparse)) logging.info("use cudnn = {}".format(use_cudnn)) win_size_list = [3] if win_size_list is None else win_size_list def gen_conv_pool(win_size): """生成指定窗口的卷积池化层 """ return ConvPool( num_channels, num_filters, [win_size, emb_dim], padding=[1, 0], use_cudnn=use_cudnn, ) self.conv_pool_list = D.LayerList([gen_conv_pool(win_size) for win_size in win_size_list]) self._hid_fc = D.Linear(input_dim=num_filters * len(win_size_list), output_dim=fc_hid_dim, act="tanh") self._output_fc = D.Linear(input_dim=fc_hid_dim, output_dim=num_class, act=None)
def __init__(self, vocab_size, emb_dim=32, is_sparse=True, hidden_dropout_prob=0.1, triplet_margin=1.0, *args, **kwargs): super(TextCNNSiameseNet, self).__init__() self.triplet_margin = triplet_margin logging.info("triplet_margin: {}".format(triplet_margin)) self.embedding = D.Embedding( size=[vocab_size, emb_dim], dtype='float32', is_sparse=is_sparse, ) self.textcnn = TextCNN(emb_dim, *args, **kwargs) logging.info("feature dropout: {}".format(hidden_dropout_prob)) self.dropout = lambda i: L.dropout(i, dropout_prob=hidden_dropout_prob, dropout_implementation="upscale_in_train", ) if self.training else i
def __init__(self, embedding_size, num_hidden, num_head=4, n_layers=3): """Encoder layer of TransformerTTS. Args: embedding_size (int): the size of position embedding. num_hidden (int): the size of hidden layer in network. num_head (int, optional): the head number of multihead attention. Defaults to 4. n_layers (int, optional): the layers number of multihead attention. Defaults to 3. """ super(Encoder, self).__init__() self.num_hidden = num_hidden self.num_head = num_head param = fluid.ParamAttr(initializer=fluid.initializer.Constant( value=1.0)) self.alpha = self.create_parameter( shape=(1, ), attr=param, dtype='float32') self.pos_inp = get_sinusoid_encoding_table( 1024, self.num_hidden, padding_idx=0) self.pos_emb = dg.Embedding( size=[1024, num_hidden], param_attr=fluid.ParamAttr( initializer=fluid.initializer.NumpyArrayInitializer( self.pos_inp), trainable=False)) self.encoder_prenet = EncoderPrenet( embedding_size=embedding_size, num_hidden=num_hidden, use_cudnn=True) self.layers = [ MultiheadAttention(num_hidden, num_hidden // num_head, num_hidden // num_head) for _ in range(n_layers) ] for i, layer in enumerate(self.layers): self.add_sublayer("self_attn_{}".format(i), layer) self.ffns = [ PositionwiseFeedForward( num_hidden, num_hidden * num_head, filter_size=1, use_cudnn=True) for _ in range(n_layers) ] for i, layer in enumerate(self.ffns): self.add_sublayer("ffns_{}".format(i), layer)
def __init__(self, n_loop, n_layer, residual_channels, output_dim, condition_dim, filter_size, loss_type, log_scale_min): """Wavenet that transform upsampled mel spectrogram into waveform. Args: n_loop (int): n_loop for the internal ResidualNet. n_layer (int): n_loop for the internal ResidualNet. residual_channels (int): the channel of the input. output_dim (int): the channel of the output distribution. condition_dim (int): the channel of the condition. filter_size (int): the filter size of the internal ResidualNet. loss_type (str): loss type of the wavenet. Possible values are 'softmax' and 'mog'. If `loss_type` is 'softmax', the output is the logits of the catrgotical(multinomial) distribution, `output_dim` means the number of classes of the categorical distribution. If `loss_type` is mog(mixture of gaussians), the output is the parameters of a mixture of gaussians, which consists of weight(in the form of logit) of each gaussian distribution and its mean and log standard deviaton. So when `loss_type` is 'mog', `output_dim` should be perfectly divided by 3. log_scale_min (int): the minimum value of log standard deviation of the output gaussian distributions. Note that this value is only used for computing loss if `loss_type` is 'mog', values less than `log_scale_min` is clipped when computing loss. """ super(WaveNet, self).__init__() if loss_type not in ["softmax", "mog"]: raise ValueError("loss_type {} is not supported".format(loss_type)) if loss_type == "softmax": self.embed = dg.Embedding((output_dim, residual_channels)) else: assert output_dim % 3 == 0, "with MoG output, the output dim must be divided by 3" self.embed = Linear(1, residual_channels) self.resnet = ResidualNet(n_loop, n_layer, residual_channels, condition_dim, filter_size) self.context_size = self.resnet.context_size skip_channels = residual_channels # assume the same channel self.proj1 = Linear(skip_channels, skip_channels) self.proj2 = Linear(skip_channels, skip_channels) # if loss_type is softmax, output_dim is n_vocab of waveform magnitude. # if loss_type is mog, output_dim is 3 * gaussian, (weight, mean and stddev) self.proj3 = Linear(skip_channels, output_dim) self.loss_type = loss_type self.output_dim = output_dim self.input_dim = 1 self.skip_channels = skip_channels self.log_scale_min = log_scale_min
def __init__(self, name_scope, n_position, d_pos_vec, position_rate=1.0, is_sparse=False, is_distributed=False, param_attr=None, max_norm=None, padding_idx=None, dtype="float32"): super(PositionEmbedding, self).__init__(name_scope, dtype=dtype) self.embed = dg.Embedding(self.full_name(), size=(n_position, d_pos_vec), is_sparse=is_sparse, is_distributed=is_distributed, padding_idx=None, param_attr=param_attr, dtype=dtype) self.set_weight( position_encoding_init(n_position, d_pos_vec, position_rate=position_rate, sinusoidal=False).astype(dtype)) self._is_sparse = is_sparse self._is_distributed = is_distributed self._remote_prefetch = self._is_sparse and (not self._is_distributed) if self._remote_prefetch: assert self._is_sparse is True and self._is_distributed is False self._padding_idx = (-1 if padding_idx is None else padding_idx if padding_idx >= 0 else (n_position + padding_idx)) self._position_rate = position_rate self._max_norm = max_norm self._dtype = dtype
def __init__(self, args): super(PretraEmbedding, self).__init__() self.args = args # the embedding layer self.word_embed = dygraph.Embedding(size=(args.n_words, args.n_embed)) self.embed_dropout = IndependentDropout(p=args.embed_dropout)
def __init__(self, num_pos_feats=256): super().__init__() self.row_embed = dg.Embedding(size=(50, num_pos_feats)) self.col_embed = dg.Embedding(size=(50, num_pos_feats)) self.reset_parameters()
def __init__(self, num_hidden, config, num_head=4, n_layers=3): """Decoder layer of TransformerTTS. Args: num_hidden (int): the number of source vocabulary. config: the yaml configs used in decoder. n_layers (int, optional): the layers number of multihead attention. Defaults to 4. num_head (int, optional): the head number of multihead attention. Defaults to 3. """ super(Decoder, self).__init__() self.num_hidden = num_hidden self.num_head = num_head param = fluid.ParamAttr() self.alpha = self.create_parameter( shape=(1, ), attr=param, dtype='float32', default_initializer=fluid.initializer.ConstantInitializer( value=1.0)) self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0) self.pos_emb = dg.Embedding( size=[1024, num_hidden], padding_idx=0, param_attr=fluid.ParamAttr( initializer=fluid.initializer.NumpyArrayInitializer( self.pos_inp), trainable=False)) self.decoder_prenet = PreNet(input_size=config['audio']['num_mels'], hidden_size=num_hidden * 2, output_size=num_hidden, dropout_rate=0.2) k = math.sqrt(1.0 / num_hidden) self.linear = dg.Linear( num_hidden, num_hidden, param_attr=fluid.ParamAttr( initializer=fluid.initializer.XavierInitializer()), bias_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform(low=-k, high=k))) self.selfattn_layers = [ MultiheadAttention(num_hidden, num_hidden // num_head, num_hidden // num_head) for _ in range(n_layers) ] for i, layer in enumerate(self.selfattn_layers): self.add_sublayer("self_attn_{}".format(i), layer) self.attn_layers = [ MultiheadAttention(num_hidden, num_hidden // num_head, num_hidden // num_head) for _ in range(n_layers) ] for i, layer in enumerate(self.attn_layers): self.add_sublayer("attn_{}".format(i), layer) self.ffns = [ PositionwiseFeedForward(num_hidden, num_hidden * num_head, filter_size=1) for _ in range(n_layers) ] for i, layer in enumerate(self.ffns): self.add_sublayer("ffns_{}".format(i), layer) self.mel_linear = dg.Linear( num_hidden, config['audio']['num_mels'] * config['audio']['outputs_per_step'], param_attr=fluid.ParamAttr( initializer=fluid.initializer.XavierInitializer()), bias_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform(low=-k, high=k))) self.stop_linear = dg.Linear( num_hidden, 1, param_attr=fluid.ParamAttr( initializer=fluid.initializer.XavierInitializer()), bias_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform(low=-k, high=k))) self.postconvnet = PostConvNet( config['audio']['num_mels'], config['hidden_size'], filter_size=5, padding=4, num_conv=5, outputs_per_step=config['audio']['outputs_per_step'], use_cudnn=True)
def make_model(config): c = config["model"] # speaker embedding n_speakers = c["n_speakers"] speaker_dim = c["speaker_embed_dim"] if n_speakers > 1: speaker_embed = dg.Embedding( (n_speakers, speaker_dim), param_attr=I.Normal(scale=c["speaker_embedding_weight_std"])) else: speaker_embed = None # encoder h = c["encoder_channels"] k = c["kernel_size"] encoder_convolutions = ( ConvSpec(h, k, 1), ConvSpec(h, k, 3), ConvSpec(h, k, 9), ConvSpec(h, k, 27), ConvSpec(h, k, 1), ConvSpec(h, k, 3), ConvSpec(h, k, 9), ConvSpec(h, k, 27), ConvSpec(h, k, 1), ConvSpec(h, k, 3), ) encoder = Encoder(n_vocab=en.n_vocab, embed_dim=c["text_embed_dim"], n_speakers=n_speakers, speaker_dim=speaker_dim, embedding_weight_std=c["embedding_weight_std"], convolutions=encoder_convolutions, dropout=c["dropout"]) if c["freeze_embedding"]: freeze(encoder.embed) # decoder h = c["decoder_channels"] k = c["kernel_size"] prenet_convolutions = (ConvSpec(h, k, 1), ConvSpec(h, k, 3)) attentive_convolutions = ( ConvSpec(h, k, 1), ConvSpec(h, k, 3), ConvSpec(h, k, 9), ConvSpec(h, k, 27), ConvSpec(h, k, 1), ) attention = [True, False, False, False, True] force_monotonic_attention = [True, False, False, False, True] window = WindowRange(c["window_backward"], c["window_ahead"]) decoder = Decoder(n_speakers, speaker_dim, embed_dim=c["text_embed_dim"], mel_dim=config["transform"]["n_mels"], r=c["outputs_per_step"], max_positions=c["max_positions"], preattention=prenet_convolutions, convolutions=attentive_convolutions, attention=attention, dropout=c["dropout"], use_memory_mask=c["use_memory_mask"], force_monotonic_attention=force_monotonic_attention, query_position_rate=c["query_position_rate"], key_position_rate=c["key_position_rate"], window_range=window, key_projection=c["key_projection"], value_projection=c["value_projection"]) if not c["trainable_positional_encodings"]: freeze(decoder.embed_keys_positions) freeze(decoder.embed_query_positions) # converter(postnet) linear_dim = 1 + config["transform"]["n_fft"] // 2 h = c["converter_channels"] k = c["kernel_size"] postnet_convolutions = ( ConvSpec(h, k, 1), ConvSpec(h, k, 3), ConvSpec(2 * h, k, 1), ConvSpec(2 * h, k, 3), ) use_decoder_states = c["use_decoder_state_for_postnet_input"] converter = Converter(n_speakers, speaker_dim, in_channels=decoder.state_dim if use_decoder_states else config["transform"]["n_mels"], linear_dim=linear_dim, time_upsampling=c["downsample_factor"], convolutions=postnet_convolutions, dropout=c["dropout"]) model = DeepVoice3(encoder, decoder, converter, speaker_embed, use_decoder_states=use_decoder_states) return model
def __init__(self): super().__init__() self.emb = D.Embedding([30002, 128], padding_idx=0) self.cnn = D.Conv2D(128, 128, (1, 3), padding=(0, 1), act='relu') self.pool = D.Pool2D((1, 3), pool_padding=(0, 1)) self.fc = D.Linear(128, 2)
def __init__(self): super().__init__() self.emb = D.Embedding([len(student_vocab), 128], padding_idx=0) self.fc = D.Linear(128, 2)
def __init__(self, args, pretrained_embed=None): super(Model, self).__init__() self.args = args # the embedding layer self.word_embed = dygraph.Embedding(size=(args.n_words, args.n_embed)) if args.pretrained_embed_shape is not None: if pretrained_embed is not None: pre_param_attrs = fluid.ParamAttr( name="pretrained_emb", initializer=initializer.NumpyArrayInitializer( pretrained_embed), trainable=True) self.pretrained = dygraph.Embedding( size=args.pretrained_embed_shape, param_attr=pre_param_attrs) self.word_embed.weight = layers.create_parameter( shape=(self.args.n_words, self.args.n_embed), dtype='float32', default_initializer=initializer.Constant(value=0.0)) else: self.pretrained = dygraph.Embedding( size=args.pretrained_embed_shape) # Initialize feat feature, feat can be char or pos if args.feat == 'char': self.feat_embed = CharLSTM(n_chars=args.n_feats, n_embed=args.n_char_embed, n_out=args.n_feat_embed, pad_index=args.feat_pad_index) else: self.feat_embed = dygraph.Embedding(size=(args.n_feats, args.n_feat_embed)) self.embed_dropout = IndependentDropout(p=args.embed_dropout) # lstm layer self.lstm = BiLSTM(input_size=args.n_embed + args.n_feat_embed, hidden_size=args.n_lstm_hidden, num_layers=args.n_lstm_layers, dropout=args.lstm_dropout) self.lstm_dropout = SharedDropout(p=args.lstm_dropout) # mlp layer self.mlp_arc_h = MLP(n_in=args.n_lstm_hidden * 2, n_out=args.n_mlp_arc, dropout=args.mlp_dropout) self.mlp_arc_d = MLP(n_in=args.n_lstm_hidden * 2, n_out=args.n_mlp_arc, dropout=args.mlp_dropout) self.mlp_rel_h = MLP(n_in=args.n_lstm_hidden * 2, n_out=args.n_mlp_rel, dropout=args.mlp_dropout) self.mlp_rel_d = MLP(n_in=args.n_lstm_hidden * 2, n_out=args.n_mlp_rel, dropout=args.mlp_dropout) # biaffine layers self.arc_attn = Biaffine(n_in=args.n_mlp_arc, bias_x=True, bias_y=False) self.rel_attn = Biaffine(n_in=args.n_mlp_rel, n_out=args.n_rels, bias_x=True, bias_y=True) self.pad_index = args.pad_index self.unk_index = args.unk_index
def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim, padding_idx, embedding_std, max_positions, n_vocab, freeze_embedding, filter_size, encoder_channels, mel_dim, decoder_channels, r, trainable_positional_encodings, use_memory_mask, query_position_rate, key_position_rate, window_behind, window_ahead, key_projection, value_projection, downsample_factor, linear_dim, use_decoder_states, converter_channels, dropout): """just a simple function to create a deepvoice 3 model""" if n_speakers > 1: spe = dg.Embedding((n_speakers, speaker_dim), param_attr=I.Normal(scale=speaker_embed_std)) else: spe = None h = encoder_channels k = filter_size encoder_convolutions = ( ConvSpec(h, k, 1), ConvSpec(h, k, 3), ConvSpec(h, k, 9), ConvSpec(h, k, 27), ConvSpec(h, k, 1), ConvSpec(h, k, 3), ConvSpec(h, k, 9), ConvSpec(h, k, 27), ConvSpec(h, k, 1), ConvSpec(h, k, 3), ) enc = Encoder(n_vocab, embed_dim, n_speakers, speaker_dim, padding_idx=None, embedding_weight_std=embedding_std, convolutions=encoder_convolutions, dropout=dropout) if freeze_embedding: freeze(enc.embed) h = decoder_channels prenet_convolutions = (ConvSpec(h, k, 1), ConvSpec(h, k, 3)) attentive_convolutions = ( ConvSpec(h, k, 1), ConvSpec(h, k, 3), ConvSpec(h, k, 9), ConvSpec(h, k, 27), ConvSpec(h, k, 1), ) attention = [True, False, False, False, True] force_monotonic_attention = [True, False, False, False, True] dec = Decoder(n_speakers, speaker_dim, embed_dim, mel_dim, r=r, max_positions=max_positions, preattention=prenet_convolutions, convolutions=attentive_convolutions, attention=attention, dropout=dropout, use_memory_mask=use_memory_mask, force_monotonic_attention=force_monotonic_attention, query_position_rate=query_position_rate, key_position_rate=key_position_rate, window_range=WindowRange(window_behind, window_ahead), key_projection=key_projection, value_projection=value_projection) if not trainable_positional_encodings: freeze(dec.embed_keys_positions) freeze(dec.embed_query_positions) h = converter_channels postnet_convolutions = ( ConvSpec(h, k, 1), ConvSpec(h, k, 3), ConvSpec(2 * h, k, 1), ConvSpec(2 * h, k, 3), ) cvt = Converter(n_speakers, speaker_dim, dec.state_dim if use_decoder_states else mel_dim, linear_dim, time_upsampling=downsample_factor, convolutions=postnet_convolutions, dropout=dropout) dv3 = DeepVoice3(enc, dec, cvt, spe, use_decoder_states) return dv3
def init_ernie_model(self, args): self.word_embed = dygraph.Embedding(size=(args.ernie_vocabs_size, args.lstm_by_wp_embed_size))
def __init__(self, n_vocab, embed_dim, n_speakers, speaker_dim, padding_idx=None, embedding_weight_std=0.1, convolutions=(ConvSpec(64, 5, 1), ) * 7, dropout=0.): """Encoder of Deep Voice 3. Args: n_vocab (int): vocabulary size of the text embedding. embed_dim (int): embedding size of the text embedding. n_speakers (int): number of speakers. speaker_dim (int): speaker embedding size. padding_idx (int, optional): padding index of text embedding. Defaults to None. embedding_weight_std (float, optional): standard deviation of the embedding weights when intialized. Defaults to 0.1. convolutions (Iterable[ConvSpec], optional): specifications of the convolutional layers. ConvSpec is a namedtuple of output channels, filter_size and dilation. Defaults to (ConvSpec(64, 5, 1), )*7. dropout (float, optional): dropout probability. Defaults to 0.. """ super(Encoder, self).__init__() self.embedding_weight_std = embedding_weight_std self.embed = dg.Embedding( (n_vocab, embed_dim), padding_idx=padding_idx, param_attr=I.Normal(scale=embedding_weight_std)) self.dropout = dropout if n_speakers > 1: std = np.sqrt((1 - dropout) / speaker_dim) self.sp_proj1 = Linear(speaker_dim, embed_dim, act="softsign", param_attr=I.Normal(scale=std)) self.sp_proj2 = Linear(speaker_dim, embed_dim, act="softsign", param_attr=I.Normal(scale=std)) self.n_speakers = n_speakers self.convolutions = dg.LayerList() in_channels = embed_dim std_mul = 1.0 for (out_channels, filter_size, dilation) in convolutions: # 1 * 1 convolution & relu if in_channels != out_channels: std = np.sqrt(std_mul / in_channels) self.convolutions.append( Conv1D(in_channels, out_channels, 1, act="relu", param_attr=I.Normal(scale=std))) in_channels = out_channels std_mul = 2.0 self.convolutions.append( Conv1DGLU(n_speakers, speaker_dim, in_channels, out_channels, filter_size, dilation, std_mul, dropout, causal=False, residual=True)) in_channels = out_channels std_mul = 4.0 std = np.sqrt(std_mul * (1 - dropout) / in_channels) self.convolutions.append( Conv1D(in_channels, embed_dim, 1, param_attr=I.Normal(scale=std)))
def __init__(self, embedding_size, num_hidden, use_cudnn=True): """ Encoder prenet layer of TransformerTTS. Args: embedding_size (int): the size of embedding. num_hidden (int): the size of hidden layer in network. use_cudnn (bool, optional): use cudnn or not. Defaults to True. """ super(EncoderPrenet, self).__init__() self.embedding_size = embedding_size self.num_hidden = num_hidden self.use_cudnn = use_cudnn self.embedding = dg.Embedding( size=[len(symbols), embedding_size], padding_idx=0, param_attr=fluid.initializer.Normal( loc=0.0, scale=1.0)) self.conv_list = [] k = math.sqrt(1.0 / embedding_size) self.conv_list.append( Conv1D( num_channels=embedding_size, num_filters=num_hidden, filter_size=5, padding=int(np.floor(5 / 2)), param_attr=fluid.ParamAttr( initializer=fluid.initializer.XavierInitializer()), bias_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform( low=-k, high=k)), use_cudnn=use_cudnn)) k = math.sqrt(1.0 / num_hidden) for _ in range(2): self.conv_list.append( Conv1D( num_channels=num_hidden, num_filters=num_hidden, filter_size=5, padding=int(np.floor(5 / 2)), param_attr=fluid.ParamAttr( initializer=fluid.initializer.XavierInitializer()), bias_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform( low=-k, high=k)), use_cudnn=use_cudnn)) for i, layer in enumerate(self.conv_list): self.add_sublayer("conv_list_{}".format(i), layer) self.batch_norm_list = [ dg.BatchNorm( num_hidden, data_layout='NCHW') for _ in range(3) ] for i, layer in enumerate(self.batch_norm_list): self.add_sublayer("batch_norm_list_{}".format(i), layer) k = math.sqrt(1.0 / num_hidden) self.projection = dg.Linear( num_hidden, num_hidden, param_attr=fluid.ParamAttr( initializer=fluid.initializer.XavierInitializer()), bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-k, high=k)))
def _initialize(self): """ initialize with the necessary elements """ self.tts_checkpoint_path = os.path.join(self.directory, "assets", "tts", "step-1780000") self.waveflow_checkpoint_path = os.path.join(self.directory, "assets", "vocoder", "step-2000000") self.waveflow_config_path = os.path.join(self.directory, "assets", "vocoder", "waveflow_ljspeech.yaml") tts_checkpoint_path = os.path.join(self.directory, "assets", "tts", "ljspeech.yaml") with open(tts_checkpoint_path) as f: self.tts_config = ruamel.yaml.safe_load(f) with fluid.dygraph.guard(fluid.CPUPlace()): char_embedding = dg.Embedding( (en.n_vocab, self.tts_config["char_dim"])) multi_speaker = self.tts_config["n_speakers"] > 1 speaker_embedding = dg.Embedding((self.tts_config["n_speakers"], self.tts_config["speaker_dim"])) \ if multi_speaker else None encoder = Encoder(self.tts_config["encoder_layers"], self.tts_config["char_dim"], self.tts_config["encoder_dim"], self.tts_config["kernel_size"], has_bias=multi_speaker, bias_dim=self.tts_config["speaker_dim"], keep_prob=1.0 - self.tts_config["dropout"]) decoder = Decoder( self.tts_config["n_mels"], self.tts_config["reduction_factor"], list(self.tts_config["prenet_sizes"]) + [self.tts_config["char_dim"]], self.tts_config["decoder_layers"], self.tts_config["kernel_size"], self.tts_config["attention_dim"], position_encoding_weight=self.tts_config["position_weight"], omega=self.tts_config["position_rate"], has_bias=multi_speaker, bias_dim=self.tts_config["speaker_dim"], keep_prob=1.0 - self.tts_config["dropout"]) postnet = PostNet(self.tts_config["postnet_layers"], self.tts_config["char_dim"], self.tts_config["postnet_dim"], self.tts_config["kernel_size"], self.tts_config["n_mels"], self.tts_config["reduction_factor"], has_bias=multi_speaker, bias_dim=self.tts_config["speaker_dim"], keep_prob=1.0 - self.tts_config["dropout"]) self.tts_model = SpectraNet(char_embedding, speaker_embedding, encoder, decoder, postnet) io.load_parameters(model=self.tts_model, checkpoint_path=self.tts_checkpoint_path) for name, layer in self.tts_model.named_sublayers(): try: remove_weight_norm(layer) except ValueError: # this layer has not weight norm hook pass self.waveflow = WaveflowVocoder( config_path=self.waveflow_config_path, checkpoint_path=self.waveflow_checkpoint_path) self.griffin = GriffinLimVocoder( sharpening_factor=self.tts_config["sharpening_factor"], sample_rate=self.tts_config["sample_rate"], n_fft=self.tts_config["n_fft"], win_length=self.tts_config["win_length"], hop_length=self.tts_config["hop_length"])