def __init__(self, vocab_size: int, channel: int, res_channel: int, n_res_blocks: int, n_encoders: int, tau: float, pad_idx: Union[None, int], input_noise=0.0, embed_dropout=0.0, num_vq_embeds: int = 512, vq_embeds_dim: int = None, vq_loss_alpha=0.25, vq_decay=0.99, ignore_quant=False): super().__init__() self.vocab_size = vocab_size self.pad_idx = pad_idx self.tau = tau self.vq_loss_alpha = vq_loss_alpha self.ignore_quant = ignore_quant self.vq_loss = 0 self.nll_loss = 0 self.acc = 0 self.vq_blend = 0.0 self.blend_steps = 5000 self.blend_step = 0 self.vq_embeds_dim = vq_embeds_dim self.input_noise = CategoricalNoise(vocab_size, input_noise) self.embed_dropout = nn.Dropout(embed_dropout) self.embed = nn.Embedding(vocab_size, channel, padding_idx=pad_idx, max_norm=1.0) self.encoder = nn.Sequential(*[ Encoder(channel, channel, res_channel, n_res_blocks) for i in range(n_encoders) ]) self.decoder = nn.Sequential(*[ Decoder(channel, channel, res_channel, n_res_blocks) for i in range(n_encoders)[::-1] ]) self.conv_to_quant = nn.Conv1d(channel, vq_embeds_dim, kernel_size=1) self.quant_to_conv = nn.Conv1d(vq_embeds_dim, channel, kernel_size=1) self.quantize = Quantize(dim=vq_embeds_dim, n_embed=num_vq_embeds, decay=vq_decay) self.conv_to_logits = nn.Conv1d(channel, vocab_size, kernel_size=1) self.nll = nn.NLLLoss(reduction='none', ignore_index=self.pad_idx)
def __init__(self, vocab_size: int, channel: int, res_channel: int, n_res_block, tau: float, pad_idx: Union[None, int], input_embed_dim: int, input_noise=0.0, embed_dropout=0.1, num_vq_embeds: int = 512, vq_embeds_dim: int = None, vq_loss_alpha=1.0, ignore_quant=False, **kwargs): super().__init__() self.pad_idx = pad_idx self.tau = tau self.vq_loss_alpha = vq_loss_alpha self.vq_loss = 0 self.nll_loss = 0 self.acc = 0 self.vq_blend = 0.0 self.blend_steps = 10000 self.blend_step = 0 self.vq_embeds_dim = vq_embeds_dim self.ignore_quant = ignore_quant self.input_noise = CategoricalNoise(vocab_size, input_noise) self.embed_dropout = nn.Dropout(embed_dropout) self.embed = nn.Embedding(vocab_size, input_embed_dim, padding_idx=pad_idx) self.quantize = Quantize(vq_embeds_dim, num_vq_embeds, decay=0.90) self.embeds_to_encode = nn.Conv1d(input_embed_dim, channel, kernel_size=3, padding=1) self.encode_to_quants = nn.Conv1d(channel, vq_embeds_dim, kernel_size=3, padding=1) self.quants_to_decode = nn.Conv1d(vq_embeds_dim, channel, kernel_size=3, padding=1) self.encoder = Encoder(channel, res_channel, n_res_block) self.decoder = Decoder(channel, vocab_size, res_channel, n_res_block) self.nll = nn.NLLLoss(reduction='none', ignore_index=self.pad_idx)
def __init__(self, vocab_size: int, channel: int, n_fold: int, tau: float, pad_idx: Union[None, int], input_noise=0.0, embed_dropout=0.1, num_vq_embeds: int = 512, vq_embeds_dim: int = None, vq_loss_alpha=1.0, d_slice=1, ignore_quant=False): super().__init__() self.pad_idx = pad_idx self.tau = tau self.vq_loss_alpha = vq_loss_alpha self.d_slice = d_slice self.ignore_quant = ignore_quant self.vq_loss = 0 self.nll_loss = 0 self.acc = 0 self.vq_blend = 0.0 self.blend_steps = 10000 self.blend_step = 0 self.vq_embeds_dim = vq_embeds_dim self.input_noise = CategoricalNoise(vocab_size, input_noise) self.embed_dropout = nn.Dropout(embed_dropout) self.embed = nn.Embedding(vocab_size, channel, padding_idx=pad_idx) self.quantize = SlicedQuantize(d_slice, dim=vq_embeds_dim, n_embed=num_vq_embeds, decay=0.99) #self.embeds_to_encode = nn.Conv1d(input_embed_dim, channel, kernel_size=3, padding=1) self.encode_to_quants = nn.Conv1d(channel, vq_embeds_dim, kernel_size=1) #self.quants_to_decode = nn.Conv1d(vq_embeds_dim, channel, kernel_size=1) #self.encoder = Encoder(channel, 64, 2) self.encoder = nn.Sequential( *[EncoderBlock(channel) for i in range(n_fold)]) self.decoder = Decoder(channel, vocab_size, vq_embeds_dim, n_blocks=int(np.log2(256) - np.log2(16) + 1)) self.nll = nn.NLLLoss(reduction='none', ignore_index=self.pad_idx)
def __init__(self, vocab_size: int, channel: int, n_fold: int, tau: float, pad_idx: Union[None, int], input_noise=0.0, embed_dropout=0.1, num_vq_embeds: int = 512, vq_embeds_dim: int = None, vq_loss_alpha=1.0, d_slice: int = 1, **kwargs): super().__init__() self.pad_idx = pad_idx self.tau = tau self.vq_loss_alpha = vq_loss_alpha self.d_slice = d_slice self.vq_loss = 0 self.nll_loss = 0 self.acc = 0 self.vq_embeds_dim = vq_embeds_dim self.input_noise = CategoricalNoise(vocab_size, input_noise) self.embed_dropout = nn.Dropout(embed_dropout) self.embed = nn.Embedding(vocab_size, channel, padding_idx=pad_idx) self.quantize = Quantize(vq_embeds_dim // self.d_slice, num_vq_embeds, decay=0.99) #self.embeds_to_encode = nn.Conv1d(input_embed_dim, channel, kernel_size=3, padding=1) self.encode_to_quants = nn.Conv1d(channel, vq_embeds_dim, kernel_size=3, padding=1) self.quants_to_decode = nn.Conv1d(vq_embeds_dim, channel, kernel_size=3, padding=1) self.encoder = Encoder(channel, n_fold) self.decoder = Decoder(channel, n_fold) self.nll = nn.NLLLoss(reduction='none', ignore_index=self.pad_idx)
def __init__(self, vocab_size: int, dim: int, dim_feedforward: int, n_fold: int, tau: float, pad_idx: Union[None, int], input_noise=0.0, embed_dropout=0.1, num_vq_embeds: int = 512, vq_embeds_dim: int = None, vq_loss_alpha=1.0, ignore_quant=False): super().__init__() self.pad_idx = pad_idx self.tau = tau self.vq_loss_alpha = vq_loss_alpha self.ignore_quant = ignore_quant self.vq_loss = 0 self.nll_loss = 0 self.acc = 0 self.vq_blend = 0.0 self.blend_steps = 10000 self.blend_step = 0 self.vq_embeds_dim = vq_embeds_dim self.input_noise = CategoricalNoise(vocab_size, input_noise) self.embed_dropout = nn.Dropout(embed_dropout) self.embed = nn.Embedding(vocab_size, dim, padding_idx=pad_idx) self.quantize = Quantize(vq_embeds_dim, num_vq_embeds, decay=0.95) self.encode_to_quants = nn.Linear(dim, vq_embeds_dim) self.quants_to_decode = nn.Linear(vq_embeds_dim, dim) self.encoder = nn.Sequential(*[BaseBlock(dim, dim_feedforward, mode='shrink') for _ in range(n_fold)]) self.decoder = nn.Sequential(*[BaseBlock(dim, dim_feedforward, mode='expand') for _ in range(n_fold)]) self.nll = nn.NLLLoss(reduction='none', ignore_index=self.pad_idx)
def __init__(self, vocab_size: int, embed_dim: int, channel: int, res_channel: int, n_res_block: int, tau: float, pad_idx: Union[None, int], input_noise=0.0, embed_dropout=0.0, num_vq_embeds: int = 512, vq_embed_dim: int = 64, vq_loss_alpha=1.0, d_slice=1, ignore_quant=False): super().__init__() self.vocab_size = vocab_size self.pad_idx = pad_idx self.tau = tau self.vq_loss_alpha = vq_loss_alpha self.d_slice = d_slice self.ignore_quant = ignore_quant self.vq_loss = 0 self.nll_loss = 0 self.acc = 0 self.vq_blend = 0.0 self.blend_steps = 5000 self.blend_step = 0 self.vq_embed_dim = vq_embed_dim self.input_noise = CategoricalNoise(vocab_size, input_noise) self.embed_dropout = nn.Dropout(embed_dropout) self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx, max_norm=1.0) #self.pos_encoder = PositionalEncoding(embed_dim, max_len=1024) #self.quant_pos_encoder = PositionalEncoding(vq_embed_dim, max_len=256) self.enc_a = Encoder(embed_dim, channel, res_channel, n_res_block) self.enc_b = Encoder(channel, channel, res_channel, n_res_block) self.quantize_conv_b = nn.Conv1d(channel, vq_embed_dim, 1) #self.quantize_b = SlicedQuantize(d_slice, dim=vq_embed_dim, n_embed=num_vq_embeds) self.quantize_b = Quantize(vq_embed_dim, num_vq_embeds) #self.quantize_b = DecomposedQuantize(16, vq_embed_dim, num_vq_embeds) self.dec_b = Decoder(vq_embed_dim, vq_embed_dim, channel, res_channel, n_res_block) self.quantize_conv_a = nn.Conv1d(vq_embed_dim + channel, vq_embed_dim, 1) #self.quantize_a = SlicedQuantize(d_slice, dim=vq_embed_dim, n_embed=num_vq_embeds) self.quantize_a = Quantize(vq_embed_dim, num_vq_embeds) #self.quantize_a = DecomposedQuantize(64, vq_embed_dim, num_vq_embeds) self.upsample_b = nn.Sequential( nn.ConvTranspose1d(vq_embed_dim, channel, 4, stride=2, padding=1), nn.ELU(), nn.ConvTranspose1d( channel, vq_embed_dim, 4, stride=2, padding=1 ), ) self.dec = Decoder( vq_embed_dim + vq_embed_dim, vocab_size, channel, res_channel, n_res_block, ) self.noise = Noise(alpha=0.05) self.nll = nn.NLLLoss(reduction='none', ignore_index=self.pad_idx)
def __init__(self, vocab_size: int, embed_dim: int, channel: int, res_channel: int, n_res_block: int, tau: float, pad_idx: Union[None, int], eos_idx: int, input_noise=0.0, embed_dropout=0.0, num_vq_embeds: int = 512, vq_embed_dim: int = 64, vq_loss_alpha=1.0, d_slice=1, ignore_quant=False): super().__init__() self.vocab_size = vocab_size self.embed_dim = embed_dim self.pad_idx = pad_idx self.eox_idx = eos_idx self.tau = tau self.vq_loss_alpha = vq_loss_alpha self.d_slice = d_slice self.ignore_quant = ignore_quant self.vq_loss = 0 self.nll_loss = 0 self.acc = 0 self.vq_blend = 0.0 self.blend_steps = 5000 self.blend_step = 0 self.vq_embed_dim = vq_embed_dim self.input_noise = CategoricalNoise(vocab_size, input_noise) self.embed_dropout = nn.Dropout(embed_dropout) self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx, max_norm=1.0) self.pos_encoder = PositionalEncoding(embed_dim) self.enc1 = Encoder(embed_dim, channel, res_channel, n_res_block, down=2) self.enc2 = Encoder(channel, channel, res_channel, n_res_block, down=2) self.enc3 = Encoder(channel, channel, res_channel, n_res_block, down=1) self.quant_conv_1 = nn.Conv1d(channel, vq_embed_dim, kernel_size=1) self.quant_conv_2 = nn.Conv1d(channel, vq_embed_dim, kernel_size=1) self.quant_conv_3 = nn.Conv1d(channel, vq_embed_dim, kernel_size=1) self.quant_1 = Quantize(vq_embed_dim, num_vq_embeds) self.quant_2 = Quantize(vq_embed_dim, num_vq_embeds) self.quant_3 = Quantize(vq_embed_dim, num_vq_embeds) self.conv_quant_1 = nn.Conv1d(vq_embed_dim, 256, kernel_size=1) self.conv_quant_2 = nn.Conv1d(vq_embed_dim, 256, kernel_size=1) self.conv_quant_3 = nn.Conv1d(vq_embed_dim, 256, kernel_size=1) self.decoder = TransformerDecoder(vocab_size + 1, 256, 4, 1024, 6) #self.noise = Noise(alpha=0.05) self.nll = nn.NLLLoss(reduction='none', ignore_index=self.pad_idx)
def __init__(self, vocab_size: int, embed_dim: int, channel: int, res_channel: int, n_res_block: int, tau: float, pad_idx: Union[None, int], config: List[Tuple[int, int]], input_noise=0.0, embed_dropout=0.0, vq_embed_dim: int = 8, vq_loss_alpha=1.0, attn=False, d_slice=1, ignore_quant=False): super().__init__() self.vocab_size = vocab_size self.pad_idx = pad_idx self.tau = tau self.vq_loss_alpha = vq_loss_alpha self.d_slice = d_slice self.ignore_quant = ignore_quant self.vq_loss = 0 self.nll_loss = 0 self.acc = 0 self.vq_blend = 0.0 self.blend_steps = 5000 self.blend_step = 0 self.vq_embed_dim = vq_embed_dim self.input_noise = CategoricalNoise(vocab_size, input_noise) self.embed_dropout = nn.Dropout(embed_dropout) self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx, max_norm=1.0) self.encoders = nn.ModuleList() self.decoders = nn.ModuleList() self.cond_decoders = nn.ModuleList() self.quant_conv = nn.ModuleList() self.quantize = nn.ModuleList() for i, (factor, n_embeds) in enumerate(config): self.encoders.append( Encoder(embed_dim if i == 0 else channel, channel, res_channel, down=factor)) dec_in_channels = vq_embed_dim * (1 if i + 1 == len(config) else 2) dec_out_channels = vocab_size if i == 0 else vq_embed_dim self.decoders.append( Decoder(dec_in_channels, dec_out_channels, channel, res_channel, up=factor)) if i + 1 != len(config): self.cond_decoders.append( Decoder(vq_embed_dim, vq_embed_dim, channel, res_channel, up=config[i + 1][0], n_res_blocks=1)) else: self.cond_decoders.append(None) self.quant_conv.append( nn.Conv1d(channel + (0 if i + 1 == len(config) else vq_embed_dim), vq_embed_dim, kernel_size=1)) #self.quant_conv.append(nn.Conv1d(channel, vq_embed_dim, kernel_size=1)) self.quantize.append(Quantize(vq_embed_dim, n_embeds)) self.upsample_t = nn.Sequential( nn.ConvTranspose1d(vq_embed_dim, channel, 4, stride=2, padding=1), nn.ELU(), nn.ConvTranspose1d(channel, vq_embed_dim, 4, stride=2, padding=1), ) self.noise = Noise(alpha=0.05) self.nll = nn.NLLLoss(reduction='none', ignore_index=self.pad_idx)