def build_model(self, wavenet_mel): hps = self.hps ns = self.hps.ns emb_size = self.hps.emb_size c = 80 if wavenet_mel else 513 patch_classify_kernel = (3, 4) if wavenet_mel else (17, 4) self.Encoder = cc(Encoder(c_in=c, ns=ns, dp=hps.enc_dp)) self.Decoder = cc( Decoder(c_out=c, ns=ns, c_a=hps.n_speakers, emb_size=emb_size)) self.Generator = cc( Decoder(c_out=c, ns=ns, c_a=hps.n_speakers, emb_size=emb_size)) self.SpeakerClassifier = cc( SpeakerClassifier(ns=ns, n_class=hps.n_speakers, dp=hps.dis_dp)) self.PatchDiscriminator = cc( nn.DataParallel( PatchDiscriminator( ns=ns, n_class=hps.n_speakers, classify_kernel_size=patch_classify_kernel))) betas = (0.5, 0.9) params = list(self.Encoder.parameters()) + list( self.Decoder.parameters()) self.ae_opt = optim.Adam(params, lr=self.hps.lr, betas=betas) self.clf_opt = optim.Adam(self.SpeakerClassifier.parameters(), lr=self.hps.lr, betas=betas) self.gen_opt = optim.Adam(self.Generator.parameters(), lr=self.hps.lr, betas=betas) self.patch_opt = optim.Adam(self.PatchDiscriminator.parameters(), lr=self.hps.lr, betas=betas)
def sequential_generation(self, seed_text, batch_size=10, max_len=15, leed_out_len=15, top_k=0, temperature=None, sample=True): """ Generate one word at a time, in L->R order """ seed_len = len(seed_text) + 1 # +1 to account for CLS batch = self.get_init_text(seed_text, max_len, batch_size) for ii in range(max_len - seed_len): # inp = [sent[:seed_len + ii + leed_out_len] + [self.sep_id] for sent in batch] inp = cc(batch, self.no_cuda) inp_mask = [ np.expand_dims(i != self.sep_id, -2).astype(np.int32) for i in inp ] test = cc(inp_mask, self.no_cuda) out, break_probs = self.model(inp.long(), cc(inp_mask, self.no_cuda)[0]) idxs = self.generate_step(out, gen_idx=seed_len + ii, top_k=top_k, temperature=temperature, sample=sample) for jj in range(batch_size): batch[jj][seed_len + ii] = idxs[jj] # return self.untokenize_batch(batch) return self.untokenize_batch(batch)
def parallel_generation(self, seed_text, batch_size=10, max_len=15, top_k=0, temperature=None, max_iter=300, sample=True, print_every=10, verbose=True): """ Generate for all positions at each time step """ seed_len = len(seed_text) + 1 # +1 to account for CLS batch = self.get_init_text(seed_text, max_len, batch_size) inp_mask = [] for ii in range(max_iter): inp = cc(batch, self.no_cuda) inp_mask.append( np.expand_dims(inp != self.sep_id, -2).astype(np.int32)) out, break_probs = self.model(inp.long(), cc(inp_mask, self.no_cuda)[0]) for kk in range(max_len - seed_len): idxs = self.generate_step(out, gen_idx=seed_len + kk, top_k=top_k, temperature=temperature, sample=sample) for jj in range(batch_size): batch[jj][seed_len + kk] = idxs[jj] if verbose and np.mod(ii, print_every) == 0: print("iter", ii + 1, self.data_utils.id2sent(batch[0])) return self.untokenize_batch(batch)
def build_model(self): hps = self.hps ns = self.hps.ns emb_size = self.hps.emb_size self.Encoder = cc(Encoder(ns=ns, dp=hps.enc_dp)) self.Decoder = cc(Decoder(ns=ns, c_a=hps.n_speakers, emb_size=emb_size)) self.Generator = cc( Decoder(ns=ns, c_a=hps.n_speakers, emb_size=emb_size)) self.SpeakerClassifier = cc( SpeakerClassifier(ns=ns, n_class=hps.n_speakers, dp=hps.dis_dp)) self.PatchDiscriminator = cc( nn.DataParallel(PatchDiscriminator(ns=ns, n_class=hps.n_speakers))) betas = (0.5, 0.9) params = list(self.Encoder.parameters()) + list( self.Decoder.parameters()) self.ae_opt = optim.Adam(params, lr=self.hps.lr, betas=betas) self.clf_opt = optim.Adam(self.SpeakerClassifier.parameters(), lr=self.hps.lr, betas=betas) self.gen_opt = optim.Adam(self.Generator.parameters(), lr=self.hps.lr, betas=betas) self.patch_opt = optim.Adam(self.PatchDiscriminator.parameters(), lr=self.hps.lr, betas=betas)
def graphEval(X, truth_spec, true_sp, true_cc, true_dd, true_bc): sp_emd_cur = [] cc_emd_cur = [] dd_emd_cur = [] assorts_cur = [] spec_l2_cur = [] spec_l2_lin_cur = [] bc_emd_cur = [] for j in range(20): A = gnp(X) G = nx.from_numpy_matrix(A) print(nx.is_connected(G)) if not nx.is_connected(G): Gc = max(nx.connected_component_subgraphs(G), key=len) print(len(Gc.nodes())) sp = utils.sp(A) cc = utils.cc(A) dd = utils.degree_sequence(A) spec_weight_l2 = l2_exp_weight(truth_spec, utils.spectrum(A)) spec_weight_l2_lin = l2_lin_weight(truth_spec, utils.spectrum(A)) bc = sorted(nx.betweenness_centrality(G).values()) sp_emd_cur.append(utils.emd(sp, true_sp)) cc_emd_cur.append(utils.emd(cc, true_cc)) dd_emd_cur.append(utils.emd(dd, true_dd)) assorts_cur.append(nx.degree_assortativity_coefficient(G)) spec_l2_cur.append(spec_weight_l2) spec_l2_lin_cur.append(spec_weight_l2_lin) bc_emd_cur.append(utils.emd(bc, true_bc)) return np.mean(sp_emd_cur), np.mean(cc_emd_cur), np.mean( dd_emd_cur), np.mean(assorts_cur), np.mean(spec_l2_cur), np.mean( bc_emd_cur), np.mean(spec_l2_lin_cur)
def forward(self, enc_output, enc_len): enc_len = enc_len.cpu().numpy().tolist() for i, (layer, project_layer) in enumerate( zip(self.layers, self.project_layers)): total_length = enc_output.size(1) xs_pack = pack_padded_sequence(enc_output, enc_len, batch_first=True) layer.flatten_parameters() xs, (_, _) = layer(xs_pack) ys_pad, enc_len = pad_packed_sequence(xs, batch_first=True, total_length=total_length) enc_len = enc_len.numpy() downsub = self.downsample[i] if downsub > 1: ys_pad = ys_pad.contiguous().view(ys_pad.size(0), ys_pad.size(1) * 2, ys_pad.size(2) // 2) enc_len = [(length * 2) for length in enc_len] ys_pad = F.dropout(ys_pad, 0.1, training=self.training) projected = project_layer(ys_pad) enc_output = self.activation(projected) output_lens = cc(torch.from_numpy(np.array(enc_len, dtype=np.int64))) return enc_output, output_lens
def __init__(self, output_dim, embedding_dim, hidden_dim, dropout_rate, n_layers, bos, eos, pad, ls_weight, labeldist): super(LM, self).__init__() self.bos, self.eos, self.pad = bos, eos, pad self.embedding = torch.nn.Embedding(output_dim, embedding_dim, padding_idx=pad) self.LSTM = torch.nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, batch_first=True, dropout=dropout_rate if n_layers > 1 else 0) # re-init weight_init(self.LSTM) self.output_layer = torch.nn.Linear(hidden_dim, output_dim) self.dropout_layer = torch.nn.Dropout(p=dropout_rate) self.hidden_dim = hidden_dim self.output_dim = output_dim self.dropout_rate = dropout_rate self.n_layers = n_layers # label smoothing hyperparameters self.ls_weight = ls_weight self.labeldist = labeldist if labeldist is not None: self.vlabeldist = cc(torch.from_numpy(np.array(labeldist, dtype=np.float32)))
def __init__(self, output_dim, embedding_dim, hidden_dim, attention, att_odim, dropout_rate, bos, eos, pad, ls_weight=0, labeldist=None): super(Decoder, self).__init__() self.bos, self.eos, self.pad = bos, eos, pad self.embedding = torch.nn.Embedding(output_dim, embedding_dim, padding_idx=pad) self.LSTMCell = torch.nn.LSTMCell(embedding_dim + att_odim, hidden_dim) self.output_layer = torch.nn.Linear(hidden_dim, output_dim) self.attention = attention self.hidden_dim = hidden_dim self.att_odim = att_odim self.dropout_rate = dropout_rate # label smoothing hyperparameters self.ls_weight = ls_weight self.labeldist = labeldist if labeldist is not None: self.vlabeldist = cc( torch.from_numpy(np.array(labeldist, dtype=np.float32)))
def parallel_sequential_generation(self, seed_text, batch_size=10, max_len=15, top_k=0, temperature=None, max_iter=300, burnin=200, print_every=10, verbose=True): """ Generate for one random position at a timestep args: - burnin: during burn-in period, sample from full distribution; afterwards take argmax """ seed_len = len(seed_text) + 1 # +1 to account for CLS batch = self.get_init_text(seed_text, max_len, batch_size) inp_mask = [] for ii in range(max_iter): kk = np.random.randint(0, max_len - seed_len) for jj in range(batch_size): batch[jj][seed_len + kk] = self.mask_id inp = cc(batch, self.no_cuda) inp_mask.append( np.expand_dims(inp != self.sep_id, -2).astype(np.int32)) out, break_probs = self.model(inp.long(), cc(inp_mask, self.no_cuda)[0]) topk = top_k if (ii >= burnin) else 0 idxs = self.generate_step(out, gen_idx=seed_len + kk, top_k=topk, temperature=temperature, sample=(ii < burnin)) for jj in range(batch_size): batch[jj][seed_len + kk] = idxs[jj] if verbose and np.mod(ii + 1, print_every) == 0: for_print = self.data_utils.id2sent(batch[0]).split() for_print = for_print[:seed_len + kk + 1] + [ '(*)' ] + for_print[seed_len + kk + 1:] print("iter", ii + 1, " ".join(for_print)) return self.untokenize_batch(batch)
def mask_and_cal_sum(self, log_probs, ys, mask=None): if mask is None: seq_len = [y.size(0) + 1 + 4 for y in ys] mask = cc(_seq_mask(seq_len=seq_len, max_len=log_probs.size(1))) else: seq_len = [y.size(0) for y in ys] # divide by total length loss = torch.sum(log_probs * mask) / sum(seq_len) return loss
def mask_and_cal_loss(self, log_probs, ys, mask=None): # mask is batch x max_len # add 1 to EOS if mask is None: seq_len = [y.size(0) + 1 for y in ys] mask = cc(_seq_mask(seq_len=seq_len, max_len=log_probs.size(1))) else: seq_len = [y.size(0) for y in ys] # divide by total length loss = -torch.sum(log_probs * mask) / sum(seq_len) return loss
def loss(self, x, training=True): with tf.name_scope('loss'): z = self._encode(x, training=training) x_h = self._decode(z, training=training) loss = dict() loss['pmse'] = p_mse(x, x_h) loss['corr'] = cc(x, x_h) #loss['diff'] = l1(x, x_h) tf.summary.scalar('pmse', loss['pmse']) tf.summary.scalar('corr', loss['corr']) #tf.summary.scalar('diff', loss['diff']) return loss
def forward(self, xpad, ilens): first_out = None first_lens = None for i, (layer, project_layer) in enumerate( zip(self.layers, self.project_layers)): total_length = xpad.size(1) xs_pack = pack_padded_sequence(xpad, ilens, batch_first=True) layer.flatten_parameters() xs, (_, _) = layer(xs_pack) ys_pad, ilens = pad_packed_sequence(xs, batch_first=True, total_length=total_length) ys_pad = F.dropout(ys_pad, self.dropout_rate, training=self.training) ilens = ilens.numpy() sub = self.subsample[i] if sub > 1: # pad one frame if it's not able to divide into 2 equal length if ys_pad.size(1) % 2 == 1: ys_pad = F.pad(ys_pad.transpose(1, 2), (0, 1), mode='replicate').transpose(1, 2) # concat two frames ys_pad = ys_pad.contiguous().view(ys_pad.size(0), ys_pad.size(1) // 2, ys_pad.size(2) * 2) ilens = [(length + 1) // sub for length in ilens] projected = project_layer(ys_pad) xpad = torch.tanh(projected) xpad = F.dropout(xpad, self.dropout_rate, training=self.training) if i == 0: first_out = xpad first_lens = cc( torch.from_numpy(np.array(ilens, dtype=np.int64))) ilens = cc(torch.from_numpy(np.array(ilens, dtype=np.int64))) return xpad, ilens, first_out, first_lens
def forward(self, enc_pad, enc_len, dec_h, att_prev, scaling=2.0): ''' enc_pad:(batch, enc_length, enc_dim) enc_len:(batch) of int dec_h:(batch, 1, dec_dim) att_prev:(batch, enc_length) ''' batch_size = enc_pad.size(0) enc_h = self.mlp_enc(enc_pad) # batch_size x enc_length x att_dim if dec_h is None: dec_h = enc_pad.new_zeros(batch_size, self.decoder_dim) else: dec_h = dec_h.view(batch_size, self.decoder_dim) # initialize attention weights to uniform if att_prev is None: att_prev = pad_list( [enc_pad.new(l).fill_(1.0 / l) for l in enc_len], 0) att_conv = self.loc_conv( att_prev.view(batch_size, 1, 1, enc_pad.size(1))) att_conv = att_conv.squeeze(2).transpose(1, 2) # att_conv: batch_size x channel x 1 x frame -> batch_size x frame x channel att_conv = self.mlp_att( att_conv ) # att_conv: batch_size x frame x channel -> batch_size x frame x att_dim dec_h_tiled = self.mlp_dec(dec_h).view(batch_size, 1, self.att_dim) att_state = torch.tanh(enc_h + dec_h_tiled + att_conv) e = self.gvec(att_state).squeeze(2) if enc_len is not None: mask = [] for b in range(batch_size): mask.append([0] * enc_len[b] + [1] * (enc_pad.size(1) - enc_len[b])) mask = cc(torch.ByteTensor(mask)) e = e.masked_fill_(mask, -1e15) attn = F.softmax(scaling * e, dim=1) w_expanded = attn.unsqueeze(1) # w_expanded: batch_size x 1 x frame c = torch.bmm(w_expanded, enc_pad).squeeze(1) # batch x 1 x frame * batch x enc_length x enc_dim => batch x 1 x enc_dim c = self.mlp_o(c) # batch x enc_dim return c, attn
def get_data(root_dir='/storage/feature/LibriSpeech/npy_files/train-clean-100/7402/90848', text_index_path='/storage/feature/LibriSpeech/text_bpe/train-clean-100/7402/7402-90848.label.txt'): prefix = '7402-90848' datas = [] for i in range(8): seg_id = str(i).zfill(4) filename = f'{prefix}-{seg_id}.npy' path = os.path.join(root_dir, filename) data = torch.from_numpy(np.load(path)).type(torch.FloatTensor) datas.append(data) datas.sort(key=lambda x: x.size(0), reverse=True) ilens = np.array([data.size(0) for data in datas], dtype=np.int64) datas = pad_sequence(datas, batch_first=True, padding_value=0) ys = [] with open(text_index_path, 'r') as f: for line in f: utt_id, indexes = line.strip().split(',', maxsplit=1) indexes = cc(torch.Tensor([int(index) + 3 for index in indexes.split()]).type(torch.LongTensor)) ys.append(indexes) return datas, ilens, ys[:8]
def decode(self, n_samples=5, sample=False, max_dec_timesteps=500): logits, predictions = [], [] dec_c, dec_z = None, None for t in range(max_dec_timesteps): if t == 0: bos = cc(torch.Tensor([self.bos for _ in range(n_samples)]).type(torch.LongTensor)) emb = self.embedding(bos).unsqueeze(1) else: emb = self.embedding(predictions[-1]).unsqueeze(1) logit, dec_z, dec_c = self.forward_step(emb, dec_z, dec_c) logits.append(logit) if not sample: predictions.append(torch.argmax(logit, dim=-1)) else: sampled_indices = Categorical(logits=logit).sample() predictions.append(sampled_indices) logits = torch.stack(logits, dim=1) predictions = torch.stack(predictions, dim=1) return predictions
def build_model(self): self.Encoder = cc(Encoder()) self.Decoder = [cc(Decoder()) for _ in range(4)] self.ACLayer = cc(ACLayer()) self.Discriminator = cc(Discriminator()) self.ASRLayer = cc(ASRLayer()) self.SpeakerClassifier = cc(SpeakerClassifier()) ac_betas = (0.5, 0.999) vae_betas = (0.9, 0.999) ac_lr = 0.00005 vae_lr = 0.001 dis_lr = 0.002 cls_betas = (0.5, 0.999) asr_betas = (0.5, 0.999) cls_lr = 0.0002 asr_lr = 0.00001 self.list_decoder = [] for i in range(4): self.list_decoder += list(self.Decoder[i].parameters()) self.vae_params = list(self.Encoder.parameters()) + self.list_decoder self.ac_optimizer = optim.Adam(self.ACLayer.parameters(), lr=ac_lr, betas=ac_betas) self.vae_optimizer = optim.Adam(self.vae_params, lr=vae_lr, betas=vae_betas) self.dis_optimizer = optim.Adam(self.Discriminator.parameters(), lr=dis_lr, betas=ac_betas) self.asr_optimizer = optim.Adam(self.ASRLayer.parameters(), lr=asr_lr, betas=asr_betas) self.cls_optimizer = optim.Adam(self.SpeakerClassifier.parameters(), lr=cls_lr, betas=cls_betas)
def build_model(self): hps = self.hps ns = self.hps.ns emb_size = self.hps.emb_size betas = (0.5, 0.9) #---stage one---# self.Encoder = cc( Encoder(ns=ns, dp=hps.enc_dp, emb_size=emb_size, seg_len=hps.seg_len, one_hot=self.one_hot, binary_output=self.binary_output, binary_ver=self.binary_ver)) self.Decoder = cc( Decoder(ns=ns, c_in=emb_size, c_h=emb_size, c_a=hps.n_speakers, seg_len=hps.seg_len, inp_emb=self.one_hot or self.binary_output)) self.SpeakerClassifier = cc( SpeakerClassifier( ns=ns, c_in=emb_size if not self.binary_output else emb_size * emb_size, c_h=emb_size, n_class=hps.n_speakers, dp=hps.dis_dp, seg_len=hps.seg_len)) #---stage one opts---# params = list(self.Encoder.parameters()) + \ list(self.Decoder.parameters()) self.ae_opt = optim.Adam(params, lr=self.hps.lr, betas=betas) self.clf_opt = optim.Adam(self.SpeakerClassifier.parameters(), lr=self.hps.lr, betas=betas) #---stage two---# self.Generator = cc( Decoder(ns=ns, c_in=emb_size, c_h=emb_size, c_a=hps.n_speakers if not self.targeted_G else hps.n_target_speakers)) self.PatchDiscriminator = cc( nn.DataParallel( PatchDiscriminator( ns=ns, n_class=hps.n_speakers if not self.targeted_G else hps.n_target_speakers, seg_len=hps.seg_len))) #---stage two opts---# self.gen_opt = optim.Adam(self.Generator.parameters(), lr=self.hps.lr, betas=betas) self.patch_opt = optim.Adam(self.PatchDiscriminator.parameters(), lr=self.hps.lr, betas=betas)
N = A_full.shape[0] truth_spec = utils.spectrum(A_full) step = 400 k = maxIter / step sp_emds = [] cc_emds = [] dd_emds = [] assorts = [] spectrum_weighted_distances = [] bc_emds = [] true_sp = utils.sp(A_full) true_cc = utils.cc(A_full) true_dd = utils.degree_sequence(A_full) G_true = nx.from_numpy_matrix(A_full) true_assort = nx.degree_assortativity_coefficient(G_true) true_bc = sorted(nx.betweenness_centrality(G_true)) true_assorts = [] #initialize all params for i in range(start, k + 1): iterNum = i * step X = np.loadtxt(path + '/samples_{}.txt'.format(iterNum)) X = genExpected_fromWalks(X, A_full.sum())
L = nx.normalized_laplacian_matrix(G).todense() eig_vals, eig_vecs = linalg.eig(L) eig_list = zip(eig_vals, np.transpose(eig_vecs)) eig_list.sort(key=lambda x: x[0]) u = np.asarray([u_i.real for u_i in eig_list[-2][1]])[0][0] truth = utils.compute_graph_statistics(np.asarray(A_matrix)) f = open('plots/truth.txt', "w") f.write(str(truth)) f.close() truth_spec = utils.specGap(A_full) train_spec = utils.specGap(A_matrix) truth_cc = utils.cc(A_full) cc_emd_combo = [] cc_emd_reg = [] cc_emd_fmm = [] cc_emd_combo_std = [] cc_emd_reg_std = [] cc_emd_fmm_std = [] k = 11 for i in range(1, k): print(i) X_c = np.loadtxt( 'plots/barbell_sameDensity/barbell_combo_mixed/trainingIteration_{}_expectedGraph.txt' .format(i * 100)) X_f = np.loadtxt( 'plots/barbell_sameDensity/barbell_fmm/trainingIteration_{}_expectedGraph.txt'
print(f"Ordered top predicted tokens: {top_tokens}") print(f"Ordered top predicted values: {probs[sorted_indexes]}") if __name__ == '__main__': args = parse() sent_gen = SentenceGenerator(args) data_utils = data_utils(args) sent = ["there is no [MASK] in our products now"] vecs = [data_utils.text2id(txt, 10) for txt in sent] # tok_sent = ['[CLS]'] # tok_sent.extend(sent_gen.data_utils.tokenizer.tokenize(sent)) # tok_sent.append('[SEP]') # inp = cc([sent_gen.data_utils.tokenizer.encode(sent, add_special_tokens=True)], args.no_cuda) inp = cc(vecs, args.no_cuda) mask_sent = np.expand_dims(inp != 0, -2).astype(np.int32) print(vecs) print(inp) print(mask_sent) predictions, break_probs = sent_gen.model.forward( inp.long(), cc(mask_sent, sent_gen.no_cuda)[0]) sm = torch.nn.Softmax(dim=0) # Used to convert logits to probs for pos in range(1, inp.shape[1]): if vecs[0][pos] != 0: # print(f"Prediction for word: {tok_sent[pos]}") print( f"Prediction for word: {data_utils.index2word[vecs[0][pos]]}") probs = sm(predictions[0, pos])
def forward(self, enc_pad, enc_len, ys=None, tf_rate=1.0, max_dec_timesteps=500, sample=False, smooth=False, scaling=1.0, label_smoothing=True): batch_size = enc_pad.size(0) if ys is not None: # prepare input and output sequences bos = ys[0].data.new([self.bos]) eos = ys[0].data.new([self.eos]) ys_in = [torch.cat([bos, y], dim=0) for y in ys] ys_out = [torch.cat([y, eos], dim=0) for y in ys] pad_ys_in = pad_list(ys_in, pad_value=self.eos) pad_ys_out = pad_list(ys_out, pad_value=self.eos) # get length info batch_size, olength = pad_ys_out.size(0), pad_ys_out.size(1) # map idx to embedding eys = self.embedding(pad_ys_in) # initialization dec_c = self.zero_state(enc_pad) dec_z = self.zero_state(enc_pad) c = self.zero_state(enc_pad, dim=self.att_odim) w = None logits, prediction, ws = [], [], [] # reset the attention module self.attention.reset() # loop for each timestep olength = max_dec_timesteps if not ys else olength for t in range(olength): # supervised learning: using teacher forcing if ys is not None: # teacher forcing tf = True if np.random.random_sample() <= tf_rate else False emb = eys[:, t, :] if tf or t == 0 else self.embedding(prediction[-1]) # else, label the data with greedy else: if t == 0: bos = cc(torch.Tensor([self.bos for _ in range(batch_size)]).type(torch.LongTensor)) emb = self.embedding(bos) else: # using argmax if not smooth: emb = self.embedding(prediction[-1]) # smooth approximation of embedding else: emb = F.softmax(logit * scaling, dim=-1) @ self.embedding.weight logit, dec_z, dec_c, c, w = \ self.forward_step(emb, dec_z, dec_c, c, w, enc_pad, enc_len) ws.append(w) logits.append(logit) if not sample: prediction.append(torch.argmax(logit, dim=-1)) else: sampled_indices = Categorical(logits=logit).sample() prediction.append(sampled_indices) logits = torch.stack(logits, dim=1) log_probs = F.log_softmax(logits, dim=2) prediction = torch.stack(prediction, dim=1) ws = torch.stack(ws, dim=1) if ys: ys_log_probs = torch.gather(log_probs, dim=2, index=pad_ys_out.unsqueeze(2)).squeeze(2) else: ys_log_probs = torch.gather(log_probs, dim=2, index=prediction.unsqueeze(2)).squeeze(2) # label smoothing if label_smoothing and self.ls_weight > 0 and self.training: loss_reg = torch.sum(log_probs * self.vlabeldist, dim=2) ys_log_probs = (1 - self.ls_weight) * ys_log_probs + self.ls_weight * loss_reg return logits, ys_log_probs, prediction, ws
def get_data(root_dir='/storage/feature/LibriSpeech/npy_files/train-clean-100/7402/90848', text_index_path='/storage/feature/LibriSpeech/text_bpe/train-clean-100/7402/7402-90848.label.txt'): prefix = '7402-90848' datas = [] for i in range(8): seg_id = str(i).zfill(4) filename = f'{prefix}-{seg_id}.npy' path = os.path.join(root_dir, filename) data = torch.from_numpy(np.load(path)).type(torch.FloatTensor) datas.append(data) datas.sort(key=lambda x: x.size(0), reverse=True) ilens = np.array([data.size(0) for data in datas], dtype=np.int64) datas = pad_sequence(datas, batch_first=True, padding_value=0) ys = [] with open(text_index_path, 'r') as f: for line in f: utt_id, indexes = line.strip().split(',', maxsplit=1) indexes = cc(torch.Tensor([int(index) + 3 for index in indexes.split()]).type(torch.LongTensor)) ys.append(indexes) return datas, ilens, ys[:8] data, ilens, ys = get_data() data = cc(data) model = cc(E2E(input_dim=40, enc_hidden_dim=800, enc_n_layers=3, subsample=[1, 2, 1], dropout_rate=0.3, dec_hidden_dim=1024, att_dim=512, conv_channels=10, conv_kernel_size=201, att_odim=800, output_dim=500)) log_probs, prediction, ws = model(data, ilens, ys) p_lens = [p.size() for p in prediction] t_lens = [t.size() for t in ys]
return c, w class Decoder(torch.nn.Module): def __init__(self, input_dim, embedding_dim, encoder_dim, att_dim, hidden_dim, output_dim): # index 0 is padding, index 1 is GO symbol self.input_layer = torch.nn.Linear(input_dim + 2, embedding_dim) self.rnn_cell = torch.nn.LSTMCell(embedding_dim + encoder_dim, hidden_dim) self.output_layer = torch.nn.Linear(hidden_dim, output_dim) self.attention = AttLoc(encoder_dim=encoder_dim, decoder_dim=hidden_dim, att_dim=att_dim, conv_channels=100, conv_kernel_size=10) def forward_step(self, token, last_hidden_state, encoder_state): self.rnn_cell() if __name__ == '__main__': data = cc(torch.randn(32, 321, 13)) ilens = np.ones((32,), dtype=np.int64) * 121 net = cc(Encoder(13, 320, 4, [1, 2, 2, 1], dropout_rate=0.3, output_dim=512)) emb = cc(EmbeddingLayer(embedding_dim=512, n_latent=300)) out, ilens = net(data, ilens) print(out.size()) distr, out = emb(out) print(distr.size(), out.size()) #att = cc(AttLoc(640, 320, 300, 100, 10)) #att.reset() #dec = cc(Variable(torch.randn(32, 320))) #context, weights = att(output, dec, None) #print(context.size(), weights.size(), weights[0]) #dec = cc(Variable(torch.randn(32, 320))) #context, weights = att(output, dec, weights) #print(context.size(), weights.size(), weights[0])
self.conv_layer3 = nn.Sequential( nn.ConvTranspose2d(128, 64, stride=2, kernel_size=5, padding=2, output_padding=1), nn.ReLU(), nn.ConvTranspose2d(64, out_channel, stride=2, kernel_size=5, padding=2, output_padding=1)) def forward(self, x): out = self.conv_layer1(x) for layer in self.conv_layer2s: res = F.relu(layer(out)) out = out + res out = self.conv_layer3(out) return out if __name__ == '__main__': enc = cc(Encoder()) dec = cc(Decoder()) data = cc(torch.randn(16, 3, 128, 128)) e = enc(data) d = dec(e) print(d.size())
def build_model(self): hps = self.hps ns = self.hps.ns enc_mode = self.enc_mode seg_len = self.hps.seg_len enc_size = self.hps.enc_size emb_size = self.hps.emb_size betas = (0.5, 0.9) #---stage one---# self.Encoder = cc( Encoder(ns=ns, dp=hps.enc_dp, enc_size=enc_size, seg_len=seg_len, enc_mode=enc_mode)) self.Decoder = cc( Decoder(ns=ns, c_in=enc_size, c_h=emb_size, c_a=hps.n_speakers, seg_len=seg_len)) self.SpeakerClassifier = cc(SpeakerClassifier(ns=ns, c_in=enc_size * enc_size if enc_mode == 'binary' else \ (2*enc_size if enc_mode == 'multilabel_binary' else enc_size), \ c_h=emb_size, n_class=hps.n_speakers, dp=hps.dis_dp, seg_len=seg_len)) #---stage one opts---# params = list(self.Encoder.parameters()) + list( self.Decoder.parameters()) self.ae_opt = optim.Adam(params, lr=self.hps.lr, betas=betas) self.clf_opt = optim.Adam(self.SpeakerClassifier.parameters(), lr=self.hps.lr, betas=betas) #---stage two---# if self.g_mode == 'naive': self.Generator = cc( Decoder(ns=ns, c_in=enc_size, c_h=emb_size, c_a=hps.n_speakers, seg_len=seg_len)) elif self.g_mode == 'targeted' or self.g_mode == 'targeted_residual': self.Generator = cc(Decoder(ns=ns, c_in=enc_size, c_h=emb_size, c_a=hps.n_target_speakers, seg_len=seg_len, \ output_mask=True if self.g_mode == 'targeted_residual' else False)) elif self.g_mode == 'enhanced': self.Generator = cc( Enhanced_Generator(ns=ns, dp=hps.enc_dp, enc_size=1024, emb_size=1024, seg_len=seg_len, n_speakers=hps.n_speakers)) elif self.g_mode == 'spectrogram': self.Generator = cc( Spectrogram_Patcher(ns=ns, c_in=513, c_h=emb_size, c_a=hps.n_target_speakers, seg_len=seg_len)) elif self.g_mode == 'tacotron': self.Generator = cc( Tacotron(enc_size, hps.n_target_speakers, mel_dim=hp.n_mels, linear_dim=int(hp.n_fft / 2) + 1)) self.tacotron_input_lengths = torch.tensor( [self.hps.seg_len // 8 for _ in range(hps.batch_size)]) else: raise NotImplementedError('Invalid Generator mode!') self.PatchDiscriminator = cc(nn.DataParallel(PatchDiscriminator(ns=ns, n_class=hps.n_speakers \ if self.g_mode == 'naive' else hps.n_target_speakers, seg_len=seg_len))) #---stage two opts---# self.gen_opt = optim.Adam(self.Generator.parameters(), lr=self.hps.lr, betas=betas) self.patch_opt = optim.Adam(self.PatchDiscriminator.parameters(), lr=self.hps.lr, betas=betas) #---target classifier---# self.TargetClassifier = cc( nn.DataParallel(TargetClassifier(ns=ns, n_class=3, seg_len=seg_len))) #---target classifier opts---# self.tclf_opt = optim.Adam(self.TargetClassifier.parameters(), lr=self.hps.lr, betas=betas)
def forward(self, enc_output, enc_len, dec_input=None, tf_rate=1.0, max_dec_timesteps=500, sample=False): batch_size = enc_output.size(0) enc_len = enc_len.cpu().numpy().tolist() if dec_input is not None: pad_dec_input_in = dec_input[0] pad_dec_input_out = dec_input[1] # get length info batch_size, olength = pad_dec_input_out.size( 0), pad_dec_input_out.size(1) # map idx to embedding dec_input_embedded = self.embedding(pad_dec_input_in) # initialization dec_c = self.zero_state(enc_output) dec_h = self.zero_state(enc_output) attn = None logits, prediction, attns = [], [], [] # loop for each timestep olength = max_dec_timesteps if not dec_input else olength for t in range(olength): if dec_input is not None: # teacher forcing tf = True if np.random.random_sample() <= tf_rate else False if tf or t == 0: emb = dec_input_embedded[:, t, :] else: self.embedding(prediction[-1]) else: if t == 0: bos = cc( torch.Tensor([self.bos for _ in range(batch_size) ]).type(torch.LongTensor)) emb = self.embedding(bos) else: emb = self.embedding(prediction[-1]) logit, dec_h, dec_c, attn = \ self.forward_step(emb, dec_h, dec_c, attn, enc_output, enc_len) attns.append(attn) logits.append(logit) if not sample: prediction.append(torch.argmax(logit, dim=-1)) else: sampled_indices = Categorical(logits=logit).sample() prediction.append(sampled_indices) logits = torch.stack(logits, dim=1) # batch x length x output_dim log_probs = F.log_softmax(logits, dim=2) prediction = torch.stack(prediction, dim=1) # batch x length attns = torch.stack(attns, dim=1) # batch x length x enc_len # get the log probs of the true label # batch x length if dec_input: dec_output_log_probs = torch.gather( log_probs, dim=2, index=pad_dec_input_out.unsqueeze(2)).squeeze(2) else: dec_output_log_probs = torch.gather( log_probs, dim=2, index=prediction.unsqueeze(2)).squeeze(2) # label smoothing : q'(y|x) = (1-e)*q(y|x) + e*u(y) if self.ls_weight > 0: loss_reg = torch.sum(log_probs * self.vlabeldist, dim=2) # u(y) dec_output_log_probs = ( 1 - self.ls_weight ) * dec_output_log_probs + self.ls_weight * loss_reg return logits, dec_output_log_probs, prediction, attns