def build_model(self): if self.args.model_type == 'AdaVAEd': speaker_encoder = VoiceEncoder() self.model = cc(AdaVAEd(self.config, speaker_encoder)) elif self.args.model_type == 'AdaVAE' or self.args.model_type== 'AdaVAEaug': self.model = cc(AdaVAE(self.config)) elif self.args.model_type == 'AdaVAEavg': self.model = cc(AdaVAEavg(self.config)) elif self.args.model_type == 'AdaAE': self.model = cc(AdaAE(self.config)) elif self.args.model_type == 'AdaVAEVAE': self.model = cc(AdaVAEVAE(self.config)) else: print('No model') self.model.eval() if self.args.use_wavenet: from wavenet import build_model self.vocoder = cc(build_model()) if self.args.infer_sproofing: from classify import SpeakerClassifier self.Speakerclassifier = cc(SpeakerClassifier(n_class=len(self.speaker2id))) self.Speakerclassifier.eval() return
def build_model(self): if self.args.model_type == 'AE': self.model = cc(AE(self.config)) elif self.args.model_type == 'VQVAE': speaker_encoder = VoiceEncoder() self.model = cc(VQVAE(self.config, speaker_encoder)) elif self.args.model_type == 'DAE': speaker_encoder = VoiceEncoder() self.model = cc(AE_D(self.config, speaker_encoder)) elif self.args.model_type == 'AutoVC': speaker_encoder = VoiceEncoder() self.model = cc(AutoVC(32, 256, 512, 32, speaker_encoder)) elif self.args.model_type == 'Prosody': speaker_encoder = VoiceEncoder() self.model = cc(VQVAE_Prosody(self.config, speaker_encoder)) elif self.args.model_type == 'MBV': speaker_encoder = VoiceEncoder() self.model = cc(MBV(self.config, speaker_encoder)) elif self.args.model_type == 'NORM': speaker_encoder = VoiceEncoder() self.model = cc(MultipleNorm(self.config, speaker_encoder)) elif self.args.model_type == 'Attn': speaker_encoder = VoiceEncoder() self.model = cc(VQVAE_attn(self.config, speaker_encoder)) self.model.eval() if self.args.use_wavenet: from wavenet import build_model self.vocoder = cc(build_model()) return
def build_model(self): speaker_encoder = VoiceEncoder() self.model = cc(AE_D(self.config, speaker_encoder)) self.model.eval() if self.args.use_wavenet: from wavenet import build_model self.vocoder = cc(build_model()) return
def build_model(self): if self.args.model_type == 'AdaVAEd': speaker_encoder = VoiceEncoder() self.model = cc(AdaVAEd(self.config, speaker_encoder)) elif self.args.model_type == 'AdaVAE': self.model = cc(AdaVAE(self.config)) elif self.args.model_type == 'AdaVAEavg': self.model = cc(AdaVAEavg(self.config)) else: print('No model') self.model.eval() if self.args.use_wavenet: from wavenet import build_model self.vocoder = cc(build_model()) return
def build_model(self): if self.args.model_type == 'AdaVAEGAN': self.model = cc(AdaVAEGAN(self.config)) self.patch_model = cc(Decoder(**self.config['Decoder'])) self.patch_model.eval() elif self.args.model_type == 'AdaVAE': self.model = cc(AdaVAEGAN(self.config)) else: print('No model') self.model.eval() if self.args.use_wavenet: from wavenet import build_model self.vocoder = cc(build_model()) if self.args.infer_sproofing: from classify import SpeakerClassifier self.Speakerclassifier = cc(SpeakerClassifier(n_class=len(self.speaker2id))) self.Speakerclassifier.eval()
sorted_datas = sorted(zip(all_datas, all_labels), key=lambda x: x[0].shape[1]) all_datas = [] all_labels = [] for _ in sorted_datas: all_datas.append(_[0]) all_labels.append(_[1]) mkdirs(args.output_dir) #ocr = OCR(args.weights_path, label_text_path, config=config) from wavenet import build_model # config = json.load(open(config_path, 'r')) old_config_path = './logdir/15_mar/sompo/keras/basemodel.json' label_text = json.load(open('./datasets/label_text_4855.json', 'r')) basemodel = build_model(len(label_text) + 1, old_config_path) basemodel.load_weights( './logdir/dc3/test/keras/basemodel_retrain_485000.h5') model = tf.keras.models.Model(basemodel.input, basemodel.outputs[-1]) ocr = OCR(None, label_text_path, config=None, model=model) eval_ocr = Eval(ocr) output = eval_ocr.export_report_jp(all_datas, all_labels, name=name, max_sample_per_file=1000, output_dir_report=args.output_dir, output_dir_image='{}/eval_imgs'.format( args.output_dir))
1,2,4,8,16,32,64,128,256,512, 1,2,4,8,16,32,64,128,256,512] impulse = {0: './impulse/impulse.wav'} generated_sample = np.load(seed_audio_path) generated_sample = generated_sample.tolist() generated_sample = q_to_one_hot(generated_sample, input_dim).astype(np.uint8) generated_sample = generated_sample[generated_sample.shape[0]-sample_len:generated_sample.shape[0]] # sample_list = [] # sample_list.append(generated_sample) # sample_list = pad_sequences(sample_list, maxlen=sample_len, padding='post') # generated_sample = sample_list[0] generated_sample = np.reshape(generated_sample, (1, sample_len, input_dim)) pred_seed = np.reshape(generated_sample, (-1, input_dim)) model = build_model(sample_len, dilation_factor) model.load_weights(weight_path) generation_step = sr * sec prev_sample = -1 equal_cnt = 0 impulse_idx = 0 for i in range(generation_step): preds = model.predict(np.expand_dims(pred_seed, 0)) # prediction with the model sampled = sample(preds[0][-1]) # multinomial sampling # To prevent dead silence. if sampled == prev_sample: equal_cnt += 1 else: equal_cnt = 0
run_data_init([train_initer], shuffle=True, sess=sess, batch_size=batch_size) # In[9]: # %%time imgs, labels, train_paths, train_losses, train_preds = sess.run(train_tensors) print(imgs.shape) imgs = np.clip(imgs, 0, 1) plot_images(imgs[..., 0], mxn=[3, 1], dpi=200) # # 2. Build model # In[10]: from wavenet import build_model basemodel = build_model(len(label_text) + 1, ngf=64) # basemodel = Model.from_config(read_json('../lib-ocr/ocr/anson/model_config_files')) basemodel.summary() # ## 2.2 Build predict tensors # For ctc loss only we take the 2nd output # # For entropy loss we take the 1st output # In[11]: logits_train, preds_train = basemodel(train_tensors[0]) preds_test = basemodel(test_tensors[0])[1] # ## 2.3 Build training ops