def main_forward( self, length: int, sampling_policy: SamplingPolicy, num_generate: int, local_array: np.ndarray = None, s_one: np.ndarray = None, ): if self.model.with_local: with chainer.using_config("train", False), chainer.using_config( "enable_backprop", False): local_array = self.model.forward_encode(l_array=local_array, s_one=s_one).data c = self.xp.zeros([num_generate], dtype=np.float32) c = encode_single(c, bit=self.single_bit) hidden_coarse = self.model.gru.init_hx(local_array)[0].data if self.use_cpp_inference and sampling_policy == SamplingPolicy.random: import yukarin_autoreg_cpp wave = np.zeros((length, num_generate), dtype=np.int32) yukarin_autoreg_cpp.inference( batch_size=num_generate, length=length, output=wave, x=to_numpy(c), l_array=to_numpy(self.xp.transpose(local_array, (1, 0, 2))), hidden=to_numpy(hidden_coarse), ) else: if sampling_policy == SamplingPolicy.random: fast_forward_params = get_fast_forward_params(self.model) w_list = fast_generate( length=length, x=c, l_array=local_array, h=hidden_coarse, **fast_forward_params, ) else: w_list = [] hc = hidden_coarse for i in tqdm(range(length), desc="generate"): with chainer.using_config("train", False), chainer.using_config( "enable_backprop", False): c, hc = self.model.forward_one( prev_x=c, prev_l=local_array[:, i], hidden=hc, ) if sampling_policy == SamplingPolicy.random: is_random = True elif sampling_policy == SamplingPolicy.maximum: is_random = False else: raise ValueError(sampling_policy) c = self.model.sampling(c, maximum=not is_random) w_list.append(c) wave = self.xp.stack(w_list) wave = cuda.to_cpu(wave) wave = wave.T wave = decode_single(wave, bit=self.single_bit) if self.mulaw: wave = decode_mulaw(wave, mu=2**self.single_bit) return [ Wave(wave=w_one, sampling_rate=self.sampling_rate) for w_one in wave ]
def generate( self, time_length: float, sampling_policy: SamplingPolicy, num_generate: int, local_array: Union[numpy.ndarray, Tensor] = None, speaker_nums: Union[Sequence[int], Tensor] = None, ): assert num_generate <= self.max_batch_size assert local_array is None or len(local_array) == num_generate assert speaker_nums is None or len(speaker_nums) == num_generate length = int(self.sampling_rate * time_length) if local_array is None: local_array = torch.empty((num_generate, length, 0)).float() local_array = to_tensor(local_array).to(self.device) if speaker_nums is not None: speaker_nums = to_tensor(speaker_nums).reshape( (-1, )).to(self.device) with torch.no_grad(): s_one = self.predictor.forward_speaker(speaker_nums) else: s_one = None if self.predictor.with_local: with torch.no_grad(): local_array = self.predictor.forward_encode( l_array=local_array, s_one=s_one) x = numpy.zeros(num_generate, dtype=numpy.float32) x = encode_single(x, bit=self.bit_size) hidden = numpy.zeros( (num_generate, self.predictor.gru.hidden_size), dtype=numpy.float32, ) if sampling_policy == SamplingPolicy.corrected_random: low_probability_threshold = -18 else: low_probability_threshold = -999 if self.use_fast_inference and self.use_gpu: assert sampling_policy in [ SamplingPolicy.random, ] import yukarin_autoreg_cpp wave = numpy.zeros((length, num_generate), dtype=numpy.int32) yukarin_autoreg_cpp.inference( batch_size=num_generate, length=length, output=wave, x=x.astype(numpy.int32), l_array=to_numpy(local_array.transpose(0, 1)), hidden=to_numpy(hidden), ) elif self.use_fast_inference and not self.use_gpu: assert sampling_policy == SamplingPolicy.random params = get_fast_forward_params(self.predictor) x_list = fast_generate( length=length, x=x, l_array=local_array.numpy(), h=hidden, **params, ) wave = numpy.stack(x_list) else: with torch.no_grad(): x = to_tensor(x).to(self.device) x_max = x hidden = to_tensor(hidden).to(self.device) x_list = [] for i in tqdm(range(length), desc="generate"): d_max, _ = self.predictor.forward_one( prev_x=x_max, prev_l=local_array[:, i], hidden=hidden) d, hidden = self.predictor.forward_one( prev_x=x, prev_l=local_array[:, i], hidden=hidden) if sampling_policy == SamplingPolicy.maximum: is_random = False else: is_random = True d[F.log_softmax(d_max.double(), dim=1) < low_probability_threshold] -= 200 x = self.predictor.sampling(d, maximum=not is_random) x_max = self.predictor.sampling(d, maximum=True) x_list.append(x) wave = torch.stack(x_list).cpu().numpy() wave = wave.T wave = decode_single(wave, bit=self.bit_size) if self.mulaw: wave = decode_mulaw(wave, mu=2**self.bit_size) return [ Wave(wave=w_one, sampling_rate=self.sampling_rate) for w_one in wave ]
before_output = None for batch_size in [1, 2, 4]: x = base_x[:batch_size].clone() l_array = base_l_array[:, :batch_size].clone() hidden = base_hidden[:batch_size].clone() # x = model.xp.zeros_like(x) # l_array = model.xp.zeros_like(l_array) # hidden = model.xp.zeros_like(hidden) output = numpy.ones((length, batch_size), dtype=numpy.int32) * -1 r = yukarin_autoreg_cpp.inference( batch_size=batch_size, length=length, output=output, x=to_numpy(x), l_array=to_numpy(l_array), hidden=to_numpy(hidden), ) print(output) if before_output is not None: min_batch_size = min(before_output.shape[1], output.shape[1]) flag = numpy.all( before_output[:, :min_batch_size] == output[:, :min_batch_size]) print("before_output == output :", flag) before_output = output with torch.no_grad(): expected = torch.stack( fast_generate(
def generate( self, time_length: Optional[float], sampling_policy: SamplingPolicy, num_generate: int, coarse=None, local_array: np.ndarray = None, speaker_nums: List[int] = None, hidden_coarse=None, ): assert num_generate <= self.max_batch_size assert coarse is None or len(coarse) == num_generate assert local_array is None or len(local_array) == num_generate assert speaker_nums is None or len(speaker_nums) == num_generate assert hidden_coarse is None or len(hidden_coarse) == num_generate assert sampling_policy == SamplingPolicy.random length = int(self.sampling_rate * time_length) if local_array is None: local_array = self.xp.empty((num_generate, length, 0), dtype=np.float32) else: local_array = self.xp.asarray(local_array) if speaker_nums is not None: speaker_nums = self.xp.asarray(speaker_nums).reshape((-1, )) with chainer.using_config("train", False), chainer.using_config( "enable_backprop", False): s_one = self.model.forward_speaker(speaker_nums).data else: s_one = None if self.model.with_local: with chainer.using_config("train", False), chainer.using_config( "enable_backprop", False): local_array = self.model.forward_encode(l_array=local_array, s_one=s_one).data if coarse is None: c = self.xp.zeros([num_generate], dtype=np.float32) c = encode_single(c, bit=self.single_bit) else: c = coarse if hidden_coarse is None: hidden_coarse = self.model.gru.init_hx(local_array)[0].data wave = np.zeros((length, num_generate), dtype=np.int32) yukarin_autoreg_cpp.inference( batch_size=num_generate, length=length, output=wave, x=to_numpy(c), l_array=to_numpy(self.xp.transpose(local_array, (1, 0, 2))), hidden=to_numpy(hidden_coarse), ) wave = wave.T wave = decode_single(wave, bit=self.single_bit) if self.mulaw: wave = decode_mulaw(wave, mu=2**self.single_bit) return [ Wave(wave=w_one, sampling_rate=self.sampling_rate) for w_one in wave ]