Beispiel #1
0
 def predict(self, path):
     self.eval()
     # 加载音频数据集并执行短时傅里叶变换
     wav = data.load_audio(path)
     spec = data.spectrogram(wav)
     spec.unsqueeze_(0)
     out = self.cnn(spec)
     out_len = torch.tensor([out.size(-1)])
     # 把预测结果转换成文字
     text = self.decode(out, out_len)
     self.train()
     return text[0]
Beispiel #2
0
def predict(wav_path):
    wav = data.load_audio(wav_path)
    spec = data.spectrogram(wav)
    spec.unsqueeze_(0)
    with torch.no_grad():
        spec = spec.cuda()
        y = model.cnn(spec)
        y = F.softmax(y, 1)
    y_len = torch.tensor([y.size(-1)])
    y = y.permute(0, 2, 1)
    out, score, offset, out_len = decoder.decode(y, y_len)
    return translate(model.vocabulary, out[0][0], out_len[0][0])
def test(model, test_dir, save_dir, image_size):
    model.eval()
    test_dirs = utils.listdir_nohidden(test_dir)
    for sub_folder in test_dirs:
        save_test_dir = os.path.join(save_dir, os.path.basename(sub_folder))

        audio_feature_files = glob.glob(
            os.path.join(sub_folder, 'audio_sample/*.mat'))
        audio_feature_files = utils.sort_filename(audio_feature_files)
        image_test_file = os.path.join(sub_folder, 'image_sample.jpg')
        audio_test_file = os.path.join(sub_folder, 'audio_sample.wav')
        audio_duration = utils.get_wav_duration(audio_test_file)

        input_image = data.load_image(image_test_file, image_size)
        input_audios = [
            data.load_audio(audio_feature_file)
            for audio_feature_file in audio_feature_files
        ]
        input_images = [input_image] * len(input_audios)

        # convert to tensor
        input_images = torch.from_numpy(
            np.array(input_images).transpose(
                (0, 3, 1, 2))).cuda()  # (seq_len, c, h, w)
        input_audios = torch.from_numpy(
            np.array(input_audios).transpose((0, 3, 1, 2))).cuda()

        model_type = model.module.model_type() if isinstance(
            model, torch.nn.DataParallel) else model.model_type()

        with torch.no_grad():
            if model_type == 'RNN':
                input_images = input_images.unsqueeze(
                    0)  # (1, seq_len, c, h, w)
                input_audios = input_audios.unsqueeze(0)
                G_images = model(input_images,
                                 input_audios,
                                 valid_len=torch.tensor(
                                     [input_audios.shape[1]],
                                     dtype=torch.int32).cuda(),
                                 teacher_forcing_ratio=0)
                G_images = G_images.squeeze(0)
            else:
                G_images = model(input_images, input_audios)
        utils.save_video(audio_duration, audio_test_file,
                         G_images.cpu().detach().numpy(), save_test_dir)
        model.train()