Example #1
0
def predict(hparams, model_dir, checkpoint_path, output_dir, test_files):
    audio = Audio(hparams)

    def predict_input_fn():
        records = tf.data.TFRecordDataset(list(test_files))
        dataset = DatasetSource(records, hparams)
        batched = dataset.make_source_and_target().group_by_batch(
            batch_size=1).arrange_for_prediction()
        return batched.dataset

    estimator = WaveNetModel(hparams, model_dir)

    predictions = map(
        lambda p: PredictedAudio(p["id"], p["key"], p["predicted_waveform"], p[
            "ground_truth_waveform"], p["mel"], p["text"]),
        estimator.predict(predict_input_fn, checkpoint_path=checkpoint_path))

    for v in predictions:
        key = v.key.decode('utf-8')
        audio_filename = f"{key}.wav"
        audio_filepath = os.path.join(output_dir, audio_filename)
        tf.logging.info(f"Saving {audio_filepath}")
        audio.save_wav(v.predicted_waveform, audio_filepath)
        png_filename = f"{key}.png"
        png_filepath = os.path.join(output_dir, png_filename)
        tf.logging.info(f"Saving {png_filepath}")
        # ToDo: pass global step
        plot_wav(png_filepath, v.predicted_waveform, v.ground_truth_waveform,
                 key, 0, v.text.decode('utf-8'), hparams.sample_rate)
Example #2
0
def main(args, hp):
    with torch.no_grad():
        model = VoiceFilter(hp).cuda()
        chkpt_model = torch.load(args.checkpoint_path)['model']
        model.load_state_dict(chkpt_model)
        model.eval()

        embedder = SpeechEmbedder(hp).cuda()
        chkpt_embed = torch.load(args.embedder_path)
        embedder.load_state_dict(chkpt_embed)
        embedder.eval()

        audio = Audio(hp)
        ref_wav, _ = librosa.load(args.reference_file, sr=16000)
        ref_mel = audio.get_mel(ref_wav)
        ref_mel = torch.from_numpy(ref_mel).float().cuda()
        dvec = embedder(ref_mel)
        dvec = dvec.unsqueeze(0)

        mixed_wav, _ = librosa.load(args.mixed_file, sr=16000)
        mixed_mag, mixed_phase = audio.wav2spec(mixed_wav)
        mixed_mag = torch.from_numpy(mixed_mag).float().cuda()

        mixed_mag = mixed_mag.unsqueeze(0)
        shadow_mag = model(mixed_mag, dvec)

        shadow_mag = shadow_mag[0].cpu().detach().numpy()
        recorded_mag = tensor_normalize(mixed_mag + shadow_mag)
        recorded_mag = recorded_mag[0].cpu().detach().numpy()
        recorded_wav = audio.spec2wav(recorded_mag, mixed_mag)

        os.makedirs(args.out_dir, exist_ok=True)
        out_path = os.path.join(args.out_dir, 'result.wav')
        librosa.output.write_wav(out_path, recorded_wav, sr=16000)
Example #3
0
    def __init__(self, model_path, out_dir, text_file, sil_file,
                 use_griffin_lim, hparams):
        self.model_path = model_path
        self.out_dir = out_dir
        self.text_file = text_file
        self.sil_file = sil_file
        self.use_griffin_lim = use_griffin_lim
        self.hparams = hparams

        self.model = get_model(model_path, hparams)
        self.audio_class = Audio(hparams)

        if hparams.use_phone:
            from text.phones import Phones
            phone_class = Phones(hparams.phone_set_file)
            self.text_to_sequence = phone_class.text_to_sequence
        else:
            from text import text_to_sequence
            self.text_to_sequence = text_to_sequence

        # self.out_png_dir = os.path.join(self.out_dir, 'png')
        # os.makedirs(self.out_png_dir, exist_ok=True)

        self.out_wav_dir = os.path.join(self.out_dir, 'wav')
        os.makedirs(self.out_wav_dir, exist_ok=True)
Example #4
0
def main(args, hp):
    model = VoiceFilter(hp).cuda()
    chkpt_model = torch.load(args.checkpoint_path)['model']
    model.load_state_dict(chkpt_model)
    model.eval()

    embedder = SpeechEmbedder(hp).cuda()
    chkpt_embed = torch.load(args.embedder_path)
    embedder.load_state_dict(chkpt_embed)
    embedder.eval()

    audio = Audio(hp)
    dvec_wav, _ = librosa.load(args.reference_file, sr=16000)
    dvec_mel = audio.get_mel(dvec_wav)
    dvec_mel = torch.from_numpy(dvec_mel).float().cuda()
    dvec = embedder(dvec_mel)
    dvec = dvec.unsqueeze(0)

    mixed_wav, _ = librosa.load(args.mixed_file, sr=16000)
    mag, phase = audio.wav2spec(mixed_wav)
    mag = torch.from_numpy(mag).float().cuda()

    mag = mag.unsqueeze(0)
    mask = model(mag, dvec)
    est_mag = mag * mask

    est_mag = est_mag[0].cpu().detach().numpy()
    est_wav = audio.spec2wav(est_mag, phase)

    os.makedirs(args.out_dir, exist_ok=True)
    out_path = os.path.join(args.out_dir, 'result.wav')
    librosa.output.write_wav(out_path, est_wav, sr=16000)
    def __init__(self, model_path, out_dir, text_file, sil_file,
                 use_griffin_lim, gen_wavenet_fea, hparams):
        self.out_dir = out_dir
        self.text_file = text_file
        self.sil_file = sil_file
        self.use_griffin_lim = use_griffin_lim
        self.gen_wavenet_fea = gen_wavenet_fea
        self.hparams = hparams

        self.model = get_model(model_path, hparams)
        self.audio_class = Audio(hparams)

        if hparams.use_phone:
            from text.phones import Phones
            phone_class = Phones(hparams.phone_set_file)
            self.text_to_sequence = phone_class.text_to_sequence
        else:
            from text import text_to_sequence
            self.text_to_sequence = text_to_sequence

        if hparams.is_multi_speakers and not hparams.use_pretrained_spkemb:
            self.speaker_id_dict = gen_speaker_id_dict(hparams)

        self.out_png_dir = os.path.join(self.out_dir, 'png')
        os.makedirs(self.out_png_dir, exist_ok=True)
        if self.use_griffin_lim:
            self.out_wav_dir = os.path.join(self.out_dir, 'wav')
            os.makedirs(self.out_wav_dir, exist_ok=True)
        if self.gen_wavenet_fea:
            self.out_mel_dir = os.path.join(self.out_dir, 'mel')
            os.makedirs(self.out_mel_dir, exist_ok=True)
    def __init__(self, hp, args, train):
        def find_all(file_format):
            # return sorted(glob.glob(os.path.join(self.data_dir, file_format)))
            return sorted(
                glob.glob(os.path.join(self.data_dir, '**', file_format),
                          recursive=True))
            # return sorted(glob.glob(os.path.join(self.data_dir, file_format)))

        self.hp = hp
        self.args = args
        self.train = train
        self.data_dir = hp.data.train_dir if train else hp.data.test_dir

        self.dvec_list = find_all(hp.form.dvec)
        self.target_wav_list = find_all(hp.form.target.wav)
        self.mixed_wav_list = find_all(hp.form.mixed.wav)
        self.target_mag_list = find_all(hp.form.target.mag)
        self.mixed_mag_list = find_all(hp.form.mixed.mag)

        assert len(self.dvec_list) == len(self.target_wav_list) == len(self.mixed_wav_list) == \
            len(self.target_mag_list) == len(self.mixed_mag_list), "number of training files must match"
        assert len(self.dvec_list) != 0, \
            "no training file found"

        self.audio = Audio(hp)
 def __init__(self,
              in_dir,
              out_dir,
              hparams,
              speaker_info_filename='speaker-info.txt'):
     self.in_dir = in_dir
     self.out_dir = out_dir
     self.speaker_info_filename = speaker_info_filename
     self.audio = Audio(hparams)
Example #8
0
class VFDataset(Dataset):
    def __init__(self, train):
        def find_all(file_format):
            return sorted(glob.glob(os.path.join(self.data_dir, file_format)))

        self.train = train
        self.data_dir = config.data['base_dir'] + config.data[
            'train_dir'] if train else config.data['base_dir'] + config.data[
                'test_dir']

        self.dvec_list = find_all(config.form['dvec'])
        self.target_wav_list = find_all(config.form['target']['wav'])
        self.mixed_wav_list = find_all(config.form['mixed']['wav'])
        self.target_mag_list = find_all(config.form['target']['mag'])
        self.mixed_mag_list = find_all(config.form['mixed']['mag'])

        assert len(self.dvec_list) == len(self.target_wav_list) == len(self.mixed_wav_list) == \
            len(self.target_mag_list) == len(self.mixed_mag_list), "number of training files must match"
        assert len(self.dvec_list) != 0, \
            "no training file found"

        self.audio = Audio()

    def __len__(self):
        return len(self.dvec_list)

    def __getitem__(self, idx):
        with open(self.dvec_list[idx], 'r') as f:
            dvec_path = f.readline().strip()

        dvec_wav, _ = librosa.load(config.data['base_dir'] + dvec_path,
                                   sr=config.audio['sample_rate'])
        dvec_mel = self.audio.get_mel(dvec_wav)
        dvec_mel = torch.from_numpy(dvec_mel).float()

        if self.train:  # need to be fast
            target_mag = torch.load(self.target_mag_list[idx])
            mixed_mag = torch.load(self.mixed_mag_list[idx])
            return dvec_mel, target_mag, mixed_mag
        else:
            target_wav, _ = librosa.load(self.target_wav_list[idx],
                                         config.audio['sample_rate'])
            mixed_wav, _ = librosa.load(self.mixed_wav_list[idx],
                                        config.audio['sample_rate'])
            target_mag, _ = self.wav2magphase(self.target_wav_list[idx])
            mixed_mag, mixed_phase = self.wav2magphase(
                self.mixed_wav_list[idx])
            target_mag = torch.from_numpy(target_mag)
            mixed_mag = torch.from_numpy(mixed_mag)
            # mixed_phase = torch.from_numpy(mixed_phase)
            return dvec_mel, target_wav, mixed_wav, target_mag, mixed_mag, mixed_phase

    def wav2magphase(self, path):
        wav, _ = librosa.load(path, config.audio['sample_rate'])
        mag, phase = self.audio.wav2spec(wav)
        return mag, phase
Example #9
0
class VFDataset(Dataset):
    def __init__(self, hp, args, train):
        def find_all(file_format):
            return sorted(glob.glob(os.path.join(self.data_dir, file_format)))

        self.hp = hp
        self.args = args
        self.train = train
        self.data_dir = hp.data.train_dir if train else hp.data.test_dir

        self.dvec_list = find_all(hp.form.dvec)
        self.target_wav_list = find_all(hp.form.target.wav)
        self.mixed_wav_list = find_all(hp.form.mixed.wav)
        self.target_mag_list = find_all(hp.form.target.mag)
        self.mixed_mag_list = find_all(hp.form.mixed.mag)

        assert len(self.dvec_list) == len(self.target_wav_list) == len(self.mixed_wav_list) == \
            len(self.target_mag_list) == len(self.mixed_mag_list), "number of training files must match"
        assert len(self.dvec_list) != 0, \
            "no training file found"

        self.audio = Audio(hp)

    def __len__(self):
        return len(self.dvec_list)

    def __getitem__(self, idx):
        with open(self.dvec_list[idx], 'r') as f:
            dvec_path = f.readline().strip()

        dvec_wav, _ = librosa.load(dvec_path, sr=self.hp.audio.sample_rate)
        dvec_mel = self.audio.get_mel(dvec_wav)
        dvec_mel = torch.from_numpy(dvec_mel).float()

        if self.train:  # need to be fast
            target_mag = torch.load(self.target_mag_list[idx])
            mixed_mag = torch.load(self.mixed_mag_list[idx])
            return dvec_mel, target_mag, mixed_mag
        else:
            target_wav, _ = librosa.load(self.target_wav_list[idx],
                                         self.hp.audio.sample_rate)
            mixed_wav, _ = librosa.load(self.mixed_wav_list[idx],
                                        self.hp.audio.sample_rate)
            target_mag, _ = self.wav2magphase(self.target_wav_list[idx])
            mixed_mag, mixed_phase = self.wav2magphase(
                self.mixed_wav_list[idx])
            target_mag = torch.from_numpy(target_mag)
            mixed_mag = torch.from_numpy(mixed_mag)
            # mixed_phase = torch.from_numpy(mixed_phase)
            return dvec_mel, target_wav, mixed_wav, target_mag, mixed_mag, mixed_phase

    def wav2magphase(self, path):
        wav, _ = librosa.load(path, self.hp.audio.sample_rate)
        mag, phase = self.audio.wav2spec(wav)
        return mag, phase
class SoundRecognitionApp():
    def __init__(self, cfg) -> None:
        self.transformer = Signal2ImageTransformer(**cfg['transforms'])
        self.audio = Audio(cfg['audio'])
        self.load_model(cfg['model'])
        pass

    def run(self):
        print("============= REALTIME START ==============")
        self.audio.start()
        self.flag = True

        try:
            while self.flag:
                status, data = self.audio.get()
                if status == Audio.ERROR:
                    print('[error]')
                    break
                elif status == Audio.WAIT:
                    continue
                mel_spec = self.preprocess(data)
                result = self.inference(mel_spec)
        except KeyboardInterrupt:
            pass
        except Exception as e:
            print(e)
        finally:
            self.audio.stop()
        print("============= REALTIME FINISH ==============")

    def preprocess(self, signal):
        return np.expand_dims(self.transformer.transform(signal), axis=0)

    def inference(self, X):
        image = torch.from_numpy(X.astype(np.float32)).clone()
        image.to(self.device).float()
        prob = self.model(image)['multilabel_proba'].detach().cpu().numpy()
        return prob

    def load_model(self, cfg):
        try:
            self.device = torch.device(cfg["device"])
            self.model = getattr(ml.my_model, cfg['name'])(**cfg['params'])
            self.model.load_state_dict(torch.load(cfg['path']))
            self.model.to(self.device)
        except AttributeError as e:
            print(f"Model {cfg['name']} is None. {e}")
            exit(1)
        except FileNotFoundError as e:
            print(f"{e}")
            exit(1)
        except Exception as e:
            print(f"{e}")
            exit(1)
Example #11
0
def crop_media(candidates, base_path, out_path='for_axlotl'):
    for session_id, session in candidates.items():
        result_top_path = os.path.join(out_path, session_id)
        if not os.path.isdir(result_top_path):
            os.makedirs(result_top_path)
        for text_path, contents in session.items():
            # the url element should always have a single file
            paths = create_local_paths(base_path, (text_path, '',
                                                   contents['urls'][0][1]))
            result_basename = os.path.basename(paths['audio_path']).\
                                                                  split('.')[0]
            result_text =  os.path.join(result_top_path, result_basename)+\
                                                                         '.txt'
            # if audio file exists
            if os.path.isfile(paths['audio_path']):
                # if there is only one speaker in the intervention
                if os.path.isfile(result_text):
                    msg = 'skipping. processed text file %s exists'%result_text
                    logging.info(msg)
                else:
                    if len(contents['text']) == 1:
                        text = contents['text'][0][1]
                        full_text = ' '.join(tokenize(text)).lower()
                        clean_text = re.sub(token_clean, '', full_text)
                        audio_file = Audio(paths['audio_path'])
                        trimmer = Trimmer(clean_text, audio_file)
                        try:
                            start, end, start_word_i, end_word_i =\
                                                        trimmer.crop_longaudio()
                        except Exception as e:
                            print(e)
                            print((clean_text[:100], clean_text[-100:]))
                            raise ValueError()
                        print(text_path)
                        if start and end:
                            msg = '%s start and end matched for cropping'\
                                  %contents['text']
                            full_words = full_text.split()
                            new_text = ' '.join(full_words[start_word_i:\
                                                           end_word_i])
                            print(start, end, new_text[:100],
                                              new_text[-100:])
                            with open(result_text, 'w') as out:
                                out.write(new_text)
                            result_audio = audio_file.segment(start=start,
                                                              end=end+0.2,
                                                       outpath=result_top_path)
                            print(result_audio, result_text)
                        else:
                            msg = '%s matching the start and end failed'\
                                  %contents['text']
Example #12
0
 def __init__(self,
              model: tf.keras.models.Model,
              log_dir: str,
              config: dict,
              max_plot_frequency=10,
              default_writer='log_dir'):
     self.model = model
     self.log_dir = Path(log_dir)
     self.config = config
     self.audio = Audio(config)
     self.plot_frequency = max_plot_frequency
     self.default_writer = default_writer
     self.writers = {}
     self.add_writer(tag=default_writer, path=self.log_dir, default=True)
Example #13
0
class LJSpeech:
    def __init__(self, in_dir, out_dir, hparams):
        self.in_dir = in_dir
        self.out_dir = out_dir
        self.audio = Audio(hparams)

    def text_and_audio_path_rdd(self, sc: SparkContext):
        return sc.parallelize(self._extract_all_text_and_audio_path())

    def process_data(self, rdd: RDD):
        return rdd.mapValues(self._process_source_and_target)

    def _extract_text_and_path(self, line, index):
        parts = line.strip().split('|')
        key = parts[0]
        wav_path = os.path.join(self.in_dir, 'wavs', '%s.wav' % parts[0])
        text = parts[2]
        return TextAndAudioPath(index, key, wav_path, text)

    def _extract_all_text_and_audio_path(self):
        index = 1
        with open(os.path.join(self.in_dir, 'metadata.csv'),
                  mode='r',
                  encoding='utf-8') as f:
            for line in f:
                extracted = self._extract_text_and_path(line, index)
                if extracted is not None:
                    yield (index, extracted)
                    index += 1

    def _process_source_and_target(self, paths: TextAndAudioPath):
        wav = self.audio.load_wav(paths.wav_path)
        n_samples = len(wav)
        mel_spectrogram = self.audio.melspectrogram(wav).astype(np.float32).T
        n_frames = mel_spectrogram.shape[0]
        filename = f"{paths.key}.tfrecord"
        filepath = os.path.join(self.out_dir, filename)
        tfrecord.write_preprocessed_data(paths.id, paths.key, wav,
                                         mel_spectrogram, paths.text, filepath)
        return SourceAndTargetMetaData(paths.id, paths.key, n_samples,
                                       n_frames, filepath)

    def _process_mel(self, paths: TextAndAudioPath):
        wav = self.audio.load_wav(paths.wav_path)
        mel_spectrogram = self.audio.melspectrogram(wav).astype(np.float32).T
        sum_mel_powers = np.sum(mel_spectrogram, axis=1)
        n_frames = mel_spectrogram.shape[0]
        return MelMetaData(n_frames, sum_mel_powers)
class LJSpeech:
    def __init__(self, in_dir, mel_out_dir, wav_out_dir, hparams):
        self.in_dir = in_dir
        self.mel_out_dir = mel_out_dir
        self.wav_out_dir = wav_out_dir
        self.audio = Audio(hparams)

    @property
    def record_ids(self):
        return map(lambda v: str(v), range(1, 13101))

    def record_file_path(self, record_id, kind):
        assert kind in ["source", "target"]
        return os.path.join(self.mel_out_dir,
                            f"ljspeech-{kind}-{int(record_id):05d}.tfrecord")

    def text_and_path_rdd(self, sc: SparkContext):
        return sc.parallelize(self._extract_all_text_and_path())

    def process_wav(self, rdd: RDD):
        return rdd.mapValues(self._process_wav)

    def _extract_text_and_path(self, line, index):
        parts = line.strip().split('|')
        key = parts[0]
        text = parts[2]
        wav_path = os.path.join(self.in_dir, 'wavs', '%s.wav' % key)
        return TextAndPath(index, key, wav_path, None, text)

    def _extract_all_text_and_path(self):
        with open(os.path.join(self.in_dir, 'metadata.csv'),
                  mode='r',
                  encoding='utf-8') as f:
            for index, line in enumerate(f):
                extracted = self._extract_text_and_path(line, index)
                if extracted is not None:
                    yield (index, extracted)

    def _process_wav(self, paths: TextAndPath):
        wav = self.audio.load_wav(paths.wav_path)
        mel_spectrogram = self.audio.melspectrogram(wav).astype(np.float32).T
        mel_spectrogram = self.audio.normalize_mel(mel_spectrogram)

        mel_filepath = os.path.join(self.mel_out_dir, f"{paths.key}.mfbsp")
        wav_filepath = os.path.join(self.wav_out_dir, f"{paths.key}.wav")

        mel_spectrogram.tofile(mel_filepath, format="<f4")
        self.audio.save_wav(wav, wav_filepath)
Example #15
0
def main(args):
    args = {
        "config": 'config/config.yaml',
        "embedder_path": 'model/embedder.pt',
        "checkpoint_path": 'enhance_my_voice/chkpt_201000.pt',
        "mixed_file": 'utils/speakerA.wav',
        "reference_file": 'utils/speakerA.wav',
        "out_dir": 'output',
    }

    hp = HParam(args['config'])

    with torch.no_grad():
        model = VoiceFilter(hp).cuda()
        chkpt_model = torch.load(args['checkpoint_path'])['model']
        model.load_state_dict(chkpt_model)
        model.eval()

        embedder = SpeechEmbedder(hp).cuda()
        chkpt_embed = torch.load(args['embedder_path'])
        embedder.load_state_dict(chkpt_embed)
        embedder.eval()

        audio = Audio(hp)
        dvec_wav, _ = librosa.load(args['reference_file'], sr=16000)
        dvec_mel = audio.get_mel(dvec_wav)
        dvec_mel = torch.from_numpy(dvec_mel).float().cuda()
        dvec = embedder(dvec_mel)
        dvec = dvec.unsqueeze(0)

        mixed_wav, _ = librosa.load(args['mixed_file'], sr=16000)
        mag, phase = audio.wav2spec(mixed_wav)
        mag = torch.from_numpy(mag).float().cuda()

        mag = mag.unsqueeze(0)
        mask = model(mag, dvec)
        est_mag = mag * mask

        est_mag = est_mag[0].cpu().detach().numpy()
        # est_wav = audio.spec2wav(est_mag, phase)

        # os.makedirs(args['out_dir'], exist_ok=True)
        # out_path = os.path.join(args['out_dir'], 'result.wav')
        # librosa.output.write_wav(out_path, est_wav, sr=16000)
        return audio.spec2wav(est_mag, phase)
Example #16
0
    def __init__(self, setting_path: str, audio_path: str):
        """コンストラクタ

        Args:
            setting_path (str): 設定ファイルのパス
            audio_path (str): 音楽ファイルのパス
        """
        with open(setting_path, 'r') as f:
            cfg = yaml.load(f)

        self.cf = cf.ChangeFinder(**cfg['change_finder'])
        self.audio = Audio(cfg['audio'], audio_file_path=audio_path)

        self.buffer = np.zeros(cfg['model']['buffer_audio_length'],
                               dtype=np.float32)
        self.buf_num = int(cfg['model']['frame_buf_num'])
        self.spec_buf = []
        self.thr = float(cfg['model']['thr'])
Example #17
0
    def __init__(self, hp, train):
        def find_all(data_dir,file_format):
            return sorted(glob.glob(os.path.join(data_dir, file_format)))
        self.hp = hp
        self.train = train

        self.mixed_dir = hp.data.vfws_dir + 'mixed/train/' if train else hp.data.vfws_dir + 'mixed/test/'
        self.clean_dir = hp.data.vfws_dir + 'clean/train/' if train else hp.data.vfws_dir + 'clean/test/'

        self.target_wav_list = find_all(self.clean_dir, hp.form.target.wav)
        self.mixed_wav_list  = find_all(self.mixed_dir, hp.form.mixed.wav)
        self.target_mag_list = find_all(self.clean_dir, hp.form.target.mag)
        self.mixed_mag_list  = find_all(self.mixed_dir, hp.form.mixed.mag)

        assert len(self.target_wav_list) == len(self.mixed_wav_list) == \
            len(self.target_mag_list) == len(self.mixed_mag_list), "number of training files must match"

        self.audio = Audio(hp)
Example #18
0
def trainer(model_name):
    chkpt_path = None  #@param
    device = xm.xla_device()
    pt_dir = os.path.join('.', config.log['chkpt_dir'], model_name)
    os.makedirs(pt_dir, exist_ok=True)

    log_dir = os.path.join('.', config.log['log_dir'], model_name)
    os.makedirs(log_dir, exist_ok=True)

    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s - %(levelname)s - %(message)s',
                        handlers=[
                            logging.FileHandler(
                                os.path.join(
                                    log_dir,
                                    '%s-%d.log' % (model_name, time.time()))),
                            logging.StreamHandler()
                        ])
    logger = logging.getLogger()
    writer = MyWriter(log_dir)

    trainloader = create_dataloader(train=True)
    testloader = create_dataloader(train=False)

    embedder_pt = torch.load(
        '/drive/content/My Drive/ColabDisk/embedder_cpu.pt')
    embedder = SpeechEmbedder().to(device)
    embedder.load_state_dict(embedder_pt)
    embedder.eval()

    model = VoiceFilter().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=config.train['adam'])
    audio = Audio()

    starting_epoch = 1

    if chkpt_path is not None:
        logger.info("Resuming from checkpoint: %s" % chkpt_path)
        checkpoint_file = torch.load(chkpt_path)
        model.load_state_dict(checkpoint_file['model'])
        optimizer.load_state_dict(checkpoint_file['optimizer'])
        starting_epoch = checkpoint_file['epoch']
    else:
        logger.info("Starting new training run")

    for epoch in range(starting_epoch, config.train['epoch'] + 1):
        para_loader = pl.ParallelLoader(trainloader,
                                        [device]).per_device_loader(device)
        train(embedder, model, optimizer, para_loader, writer, logger, epoch,
              pt_dir, device)
        xm.master_print("Finished training epoch {}".format(epoch))
        logger.info("Starting to validate epoch...")
        para_loader = pl.ParallelLoader(testloader,
                                        [device]).per_device_loader(device)
        validate(audio, model, embedder, para_loader, writer, epoch, device)

    model_saver(model, optimizer, pt_dir, config.train['epoch'])
Example #19
0
    def __init__(self, train):
        def find_all(file_format):
            return sorted(glob.glob(os.path.join(self.data_dir, file_format)))
        self.train = train
        self.data_dir = config.data['records_dir'] + config.data['train_dir'] if train else config.data['records_dir'] + config.data['test_dir']

        self.dvec_list = find_all(config.form['dvec'])
        self.target_wav_list = find_all(config.form['target']['wav'])
        self.mixed_wav_list = find_all(config.form['mixed']['wav'])
        self.target_mag_list = find_all(config.form['target']['mag'])
        self.mixed_mag_list = find_all(config.form['mixed']['mag'])

        assert len(self.dvec_list) == len(self.target_wav_list) == len(self.mixed_wav_list) == \
            len(self.target_mag_list) == len(self.mixed_mag_list), "number of training files must match"
        assert len(self.dvec_list) != 0, \
            "no training file found"

        self.audio = Audio()
Example #20
0
    def __init__(self, config: configparser, debug: bool):
        super().__init__(command_prefix=determine_prefix,
                         description="NerdyBot - Always one step ahead!")

        self.config = config
        self.debug = debug
        self.client_id = config["bot"]["client_id"]
        self.token = config["bot"]["token"]
        self.ops = config["bot"]["ops"]
        self.moderator_role = config["bot"]["moderator_role_name"]
        self.modules = json.loads(config["bot"]["modules"])
        self.restart = True
        self.log = self._get_logger()
        self.uptime = datetime.utcnow()

        self.audio = Audio(self)
        self.last_cmd_cache = {}
        self.usr_cmd_err_spam = {}
        self.usr_cmd__err_spam_threshold = int(
            config["bot"]["error_spam_threshold"])
        self.convMan = ConversationManager(self)

        # database variables
        if "database" not in config:
            self.log.error(
                "No Database specified! Fallback to local SQLite Database!")
            db_connection_string = "sqlite:///db.db"
        else:
            database_config = config["database"]
            db_type = database_config["db_type"]
            db_name = database_config["db_name"]
            db_username = ""
            db_password = ""
            db_host = ""
            db_port = ""

            if any(s in db_type for s in ("mysql", "mariadb")):
                db_type = f'{database_config["db_type"]}+pymysql'
            if "db_password" in database_config and database_config[
                    "db_password"]:
                db_password = f':{database_config["db_password"]}'
            if "db_username" in database_config and database_config[
                    "db_username"]:
                db_username = database_config["db_username"]
            if "db_host" in database_config and database_config["db_host"]:
                db_host = f'@{database_config["db_host"]}'
            if "db_port" in database_config and database_config["db_port"]:
                db_port = f':{database_config["db_port"]}'

            db_authentication = f"{db_username}{db_password}{db_host}{db_port}"
            db_connection_string = f"{db_type}://{db_authentication}/{db_name}"

        self.ENGINE = create_engine(db_connection_string, echo=self.debug)
        self.SESSION = sessionmaker(bind=self.ENGINE, expire_on_commit=False)

        self.create_all()
        self._import_modules()
Example #21
0
def main(args, hp):
    with open('out1.txt') as f:
        for line in f:
            res = line.split('\t')
            with torch.no_grad():
                model = VoiceFilter(hp)
                chkpt_model = torch.load(args.checkpoint_path, map_location='cpu')['model']
                model.load_state_dict(chkpt_model)
                model.eval()

                embedder = SpeechEmbedder(hp)
                chkpt_embed = torch.load(args.embedder_path, map_location='cpu')
                embedder.load_state_dict(chkpt_embed)
                embedder.eval()

                audio = Audio(hp)
                dvec_wav, _ = librosa.load(res[1], sr=16000)
                dvec_mel = audio.get_mel(dvec_wav)
                dvec_mel = torch.from_numpy(dvec_mel).float()
                dvec = embedder(dvec_mel)
                dvec = dvec.unsqueeze(0)

                mixed_wav, _ = librosa.load(res[0], sr=16000)
                mag, phase = audio.wav2spec(mixed_wav)
                mag = torch.from_numpy(mag).float()

                mag = mag.unsqueeze(0)
                mask = model(mag, dvec)
                est_mag = mag * mask

                est_mag = est_mag[0].cpu().detach().numpy()
                est_wav = audio.spec2wav(est_mag, phase)

                os.makedirs('/root/voicefilter/res', exist_ok=True)
                out_path = os.path.join('/root/voicefilter/res', f'{res[2]}')
                librosa.output.write_wav(out_path, est_wav, sr=16000)
Example #22
0
def main(audio_filepath, text_filepath):
    text = get_text(text_filepath)
    token_clean = '\.|,|;|:|\?|!|\.\.\.'
    tokenized_text = ' '.join(tokenize(text))
    clean_text = re.sub(token_clean,'',tokenized_text).lower()
    audio_file = Audio(audio_filepath)
    trimmer = Trimmer(clean_text, audio_file)
    start, end, start_word_index, end_word_index = trimmer.crop_longaudio()
    if start and end:
        if end_word_index == None:
            end_word_index = -1
        else:
            end_word_index += 1
        print(start, end, tokenized_text.split()[start_word_index],
                      tokenized_text.split()[end_word_index])
Example #23
0
def prop_media(candidates, base_path, out_path='for_axlotl'):
    axlotl_input = []
    for session_id, session in candidates.items():
        result_top_path = os.path.join(out_path, session_id)
        if not os.path.isdir(result_top_path):
            os.makedirs(result_top_path)
        for text_path, contents in session.items():
            # the url element should always have a single file
            paths = create_local_paths(base_path, (text_path, '',
                                                   contents['urls'][0][1]))
            result_basename = os.path.basename(paths['audio_path']).\
                                                                  split('.')[0]
            yaml_name = os.path.basename(text_path).split('.')[0]
            text_filename = '-'.join([session_id,
                                      yaml_name,
                                      result_basename])+'.txt'
            result_text =  os.path.join(result_top_path, text_filename)
            # if audio file exists
            if os.path.isfile(paths['audio_path']):
                if os.path.isfile(result_text):
                    msg = 'skipping. processed text file %s exists'%result_text
                    #logging.info(msg)
                else:
                    # if there is one or two speaker in the intervention
                    if len(contents['text']) < 3:
                        text = ' '.join([text[1] for text in contents['text']])
                        audio_file = Audio(paths['audio_path'])
                        wps = len(text.split())/audio_file.duration*60
                        # if wps reasonable accept as an axlotl input
                        if wps < 95. or wps > 195:
                            msg = '%s wps is not reasonable: %4.2f. skipping'\
                                  %(text_path, wps)
                            logging.warning(msg)
                        else:
                            with open(result_text, 'w') as out:
                                out.write(text)
                if os.path.isfile(result_text):
                    #logging.info('text, audio: %s,%s'%(result_text,
                    #                                   paths['audio_path']))
                    axlotl_input.append((result_text,paths['audio_path']))
        with open('axlotl_input.csv', 'w') as out:
            for text, audio in axlotl_input:
                out.write('%s,%s\n'%(os.path.abspath(text),
                                     os.path.abspath(audio)))
Example #24
0
class VFWSDataset(Dataset):
    def __init__(self, hp, train):
        def find_all(data_dir,file_format):
            return sorted(glob.glob(os.path.join(data_dir, file_format)))
        self.hp = hp
        self.train = train

        self.mixed_dir = hp.data.vfws_dir + 'mixed/train/' if train else hp.data.vfws_dir + 'mixed/test/'
        self.clean_dir = hp.data.vfws_dir + 'clean/train/' if train else hp.data.vfws_dir + 'clean/test/'

        self.target_wav_list = find_all(self.clean_dir, hp.form.target.wav)
        self.mixed_wav_list  = find_all(self.mixed_dir, hp.form.mixed.wav)
        self.target_mag_list = find_all(self.clean_dir, hp.form.target.mag)
        self.mixed_mag_list  = find_all(self.mixed_dir, hp.form.mixed.mag)

        assert len(self.target_wav_list) == len(self.mixed_wav_list) == \
            len(self.target_mag_list) == len(self.mixed_mag_list), "number of training files must match"

        self.audio = Audio(hp)

    def __len__(self):
        return len(self.target_mag_list)

    def __getitem__(self, idx):
        if self.train :  # need to be fast
            target_mag = torch.load(self.target_mag_list[idx])
            mixed_mag = torch.load(self.mixed_mag_list[idx])
            return target_mag, mixed_mag
        else:
            target_wav, _ = librosa.load(self.target_wav_list[idx], self.hp.audio.sample_rate)
            mixed_wav, _ = librosa.load(self.mixed_wav_list[idx], self.hp.audio.sample_rate)
            target_mag, _ = self.wav2magphase(self.target_wav_list[idx])
            mixed_mag, mixed_phase = self.wav2magphase(self.mixed_wav_list[idx])
            target_mag = torch.from_numpy(target_mag)
            mixed_mag = torch.from_numpy(mixed_mag)
            # mixed_phase = torch.from_numpy(mixed_phase)
            return target_wav, mixed_wav, target_mag, mixed_mag, mixed_phase

    def wav2magphase(self, path):
        wav, _ = librosa.load(path, self.hp.audio.sample_rate)
        mag, phase = self.audio.wav2spec(wav)
        return mag, phase
Example #25
0
                            if os.path.isdir(x)]
        test_folders  = [x for x in glob.glob(os.path.join(args.current_corpus_dir, 'test')) 
                            if os.path.isdir(x)]
        

    #train_spk = all files in train_folders
    train_spk = [glob.glob(os.path.join(spk, '**', hp.form.input), recursive=True)
                    for spk in train_folders]
    train_spk = [x for x in train_spk if len(x) >= 2]

    #test_spk = all files in test_folders
    test_spk = [glob.glob(os.path.join(spk, '**', hp.form.input), recursive=True)
                    for spk in test_folders]
    test_spk = [x for x in test_spk if len(x) >= 2]

    audio = Audio(hp)

    def train_wrapper(num):
        '''Randomly chose 2 speakers from training set and mix them'''
        spk1, spk2 = random.sample(train_spk, 2)
        s1_dvec, s1_target = random.sample(spk1, 2)
        s2 = random.choice(spk2)
        mix(hp, args, audio, num, s1_dvec, s1_target, s2, train=True)

    def test_wrapper(num):
        '''Randomly chose 2 speakers from testing set and mix them'''
        spk1, spk2 = random.sample(test_spk, 2)
        s1_dvec, s1_target = random.sample(spk1, 2)
        s2 = random.choice(spk2)
        mix(hp, args, audio, num, s1_dvec, s1_target, s2, train=False)
            ref_mel, eliminated_wav, mixed_wav, expected_hidden_wav, eliminated_mag, expected_hidden_mag, mixed_mag, mixed_phase, dvec_path, eliminated_wav_path, mixed_wav_path = \
                batch[0]
            # print("expected_focused: {}".format(expected_focused_wav_path))
            print("Mixed: {}".format(mixed_wav_path))
            model = VoiceFilter(hp).cuda()
            chkpt_model = torch.load(args.checkpoint_path,
                                     map_location='cuda:0')['model']
            model.load_state_dict(chkpt_model)
            model.eval()

            embedder = SpeechEmbedder(hp).cuda()
            chkpt_embed = torch.load(args.embedder_path)
            embedder.load_state_dict(chkpt_embed)
            embedder.eval()

            audio = Audio(hp)
            dvec_wav, _ = librosa.load(dvec_path, sr=16000)
            ref_mel = audio.get_mel(dvec_wav)
            ref_mel = torch.from_numpy(ref_mel).float().cuda()
            dvec = embedder(ref_mel)
            dvec = dvec.unsqueeze(0)  # (1, 256)

            mixed_wav, _ = librosa.load(mixed_wav_path, sr=16000)
            mixed_mag, mixed_phase = audio.wav2spec(mixed_wav)
            mixed_mag = torch.from_numpy(mixed_mag).float().cuda()

            mixed_mag = mixed_mag.unsqueeze(0)

            shadow_mag = model(mixed_mag, dvec)
            # shadow_mag.size() = [1, 301, 601]
class VCTK:
    def __init__(self,
                 in_dir,
                 out_dir,
                 hparams,
                 speaker_info_filename='speaker-info.txt'):
        self.in_dir = in_dir
        self.out_dir = out_dir
        self.speaker_info_filename = speaker_info_filename
        self.audio = Audio(hparams)

    def list_files(self):
        def wav_files(speaker_info: SpeakerInfo):
            wav_dir = os.path.join(self.in_dir, f"wav48/p{speaker_info.id}")
            return [
                os.path.join(wav_dir, wav_file)
                for wav_file in sorted(os.listdir(wav_dir))
                if wav_file.endswith('.wav')
            ]

        def text_files(speaker_info: SpeakerInfo):
            txt_dir = os.path.join(self.in_dir, f"txt/p{speaker_info.id}")
            return [
                os.path.join(txt_dir, txt_file)
                for txt_file in sorted(os.listdir(txt_dir))
                if txt_file.endswith('.txt')
            ]

        def text_and_wav_records(file_pairs, speaker_info):
            def create_record(txt_f, wav_f, speaker_info):
                key1 = os.path.basename(wav_f).strip('.wav')
                key2 = os.path.basename(txt_f).strip('.txt')
                assert key1 == key2
                return TxtWavRecord(0, key1, txt_f, wav_f, speaker_info)

            return [
                create_record(txt_f, wav_f, speaker_info)
                for txt_f, wav_f in file_pairs
            ]

        records = sum([
            text_and_wav_records(zip(text_files(si), wav_files(si)), si)
            for si in self._load_speaker_info()
        ], [])
        return [
            TxtWavRecord(i, r.key, r.txt_path, r.wav_path, r.speaker_info)
            for i, r in enumerate(records)
        ]

    def process_sources(self, rdd: RDD):
        return rdd.map(self._process_txt)

    def process_targets(self, rdd: RDD):
        return TargetRDD(
            rdd.map(self._process_wav).persist(StorageLevel.MEMORY_AND_DISK))

    def _load_speaker_info(self):
        with open(os.path.join(self.in_dir, self.speaker_info_filename),
                  mode='r',
                  encoding='utf8') as f:
            for l in f.readlines()[1:]:
                si = l.split()
                gender = 0 if si[2] == 'F' else 1
                if str(si[0]) != "315":  # FixMe: Why 315 is missing?
                    yield SpeakerInfo(int(si[0]), int(si[1]), gender)

    def _process_wav(self, record: TxtWavRecord):
        wav = self.audio.load_wav(record.wav_path)
        wav = self.audio.trim(wav)
        mel_spectrogram = self.audio.melspectrogram(wav).astype(np.float32).T
        file_path = os.path.join(self.out_dir, f"{record.key}.target.tfrecord")
        write_preprocessed_target_data(record.id, record.key, mel_spectrogram,
                                       file_path)
        return MelStatistics(id=record.id,
                             key=record.key,
                             min=np.min(mel_spectrogram, axis=0),
                             max=np.max(mel_spectrogram, axis=0),
                             sum=np.sum(mel_spectrogram, axis=0),
                             length=len(mel_spectrogram),
                             moment2=np.sum(np.square(mel_spectrogram),
                                            axis=0))

    def _process_txt(self, record: TxtWavRecord):
        with open(os.path.join(self.in_dir, record.txt_path),
                  mode='r',
                  encoding='utf8') as f:
            txt = f.readline().rstrip("\n")
            sequence, clean_text = text_to_sequence(txt, basic_cleaners)
            source = np.array(sequence, dtype=np.int64)
            file_path = os.path.join(self.out_dir,
                                     f"{record.key}.source.tfrecord")
            write_preprocessed_source_data(record.id, record.key, source,
                                           clean_text, record.speaker_info.id,
                                           record.speaker_info.age,
                                           record.speaker_info.gender,
                                           file_path)
            return record.key
Example #28
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch Voice Filter')
    parser.add_argument('-b',
                        '--base_dir',
                        type=str,
                        default='.',
                        help="Root directory of run.")
    parser.add_argument('--checkpoint_path',
                        type=str,
                        default=None,
                        help='Path to last checkpoint')
    parser.add_argument('-e',
                        '--embedder_path',
                        type=str,
                        required=True,
                        help="path of embedder model pt file")
    parser.add_argument(
        '-m',
        '--model',
        type=str,
        required=True,
        help="Name of the model. Used for both logging and saving checkpoints."
    )
    args = parser.parse_args()

    chkpt_path = args.checkpoint_path if args.checkpoint_path is not None else None

    pt_dir = os.path.join(args.base_dir, config.log['chkpt_dir'], args.model)
    os.makedirs(pt_dir, exist_ok=True)

    log_dir = os.path.join(args.base_dir, config.log['log_dir'], args.model)
    os.makedirs(log_dir, exist_ok=True)

    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s - %(levelname)s - %(message)s',
                        handlers=[
                            logging.FileHandler(
                                os.path.join(
                                    log_dir,
                                    '%s-%d.log' % (args.model, time.time()))),
                            logging.StreamHandler()
                        ])
    logger = logging.getLogger()
    writer = MyWriter(log_dir)

    trainloader = create_dataloader(train=True)
    testloader = create_dataloader(train=False)

    embedder_pt = torch.load(args.embedder_path)
    embedder = SpeechEmbedder().cuda()
    embedder.load_state_dict(embedder_pt)
    embedder.eval()

    model = nn.DataParallel(VoiceFilter())
    optimizer = torch.optim.Adam(model.parameters(), lr=config.train['adam'])
    audio = Audio()

    starting_step = 0
    starting_epoch = 1

    if chkpt_path is not None:
        logger.info("Resuming from checkpoint: %s" % chkpt_path)
        checkpoint_file = torch.load(chkpt_path)
        model.load_state_dict(checkpoint_file['model'])
        starting_epoch = checkpoint_file['epoch']
        starting_step = checkpoint_file['step']
    else:
        logger.info("Starting new training run")

    scheduler = StepLR(optimizer, step_size=1, gamma=0.7)
    for epoch in range(starting_epoch, config.train['epoch'] + 1):
        train(embedder, model, optimizer, trainloader, writer, logger, epoch,
              pt_dir, starting_step)
        validate(audio, model, embedder, testloader, writer, epoch)
        scheduler.step()
        starting_step = 0

    model_saver(model, pt_dir, config.train['epoch'],
                config.train['train_step_pre_epoch'])
Example #29
0
 def __init__(self, in_dir, out_dir, hparams):
     self.in_dir = in_dir
     self.out_dir = out_dir
     self.audio = Audio(hparams)
Example #30
0
        phonemes.extend(batch)
    audio_data = np.concatenate([np.array(audio_data), np.expand_dims(phonemes, axis=1)], axis=1)
    if args.CACHE_PHON:
        np.save(phon_path, audio_data, allow_pickle=True)

print('\nBuilding dataset and writing files')
np.random.seed(42)
np.random.shuffle(audio_data)
test_metafile = os.path.join(args.TARGET_DIR, 'test_metafile.txt')
train_metafile = os.path.join(args.TARGET_DIR, 'train_metafile.txt')

test_lines = [''.join([filename, '|', text, '|', phon, '\n']) for filename, text, phon in
              audio_data[:config['n_test']]]
train_lines = [''.join([filename, '|', text, '|', phon, '\n']) for filename, text, phon in
               audio_data[config['n_test']:-1]]

with open(test_metafile, 'w+', encoding='utf-8') as test_f:
    test_f.writelines(test_lines)
with open(train_metafile, 'w+', encoding='utf-8') as train_f:
    train_f.writelines(train_lines)

audio = Audio(config)
for i in tqdm.tqdm(range(len(audio_data))):
    filename, _, _ = audio_data[i]
    wav_path = os.path.join(args.WAV_DIR, filename.replace('"', '') + '.wav')
    y, sr = librosa.load(wav_path, sr=config['sampling_rate'])
    mel = audio.mel_spectrogram(y)
    mel_path = os.path.join(mel_dir, filename)
    np.save(mel_path, mel.T)
print('\nDone')