Example #1
0
class TextSpeechDataset(Dataset):
    def __init__(self, root_dir, annotations_file, parameters):
        self.root_dir = root_dir
        self.annotations = pd.read_csv(os.path.join(root_dir, annotations_file), sep="|", header=None)
        self.cleaners = parameters["text_cleaner"]
        self.outputs_per_step = parameters["outputs_per_step"]
        self.sample_rate = parameters["sample_rate"]

        self.ap = AudioProcessor(sample_rate=self.sample_rate, preemphasis=parameters["preemphasis"],
                                 frequency=parameters["frequency"], frame_length=parameters["frame_length"],
                                 frame_shift=parameters["frame_shift"], min_dbs=parameters["min_dbs"],
                                 ref_dbs=parameters["ref_dbs"],
                                 mels_size=parameters["mels_size"],
                                 griff_lim_iters=parameters["griff_lim_iters"],
                                 power=parameters["spectro_power"]
                                 )

    def __len__(self):
        return self.annotations.shape[0]

    def __getitem__(self, index):
        text = self.annotations.iloc[index][1]
        text = np.asarray(text_to_sequence(text=text, cleaner_names=[self.cleaners]), dtype=np.int32)

        wav_path = os.path.join(self.root_dir, "wavs", f"{self.annotations.iloc[index][0]}.wav")
        wav = load_wav(filename=wav_path, sample_rate=self.sample_rate)
        return {"text": text, "wav": wav}

    def collate_fn(self, batch):
        wavs = list(map(lambda sample: sample["wav"], batch))
        texts = list(map(lambda sample: sample["text"], batch))

        max_text_len = np.max(list(map(lambda txt: len(txt), texts)))
        texts = np.stack(list(map(lambda txt: pad_data(txt, max_text_len), texts))).astype(np.int32)
        max_wav_len = np.max(list(map(lambda waveform: len(waveform), wavs)))
        wavs = np.stack(list(map(lambda waveform: pad_data(x=waveform, length=max_wav_len), wavs))).astype(np.int32)

        linears = list(
            map(lambda waveform: self.ap.wav_to_linear_spectrogram(waveform=waveform).astype(np.float32), wavs))
        mels = list(map(lambda waveform: self.ap.wav_to_mel_spectrogram(waveform=waveform).astype(np.float32), wavs))
        mel_lengths = list(map(lambda mel_: mel_.shape[1] + 1, mels))

        linears = prepare_tensor(inputs=linears, out_steps=self.outputs_per_step)
        mels = prepare_tensor(inputs=mels, out_steps=self.outputs_per_step)
        linears = linears.transpose(0, 2, 1)
        mels = mels.transpose(0, 2, 1)

        texts = torch.LongTensor(texts)
        linears = torch.FloatTensor(linears)
        mels = torch.FloatTensor(mels)
        mel_lengths = torch.LongTensor(mel_lengths)

        return texts, linears, mels, mel_lengths
Example #2
0
    def __init__(self, root_dir, annotations_file, parameters):
        self.root_dir = root_dir
        self.annotations = pd.read_csv(os.path.join(root_dir, annotations_file), sep="|", header=None)
        self.cleaners = parameters["text_cleaner"]
        self.outputs_per_step = parameters["outputs_per_step"]
        self.sample_rate = parameters["sample_rate"]

        self.ap = AudioProcessor(sample_rate=self.sample_rate, preemphasis=parameters["preemphasis"],
                                 frequency=parameters["frequency"], frame_length=parameters["frame_length"],
                                 frame_shift=parameters["frame_shift"], min_dbs=parameters["min_dbs"],
                                 ref_dbs=parameters["ref_dbs"],
                                 mels_size=parameters["mels_size"],
                                 griff_lim_iters=parameters["griff_lim_iters"],
                                 power=parameters["spectro_power"]
                                 )
Example #3
0
def run_train(c, args, model_params):
        c = copy_config_dict(c)

        ap = AudioProcessor(**c.audio)
        
        if args.seed is None:
            log_path = os.path.join(c.train_config['logs_path'], c.model_name)
        else:
            log_path = os.path.join(os.path.join(c.train_config['logs_path'], str(args.seed)), c.model_name)
            c.train_config['seed'] = args.seed

        os.makedirs(log_path, exist_ok=True)

        tensorboard = TensorboardWriter(os.path.join(log_path,'tensorboard'))

        trainloader = train_dataloader(copy_config_dict(c), ap, class_balancer_batch=c.dataset['class_balancer_batch'])
        max_seq_len = trainloader.dataset.get_max_seq_lenght()
        c.dataset['max_seq_len'] = max_seq_len
        model_params['config'] = copy_config_dict(c)
        # save config in train dir, its necessary for test before train and reproducity
        save_config_file(c, os.path.join(log_path,'config.json'))

        # one_window in eval use overlapping
        if c.dataset['temporal_control'] == 'one_window':
            c.dataset['temporal_control']  = 'overlapping'

        evaloader = eval_dataloader(c, ap, max_seq_len=max_seq_len)
        # enablePrint()
        # print(max_seq_len, trainloader.dataset.get_max_seq_lenght(), c.dataset['temporal_control'], c.dataset['max_seq_len'])

        return train(args, log_path, args.checkpoint_path, trainloader, evaloader, tensorboard, c, c.model_name, ap, cuda=True, model_params=model_params)
Example #4
0
def run_train(c, args, model_params=None):

    ap = AudioProcessor(**c.audio)

    log_path = os.path.join(c.train_config['logs_path'], c.model_name)

    os.makedirs(log_path, exist_ok=True)

    tensorboard = TensorboardWriter(os.path.join(log_path, 'tensorboard'))
    print(c.dataset['train_csv'], c.dataset['eval_csv'])

    trainloader = train_dataloader(
        c, ap, class_balancer_batch=c.dataset['class_balancer_batch'])
    max_seq_len = trainloader.dataset.get_max_seq_lenght()
    c.dataset['max_seq_len'] = max_seq_len

    print(c.dataset['train_csv'], c.dataset['eval_csv'])

    # save config in train dir, its necessary for test before train and reproducity
    save_config_file(c, os.path.join(log_path, 'config.json'))

    evaloader = eval_dataloader(c, ap, max_seq_len=max_seq_len)

    return train(args,
                 log_path,
                 args.checkpoint_path,
                 trainloader,
                 evaloader,
                 tensorboard,
                 c,
                 c.model_name,
                 ap,
                 cuda=True,
                 model_params=model_params)
        type=str,
        required=False,
        default=False,
        help="Librispeech format, if true load with librispeech format")
    args = parser.parse_args()

    os.makedirs(args.out_dir, exist_ok=True)
    if args.train_data_csv:
        os.makedirs(os.path.join(args.out_dir, 'train'), exist_ok=True)
    if args.test_data_csv:
        os.makedirs(os.path.join(args.out_dir, 'test'), exist_ok=True)

    cpu_num = cpu_count()  # num threads = num cpu cores

    config = load_config(args.config)
    ap = AudioProcessor(config.audio)

    sample_rate = config.audio[config.audio['backend']]['sample_rate']
    audio_len = config.audio['audio_len']
    form = config.dataset['format']
    output_dir_train = os.path.join(args.out_dir, 'train')
    output_dir_test = os.path.join(args.out_dir, 'test')

    dataset_root_dir = args.dataset_root_dir
    train_data_csv = None
    test_data_csv = None

    noise_files = open(args.noise_csv).readlines()

    if args.train_data_csv:
        train_data_csv = pd.read_csv(args.train_data_csv, sep=',').values
    outpath = os.path.join(
        output_dir,
        os.path.basename(mixed_path).replace(".wav", "_predict.wav"))
    soundfile.write(outpath, est_wav, 16000)
    return est_wav, target_wav, mixed_wav, emb_wav


print("... Load GE2E encoder model ...")
encoder.load_model(Path('encoder/saved_models/pretrained_en.pt'))

checkpoint_path = 'models/demo5.pt'
checkpoint = torch.load(checkpoint_path, map_location='cpu')
model_c = load_config_from_str(checkpoint['config_str'])

ap = AudioProcessor(model_c.audio)  # create AudioProcessor for model
model_name = model_c.model_name
cuda = False
if model_name == 'voicefilter':
    print('inicializado com voicefilter')
    model = VoiceFilter(model_c)
elif model_name == 'voicesplit':
    model = VoiceSplit(model_c)
else:
    raise Exception(" The model '" + model_name + "' is not suported")
if model_c.train_config['optimizer'] == 'adam':
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=model_c.train_config['learning_rate'])
else:
    raise Exception("The %s  not is a optimizer supported" %
                    model_c.train['optimizer'])
                        help="Root directory of run.")
    parser.add_argument('-c', '--config_path', type=str, required=False, default=None,
                        help="json file with configurations")
    parser.add_argument('--checkpoints_path', type=str, required=True,
                        help="path of checkpoint pt file, for continue training")
    args = parser.parse_args()

    all_checkpoints = sorted(glob(os.path.join(args.checkpoints_path, '*.pt')))
    #print(all_checkpoints, os.listdir(args.checkpoints_path))
    if args.config_path:
        c = load_config(args.config_path)
    else: #load config in checkpoint
        checkpoint = torch.load(all_checkpoints[0], map_location='cpu')
        c = load_config_from_str(checkpoint['config_str'])

    ap = AudioProcessor(c.audio)

    log_path = os.path.join(c.train_config['logs_path'], c.model_name)
    audio_config = c.audio[c.audio['backend']]
    tensorboard = TensorboardWriter(log_path, audio_config)
    # set test dataset dir
    c.dataset['test_dir'] = args.dataset_dir
    # set batchsize = 32
    c.test_config['batch_size'] = 5
    test_dataloader = test_dataloader(c, ap)
    best_loss = 999999999
    best_loss_checkpoint = ''
    sdrs_checkpoint = []
    for i in tqdm.tqdm(range(len(all_checkpoints))):
        checkpoint = all_checkpoints[i]
        mean_loss= test(args, log_path, checkpoint, test_dataloader, tensorboard, c, c.model_name, ap, cuda=True)
Example #8
0
                        help="path of checkpoint pt file, for continue training")
    parser.add_argument('--batch_size', type=int, default=20,
                        help="Batch size for test")
    parser.add_argument('--num_workers', type=int, default=10,
                        help="Number of Workers for test data load")
    parser.add_argument('--no_insert_noise', type=bool, default=False,
                        help=" No insert noise in test ?")
    parser.add_argument('--num_noise_control', type=int, default=1,
                        help="Number of Noise for insert in control")
    parser.add_argument('--num_noise_patient', type=int, default=0,
                        help="Number of Noise for insert in patient")
                        
    args = parser.parse_args()

    c = load_config(args.config_path)
    ap = AudioProcessor(**c.audio)
    
    if not args.no_insert_noise:
        c.data_aumentation['insert_noise'] = True
    else:
        c.data_aumentation['insert_noise'] = False

    # ste values for noisy insertion in test
    c.data_aumentation["num_noise_control"] = args.num_noise_control
    c.data_aumentation["num_noise_patient"] = args.num_noise_patient

    print("Insert noise ?", c.data_aumentation['insert_noise'])

    c.dataset['test_csv'] = args.test_csv
    c.dataset['test_data_root_path'] = args.test_root_dir
Example #9
0
if __name__ == "__main__":
    # Get defaults so it can work with no Sacred
    parser = argparse.ArgumentParser()
    parser.add_argument('-f', "--wavfile_path", required=True)
    parser.add_argument('-c', '--config', type=str, required=True,
                        help='JSON file for configuration')
    args = parser.parse_args()

    config = load_config(args.config)

    filepath = args.wavfile_path

    # extract spectrogram
    config.audio['feature'] = 'spectrogram'
    
    ap = AudioProcessor(**config.audio)
    spectrogram = ap.get_feature_from_audio_path(filepath)
    print("Spectogram with shape:",spectrogram.shape)

    # extract spectrogram
    config.audio['feature'] = 'melspectrogram'
    ap = AudioProcessor(**config.audio)
    melspectrogram = ap.get_feature_from_audio_path(filepath)
    print("MelSpectogram with shape:",melspectrogram.shape)

    # extract spectrogram
    config.audio['feature'] = 'mfcc'
    ap = AudioProcessor(**config.audio)
    mfcc = ap.get_feature_from_audio_path(filepath)
    print("MFCC with shape:",mfcc.shape)  
Example #10
0
def run_test_all_seeds(args, cuda=True, debug=False, return_potential=False):
    runs_list = os.listdir(args.experiment_dir)
    runs_list.sort()
    num_runs = len(runs_list)

    votes = []
    wav_files = []
    targets = []
    # define loss function
    criterion = nn.BCELoss(reduction='sum')
    for run in runs_list:
        blockPrint()
        run_dir = os.path.join(args.experiment_dir, run)
        if os.path.isfile(run_dir):
            continue
        model_name = os.listdir(run_dir)[0]
        checkpoint_path = os.path.join(run_dir, model_name,
                                       'best_checkpoint.pt')
        config_path = os.path.join(run_dir, model_name, 'config.json')

        c = load_config(config_path)
        ap = AudioProcessor(**c.audio)

        c.dataset['test_csv'] = args.test_csv
        c.dataset['test_data_root_path'] = args.test_root_dir

        c.test_config['batch_size'] = args.batch_size
        c.test_config['num_workers'] = args.num_workers
        max_seq_len = c.dataset['max_seq_len']

        c.train_config['seed'] = 0

        testdataloader = test_dataloader(c, ap, max_seq_len=max_seq_len)

        # load model
        model = return_model(c)
        enablePrint()
        if checkpoint_path is not None:
            print("Loading checkpoint: %s" % checkpoint_path)
            try:
                checkpoint = torch.load(checkpoint_path, map_location='cpu')
                model.load_state_dict(checkpoint['model'])
                print("Model Sucessful Load !")
            except Exception as e:
                raise ValueError(
                    "You need pass a valid checkpoint, may be you need check your config.json because de the of this checkpoint cause the error: "
                    + e)
        blockPrint()
        # convert model from cuda
        if cuda:
            model = model.cuda()

        model.train(False)

        vote, targets, wav_path = test(criterion,
                                       ap,
                                       model,
                                       c,
                                       testdataloader,
                                       cuda=cuda,
                                       confusion_matrix=True,
                                       debug=debug,
                                       simples_vote=args.simples_vote)
        # print(vote)
        wav_files.append(wav_path)
        votes.append(vote)
        if len(wav_files):
            if wav_files[-1] != wav_files[0]:
                raise ValueError(
                    "Diferents files  or order for the test in diferrents seeds or folds"
                )

    # mean vote, and round is necessary if use composite vote
    preds = np.mean(np.array(votes), axis=0)
    # print(preds)
    if not return_potential:
        preds = preds.round()

    file_names = wav_files[0]

    if debug and not return_potential:
        enablePrint()
        targets = np.array(targets)
        preds = np.array(preds)
        names = np.array(file_names)
        idxs = np.nonzero(targets == c.dataset['control_class'])
        control_target = targets[idxs]
        control_preds = preds[idxs]
        names_control = names[idxs]

        idxs = np.nonzero(targets == c.dataset['patient_class'])

        patient_target = targets[idxs]
        patient_preds = preds[idxs]
        names_patient = names[idxs]

        if debug:
            print('+' * 40)
            print("Control Files Classified incorrectly:")
            incorrect_ids = np.nonzero(
                control_preds != c.dataset['control_class'])
            inc_names = names_control[incorrect_ids]
            print("Num. Files:", len(inc_names))
            print(inc_names)
            print('+' * 40)
            print('-' * 40)
            print("Patient Files Classified incorrectly:")
            incorrect_ids = np.nonzero(
                patient_preds != c.dataset['patient_class'])
            inc_names = names_patient[incorrect_ids]
            print("Num. Files:", len(inc_names))
            print(inc_names)
            print('-' * 40)

        acc_control = (control_preds == control_target).mean()
        acc_patient = (patient_preds == patient_target).mean()
        acc_balanced = (acc_control + acc_patient) / 2

        f1 = f1_score(targets.tolist(), preds.tolist())
        uar = recall_score(targets.tolist(), preds.tolist(), average='macro')
        print("======== Confusion Matrix ==========")
        y_target = pd.Series(targets, name='Target')
        y_pred = pd.Series(preds, name='Predicted')
        df_confusion = pd.crosstab(y_target,
                                   y_pred,
                                   rownames=['Target'],
                                   colnames=['Predicted'],
                                   margins=True)
        print(df_confusion)

        print("Test\n ", "Acurracy Control: ", acc_control,
              "Acurracy Patient: ", acc_patient, "Acurracy Balanced",
              acc_balanced)
        print("F1:", f1, "UAR:", uar)

    if return_potential:
        return preds, file_names
    else:
        df = pd.DataFrame({
            'filename': file_names,
            'prediction': preds.astype(int)
        })
        df['prediction'] = df['prediction'].replace(
            int(c.dataset['control_class']), 'negative',
            regex=True).replace(int(c.dataset['patient_class']),
                                'positive',
                                regex=True)
        if args.output_csv:
            out_csv_path = args.output_csv
        else:
            out_csv_path = os.path.join(
                args.experiment_dir, os.path.basename(c.dataset['test_csv']))

        df.to_csv(out_csv_path, index=False)
Example #11
0
    "audio_len": 3,
    "voicefilter": {
        "n_fft": 1200,
        "num_mels": 40,
        "num_freq": 601,
        "sample_rate": 16000,
        "hop_length": 160,
        "win_length": 400,
        "min_level_db": -100.0,
        "ref_level_db": 20.0,
        "preemphasis": 0.97,
        "power": 1.5,
        "griffin_lim_iters": 60
    }
}
ap = AudioProcessor(config)

# In[9]:

os.listdir(TEST_DATA)

# In[12]:

#Preprocess dataset
train_files = sorted(glob(os.path.join(TRAIN_DATA, glob_re_wav_emb)))
test_files = sorted(glob(os.path.join(TEST_DATA, glob_re_wav_emb)))

if len(train_files) == 0 or len(test_files):
    print("check train and test path files not in directory")
files = train_files + test_files
]
if __name__ == "__main__":
    # Get defaults so it can work with no Sacred
    parser = argparse.ArgumentParser()
    parser.add_argument('-c',
                        '--config',
                        type=str,
                        required=True,
                        help='JSON file for configuration')
    args = parser.parse_args()

    config = load_config(args.config)
    # extract spectrogram
    config.audio['feature'] = 'spectrogram'

    ap = AudioProcessor(**config.audio)
    max_amp = 0
    min_amp = 99999999999999999
    for samples_file in samples:
        for sample in samples_file:
            loc_max_amp = 0
            wav = ap.load_wav(sample[0])

            inicio = sample[1]
            fim = sample[2]
            slice_start = int(inicio * ap.sample_rate)
            slice_end = int(fim * ap.sample_rate)
            wav = wav[:, slice_start:slice_end]
            wav_max_amp = wav.max().numpy()
            if wav_max_amp > loc_max_amp:
                loc_max_amp = wav_max_amp