def parse_audio(audio_path: str, audio_extension: str = 'pcm') -> Tensor: sound = load_audio(audio_path, extension=audio_extension) melspectrogram = librosa.feature.melspectrogram(sound, sr=16000, n_mels=80, n_fft=320, hop_length=160) log_melspectrogram = librosa.amplitude_to_db(melspectrogram) log_melspectrogram = torch.FloatTensor(log_melspectrogram) return log_melspectrogram
def __init__(self, *args, **kwargs): """ SpectrogramDataset that splits utterances into buckets based on their length. Bucketing is done via numpy's histogram method. Used by BucketingSampler to sample utterances from the same bin. """ super(SpectrogramDatasetWithLength, self).__init__(*args, **kwargs) audio_paths = [path for (path, _) in self.ids] audio_lengths = [len(load_audio(path)) for path in audio_paths] hist, bin_edges = np.histogram(audio_lengths, bins="auto") audio_samples_indices = np.digitize(audio_lengths, bins=bin_edges) self.bins_to_samples = defaultdict(list) for idx, bin_id in enumerate(audio_samples_indices): self.bins_to_samples[bin_id].append(idx)
""" This File is to inject noise on the training data tp increase robustness """ import argparse import torch import torchaudio from data.data_loader import load_audio, NoiseInjection parser = argparse.ArgumentParser() parser.add_argument('--input-path', default='input.wav', help='The input audio to inject noise into') parser.add_argument('--noise-path', default='noise.wav', help='The noise file to mix in') parser.add_argument('--output-path', default='output.wav', help='The noise file to mix in') parser.add_argument('--sample-rate', default=16000, help='Sample rate to save output as') parser.add_argument('--noise-level', type=float, default=1.0, help='The Signal to Noise ratio (higher means more noise)') args = parser.parse_args() noise_injector = NoiseInjection() data = load_audio(args.input_path) mixed_data = noise_injector.inject_noise_sample(data, args.noise_path, args.noise_level) mixed_data = torch.tensor(mixed_data, dtype=torch.float).unsqueeze(1) # Add channels dim torchaudio.save(args.output_path, mixed_data, args.sample_rate) print('Saved mixed file to %s' % args.output_path)
import argparse import torch import torchaudio from data.data_loader import load_audio, NoiseInjection parser = argparse.ArgumentParser() parser.add_argument('--input-path', default='input.wav', help='The input audio to inject noise into') parser.add_argument('--noise-path', default='noise.wav', help='The noise file to mix in') parser.add_argument('--output-path', default='output.wav', help='The noise file to mix in') parser.add_argument('--sample-rate', default=16000, help='Sample rate to save output as') parser.add_argument('--noise-level', type=float, default=1.0, help='The Signal to Noise ratio (higher means more noise)') args = parser.parse_args() noise_injector = NoiseInjection() data = load_audio(args.input_path) mixed_data = noise_injector.inject_noise_sample(data, args.noise_path, args.noise_level) mixed_data = torch.tensor(mixed_data, dtype=torch.float).unsqueeze(1) # Add channels dim torchaudio.save(args.output_path, mixed_data, args.sample_rate) print('Saved mixed file to %s' % args.output_path)
def process_sample(q, samples, args, labels, invalid_counters): logger = logging.getLogger('data_prep') while True: score_path = q.get() if score_path is None: break # Errors counters, for debug. # Per file: clean_errors = 0 split_errors = 0 # Per chunk: tiefix_errors = 0 hum2mid_errors = 0 audio_errors = 0 krnseq_errors = 0 double_symbol_errors = 0 encoding_errors = 0 length_errors = 0 # Remove grace notes, ornaments, etc... kern = Kern(Path(args.data_dir) / score_path, remove_splits=args.remove_splits) kern.spines.override_instruments(args.instruments) try: if not kern.clean(): logger.error(f'Cannot clean kern {score_path}') clean_errors += 1 continue except Exception as e: logger.exception( f"Exception while cleaning {score_path} audio. Reason: {e}") clean_errors += 1 continue root_path = Path(args.out_dir) / score_path.parent root_path.mkdir(parents=True, exist_ok=True) krn_path = Path(args.out_dir) / score_path krn_path_clean = krn_path.with_suffix('.clean.krn') kern.save(krn_path_clean) # Set seed to ensure same chunk sizes and tempo scaling np.random.seed(bytearray(score_path.name, 'utf-8')) try: kern_chunks = kern.split(args.chunk_sizes, args.train_stride) except Exception as e: logger.exception(f'Exception {e} while splitting {score_path}') split_errors += 1 continue # random scale between +ts and -ts ts = 1 + args.tempo_scaling * (2 * np.random.rand(len(kern_chunks)) - 1) for i, kern in enumerate(kern_chunks): chunk_path = krn_path.with_suffix(f'.{i:03d}.krn') kern.save(chunk_path) # Fix ties with tiefix command process = subprocess.run(['tiefix', chunk_path], capture_output=True, encoding='iso-8859-1') if (process.returncode != 0): logger.error( f"tiefix error={process.returncode} on {chunk_path}") logger.error(process.stdout) tiefix_errors += 1 continue kern = Kern(data=process.stdout, remove_splits=args.remove_splits) kern.save(chunk_path) audio_path = chunk_path.with_suffix('.flac') if args.resynthesize or not audio_path.exists(): mid_path = chunk_path.with_suffix('.mid') # Tempo and instrumment extracted from *MM and *I indications status = os.system( f'hum2mid {str(chunk_path)} -C -v 100 -t {ts[i]} -o {str(mid_path)} >/dev/null 2>&1' # noqa E501 ) if (os.WEXITSTATUS(status) != 0): logger.error(f"hum2mid error={status} on {chunk_path}") hum2mid_errors += 1 continue status = os.system( f'fluidsynth --sample-rate={args.sample_rate} -O s16 -T raw -i -l -F - {args.soundfont} {str(mid_path)} | ' # noqa E501 f'ffmpeg -y -f s16le -ar {args.sample_rate} -ac 2 -i pipe: ' # noqa E501 f'-ar {args.sample_rate} -ac 1 -ab {args.bit_rate} -strict -2 {str(audio_path)} 2>/dev/null' # noqa E501 ) try: y = load_audio(str(audio_path)) except Exception as e: logger.exception( f"Exception while loading {chunk_path} audio. Reason: {e}") audio_errors += 1 continue duration = len(y) / args.sample_rate try: krnseq = kern.tosequence() except Exception as e: logger.exception(f"Discarded {chunk_path} due to error in kern" f" sequence conversion. Reason {e}") krnseq_errors += 1 continue if krnseq is None: logger.warning( f"Discarded {chunk_path} for double dots/sharps/flats") double_symbol_errors += 1 continue try: seq = labels.encode(krnseq) except Exception as e: logger.warning(f"Discarded {chunk_path} during label encoding." f" Reason: {e}") encoding_errors += 1 continue seqlen = labels.ctclen(seq) krnseq_path = chunk_path.with_suffix('.krnseq') krnseq_path.write_text(krnseq) seq_path = chunk_path.with_suffix('.seq') with seq_path.open(mode="wb") as f: f.write(pickle.dumps(seq)) if duration > args.max_duration or \ duration < seqlen * args.min_duration_symbol: logger.warning(f"Sequence too long in {chunk_path} " f"len={seqlen} duration={duration:.2f}") length_errors += 1 continue samples.append([str(audio_path), str(seq_path), duration]) invalid_counters['clean_errors'].append(clean_errors) invalid_counters['split_errors'].append(split_errors) invalid_counters['tiefix_errors'].append(tiefix_errors) invalid_counters['hum2mid_errors'].append(hum2mid_errors) invalid_counters['audio_errors'].append(audio_errors) invalid_counters['krnseq_errors'].append(krnseq_errors) invalid_counters['double_symbol_errors'].append(double_symbol_errors) invalid_counters['encoding_errors'].append(encoding_errors) invalid_counters['length_errors'].append(length_errors)
def get_signal(self, path): signal = load_audio(path) signal = Variable(signal, requires_grad=True) return signal
from data.data_loader import load_audio, NoiseInjection parser = argparse.ArgumentParser() parser.add_argument('--input-path', default='input.wav', help='The input audio to inject noise into') parser.add_argument('--noise-path', default='noise.wav', help='The noise file to mix in') parser.add_argument('--output-path', default='output.wav', help='The noise file to mix in') parser.add_argument('--sample-rate', default=16000, help='Sample rate to save output as') parser.add_argument('--noise-level', type=float, default=1.0, help='The Signal to Noise ratio (higher means more noise)') args = parser.parse_args() noise_injector = NoiseInjection() data, sample_rate_ = load_audio(args.input_path) assert sample_rate_ == args.sample_rate mixed_data = noise_injector.inject_noise_sample(data, args.noise_path, args.noise_level) mixed_data = torch.tensor(mixed_data, dtype=torch.float).unsqueeze(1) # Add channels dim torchaudio.save(args.output_path, mixed_data, args.sample_rate) print('Saved mixed file to %s' % args.output_path)
def combine_datasets(data_dir, dataset_config, sample_rate=22050): logger = config_logger('combine_datasets', console_level='INFO') if not isinstance(data_dir, Path): data_dir = Path(data_dir) group = data_dir.name # Label file must be the same. Just make a copy and rename. labels_path = [ i for i in data_dir.rglob(f'*{dataset_config}/labels*.json') ][0] outpath = data_dir / f'labels_{group}_{dataset_config}.json' logger.info(f'Saving encoder labels to {outpath}') outpath.write_text(labels_path.read_text()) dataset_partitions = ['test', 'train', 'val'] extension = 'csv' for dataset_partition in dataset_partitions: logger.info( f'Looking for partition {dataset_partition} from configuration {dataset_config}.' ) all_filenames = [ i for i in data_dir.rglob( f'*{dataset_config}/{dataset_partition}*.{extension}') ] logger.info(f'Found {len(all_filenames)} files.') # combine all files in the list combined_csv = pd.concat([ pd.read_csv(f, header=None).assign(filename=f.name) for f in all_filenames ]).rename(columns={ 0: "audio", 1: "seq" }) logger.info(combined_csv.groupby('filename')['audio'].count()) logger.info( f"Combined partition {dataset_partition}: {combined_csv['audio'].count()} samples." ) durations = [] total_duration = 0 audio_errors = 0 for audio_file in combined_csv['audio'].tolist(): try: y = load_audio(str(audio_file)) duration = len(y) / sample_rate except Exception as e: logger.exception( f"Exception while loading {audio_file} audio. Reason: {e}") audio_errors += 1 duration = 0 continue durations.append(duration) combined_csv = combined_csv.assign(duration=durations) # SortaGrad combined_csv = combined_csv.sort_values(by='duration') total_duration = combined_csv['duration'].sum() logger.info(f'Total duration: {total_duration/60/60} hours.') logger.info(f'Found {audio_errors} errors during loading.') # export to csv outpath = data_dir / f'{dataset_partition}_{group}_{dataset_config}.{extension}' logger.info(f'Saving to {outpath}') combined_csv.drop(['filename', 'duration'], axis=1).to_csv(outpath, index=False, header=False, encoding='iso-8859-1')
def process_sample(q, samples, args, labels): while True: score_path = q.get() if score_path is None: break # Remove grace notes, ornaments, etc... kern = Kern(Path(args.data_dir) / score_path) kern.spines.override_instruments(args.instruments) try: if not kern.clean(): print(f'Cannot clean kern {score_path}') continue except Exception as e: print(f"Exception while cleaning {score_path} audio. Reason: {e}") continue root_path = Path(args.out_dir) / score_path.parent root_path.mkdir(parents=True, exist_ok=True) krn_path = Path(args.out_dir) / score_path # Set seed to ensure same chunk sizes and tempo scaling np.random.seed(bytearray(score_path.name, 'utf-8')) try: kern_chunks = kern.split(args.chunk_sizes, args.train_stride) except Exception as e: print(f'Exception {e} while splitting {score_path}') continue # random scale between +ts and -ts ts = 1 + args.tempo_scaling * (2 * np.random.rand(len(kern_chunks)) - 1) for i, kern in enumerate(kern_chunks): chunk_path = krn_path.with_suffix(f'.{i:03d}.krn') kern.save(chunk_path) # Fix ties with tiefix command process = subprocess.run(['tiefix', chunk_path], encoding='iso-8859-1', stdout=subprocess.PIPE) if (process.returncode != 0): print(f"tiefix error={process.returncode} on {chunk_path}") print(process.stdout) continue kern = Kern(data=process.stdout) kern.save(chunk_path) audio_path = chunk_path.with_suffix('.flac') if args.resynthesize or not audio_path.exists(): mid_path = chunk_path.with_suffix('.mid') # Tempo and instrumment extracted from *MM and *I indications status = os.system( f'hum2mid {str(chunk_path)} -C -v 100 -t {ts[i]} -o {str(mid_path)} >/dev/null 2>&1' ) if (os.WEXITSTATUS(status) != 0): print(f"hum2mid error={status} on {krn_path}") continue status = os.system( f'fluidsynth --sample-rate={args.sample_rate} -O s16 -T raw -i -l -F - {args.soundfont} {str(mid_path)} | ' f'ffmpeg -y -f s16le -ar {args.sample_rate} -ac 2 -i pipe: ' f'-ar {args.sample_rate} -ac 1 -ab {args.bit_rate} -strict -2 {str(audio_path)} 2>/dev/null' ) try: y = load_audio(str(audio_path)) except Exception as e: print( f"Exception while loading {chunk_path} audio. Reason: {e}") continue duration = len(y) / args.sample_rate krnseq = kern.tosequence() if krnseq is None: #print(f"Discarded {chunk_path} for double dots/sharps/flats") continue try: seq = labels.encode(krnseq) except Exception as e: print(f"Discarded {chunk_path}. Reason: {e}") continue seqlen = labels.ctclen(seq) krnseq_path = chunk_path.with_suffix('.krnseq') krnseq_path.write_text(krnseq) seq_path = chunk_path.with_suffix('.seq') with seq_path.open(mode="wb") as f: f.write(pickle.dumps(seq)) if duration > args.max_duration or duration < seqlen * args.min_duration_symbol: #print(f"Sequence too long in {chunk_path} len={seqlen} duration={duration:.2f}") continue samples.append([str(audio_path), str(seq_path), duration])