def split_with_vad_wav( wav_path: Path, out_dir: Path, min_dur: float, max_dur: float, max_silence: float, strict_min_dur: bool, shift: float = 0, ) -> None: assert Path(wav_path).suffix == ".wav" audio_region = AudioRegion.load(str(wav_path)) out_dir = Path(out_dir) regions = audio_region.split( min_dur=min_dur, max_dur=max_dur, max_silence=max_silence, strict_min_dur=strict_min_dur, ) waveform, sr = sf.read(wav_path, dtype="float32") out = [] for i, r in enumerate(regions): start = int(r._meta.start * sr) end = int(r._meta.end * sr) path_seg = out_dir / f"{out_dir.stem}_{i}.flac" path_timestamp = get_path_timestamp(path_seg, ".vad.timestamp") save_timestamp(path_timestamp, r._meta.start + shift, r._meta.end + shift) sf.write( str(path_seg), waveform[start:end], sr, subtype="PCM_16", format="FLAC" ) out.append(path_seg) return out
def test_StreamSaverWorker_wav(self): with TemporaryDirectory() as tmpdir: expected_filename = os.path.join(tmpdir, "output.wav") saver = StreamSaverWorker(self.reader, expected_filename) saver.start() tokenizer = TokenizerWorker(saver) tokenizer.start_all() tokenizer.join() saver.join() output_filename = saver.save_stream() region = AudioRegion.load( "tests/data/test_split_10HZ_mono.raw", sr=10, sw=2, ch=1 ) expected_region = AudioRegion.load(output_filename) self.assertEqual(output_filename, expected_filename) self.assertEqual(region, expected_region) self.assertEqual(saver.data, bytes(expected_region))
def test_StreamSaverWorker_encode_audio(self): with TemporaryDirectory() as tmpdir: with patch("auditok.workers._run_subprocess") as patch_rsp: patch_rsp.return_value = (1, None, None) expected_filename = os.path.join(tmpdir, "output.ogg") tmp_expected_filename = expected_filename + ".wav" saver = StreamSaverWorker(self.reader, expected_filename) saver.start() tokenizer = TokenizerWorker(saver) tokenizer.start_all() tokenizer.join() saver.join() with self.assertRaises(AudioEncodingWarning) as rt_warn: saver.save_stream() warn_msg = "Couldn't save audio data in the desired format " warn_msg += "'ogg'. Either none of 'ffmpeg', 'avconv' or 'sox' " warn_msg += "is installed or this format is not recognized.\n" warn_msg += "Audio file was saved as '{}'" self.assertEqual( warn_msg.format(tmp_expected_filename), str(rt_warn.exception) ) ffmpef_avconv = [ "-y", "-f", "wav", "-i", tmp_expected_filename, "-f", "ogg", expected_filename, ] expected_calls = [ call(["ffmpeg"] + ffmpef_avconv), call(["avconv"] + ffmpef_avconv), call( [ "sox", "-t", "wav", tmp_expected_filename, expected_filename, ] ), ] self.assertEqual(patch_rsp.mock_calls, expected_calls) region = AudioRegion.load( "tests/data/test_split_10HZ_mono.raw", sr=10, sw=2, ch=1 ) self.assertTrue(saver._exported) self.assertEqual(saver.data, bytes(region))
def main(argv=None): program_name = os.path.basename(sys.argv[0]) if argv is None: argv = sys.argv[1:] try: parser = ArgumentParser(prog=program_name, description="An Audio Tokenization tool") parser.add_argument("--version", "-v", action="version", version=__version__) group = parser.add_argument_group("Input-Output options") group.add_argument( dest="input", help="Input audio or video file. Use '-' for stdin " "[default: read from microphone using pyaudio]", metavar="input", nargs="?", default=None, ) group.add_argument( "-I", "--input-device-index", dest="input_device_index", help="Audio device index [default: %(default)s]. " "Optional and only effective when using PyAudio", type=int, default=None, metavar="INT", ) group.add_argument( "-F", "--audio-frame-per-buffer", dest="frame_per_buffer", help="Audio frame per buffer [default: %(default)s]. " "Optional and only effective when using PyAudio", type=int, default=1024, metavar="INT", ) group.add_argument( "-f", "--input-format", dest="input_format", type=str, default=None, help="Input audio file format. If not given, guess format from " "extension. If output file name has no extension, guess format " "from file header (requires pydub). If none of the previous is " "true, raise an error", metavar="STRING", ) group.add_argument( "-M", "--max-read", dest="max_read", type=float, default=None, help="Maximum data (in seconds) to read from microphone or file " "[default: read until the end of file/stream]", metavar="FLOAT", ) group.add_argument( "-L", "--large-file", dest="large_file", action="store_true", default=False, help="Whether input file should be treated as a large file. " "If True, data will be read from file on demand, otherwise all " "audio data is loaded to memory before tokenization.", ) group.add_argument( "-O", "--save-stream", dest="save_stream", type=str, default=None, help="Save acquired audio data (from file or microphone) to disk." " If omitted no data will be saved. [default: omitted]", metavar="FILE", ) group.add_argument( "-o", "--save-detections-as", dest="save_detections_as", type=str, default=None, help="File name format for detections." "The following placeholders can be used to build output file name " "for each detection: {id} (sequential, starts from 1), {start}, " "{end} and {duration}. Time placeholders are in seconds. " "Example: 'Event_{id}_{start}-{end}_{duration:.3f}.wav'", metavar="STRING", ) group.add_argument( "-T", "--output-format", dest="output_format", type=str, default=None, help="Audio format used to save detections and/or main stream. " "If not supplied, then it will: (1. be guessed from extension or " "(2. use raw format", metavar="STRING", ) group.add_argument( "-u", "--use-channel", dest="use_channel", type=str, default=None, help="Which channel to use for tokenization when input stream is " "multi-channel (0 is the first channel). Default is None, meaning " "that all channels will be considered for tokenization (i.e., get " "any valid audio event regardless of the channel it occurs in). " "This value can also be 'mix' (alias 'avg' or 'average') and " "means mix down all audio channels into one channel (i.e. compute " "average channel) and use the resulting channel for tokenization. " "Whatever option is used, saved audio events will contain the same" " number of channels as input stream. " "[Default: None, use all channels]", metavar="INT/STRING", ) group = parser.add_argument_group("Tokenization options", "Set tokenizer options.") group.add_argument( "-a", "--analysis-window", dest="analysis_window", default=0.01, type=float, help="Size of analysis window in seconds [default: %(default)s " "(10ms)]", metavar="FLOAT", ) group.add_argument( "-n", "--min-duration", dest="min_duration", type=float, default=0.2, help="Min duration of a valid audio event in seconds " "[default: %(default)s]", metavar="FLOAT", ) group.add_argument( "-m", "--max-duration", dest="max_duration", type=float, default=5, help="Max duration of a valid audio event in seconds " "[default: %(default)s]", metavar="FLOAT", ) group.add_argument( "-s", "--max-silence", dest="max_silence", type=float, default=0.3, help="Max duration of a consecutive silence within a valid audio " "event in seconds [default: %(default)s]", metavar="FLOAT", ) group.add_argument( "-d", "--drop-trailing-silence", dest="drop_trailing_silence", action="store_true", default=False, help="Drop trailing silence from a detection [default: keep " "trailing silence]", ) group.add_argument( "-R", "--strict-min-duration", dest="strict_min_duration", action="store_true", default=False, help="Reject an event shorter than --min-duration even if it's " "adjacent to the latest valid event that reached max-duration " "[default: keep such events]", ) group.add_argument( "-e", "--energy-threshold", dest="energy_threshold", type=float, default=50, help="Log energy threshold for detection [default: %(default)s]", metavar="FLOAT", ) group = parser.add_argument_group( "Audio parameters", "Define audio parameters if data is read from a " "headerless file (raw or stdin) or you want to use " "different microphone parameters.", ) group.add_argument( "-r", "--rate", dest="sampling_rate", type=int, default=16000, help="Sampling rate of audio data [default: %(default)s]", metavar="INT", ) group.add_argument( "-c", "--channels", dest="channels", type=int, default=1, help="Number of channels of audio data [default: %(default)s]", metavar="INT", ) group.add_argument( "-w", "--width", dest="sample_width", type=int, default=2, help="Number of bytes per audio sample [default: %(default)s]", metavar="INT", ) group = parser.add_argument_group( "Do something with audio events", "Use these options to print, play back or plot detections.", ) group.add_argument( "-C", "--command", dest="command", type=str, help="Command to call when an audio detection occurs. Use '{file}' " "as a placeholder for the temporary wav file that will contain " "event's data (e.g., \"-C 'du -h {file}'\" to print out file size " " or \"-C 'play -q {file}'\" to play audio with sox)", metavar="STRING", ) group.add_argument( "-E", "--echo", dest="echo", action="store_true", default=False, help="Play back each detection immediately using pyaudio", ) group.add_argument( "-B", "--progress-bar", dest="progress_bar", action="store_true", default=False, help="Show a progress bar when playing audio", ) group.add_argument( "-p", "--plot", dest="plot", action="store_true", default=False, help="Plot and show audio signal and detections (requires " "matplotlib)", ) group.add_argument( "--save-image", dest="save_image", type=str, help="Save plotted audio signal and detections as a picture or a " "PDF file (requires matplotlib)", metavar="FILE", ) group.add_argument( "--printf", dest="printf", type=str, default="{id} {start} {end}", help="Print audio events information, one per line, using this " "format. Format can contain text with the following placeholders: " "{id} (sequential, starts from 1), {start}, {end}, {duration} and " "{timestamp}. The first 3 time placeholders are in seconds and " "their format can be set using --time-format argument. " "{timestamp} is the system timestamp (date and time) of the event " "and can be set using --timestamp-format argument.\n" "Example: '[{id}]: {start} -> {end} -- {timestamp}'", metavar="STRING", ) group.add_argument( "--time-format", dest="time_format", type=str, default="%S", help="Format used to print {start}, {end} and {duration} " "placeholders used with --printf [default= %(default)s]. The " "following formats are accepted:\n" "%%S: absolute time in seconds. %%I: absolute time in ms. If at " "least one of (%%h, %%m, %%s, %%i) is used, convert time into " "hours, minutes, seconds and millis (e.g. %%h:%%m:%%s.%%i). Only " "supplied fields are printed. Note that %%S and %%I can only be " "used alone", metavar="STRING", ) group.add_argument( "--timestamp-format", dest="timestamp_format", type=str, default="%Y/%m/%d %H:%M:%S", help="Format used to print {timestamp}. Should be a format " "accepted by 'datetime' standard module. Default: " "'%%Y/%%m/%%d %%H:%%M:%%S'", ) parser.add_argument( "-q", "--quiet", dest="quiet", action="store_true", default=False, help="Do not print any information about detections [default: " "print 'id', 'start' and 'end' of each detection]", ) parser.add_argument( "-D", "--debug", dest="debug", action="store_true", default=False, help="Print processing operations to STDOUT", ) parser.add_argument( "--debug-file", dest="debug_file", type=str, default=None, help="Print processing operations to FILE", metavar="FILE", ) args = parser.parse_args(argv) logger = make_logger(args.debug, args.debug_file) kwargs = make_kwargs(args) reader, observers = initialize_workers(logger=logger, **kwargs.io, **kwargs.miscellaneous) tokenizer_worker = workers.TokenizerWorker(reader, observers, logger=logger, **kwargs.split) tokenizer_worker.start_all() while True: time.sleep(1) if len(threading.enumerate()) == 1: raise EndOfProcessing except (KeyboardInterrupt, EndOfProcessing): if tokenizer_worker is not None: tokenizer_worker.stop_all() if isinstance(reader, workers.StreamSaverWorker): reader.join() try: reader.save_stream() except AudioEncodingWarning as ae_warn: print(str(ae_warn), file=sys.stderr) if args.plot or args.save_image is not None: from .plotting import plot reader.rewind() record = AudioRegion(reader.data, reader.sr, reader.sw, reader.ch) detections = ((det.start, det.end) for det in tokenizer_worker.detections) plot( record, detections=detections, energy_threshold=args.energy_threshold, show=True, save_as=args.save_image, ) return 0