def load(data_path): assert os.path.exists(data_path) if data_path.endswith('.rttm'): with open(data_path) as f: transcript = [ dict(audio_name=splitted[1], begin=float(splitted[3]), end=float(splitted[3]) + float(splitted[4]), speaker_name=splitted[7]) for splitted in map(str.split, f) ] elif data_path.endswith('.json') or data_path.endswith('.json.gz'): with utils.open_maybe_gz(data_path) as f: transcript = json.load(f) elif os.path.exists(data_path + '.json'): with open(data_path + '.json') as f: transcript = json.load(f) for t in transcript: t['audio_path'] = data_path else: transcript = [dict(audio_path=data_path)] return transcript
def read_transcript(data_path): assert os.path.exists(data_path) if data_path.endswith('.json') or data_path.endswith('.json.gz'): return json.load(utils.open_maybe_gz(data_path)) if os.path.exists(data_path + '.json'): return json.load(open(data_path + '.json')) return [dict(audio_path=data_path)]
def csv2json(input_path, gz, group, reset_begin_end, csv_sep, audio_name_pattern=None): """ Convert cvs transcripts file to .csv.json transcripts file. Each line in `input_path` file must have format: 'audio_path,transcription,begin,end\n' csv_sep could be 'comma', representing ',', or 'tab', representing '\t'. audio_name_pattern - is a regex pattern, that is used, when reset_begin_end is True. It must contain at least two named capturing groups: (?P<begin>...) and (?P<end>...). By default, Kontur calls patter will be used. """ audio_name_regex = re.compile(audio_name_pattern) if audio_name_pattern else re.compile( r'(?P<begin>\d+\.?\d*)-(?P<end>\d+\.?\d*)_\d+\.?\d*_[01]_1\d{9}\.?\d*\.wav' ) # default is Kontur calls pattern, match example: '198.38-200.38_2.0_0_1582594487.376404.wav' def duration(audio_name): match = audio_name_regex.fullmatch(audio_name) assert match is not None, f'audio_name {audio_name!r} must match {audio_name_regex.pattern}' begin, end = float(match['begin']), float(match['end']) assert begin < end < 10_000, 'sanity check: begin and end must be below 10_000 seconds' return end - begin csv_sep = dict(tab = '\t', comma = ',')[csv_sep] res = [] for line in utils.open_maybe_gz(input_path): assert '"' not in line, f'{input_path!r} lines must not contain any quotation marks!' audio_path, ref, begin, end = line[:-1].split(csv_sep)[:4] transcription = dict(audio_path = audio_path, ref = ref, begin = float(begin), end = float(end)) if reset_begin_end: transcription['begin'] = 0.0 transcription['end'] = duration(os.path.basename(audio_path)) # add input_path folder name to the 'group' key of each transcription # todo: rename --group parameter to something more sensible! if group >= 0: transcription['group'] = audio_path.split('/')[group] res.append(transcription) output_path = input_path + '.json' + ('.gz' if gz else '') json.dump(res, utils.open_maybe_gz(output_path, 'w'), ensure_ascii = False, indent = 2, sort_keys = True) print(output_path)