Ejemplo n.º 1
0
def load(data_path):
    assert os.path.exists(data_path)

    if data_path.endswith('.rttm'):
        with open(data_path) as f:
            transcript = [
                dict(audio_name=splitted[1],
                     begin=float(splitted[3]),
                     end=float(splitted[3]) + float(splitted[4]),
                     speaker_name=splitted[7])
                for splitted in map(str.split, f)
            ]

    elif data_path.endswith('.json') or data_path.endswith('.json.gz'):
        with utils.open_maybe_gz(data_path) as f:
            transcript = json.load(f)

    elif os.path.exists(data_path + '.json'):
        with open(data_path + '.json') as f:
            transcript = json.load(f)
            for t in transcript:
                t['audio_path'] = data_path
    else:
        transcript = [dict(audio_path=data_path)]

    return transcript
Ejemplo n.º 2
0
 def read_transcript(data_path):
     assert os.path.exists(data_path)
     if data_path.endswith('.json') or data_path.endswith('.json.gz'):
         return json.load(utils.open_maybe_gz(data_path))
     if os.path.exists(data_path + '.json'):
         return json.load(open(data_path + '.json'))
     return [dict(audio_path=data_path)]
Ejemplo n.º 3
0
def csv2json(input_path, gz, group, reset_begin_end, csv_sep, audio_name_pattern=None):
	""" Convert cvs transcripts file to .csv.json transcripts file. Each line in `input_path` file must have format:
		'audio_path,transcription,begin,end\n'
		csv_sep could be 'comma', representing ',', or 'tab', representing '\t'.
		audio_name_pattern - is a regex pattern, that is used, when reset_begin_end is True. It must contain at least
			two named capturing groups: (?P<begin>...) and (?P<end>...). By default, Kontur calls patter will be used.
	"""
	audio_name_regex = re.compile(audio_name_pattern) if audio_name_pattern else re.compile(
		r'(?P<begin>\d+\.?\d*)-(?P<end>\d+\.?\d*)_\d+\.?\d*_[01]_1\d{9}\.?\d*\.wav'
	)
	# default is Kontur calls pattern, match example: '198.38-200.38_2.0_0_1582594487.376404.wav'

	def duration(audio_name):
		match = audio_name_regex.fullmatch(audio_name)
		assert match is not None, f'audio_name {audio_name!r} must match {audio_name_regex.pattern}'
		begin, end = float(match['begin']), float(match['end'])
		assert begin < end < 10_000, 'sanity check: begin and end must be below 10_000 seconds'
		return end - begin

	csv_sep = dict(tab = '\t', comma = ',')[csv_sep]
	res = []
	for line in utils.open_maybe_gz(input_path):
		assert '"' not in line, f'{input_path!r} lines must not contain any quotation marks!'
		audio_path, ref, begin, end = line[:-1].split(csv_sep)[:4]
		transcription = dict(audio_path = audio_path, ref = ref, begin = float(begin), end = float(end))
		if reset_begin_end:
			transcription['begin'] = 0.0
			transcription['end'] = duration(os.path.basename(audio_path))

		# add input_path folder name to the 'group' key of each transcription
		# todo: rename --group parameter to something more sensible!
		if group >= 0:
			transcription['group'] = audio_path.split('/')[group]
		res.append(transcription)

	output_path = input_path + '.json' + ('.gz' if gz else '')
	json.dump(res, utils.open_maybe_gz(output_path, 'w'), ensure_ascii = False, indent = 2, sort_keys = True)
	print(output_path)