def get_wav_fnames(dirname, ignore_list_file=None): """ Finds WAV files that should be decoded. Returns a list of tuples (filename, WAV unique ID). Arguments: dirname -- the directory to search for WAVs ignore_list_file -- a file of absolute paths or globs (can be mixed) specifying logs that should be skipped """ find_kwargs = {'mindepth': 0, 'maxdepth': None, 'notrx': re.compile('^.*_all\\.wav$')} wav_fnames = find_with_ignorelist(dirname, '*.wav', ignore_list_file=ignore_list_file, find_kwargs=find_kwargs) return [(fname, basename(fname)) for fname in wav_fnames]
def get_wav_fnames(dirname, ignore_list_file=None): """ Finds WAV files that should be decoded. Returns a list of tuples (filename, WAV unique ID). Arguments: dirname -- the directory to search for WAVs ignore_list_file -- a file of absolute paths or globs (can be mixed) specifying logs that should be skipped """ find_kwargs = { 'mindepth': 0, 'maxdepth': None, 'notrx': re.compile('^.*_all\\.wav$') } wav_fnames = find_with_ignorelist(dirname, '*.wav', ignore_list_file=ignore_list_file, find_kwargs=find_kwargs) return [(fname, basename(fname)) for fname in wav_fnames]
def convert(args): """ Looks for recordings and transcriptions under the `args.infname' directory. Converts audio files to WAVs and copies the .wav files and their transcriptions to `args.outdir' directory. `args.dictionary' may refer to an open file listing the only words to be allowed in transcriptions in the first whitespace-separated column. Returns a tuple of: total audio size total audio length in seconds number of collisions (files at different paths with same basename) number of overwrites (files with the same basename as previously present in `args.outdir') number of missing files (file basenames referred in transcription logs but missing in the file system) number of missing transcriptions """ # Unpack the arguments. infname = args.infname outdir = args.outdir lang = args.language verbose = args.verbose ignore_list_file = args.ignore dict_file = args.dictionary size = 0 seconds = 0 n_overwrites = 0 n_missing_audio = 0 n_missing_trs = 0 # Import the appropriate normalisation module. norm_mod_name = _LANG2NORMALISATION_MOD[lang] norm_mod = __import__(norm_mod_name, fromlist=('normalise_text', 'exclude_asr', 'exclude_by_dict')) # Read in the dictionary. if dict_file: known_words = set(line.split()[0] for line in dict_file) dict_file.close() else: known_words = None # Find transcription files. trs_paths = find_with_ignorelist(infname, '*.trs', ignore_list_file) trs_dict = {os.path.split(fpath)[1]: fpath for fpath in trs_paths} # Find all audio files, create dictionary of paths by basename. audio_paths = find_with_ignorelist(infname, '*.mp2') audio_dict = { os.path.splitext(os.path.split(fpath)[1])[0]: fpath for fpath in audio_paths } n_collisions = len(audio_paths) - len(audio_dict) # Process the files. for trs_path in trs_dict.values(): if verbose: print "Processing", trs_path # Parse the file. doc = xml.dom.minidom.parse(trs_path) fname = doc.getElementsByTagName( "Trans")[0].attributes['audio_filename'].value if not fname in audio_dict or not os.path.isfile(audio_dict[fname]): if verbose: print "Lost audio file:", fname n_missing_audio += 1 continue audio_path = audio_dict[fname] # Convert audio to wav. tmp_wav_path = os.path.join(outdir, fname + '.wav') to_wav(audio_path, tmp_wav_path) turns = doc.getElementsByTagName("Turn") i = 0 for turn in turns: i += 1 currtime = float(turn.getAttribute('startTime')) currtext = '' utterances = [] # Process all child nodes. for node in turn.childNodes: if node.nodeType == node.ELEMENT_NODE and node.tagName == 'Sync': starttime = currtime currtime = float(node.getAttribute('time')) if currtime > starttime: utterances += [(currtext, starttime, currtime)] currtext = '' elif node.nodeType == node.TEXT_NODE: currtext += ' ' + node.data.strip() # Add the last utterance, which is not followed by a Sync tag. starttime = currtime try: currtime = float(turn.getAttribute('endTime')) except ValueError: currtime = float(turn.getAttribute('endTime').split()[0]) if currtime > starttime: utterances += [(currtext, starttime, currtime)] j = 0 for (trs, starttime, endtime) in utterances: j += 1 if (endtime - starttime) < 0.2: print "Too short segment" continue if not trs: # empty transcription n_missing_trs += 1 wav_name = '%s_%02d_%04d.wav' % (fname, i, j) #wav_path = os.path.join(outdir, wav_name) wav_path = os.path.join( outdir, "{r:02}".format(r=random.randint(0, 99)), "{r:02}".format(r=random.randint(0, 99)), wav_name) if not os.path.exists(os.path.dirname(wav_path)): os.makedirs(os.path.dirname(wav_path)) if verbose: print print "src:", os.path.split(audio_path)[1] print "tgt:", wav_name print "time:", starttime, endtime print "orig transcription:", trs.upper().strip() trs = norm_mod.normalise_text(trs) if verbose: print "normalised trans: ", trs if known_words is not None: excluded = norm_mod.exclude_by_dict(trs, known_words) else: excluded = norm_mod.exclude_asr(trs) if excluded: if verbose: print "... excluded" continue wc.update(trs.split()) if save_transcription(wav_path + '.trn', trs): n_overwrites += 1 # Extract utterance from audio. segment_to_wav(tmp_wav_path, wav_path, starttime, endtime) size += os.path.getsize(wav_path) seconds += endtime - starttime os.remove(tmp_wav_path) return size, seconds, n_collisions, n_overwrites, n_missing_audio, n_missing_trs
def convert(args): """ Looks for recordings and transcriptions under the `args.infname' directory. Converts audio files to WAVs and copies the .wav files and their transcriptions to `args.outdir' using the `extract_wavs_trns' function. `args.dictionary' may refer to an open file listing the only words to be allowed in transcriptions in the first whitespace-separated column. Returns a tuple of: number of collisions (files at different paths with same basename) number of overwrites (files with the same basename as previously present in `args.outdir') number of ignored files (file basenames referred in transcription logs but missing in the file system, presumably because specified by one of the ignoring mechanisms) """ # Unpack the arguments. infname = args.infname outdir = args.outdir lang = args.language verbose = args.verbose ignore_list_file = args.ignore dict_file = args.dictionary size = 0 n_overwrites = 0 # Import the appropriate normalisation module. norm_mod_name = _LANG2NORMALISATION_MOD[lang] norm_mod = __import__(norm_mod_name, fromlist=( 'normalise_text', 'exclude_asr', 'exclude_by_dict')) # Read in the dictionary. if dict_file: known_words = set(line.split()[0] for line in dict_file) dict_file.close() else: known_words = None # Find transcription files. txt_paths = find_with_ignorelist(args.infname, '*.trans.txt', ignore_list_file) # Process the files. flac_names = [] for txt_path in txt_paths: if verbose: print "Processing", txt_path path_prefix = os.path.split(txt_path)[0] with codecs.open(txt_path, 'r', 'UTF-8') as txt_file: for line in txt_file: # Each line contains the name of the audio file and the transcription (flac_name, trs) = line.split(' ', 1) flac_names += [flac_name] # Process audio file flac_path = os.path.join(path_prefix, flac_name + '.flac') wav_path = os.path.join(outdir, "{r:02}".format(r=random.randint(0, 99)), "{r:02}".format(r=random.randint(0, 99)), flac_name + '.wav') if not os.path.exists(os.path.dirname(wav_path)): os.makedirs(os.path.dirname(wav_path)) if not os.path.isfile(flac_path): continue to_wav(flac_path, wav_path) size += os.path.getsize(wav_path) # Process transcription if verbose: print print "# f:", flac_name + '.flac' print "orig transcription:", trs.upper().strip() trs = norm_mod.normalise_text(trs) if verbose: print "normalised trans: ", trs if known_words is not None: excluded = norm_mod.exclude_by_dict(trs, known_words) else: excluded = norm_mod.exclude_asr(trs) if excluded: if verbose: print "... excluded" continue wc.update(trs.split()) if save_transcription(wav_path + '.trn', trs): n_overwrites += 1 n_collisions = len(flac_names) - len(set(flac_names)) return size, n_collisions, n_overwrites
def convert(args): """ Looks for recordings and transcriptions under the `args.infname' directory. Converts audio files to WAVs and copies the .wav files and their transcriptions to `args.outdir' function. `args.dictionary' may refer to an open file listing the only words to be allowed in transcriptions in the first whitespace-separated column. Returns a tuple of: total audio size total audio length in seconds number of collisions (files at different paths with same basename) number of overwrites (files with the same basename as previously present in `args.outdir') number of missing files (file basenames referred in transcription logs but missing in the file system) """ # Unpack the arguments. infname = args.infname outdir = args.outdir lang = args.language verbose = args.verbose ignore_list_file = args.ignore dict_file = args.dictionary size = 0 seconds = 0 n_overwrites = 0 n_missing_audio = 0 # Import the appropriate normalisation module. norm_mod_name = _LANG2NORMALISATION_MOD[lang] norm_mod = __import__(norm_mod_name, fromlist=( 'normalise_text', 'exclude_asr', 'exclude_by_dict')) # Read in the dictionary. if dict_file: known_words = set(line.split()[0] for line in dict_file) dict_file.close() else: known_words = None # Find transcription files. txt_paths = find_with_ignorelist(infname, 'fe_*.txt', ignore_list_file) # Find all audio files, create dictionary of paths by basename. sph_paths = find_with_ignorelist(infname, 'fe_*.sph') sph_dict = {os.path.split(fpath)[1]: fpath for fpath in sph_paths} n_collisions = len(sph_paths) - len(sph_dict) # Process the files. for txt_path in txt_paths: if verbose: print "Processing", txt_path txt_name = os.path.split(txt_path)[1] src_name = os.path.splitext(txt_name)[0] sph_name = src_name + '.sph' if not sph_name in sph_dict or not os.path.isfile(sph_dict[sph_name]): if verbose: print "Lost audio file:", sph_name n_missing_audio += 1 continue sph_path = sph_dict[sph_name] utterances = [] with codecs.open(txt_path, 'r', 'UTF-8') as txt_file: i = 1 for line in txt_file: if len(line.strip()) == 0 or line.strip()[0] == '#': continue #ignore comments and empty lines # Each line contains start time, end time, speaker id and transcription (start, end, speaker, trs) = line.split(' ', 3) start = float(start) end = float(end) channel = 1 if speaker[0] == 'A' else 2 utterances += [{'start': start, 'end': end, 'trs': trs, 'channel': channel}] i += 1 for i in range(len(utterances)): utt = utterances[i] trs = utt['trs'] wav_name = '%s_%03d.wav' % (src_name, i) # wav_path = os.path.join(outdir, wav_name) wav_path = os.path.join(outdir, "{r:02}".format(r=random.randint(0, 99)), "{r:02}".format(r=random.randint(0, 99)), wav_name) if not os.path.exists(os.path.dirname(wav_path)): os.makedirs(os.path.dirname(wav_path)) if verbose: print print "src:", sph_name print "tgt:", wav_name print "time:", utt['start'], utt['end'] print "channel:", utt['channel'] print "orig transcription:", trs.upper().strip() trs = norm_mod.normalise_text(trs) if verbose: print "normalised trans: ", trs if known_words is not None: excluded = norm_mod.exclude_by_dict(trs, known_words) else: excluded = norm_mod.exclude_asr(trs) if excluded: if verbose: print "... excluded" continue # Check for very short utterances if utt['end']-utt['start'] < 1: if verbose: print "... too short" continue wc.update(trs.split()) if save_transcription(wav_path + '.trn', trs): n_overwrites += 1 # Extract utterance from audio tmp_path = wav_path + '.tmp' cmd = ['sph2pipe', '-f', 'wav', '-t', '%f:%f' % (utt['start'], utt['end']), '-c', str(utt['channel']), sph_path, tmp_path] subprocess.call(cmd) # Convert to valid WAV to_wav(tmp_path, wav_path) os.remove(tmp_path) size += os.path.getsize(wav_path) seconds += utt['end'] - utt['start'] return size, seconds, n_collisions, n_overwrites, n_missing_audio
def convert(args): """ Looks for recordings and transcriptions under the `args.infname' directory. Converts audio files to WAVs and copies the .wav files and their transcriptions to `args.outdir' directory. `args.dictionary' may refer to an open file listing the only words to be allowed in transcriptions in the first whitespace-separated column. Returns a tuple of: total audio size total audio length in seconds number of collisions (files at different paths with same basename) number of overwrites (files with the same basename as previously present in `args.outdir') number of missing files (file basenames referred in transcription logs but missing in the file system) number of missing transcriptions """ # Unpack the arguments. infname = args.infname outdir = args.outdir lang = args.language verbose = args.verbose ignore_list_file = args.ignore dict_file = args.dictionary size = 0 seconds = 0 n_overwrites = 0 n_missing_audio = 0 n_missing_trs = 0 # Import the appropriate normalisation module. norm_mod_name = _LANG2NORMALISATION_MOD[lang] norm_mod = __import__(norm_mod_name, fromlist=( 'normalise_text', 'exclude_asr', 'exclude_by_dict')) # Read in the dictionary. if dict_file: known_words = set(line.split()[0] for line in dict_file) dict_file.close() else: known_words = None # Find transcription files. trs_paths = find_with_ignorelist(infname, '*.trs', ignore_list_file) trs_dict = {os.path.split(fpath)[1]: fpath for fpath in trs_paths} # Find all audio files, create dictionary of paths by basename. audio_paths = find_with_ignorelist(infname, '*.mp2') audio_dict = {os.path.splitext(os.path.split(fpath)[1])[0]: fpath for fpath in audio_paths} n_collisions = len(audio_paths) - len(audio_dict) # Process the files. for trs_path in trs_dict.values(): if verbose: print "Processing", trs_path # Parse the file. doc = xml.dom.minidom.parse(trs_path) fname = doc.getElementsByTagName("Trans")[0].attributes['audio_filename'].value if not fname in audio_dict or not os.path.isfile(audio_dict[fname]): if verbose: print "Lost audio file:", fname n_missing_audio += 1 continue audio_path = audio_dict[fname] # Convert audio to wav. tmp_wav_path = os.path.join(outdir, fname + '.wav') to_wav(audio_path, tmp_wav_path) turns = doc.getElementsByTagName("Turn") i = 0 for turn in turns: i += 1 currtime = float(turn.getAttribute('startTime')) currtext = '' utterances = [] # Process all child nodes. for node in turn.childNodes: if node.nodeType == node.ELEMENT_NODE and node.tagName == 'Sync': starttime = currtime currtime = float(node.getAttribute('time')) if currtime > starttime: utterances += [(currtext, starttime, currtime)] currtext = '' elif node.nodeType == node.TEXT_NODE: currtext += ' ' + node.data.strip() # Add the last utterance, which is not followed by a Sync tag. starttime = currtime try: currtime = float(turn.getAttribute('endTime')) except ValueError: currtime = float(turn.getAttribute('endTime').split()[0]) if currtime > starttime: utterances += [(currtext, starttime, currtime)] j = 0 for (trs, starttime, endtime) in utterances: j += 1 if (endtime - starttime) < 0.2: print "Too short segment" continue if not trs: # empty transcription n_missing_trs += 1 wav_name = '%s_%02d_%04d.wav' % (fname, i, j) #wav_path = os.path.join(outdir, wav_name) wav_path = os.path.join(outdir, "{r:02}".format(r=random.randint(0, 99)), "{r:02}".format(r=random.randint(0, 99)), wav_name) if not os.path.exists(os.path.dirname(wav_path)): os.makedirs(os.path.dirname(wav_path)) if verbose: print print "src:", os.path.split(audio_path)[1] print "tgt:", wav_name print "time:", starttime, endtime print "orig transcription:", trs.upper().strip() trs = norm_mod.normalise_text(trs) if verbose: print "normalised trans: ", trs if known_words is not None: excluded = norm_mod.exclude_by_dict(trs, known_words) else: excluded = norm_mod.exclude_asr(trs) if excluded: if verbose: print "... excluded" continue wc.update(trs.split()) if save_transcription(wav_path + '.trn', trs): n_overwrites += 1 # Extract utterance from audio. segment_to_wav(tmp_wav_path, wav_path, starttime, endtime) size += os.path.getsize(wav_path) seconds += endtime - starttime os.remove(tmp_wav_path) return size, seconds, n_collisions, n_overwrites, n_missing_audio, n_missing_trs
def convert(args): """ Looks for recordings and transcriptions under the `args.infname' directory. Converts audio files to WAVs and copies the .wav files and their transcriptions to `args.outdir' using the `extract_wavs_trns' function. `args.dictionary' may refer to an open file listing the only words to be allowed in transcriptions in the first whitespace-separated column. Returns a tuple of: number of collisions (files at different paths with same basename) number of overwrites (files with the same basename as previously present in `args.outdir') number of ignored files (file basenames referred in transcription logs but missing in the file system, presumably because specified by one of the ignoring mechanisms) """ # Unpack the arguments. infname = args.infname outdir = args.outdir lang = args.language verbose = args.verbose ignore_list_file = args.ignore dict_file = args.dictionary size = 0 n_overwrites = 0 # Import the appropriate normalisation module. norm_mod_name = _LANG2NORMALISATION_MOD[lang] norm_mod = __import__(norm_mod_name, fromlist=('normalise_text', 'exclude_asr', 'exclude_by_dict')) # Read in the dictionary. if dict_file: known_words = set(line.split()[0] for line in dict_file) dict_file.close() else: known_words = None # Find transcription files. txt_paths = find_with_ignorelist(args.infname, '*.trans.txt', ignore_list_file) # Process the files. flac_names = [] for txt_path in txt_paths: if verbose: print "Processing", txt_path path_prefix = os.path.split(txt_path)[0] with codecs.open(txt_path, 'r', 'UTF-8') as txt_file: for line in txt_file: # Each line contains the name of the audio file and the transcription (flac_name, trs) = line.split(' ', 1) flac_names += [flac_name] # Process audio file flac_path = os.path.join(path_prefix, flac_name + '.flac') wav_path = os.path.join( outdir, "{r:02}".format(r=random.randint(0, 99)), "{r:02}".format(r=random.randint(0, 99)), flac_name + '.wav') if not os.path.exists(os.path.dirname(wav_path)): os.makedirs(os.path.dirname(wav_path)) if not os.path.isfile(flac_path): continue to_wav(flac_path, wav_path) size += os.path.getsize(wav_path) # Process transcription if verbose: print print "# f:", flac_name + '.flac' print "orig transcription:", trs.upper().strip() trs = norm_mod.normalise_text(trs) if verbose: print "normalised trans: ", trs if known_words is not None: excluded = norm_mod.exclude_by_dict(trs, known_words) else: excluded = norm_mod.exclude_asr(trs) if excluded: if verbose: print "... excluded" continue wc.update(trs.split()) if save_transcription(wav_path + '.trn', trs): n_overwrites += 1 n_collisions = len(flac_names) - len(set(flac_names)) return size, n_collisions, n_overwrites
def convert(args): """ Looks for recordings and transcriptions under the `args.infname' directory. Converts audio files to WAVs and copies the .wav files and their transcriptions to `args.outdir' function. `args.dictionary' may refer to an open file listing the only words to be allowed in transcriptions in the first whitespace-separated column. Returns a tuple of: total audio size total audio length in seconds number of collisions (files at different paths with same basename) number of overwrites (files with the same basename as previously present in `args.outdir') number of missing files (file basenames referred in transcription logs but missing in the file system) """ # Unpack the arguments. infname = args.infname outdir = args.outdir lang = args.language verbose = args.verbose ignore_list_file = args.ignore dict_file = args.dictionary size = 0 seconds = 0 n_overwrites = 0 n_missing_audio = 0 # Import the appropriate normalisation module. norm_mod_name = _LANG2NORMALISATION_MOD[lang] norm_mod = __import__(norm_mod_name, fromlist=("normalise_text", "exclude_asr", "exclude_by_dict")) # Read in the dictionary. if dict_file: known_words = set(line.split()[0] for line in dict_file) dict_file.close() else: known_words = None # Find transcription files. txt_paths = find_with_ignorelist(infname, "fe_*.txt", ignore_list_file) # Find all audio files, create dictionary of paths by basename. sph_paths = find_with_ignorelist(infname, "fe_*.sph") sph_dict = {os.path.split(fpath)[1]: fpath for fpath in sph_paths} n_collisions = len(sph_paths) - len(sph_dict) # Process the files. for txt_path in txt_paths: if verbose: print "Processing", txt_path txt_name = os.path.split(txt_path)[1] src_name = os.path.splitext(txt_name)[0] sph_name = src_name + ".sph" if not sph_name in sph_dict or not os.path.isfile(sph_dict[sph_name]): if verbose: print "Lost audio file:", sph_name n_missing_audio += 1 continue sph_path = sph_dict[sph_name] utterances = [] with codecs.open(txt_path, "r", "UTF-8") as txt_file: i = 1 for line in txt_file: if len(line.strip()) == 0 or line.strip()[0] == "#": continue # ignore comments and empty lines # Each line contains start time, end time, speaker id and transcription (start, end, speaker, trs) = line.split(" ", 3) start = float(start) end = float(end) channel = 1 if speaker[0] == "A" else 2 utterances += [{"start": start, "end": end, "trs": trs, "channel": channel}] i += 1 for i in range(len(utterances)): utt = utterances[i] trs = utt["trs"] wav_name = "%s_%03d.wav" % (src_name, i) # wav_path = os.path.join(outdir, wav_name) wav_path = os.path.join( outdir, "{r:02}".format(r=random.randint(0, 99)), "{r:02}".format(r=random.randint(0, 99)), wav_name ) if not os.path.exists(os.path.dirname(wav_path)): os.makedirs(os.path.dirname(wav_path)) if verbose: print print "src:", sph_name print "tgt:", wav_name print "time:", utt["start"], utt["end"] print "channel:", utt["channel"] print "orig transcription:", trs.upper().strip() trs = norm_mod.normalise_text(trs) if verbose: print "normalised trans: ", trs if known_words is not None: excluded = norm_mod.exclude_by_dict(trs, known_words) else: excluded = norm_mod.exclude_asr(trs) if excluded: if verbose: print "... excluded" continue # Check for very short utterances if utt["end"] - utt["start"] < 1: if verbose: print "... too short" continue wc.update(trs.split()) if save_transcription(wav_path + ".trn", trs): n_overwrites += 1 # Extract utterance from audio tmp_path = wav_path + ".tmp" cmd = [ "sph2pipe", "-f", "wav", "-t", "%f:%f" % (utt["start"], utt["end"]), "-c", str(utt["channel"]), sph_path, tmp_path, ] subprocess.call(cmd) # Convert to valid WAV to_wav(tmp_path, wav_path) os.remove(tmp_path) size += os.path.getsize(wav_path) seconds += utt["end"] - utt["start"] return size, seconds, n_collisions, n_overwrites, n_missing_audio