def convert(args): """ Looks for .wav files and transcription logs under the `args.infname' directory. Copies .wav files and their transcriptions linked from the log to `args.outdir' using the `extract_wavs_trns' function. `args.dictionary' may refer to an open file listing the only words to be allowed in transcriptions in the first whitespace-separated column. Returns a tuple of: number of collisions (files at different paths with same basename) number of overwrites (files with the same basename as previously present in `args.outdir') number of ignored files (file basenames referred in transcription logs but missing in the file system, presumably because specified by one of the ignoring mechanisms) """ # Unpack the arguments. infname = args.infname outdir = args.outdir lang = args.language verbose = args.verbose ignore_list_file = args.ignore dict_file = args.dictionary # Read in the dictionary. if dict_file: known_words = set(line.split()[0] for line in dict_file) dict_file.close() else: known_words = None # Find wavs. wav_paths = find_wavs(infname, ignore_list_file=ignore_list_file) # Map file basenames to their relative paths -- NOTE this can be # destructive if multiple files have the same basename. # Wav_mapping: wav basename -> path to call log dir swapped = [os.path.split(fpath) for fpath in wav_paths] wav_mapping = {name: prefix for (prefix, name) in swapped} n_collisions = len(wav_paths) - len(wav_mapping) # Get all transcription logs. n_notnorm_trss = 0 n_missing_trss = 0 # XXX ??? # path session file name created from call log dir sess_fnames = dict() # Call dir == prefix for prefix in wav_mapping.itervalues(): norm_fname = os.path.join(prefix, 'asr_transcribed.xml') if os.path.isfile(norm_fname): sess_fnames[prefix] = norm_fname else: basic_fname = os.path.join(prefix, 'session.xml') if os.path.isfile(basic_fname): n_notnorm_trss += 1 else: n_missing_trss += 1 # trn_paths = find(infname, 'user-transcription.norm.xml') print "" print "Number of sessions: ", len(sess_fnames) print "Number of untranscribed sessions: ", n_notnorm_trss print "Number of missing sessions: ", n_missing_trss print "" # Copy files referred in the transcription logs to `outdir'. size = 0 n_overwrites = 0 n_missing_wav = 0 n_missing_trs = 0 # for trn_path in trn_paths: for prefix, call_log in sess_fnames.iteritems(): if verbose: print "Processing call log dir:", prefix cursize, cur_n_overwrites, cur_n_missing_wav, cur_n_missing_trs = \ extract_wavs_trns(prefix, call_log, outdir, wav_mapping, known_words, lang, verbose) size += cursize n_overwrites += cur_n_overwrites n_missing_wav += cur_n_missing_wav n_missing_trs += cur_n_missing_trs # Print statistics. print "Size of copied audio data:", size sec = size / (16000 * 2) hour = sec / 3600.0 print "Length of audio data in hours (for 16kHz 16b WAVs):", hour # Return the number of file collisions and overwrites. return n_collisions, n_overwrites, n_missing_wav, n_missing_trs
def convert(args): """ Looks for .wav files and transcription logs under the `args.infname' directory. Copies .wav files and their transcriptions linked from the log to `args.outdir' using the `extract_wavs_trns' function. `args.dictionary' may refer to an open file listing the only words to be allowed in transcriptions in the first whitespace-separated column. Returns a tuple of: number of collisions (files at different paths with same basename) number of overwrites (files with the same basename as previously present in `args.outdir') number of ignored files (file basenames referred in transcription logs but missing in the file system, presumably because specified by one of the ignoring mechanisms) """ # Unpack the arguments. infname = args.infname outdir = args.outdir verbose = args.verbose ignore_list_file = args.ignore dict_file = args.dictionary # Read in the dictionary. if dict_file: known_words = set(line.split()[0] for line in dict_file) dict_file.close() else: known_words = None # Find wavs. wav_paths = find_wavs(infname, ignore_list_file=ignore_list_file) # Map file basenames to their relative paths -- NOTE this can be # destructive if multiple files have the same basename. # Wav_mapping: wav basename -> path to call log dir swapped = [os.path.split(fpath) for fpath in wav_paths] wav_mapping = {name: prefix for (prefix, name) in swapped} n_collisions = len(wav_paths) - len(wav_mapping) # XXX Obsoleted since this should now be done by the `find_wavs' function # itself. Yes, it does not match basenames but whole paths, still it # should work because all paths are normalised and absolutised. # # # Get basename for ignore files and we do NOT care about collisions # # ignore_wav_mapping: wav basename -> path to call log dir # swapped = [os.path.split(fpath) for fpath in ignore_wav_paths] # ignore_wav_mapping = dict([(name, prefix) for (prefix, name) in swapped]) # # # Delete ignore basenames from wav_mapping (if any pass through # # ignore_paths and globs) # for ignore_basename, ignore_prefix in ignore_wav_mapping.iteritems(): # if ignore_basename in wav_mapping: # if verbose: # ignore_path = os.path.join(ignore_prefix, ignore_basename) # print 'Wav file SKIPPED:', ignore_path # # Removing the ignore_basename from wav_mapping # del wav_mapping[ignore_basename] # Get all transcription logs. n_notnorm_trss = 0 n_missing_trss = 0 # XXX ??? # path session file name created from call log dir sess_fnames = dict() # Call dir == prefix for prefix in wav_mapping.itervalues(): norm_fname = os.path.join(prefix, 'user-transcription.norm.xml') if os.path.isfile(norm_fname): sess_fnames[prefix] = norm_fname else: basic_fname = os.path.join(prefix, 'user-transcription.xml') if os.path.isfile(basic_fname): sess_fnames[prefix] = basic_fname n_notnorm_trss += 1 else: basic_fname = os.path.join(prefix, 'user-transcription-all.xml') if os.path.isfile(basic_fname): sess_fnames[prefix] = basic_fname n_notnorm_trss += 1 else: n_missing_trss += 1 # trn_paths = find(infname, 'user-transcription.norm.xml') print "" print "Number of sessions :", len(sess_fnames) print "Number of unnormalised transcriptions:", n_notnorm_trss print "Number of missing transcriptions :", n_missing_trss print "" # Copy files referred in the transcription logs to `outdir'. size = 0 n_overwrites = 0 n_missing_wav = 0 n_missing_trs = 0 # for trn_path in trn_paths: for prefix, call_log in sess_fnames.iteritems(): if verbose: print "Processing call log dir:", prefix print " session file name: ", call_log cursize, cur_n_overwrites, cur_n_missing_wav, cur_n_missing_trs = \ extract_wavs_trns(prefix, call_log, outdir, wav_mapping, known_words, verbose) size += cursize n_overwrites += cur_n_overwrites n_missing_wav += cur_n_missing_wav n_missing_trs += cur_n_missing_trs # Print statistics. print "Size of copied audio data:", size sec = size / (16000 * 2) hour = sec / 3600.0 print "Length of audio data in hours (for 16kHz 16b WAVs):", hour # Return the number of file collisions and overwrites. return n_collisions, n_overwrites, n_missing_wav, n_missing_trs
def convert(args): """ Looks for .wav files and transcription logs under the `args.infname' directory. Copies .wav files and their transcriptions linked from the log to `args.outdir' using the `extract_wavs_trns' function. `args.dictionary' may refer to an open file listing the only words to be allowed in transcriptions in the first whitespace-separated column. Returns a tuple of: number of collisions (files at different paths with same basename) number of overwrites (files with the same basename as previously present in `args.outdir') number of ignored files (file basenames referred in transcription logs but missing in the file system, presumably because specified by one of the ignoring mechanisms) """ # Unpack the arguments. infname = args.infname outdir = args.outdir lang = args.language verbose = args.verbose ignore_list_file = args.ignore dict_file = args.dictionary # Read in the dictionary. if dict_file: known_words = set(line.split()[0] for line in dict_file) dict_file.close() else: known_words = None # Find wavs. wav_paths = find_wavs(infname, ignore_list_file=ignore_list_file) # Map file basenames to their relative paths -- NOTE this can be # destructive if multiple files have the same basename. # Wav_mapping: wav basename -> path to call log dir swapped = [os.path.split(fpath) for fpath in wav_paths] wav_mapping = {name: prefix for (prefix, name) in swapped} n_collisions = len(wav_paths) - len(wav_mapping) # Get all transcription logs. n_notnorm_trss = 0 n_missing_trss = 0 # XXX ??? # path session file name created from call log dir sess_fnames = dict() # Call dir == prefix for prefix in wav_mapping.itervalues(): norm_fname = os.path.join(prefix, 'asr_transcribed.xml') if os.path.isfile(norm_fname): sess_fnames[prefix] = norm_fname else: basic_fname = os.path.join(prefix, 'session.xml') if os.path.isfile(basic_fname): n_notnorm_trss += 1 else: n_missing_trss += 1 # trn_paths = find(infname, 'user-transcription.norm.xml') print "" print "Number of sessions: ", len(sess_fnames) print "Number of untranscribed sessions: ", n_notnorm_trss print "Number of missing sessions: ", n_missing_trss print "" # Copy files referred in the transcription logs to `outdir'. size = 0 n_overwrites = 0 n_missing_wav = 0 n_missing_trs = 0 # for trn_path in trn_paths: for prefix, call_log in sess_fnames.iteritems(): if verbose: print "Processing call log dir:", prefix cursize, cur_n_overwrites, cur_n_missing_wav, cur_n_missing_trs = \ extract_wavs_trns(prefix, call_log, outdir, wav_mapping, known_words, lang, verbose) size += cursize n_overwrites += cur_n_overwrites n_missing_wav += cur_n_missing_wav n_missing_trs += cur_n_missing_trs # Print statistics. print "Size of copied audio data:", size sec = size / (16000 * 2) hour = sec / 3600.0 print "Length of audio data in hours (for 16kHz 16b WAVs):", hour # Return the number of file collisions and overwrites. return n_collisions, n_overwrites, n_missing_wav, n_missing_trs
def convert(args): """ Looks for .wav files and transcription logs under the `args.infname' directory. Copies .wav files and their transcriptions linked from the log to `args.outdir' using the `extract_wavs_trns' function. `args.dictionary' may refer to an open file listing the only words to be allowed in transcriptions in the first whitespace-separated column. Returns a tuple of: number of collisions (files at different paths with same basename) number of overwrites (files with the same basename as previously present in `args.outdir') number of ignored files (file basenames referred in transcription logs but missing in the file system, presumably because specified by one of the ignoring mechanisms) """ # Unpack the arguments. infname = args.infname outdir = args.outdir verbose = args.verbose ignore_list_file = args.ignore dict_file = args.dictionary # Read in the dictionary. if dict_file: known_words = set(line.split()[0] for line in dict_file) dict_file.close() else: known_words = None # Find wavs. wav_paths = find_wavs(infname, ignore_list_file=ignore_list_file) # Map file basenames to their relative paths -- NOTE this can be # destructive if multiple files have the same basename. # Wav_mapping: wav basename -> path to call log dir swapped = [os.path.split(fpath) for fpath in wav_paths] wav_mapping = {name: prefix for (prefix, name) in swapped} n_collisions = len(wav_paths) - len(wav_mapping) # XXX Obsoleted since this should now be done by the `find_wavs' function # itself. Yes, it does not match basenames but whole paths, still it # should work because all paths are normalised and absolutised. # # # Get basename for ignore files and we do NOT care about collisions # # ignore_wav_mapping: wav basename -> path to call log dir # swapped = [os.path.split(fpath) for fpath in ignore_wav_paths] # ignore_wav_mapping = dict([(name, prefix) for (prefix, name) in swapped]) # # # Delete ignore basenames from wav_mapping (if any pass through # # ignore_paths and globs) # for ignore_basename, ignore_prefix in ignore_wav_mapping.iteritems(): # if ignore_basename in wav_mapping: # if verbose: # ignore_path = os.path.join(ignore_prefix, ignore_basename) # print 'Wav file SKIPPED:', ignore_path # # Removing the ignore_basename from wav_mapping # del wav_mapping[ignore_basename] # Get all transcription logs. n_notnorm_trss = 0 n_missing_trss = 0 # XXX ??? # path session file name created from call log dir sess_fnames = dict() # Call dir == prefix for prefix in wav_mapping.itervalues(): norm_fname = os.path.join(prefix, 'user-transcription.norm.xml') if os.path.isfile(norm_fname): sess_fnames[prefix] = norm_fname else: basic_fname = os.path.join(prefix, 'user-transcription.xml') if os.path.isfile(basic_fname): sess_fnames[prefix] = basic_fname n_notnorm_trss += 1 else: basic_fname = os.path.join(prefix, 'user-transcription-all.xml') if os.path.isfile(basic_fname): sess_fnames[prefix] = basic_fname n_notnorm_trss += 1 else: n_missing_trss += 1 # trn_paths = find(infname, 'user-transcription.norm.xml') print "" print "Number of sessions :", len(sess_fnames) print "Number of unnormalised transcriptions:", n_notnorm_trss print "Number of missing transcriptions :", n_missing_trss print "" # Copy files referred in the transcription logs to `outdir'. size = 0 n_overwrites = 0 n_missing_wav = 0 n_missing_trs = 0 # for trn_path in trn_paths: for prefix, call_log in sess_fnames.iteritems(): if verbose: print "Processing call log dir:", prefix print " session file name: ", call_log cursize, cur_n_overwrites, cur_n_missing_wav, cur_n_missing_trs = \ extract_wavs_trns(prefix, call_log, outdir, wav_mapping, known_words, verbose) size += cursize n_overwrites += cur_n_overwrites n_missing_wav += cur_n_missing_wav n_missing_trs += cur_n_missing_trs # Print statistics. print "Size of copied audio data:", size sec = size / (16000 * 2) hour = sec / 3600.0 print "Length of audio data in hours (for 16kHz 16b WAVs):", hour # Return the number of file collisions and overwrites. return n_collisions, n_overwrites, n_missing_wav, n_missing_trs