def extract_wavs_trns(dirname, sess_fname, outdir, wav_mapping, known_words=None, verbose=False): """Extracts wavs and their transcriptions from the named in `sess_fname', a CUED call log file. Extracting means copying them to `outdir'. Recordings themselves are expected to reside in `dirname'. If `known_words', a collection of words present in the phonetic dictionary, is provided, transcriptions are excluded which contain other words. If `known_words' is not provided, excluded are transcriptions that contain any of _excluded_characters. Returns the total size of audio files copied to `outdir', the number of overwritten files by the output files, the number of wav files that were missing from the `wav_mapping' dictionary, and the number of transcriptions not present for existing recordings. """ # Parse the file. try: doc = ElementTree.parse(sess_fname) except IOError as error: if verbose: print '!!! Could not parse "{fname}": {msg!s}.'.format( fname=sess_fname, msg=error) return 0, 0, 0, 0 uturns = doc.findall(".//userturn") annotations = doc.findall('.//annotation') print "# annotations: ", len(annotations) if len(annotations) > 1: # FIXME: This is bad! We waste about 1/3 of all data from CF. However, it is not possible to deduce # what transcription to use. print "Transcription was rejected as we have more then two transcriptions and " \ "we cannot decide which one is better." return 0, 0, 0, 0 for a in annotations: r = False if 'worker_id' in a.attrib and a.attrib['worker_id'] == '19113916': r = True if r: print "Transcription was rejected because of unreliable annotator." return 0, 0, 0, 0 size = 0 n_overwrites = 0 n_missing_wav = 0 n_missing_trs = 0 for uturn in uturns: # trs.text = uturn.getElementsByTagName("trs.text") # rec = uturn.getElementsByTagName("rec") rec = uturn.find("rec") trs = uturn.find("transcription") if trs is None: # this may be CF style transcription trs2 = uturn.find("transcriptions") if trs2 is not None: trs3 = trs2.findall("transcription") if trs3 is None: if rec is not None: n_missing_trs += 1 continue else: trs = trs3[-1].text else: continue else: trs = trs.text # Check this is the wav from this directory. wav_basename = rec.attrib['fname'].strip() if wav_basename in wav_mapping: wav_fname = os.path.join(wav_mapping[wav_basename], wav_basename) if os.path.dirname(wav_fname) != dirname: missing_wav = True else: missing_wav = False else: missing_wav = True if not missing_wav: if verbose: term_width = getTerminalSize()[1] or 80 print '-' * term_width print "# f:", wav_basename print "orig transcription:", trs.upper().strip() trs = normalise_text(trs) if verbose: print "normalised trans: ", trs if known_words is not None: excluded = exclude_by_dict(trs, known_words) else: excluded = exclude_asr(trs) if excluded: print "... excluded" continue wc.update(trs.split()) trs_fname = os.path.join(outdir, wav_basename + '.trn') try: szx = os.path.getsize(wav_fname) except OSError: print "Lost audio file:", wav_fname else: try: #shutil.copy2(wav_fname, outdir) tgt = os.path.join(outdir, os.path.basename(wav_fname)) cmd = "sox --ignore-length {src} -c 1 -r 16000 -b 16 {tgt}".format( src=wav_fname, tgt=tgt) print cmd os.system(cmd) size += os.path.getsize(tgt) except shutil.Error as e: print >>sys.stderr, \ ("Isn't the `outdir' with previously copied files " "below `infname' within the filesystem?\n") raise e n_overwrites += save_transcription(trs_fname, trs) shutil.copystat(sess_fname, trs_fname) else: n_missing_wav += 1 if args.verbose: term_width = getTerminalSize()[1] or 80 print '-' * term_width print "(WW) Ignoring or missing_wav the file '{0}'."\ .format(wav_basename) if verbose: term_width = getTerminalSize()[1] or 80 print '-' * term_width print return size, n_overwrites, n_missing_wav, n_missing_trs
def extract_trns_sems_from_file(fname, verbose, fields=None, normalise=True, do_exclude=True, known_words=None, robust=False): """ Extracts transcriptions and their semantic annotation from a CUED call log file. Arguments: fname -- path towards the call log file verbose -- print lots of output? fields -- names of fields that should be required for the output. Field names are strings corresponding to the element names in the transcription XML format. (default: all five of them) normalise -- whether to do normalisation on transcriptions do_exclude -- whether to exclude transcriptions not considered suitable known_words -- a collection of words. If provided, transcriptions are excluded which contain other words. If not provided, excluded are transcriptions that contain any of _excluded_characters. What "excluded" means depends on whether the transcriptions are required by being specified in `fields'. robust -- whether to assign recordings to turns robustly or trust where they are in the log. This could be useful for older CUED logs where the elements sometimes escape to another <turn> than they belong. However, in cases where `robust' leads to finding the correct recording for the user turn, the log is damaged at other places too, and the resulting turn record would be misleading. Therefore, we recommend leaving robust=False. Returns a list of TurnRecords. """ if verbose: print 'Processing', fname # Interpret the arguments. if fields is None: fields = ("transcription", "semitran", "semihyp", "asrhyp", "rec") rec_filter = _make_rec_filter(fields) # Load the file. doc = xml.dom.minidom.parse(fname) uturns = doc.getElementsByTagName("userturn") if robust: audios = [audio for audio in doc.getElementsByTagName("rec") if not audio.getAttribute('fname').endswith('_all.wav')] trns_sems = [] for uturn in uturns: transcription = uturn.getElementsByTagName("transcription") cued_da = uturn.getElementsByTagName("semitran") cued_dahyp = uturn.getElementsByTagName("semihyp") asrhyp = uturn.getElementsByTagName("asrhyp") audio = uturn.getElementsByTagName("rec") # If there was something recognised but nothing recorded, if in the # robust mode, if asrhyp and not audio and robust: # Look for the recording elsewhere. audio = [_find_audio_for_turn(uturn, audios)] # This is the first form of the turn record, containing lists of XML # elements and suited only for internal use. rec = TurnRecord(transcription, cued_da, cued_dahyp, asrhyp, audio) if not rec_filter(rec): # Skip this node, it contains a wrong number of elements of either # transcription, cued_da, cued_dahyp, asrhyp, or audio. continue # XXX Here we take always the first tag having the respective tag name. transcription = get_text_from_xml_node( rec.transcription[0]).lower() if rec.transcription else None asrhyp = get_text_from_xml_node( rec.asrhyp[0]).lower() if rec.asrhyp else None # Filter the transcription and the ASR hypothesis through normalisation # and excluding non-conformant utterances. if transcription is not None: if normalise: transcription = normalise_text(transcription) if do_exclude: if known_words is not None: trs_excluded = exclude_by_dict(transcription, known_words) else: trs_excluded = exclude_asr(transcription) if trs_excluded: if verbose: print 'Excluded transcription: "{trs}".'.format( trs=transcription) if 'transcription' in fields: continue transcription = None if asrhyp is not None: if normalise: asrhyp = normalise_text(asrhyp) if do_exclude: if known_words is not None: asr_excluded = exclude_by_dict(asrhyp, known_words) else: asr_excluded = exclude_asr(asrhyp) if asr_excluded: if verbose: print 'Excluded ASR hypothesis: "{asr}".'.format( asr=asrhyp) if 'asrhyp' in fields: continue asrhyp = None cued_da = get_text_from_xml_node( rec.cued_da[0]) if rec.cued_da else None cued_dahyp = get_text_from_xml_node( rec.cued_dahyp[0]) if rec.cued_dahyp else None audio = rec.audio[0].getAttribute( 'fname').strip() if rec.audio else None # Construct the resulting turn record. rec = TurnRecord(transcription, cued_da, cued_dahyp, asrhyp, audio) if verbose: print "#1 f:", rec.audio print "#2 t:", rec.transcription, "# s:", rec.cued_da print "#3 a:", rec.asrhyp, "# s:", rec.cued_dahyp print if rec.cued_da or 'semitran' not in fields: trns_sems.append(rec) return trns_sems
def extract_wavs_trns(dirname, sess_fname, outdir, wav_mapping, known_words=None, verbose=False): """Extracts wavs and their transcriptions from the named in `sess_fname', a CUED call log file. Extracting means copying them to `outdir'. Recordings themselves are expected to reside in `dirname'. If `known_words', a collection of words present in the phonetic dictionary, is provided, transcriptions are excluded which contain other words. If `known_words' is not provided, excluded are transcriptions that contain any of _excluded_characters. Returns the total size of audio files copied to `outdir', the number of overwritten files by the output files, the number of wav files that were missing from the `wav_mapping' dictionary, and the number of transcriptions not present for existing recordings. """ # Parse the file. try: doc = ElementTree.parse(sess_fname) except IOError as error: if verbose: print '!!! Could not parse "{fname}": {msg!s}.'.format(fname=sess_fname, msg=error) return 0, 0, 0, 0 uturns = doc.findall(".//userturn") annotations = doc.findall('.//annotation') print "# annotations: ", len(annotations) if len(annotations) > 1: # FIXME: This is bad! We waste about 1/3 of all data from CF. However, it is not possible to deduce # what transcription to use. print "Transcription was rejected as we have more then two transcriptions and " \ "we cannot decide which one is better." return 0, 0, 0, 0 for a in annotations: r = False if 'worker_id' in a.attrib and a.attrib['worker_id'] == '19113916': r = True if r: print "Transcription was rejected because of unreliable annotator." return 0, 0, 0, 0 size = 0 n_overwrites = 0 n_missing_wav = 0 n_missing_trs = 0 for uturn in uturns: # trs.text = uturn.getElementsByTagName("trs.text") # rec = uturn.getElementsByTagName("rec") rec = uturn.find("rec") trs = uturn.find("transcription") if trs is None: # this may be CF style transcription trs2 = uturn.find("transcriptions") if trs2 is not None: trs3 = trs2.findall("transcription") if trs3 is None: if rec is not None: n_missing_trs += 1 continue else: trs = trs3[-1].text else: continue else: trs = trs.text # Check this is the wav from this directory. wav_basename = rec.attrib['fname'].strip() if wav_basename in wav_mapping: wav_fname = os.path.join(wav_mapping[wav_basename], wav_basename) if os.path.dirname(wav_fname) != dirname: missing_wav = True else: missing_wav = False else: missing_wav = True if not missing_wav: if verbose: term_width = getTerminalSize()[1] or 80 print '-' * term_width print "# f:", wav_basename print "orig transcription:", trs.upper().strip() trs = normalise_text(trs) if verbose: print "normalised trans: ", trs if known_words is not None: excluded = exclude_by_dict(trs, known_words) else: excluded = exclude_asr(trs) if excluded: print "... excluded" continue wc.update(trs.split()) # trs_fname = os.path.join(outdir, wav_basename + '.trn') sub_dir = "{r:02}".format(r=random.randint(0, 99)), "{r:02}".format(r=random.randint(0, 99)) trs_fname = os.path.join(outdir, sub_dir[0], sub_dir[1], wav_basename + '.trn') if not os.path.exists(os.path.dirname(trs_fname)): os.makedirs(os.path.dirname(trs_fname)) try: szx = os.path.getsize(wav_fname) except OSError: print "Lost audio file:", wav_fname else: try: #shutil.copy2(wav_fname, outdir) # tgt = os.path.join(outdir, os.path.basename(wav_fname)) tgt = os.path.join(outdir, sub_dir[0], sub_dir[1], os.path.basename(wav_fname)) cmd = "sox --ignore-length {src} -c 1 -r 16000 -b 16 {tgt}".format(src=wav_fname, tgt=tgt) print cmd os.system(cmd) size += os.path.getsize(tgt) except shutil.Error as e: print >>sys.stderr, \ ("Isn't the `outdir' with previously copied files " "below `infname' within the filesystem?\n") raise e n_overwrites += save_transcription(trs_fname, trs) shutil.copystat(sess_fname, trs_fname) else: n_missing_wav += 1 if args.verbose: term_width = getTerminalSize()[1] or 80 print '-' * term_width print "(WW) Ignoring or missing_wav the file '{0}'."\ .format(wav_basename) if verbose: term_width = getTerminalSize()[1] or 80 print '-' * term_width print return size, n_overwrites, n_missing_wav, n_missing_trs