def process_dm(self): # new turn self.cfg['Logging']['session_logger'].turn("system") self.dm.log_state() term_width = getTerminalSize()[1] or 120 print "=" * term_width print sys_da = self.dm.da_out() self.output_sys_da(sys_da) self.cfg['Logging']['session_logger'].dialogue_act("system", sys_da) sys_utt = self.nlg.generate(sys_da) self.output_sys_utt(sys_utt) term_width = getTerminalSize()[1] or 120 print '-' * term_width print
def process_utterance_hyp(self, obs): #self.output_usr_utt_nblist(utt_nblist) das = self.slu.parse(obs) self.output_usr_da(das) self.cfg['Logging']['session_logger'].turn("user") self.cfg['Logging']['session_logger'].slu("user", "*", das) term_width = getTerminalSize()[1] or 120 print '-' * term_width print self.dm.da_in(das, obs.values()[0])
def extract_wavs_trns(dirname, sess_fname, outdir, wav_mapping, known_words=None, lang='cs', verbose=False): """Extracts wavs and their transcriptions from the named in `sess_fname', a CUED call log file. Extracting means copying them to `outdir'. Recordings themselves are expected to reside in `dirname'. If `known_words', a collection of words present in the phonetic dictionary, is provided, transcriptions are excluded which contain other words. If `known_words' is not provided, excluded are transcriptions that contain any of _excluded_characters. Returns the total size of audio files copied to `outdir', the number of overwritten files by the output files, the number of wav files that were missing from the `wav_mapping' dictionary, and the number of transcriptions not present for existing recordings. """ # Import the appropriate normalisation module. norm_mod_name = _LANG2NORMALISATION_MOD[lang] norm_mod = __import__(norm_mod_name, fromlist=( 'normalise_text', 'exclude_asr', 'exclude_by_dict')) # Parse the file. try: doc = ElementTree.parse(sess_fname) except IOError as error: if verbose: print '!!! Could not parse "{fname}": {msg!s}.'\ .format(fname=sess_fname, msg=error) return 0, 0, 0, 0 uturns = doc.findall(".//turn") annotations = doc.findall('.//annotation') if len(annotations) > 1: print "Transcription was rejected as we have more then two transcriptions and " \ "we cannot decide which one is better." return 0, 0, 0, 0 for a in annotations: r = False if 'worker_id' in a.attrib and a.attrib['worker_id'] == '19113916': r = True if r: print "Transcription was rejected because of unreliable annotator." return 0, 0, 0, 0 size = 0 n_overwrites = 0 n_missing_wav = 0 n_missing_trs = 0 for uturn in uturns: # trs.text = uturn.getElementsByTagName("trs.text") # rec = uturn.getElementsByTagName("rec") if uturn.attrib['speaker'] != "user": continue rec = uturn.find("rec") trs = uturn.findall("asr_transcription") if trs is None: if rec is not None: n_missing_trs += 1 continue else: # FIXME: Is the last transcription the right thing to be used? Probably. Must be checked! trs = trs[-1].text # Check this is the wav from this directory. wav_basename = rec.attrib['fname'].strip() if wav_basename in wav_mapping: wav_fname = os.path.join(wav_mapping[wav_basename], wav_basename) if os.path.dirname(wav_fname) != dirname: missing_wav = True else: missing_wav = False else: missing_wav = True if not missing_wav: if verbose: term_width = getTerminalSize()[1] or 80 print '-' * term_width print "# f:", wav_basename print "orig transcription:", trs.upper().strip() trs = norm_mod.normalise_text(trs) if verbose: print "normalised trans: ", trs if known_words is not None: excluded = norm_mod.exclude_by_dict(trs, known_words) else: excluded = norm_mod.exclude_asr(trs) if excluded: print "... excluded" continue wc.update(trs.split()) trs_fname = os.path.join(outdir, wav_basename + '.trn') try: size += os.path.getsize(wav_fname) except OSError: print "Lost audio file:", wav_fname else: try: #shutil.copy2(wav_fname, outdir) tgt = os.path.join(outdir, os.path.basename(wav_fname)) cmd = "sox --ignore-length {src} -c 1 -r 16000 -b 16 {tgt}".format(src=wav_fname, tgt=tgt) print cmd os.system(cmd) except shutil.Error as e: print >>sys.stderr, \ ("Isn't the `outdir' with previously copied files " "below `infname' within the filesystem?\n") raise e n_overwrites += save_transcription(trs_fname, trs) shutil.copystat(sess_fname, trs_fname) else: n_missing_wav += 1 if args.verbose: term_width = getTerminalSize()[1] or 80 print '-' * term_width print "(WW) Ignoring or missing_wav the file '{0}'."\ .format(wav_basename) if verbose: term_width = getTerminalSize()[1] or 80 print '-' * term_width print return size, n_overwrites, n_missing_wav, n_missing_trs
# Make sure the alex package is visible. if __name__ == '__main__': import autopath from alex.utils.fs import find _LANG2NORMALISATION_MOD = { 'cs': 'alex.corpustools.text_norm_cs', 'en': 'alex.corpustools.text_norm_en', 'es': 'alex.corpustools.text_norm_es', } from alex.utils.ui import getTerminalSize try: _term_width = getTerminalSize()[1] except: _term_width = 80 def unique_str(): """Generates a fairly unique string.""" return hex(random.randint(0, 256 * 256 * 256 * 256 - 1))[2:] def cut_wavs(src, tgt, start, end): """Cuts out the interval `start'--`end' from the wav file `src' and saves it to `tgt'. """ existed = os.path.exists(tgt)
# Make sure the alex package is visible. if __name__ == "__main__": import autopath from alex.utils.fs import find _LANG2NORMALISATION_MOD = { "cs": "alex.corpustools.text_norm_cs", "en": "alex.corpustools.text_norm_en", "es": "alex.corpustools.text_norm_es", } from alex.utils.ui import getTerminalSize try: _term_width = getTerminalSize()[1] except: _term_width = 80 def unique_str(): """Generates a fairly unique string.""" return hex(random.randint(0, 256 * 256 * 256 * 256 - 1))[2:] def cut_wavs(src, tgt, start, end): """Cuts out the interval `start'--`end' from the wav file `src' and saves it to `tgt'. """ existed = os.path.exists(tgt)
def extract_wavs_trns(dirname, sess_fname, outdir, wav_mapping, known_words=None, lang='cs', verbose=False): """Extracts wavs and their transcriptions from the named in `sess_fname', a CUED call log file. Extracting means copying them to `outdir'. Recordings themselves are expected to reside in `dirname'. If `known_words', a collection of words present in the phonetic dictionary, is provided, transcriptions are excluded which contain other words. If `known_words' is not provided, excluded are transcriptions that contain any of _excluded_characters. Returns the total size of audio files copied to `outdir', the number of overwritten files by the output files, the number of wav files that were missing from the `wav_mapping' dictionary, and the number of transcriptions not present for existing recordings. """ # Import the appropriate normalisation module. norm_mod_name = _LANG2NORMALISATION_MOD[lang] norm_mod = __import__(norm_mod_name, fromlist=('normalise_text', 'exclude_asr', 'exclude_by_dict')) # Parse the file. try: doc = ElementTree.parse(sess_fname) except IOError as error: if verbose: print '!!! Could not parse "{fname}": {msg!s}.'\ .format(fname=sess_fname, msg=error) return 0, 0, 0, 0 uturns = doc.findall(".//turn") annotations = doc.findall('.//annotation') if len(annotations) > 1: print "Transcription was rejected as we have more then two transcriptions and " \ "we cannot decide which one is better." return 0, 0, 0, 0 for a in annotations: r = False if 'worker_id' in a.attrib and a.attrib['worker_id'] == '19113916': r = True if r: print "Transcription was rejected because of unreliable annotator." return 0, 0, 0, 0 size = 0 n_overwrites = 0 n_missing_wav = 0 n_missing_trs = 0 for uturn in uturns: # trs.text = uturn.getElementsByTagName("trs.text") # rec = uturn.getElementsByTagName("rec") if uturn.attrib['speaker'] != "user": continue rec = uturn.find("rec") trs = uturn.findall("asr_transcription") if trs is None: if rec is not None: n_missing_trs += 1 continue else: # FIXME: Is the last transcription the right thing to be used? Probably. Must be checked! trs = trs[-1].text # Check this is the wav from this directory. wav_basename = rec.attrib['fname'].strip() if wav_basename in wav_mapping: wav_fname = os.path.join(wav_mapping[wav_basename], wav_basename) if os.path.dirname(wav_fname) != dirname: missing_wav = True else: missing_wav = False else: missing_wav = True if not missing_wav: if verbose: term_width = getTerminalSize()[1] or 80 print '-' * term_width print "# f:", wav_basename print "orig transcription:", trs.upper().strip() trs = norm_mod.normalise_text(trs) if verbose: print "normalised trans: ", trs if known_words is not None: excluded = norm_mod.exclude_by_dict(trs, known_words) else: excluded = norm_mod.exclude_asr(trs) if excluded: print "... excluded" continue wc.update(trs.split()) trs_fname = os.path.join(outdir, wav_basename + '.trn') try: size += os.path.getsize(wav_fname) except OSError: print "Lost audio file:", wav_fname else: try: #shutil.copy2(wav_fname, outdir) tgt = os.path.join(outdir, os.path.basename(wav_fname)) cmd = "sox --ignore-length {src} -c 1 -r 16000 -b 16 {tgt}".format( src=wav_fname, tgt=tgt) print cmd os.system(cmd) except shutil.Error as e: print >>sys.stderr, \ ("Isn't the `outdir' with previously copied files " "below `infname' within the filesystem?\n") raise e n_overwrites += save_transcription(trs_fname, trs) shutil.copystat(sess_fname, trs_fname) else: n_missing_wav += 1 if args.verbose: term_width = getTerminalSize()[1] or 80 print '-' * term_width print "(WW) Ignoring or missing_wav the file '{0}'."\ .format(wav_basename) if verbose: term_width = getTerminalSize()[1] or 80 print '-' * term_width print return size, n_overwrites, n_missing_wav, n_missing_trs
def extract_wavs_trns(dirname, sess_fname, outdir, wav_mapping, known_words=None, verbose=False): """Extracts wavs and their transcriptions from the named in `sess_fname', a CUED call log file. Extracting means copying them to `outdir'. Recordings themselves are expected to reside in `dirname'. If `known_words', a collection of words present in the phonetic dictionary, is provided, transcriptions are excluded which contain other words. If `known_words' is not provided, excluded are transcriptions that contain any of _excluded_characters. Returns the total size of audio files copied to `outdir', the number of overwritten files by the output files, the number of wav files that were missing from the `wav_mapping' dictionary, and the number of transcriptions not present for existing recordings. """ # Parse the file. try: doc = ElementTree.parse(sess_fname) except IOError as error: if verbose: print '!!! Could not parse "{fname}": {msg!s}.'.format(fname=sess_fname, msg=error) return 0, 0, 0, 0 uturns = doc.findall(".//userturn") annotations = doc.findall('.//annotation') print "# annotations: ", len(annotations) if len(annotations) > 1: # FIXME: This is bad! We waste about 1/3 of all data from CF. However, it is not possible to deduce # what transcription to use. print "Transcription was rejected as we have more then two transcriptions and " \ "we cannot decide which one is better." return 0, 0, 0, 0 for a in annotations: r = False if 'worker_id' in a.attrib and a.attrib['worker_id'] == '19113916': r = True if r: print "Transcription was rejected because of unreliable annotator." return 0, 0, 0, 0 size = 0 n_overwrites = 0 n_missing_wav = 0 n_missing_trs = 0 for uturn in uturns: # trs.text = uturn.getElementsByTagName("trs.text") # rec = uturn.getElementsByTagName("rec") rec = uturn.find("rec") trs = uturn.find("transcription") if trs is None: # this may be CF style transcription trs2 = uturn.find("transcriptions") if trs2 is not None: trs3 = trs2.findall("transcription") if trs3 is None: if rec is not None: n_missing_trs += 1 continue else: trs = trs3[-1].text else: continue else: trs = trs.text # Check this is the wav from this directory. wav_basename = rec.attrib['fname'].strip() if wav_basename in wav_mapping: wav_fname = os.path.join(wav_mapping[wav_basename], wav_basename) if os.path.dirname(wav_fname) != dirname: missing_wav = True else: missing_wav = False else: missing_wav = True if not missing_wav: if verbose: term_width = getTerminalSize()[1] or 80 print '-' * term_width print "# f:", wav_basename print "orig transcription:", trs.upper().strip() trs = normalise_text(trs) if verbose: print "normalised trans: ", trs if known_words is not None: excluded = exclude_by_dict(trs, known_words) else: excluded = exclude_asr(trs) if excluded: print "... excluded" continue wc.update(trs.split()) # trs_fname = os.path.join(outdir, wav_basename + '.trn') sub_dir = "{r:02}".format(r=random.randint(0, 99)), "{r:02}".format(r=random.randint(0, 99)) trs_fname = os.path.join(outdir, sub_dir[0], sub_dir[1], wav_basename + '.trn') if not os.path.exists(os.path.dirname(trs_fname)): os.makedirs(os.path.dirname(trs_fname)) try: szx = os.path.getsize(wav_fname) except OSError: print "Lost audio file:", wav_fname else: try: #shutil.copy2(wav_fname, outdir) # tgt = os.path.join(outdir, os.path.basename(wav_fname)) tgt = os.path.join(outdir, sub_dir[0], sub_dir[1], os.path.basename(wav_fname)) cmd = "sox --ignore-length {src} -c 1 -r 16000 -b 16 {tgt}".format(src=wav_fname, tgt=tgt) print cmd os.system(cmd) size += os.path.getsize(tgt) except shutil.Error as e: print >>sys.stderr, \ ("Isn't the `outdir' with previously copied files " "below `infname' within the filesystem?\n") raise e n_overwrites += save_transcription(trs_fname, trs) shutil.copystat(sess_fname, trs_fname) else: n_missing_wav += 1 if args.verbose: term_width = getTerminalSize()[1] or 80 print '-' * term_width print "(WW) Ignoring or missing_wav the file '{0}'."\ .format(wav_basename) if verbose: term_width = getTerminalSize()[1] or 80 print '-' * term_width print return size, n_overwrites, n_missing_wav, n_missing_trs
def extract_wavs_trns(dirname, sess_fname, outdir, wav_mapping, known_words=None, verbose=False): """Extracts wavs and their transcriptions from the named in `sess_fname', a CUED call log file. Extracting means copying them to `outdir'. Recordings themselves are expected to reside in `dirname'. If `known_words', a collection of words present in the phonetic dictionary, is provided, transcriptions are excluded which contain other words. If `known_words' is not provided, excluded are transcriptions that contain any of _excluded_characters. Returns the total size of audio files copied to `outdir', the number of overwritten files by the output files, the number of wav files that were missing from the `wav_mapping' dictionary, and the number of transcriptions not present for existing recordings. """ # Parse the file. try: doc = ElementTree.parse(sess_fname) except IOError as error: if verbose: print '!!! Could not parse "{fname}": {msg!s}.'.format( fname=sess_fname, msg=error) return 0, 0, 0, 0 uturns = doc.findall(".//userturn") annotations = doc.findall('.//annotation') print "# annotations: ", len(annotations) if len(annotations) > 1: # FIXME: This is bad! We waste about 1/3 of all data from CF. However, it is not possible to deduce # what transcription to use. print "Transcription was rejected as we have more then two transcriptions and " \ "we cannot decide which one is better." return 0, 0, 0, 0 for a in annotations: r = False if 'worker_id' in a.attrib and a.attrib['worker_id'] == '19113916': r = True if r: print "Transcription was rejected because of unreliable annotator." return 0, 0, 0, 0 size = 0 n_overwrites = 0 n_missing_wav = 0 n_missing_trs = 0 for uturn in uturns: # trs.text = uturn.getElementsByTagName("trs.text") # rec = uturn.getElementsByTagName("rec") rec = uturn.find("rec") trs = uturn.find("transcription") if trs is None: # this may be CF style transcription trs2 = uturn.find("transcriptions") if trs2 is not None: trs3 = trs2.findall("transcription") if trs3 is None: if rec is not None: n_missing_trs += 1 continue else: trs = trs3[-1].text else: continue else: trs = trs.text # Check this is the wav from this directory. wav_basename = rec.attrib['fname'].strip() if wav_basename in wav_mapping: wav_fname = os.path.join(wav_mapping[wav_basename], wav_basename) if os.path.dirname(wav_fname) != dirname: missing_wav = True else: missing_wav = False else: missing_wav = True if not missing_wav: if verbose: term_width = getTerminalSize()[1] or 80 print '-' * term_width print "# f:", wav_basename print "orig transcription:", trs.upper().strip() trs = normalise_text(trs) if verbose: print "normalised trans: ", trs if known_words is not None: excluded = exclude_by_dict(trs, known_words) else: excluded = exclude_asr(trs) if excluded: print "... excluded" continue wc.update(trs.split()) # trs_fname = os.path.join(outdir, wav_basename + '.trn') sub_dir = "{r:02}".format( r=random.randint(0, 99)), "{r:02}".format( r=random.randint(0, 99)) trs_fname = os.path.join(outdir, sub_dir[0], sub_dir[1], wav_basename + '.trn') if not os.path.exists(os.path.dirname(trs_fname)): os.makedirs(os.path.dirname(trs_fname)) try: szx = os.path.getsize(wav_fname) except OSError: print "Lost audio file:", wav_fname else: try: #shutil.copy2(wav_fname, outdir) # tgt = os.path.join(outdir, os.path.basename(wav_fname)) tgt = os.path.join(outdir, sub_dir[0], sub_dir[1], os.path.basename(wav_fname)) cmd = "sox --ignore-length {src} -c 1 -r 16000 -b 16 {tgt}".format( src=wav_fname, tgt=tgt) print cmd os.system(cmd) size += os.path.getsize(tgt) except shutil.Error as e: print >>sys.stderr, \ ("Isn't the `outdir' with previously copied files " "below `infname' within the filesystem?\n") raise e n_overwrites += save_transcription(trs_fname, trs) shutil.copystat(sess_fname, trs_fname) else: n_missing_wav += 1 if args.verbose: term_width = getTerminalSize()[1] or 80 print '-' * term_width print "(WW) Ignoring or missing_wav the file '{0}'."\ .format(wav_basename) if verbose: term_width = getTerminalSize()[1] or 80 print '-' * term_width print return size, n_overwrites, n_missing_wav, n_missing_trs