tt = [] pt = [] for fn in files: # print "Processing:", fn doc = xml.dom.minidom.parse(fn) turns = doc.getElementsByTagName("turn") for turn in turns: recs_list = turn.getElementsByTagName("rec") trans_list = turn.getElementsByTagName("asr_transcription") if trans_list: trans = trans_list[-1] t = various.get_text_from_xml_node(trans) t = normalise_text(t) if exclude_lm(t): continue # The silence does not have a label in the language model. t = t.replace('_SIL_', '') tt.append(t) wav_file = recs_list[0].getAttribute('fname') wav_path = os.path.realpath(os.path.join(os.path.dirname(fn), wav_file)) pt.append((wav_path, t))
def process_call_log(fn): name = multiprocessing.current_process().name asr = [] nbl = [] sem = [] trn = [] trn_hdc_sem = [] fcount = 0 tcount = 0 f_dir = os.path.dirname(fn) print "Process name:", name print "File #", fcount fcount += 1 print "Processing:", fn doc = xml.dom.minidom.parse(fn) turns = doc.getElementsByTagName("turn") for i, turn in enumerate(turns): if turn.getAttribute('speaker') != 'user': continue recs = turn.getElementsByTagName("rec") trans = turn.getElementsByTagName("asr_transcription") asrs = turn.getElementsByTagName("asr") if len(recs) != 1: print "Skipping a turn {turn} in file: {fn} - recs: {recs}".format( turn=i, fn=fn, recs=len(recs)) continue if len(asrs) == 0 and (i + 1) < len(turns): next_asrs = turns[i + 1].getElementsByTagName("asr") if len(next_asrs) != 2: print "Skipping a turn {turn} in file: {fn} - asrs: {asrs} - next_asrs: {next_asrs}".format( turn=i, fn=fn, asrs=len(asrs), next_asrs=len(next_asrs)) continue print "Recovered from missing ASR output by using a delayed ASR output from the following turn of turn {turn}. File: {fn} - next_asrs: {asrs}".format( turn=i, fn=fn, asrs=len(next_asrs)) hyps = next_asrs[0].getElementsByTagName("hypothesis") elif len(asrs) == 1: hyps = asrs[0].getElementsByTagName("hypothesis") elif len(asrs) == 2: print "Recovered from EXTRA ASR outputs by using a the last ASR output from the turn. File: {fn} - asrs: {asrs}".format( fn=fn, asrs=len(asrs)) hyps = asrs[-1].getElementsByTagName("hypothesis") else: print "Skipping a turn {turn} in file {fn} - asrs: {asrs}".format( turn=i, fn=fn, asrs=len(asrs)) continue if len(trans) == 0: print "Skipping a turn in {fn} - trans: {trans}".format( fn=fn, trans=len(trans)) continue wav_key = recs[0].getAttribute('fname') wav_path = os.path.join(f_dir, wav_key) # FIXME: Check whether the last transcription is really the best! FJ t = various.get_text_from_xml_node(trans[-1]) t = normalise_text(t) if '--asr-log' not in sys.argv: asr_rec_nbl = asr_rec.rec_wav_file(wav_path) a = unicode(asr_rec_nbl.get_best()) else: a = various.get_text_from_xml_node(hyps[0]) a = normalise_semi_words(a) if exclude_slu(t) or 'DOM Element:' in a: print "Skipping transcription:", unicode(t) print "Skipping ASR output: ", unicode(a) continue # The silence does not have a label in the language model. t = t.replace('_SIL_', '') trn.append((wav_key, t)) print print "Transcritpiton #", tcount tcount += 1 print "Parsing transcription:", unicode(t) print " ASR:", unicode(a) # HDC SLU on transcription s = slu.parse_1_best({'utt': Utterance(t)}).get_best_da() trn_hdc_sem.append((wav_key, s)) # 1 best ASR asr.append((wav_key, a)) # N best ASR n = UtteranceNBList() if '--asr-log' not in sys.argv: n = asr_rec_nbl print 'ASR RECOGNITION NBLIST\n', unicode(n) else: for h in hyps: txt = various.get_text_from_xml_node(h) txt = normalise_semi_words(txt) n.add(abs(float(h.getAttribute('p'))), Utterance(txt)) n.merge() n.normalise() nbl.append((wav_key, n.serialise())) # there is no manual semantics in the transcriptions yet sem.append((wav_key, None)) return asr, nbl, sem, trn, trn_hdc_sem, fcount, tcount
def extract_trns_sems_from_file(fname, verbose, fields=None, normalise=True, do_exclude=True, known_words=None, robust=False): """ Extracts transcriptions and their semantic annotation from a CUED call log file. Arguments: fname -- path towards the call log file verbose -- print lots of output? fields -- names of fields that should be required for the output. Field names are strings corresponding to the element names in the transcription XML format. (default: all five of them) normalise -- whether to do normalisation on transcriptions do_exclude -- whether to exclude transcriptions not considered suitable known_words -- a collection of words. If provided, transcriptions are excluded which contain other words. If not provided, excluded are transcriptions that contain any of _excluded_characters. What "excluded" means depends on whether the transcriptions are required by being specified in `fields'. robust -- whether to assign recordings to turns robustly or trust where they are in the log. This could be useful for older CUED logs where the elements sometimes escape to another <turn> than they belong. However, in cases where `robust' leads to finding the correct recording for the user turn, the log is damaged at other places too, and the resulting turn record would be misleading. Therefore, we recommend leaving robust=False. Returns a list of TurnRecords. """ if verbose: print 'Processing', fname # Interpret the arguments. if fields is None: fields = ("transcription", "semitran", "semihyp", "asrhyp", "rec") rec_filter = _make_rec_filter(fields) # Load the file. doc = xml.dom.minidom.parse(fname) uturns = doc.getElementsByTagName("userturn") if robust: audios = [audio for audio in doc.getElementsByTagName("rec") if not audio.getAttribute('fname').endswith('_all.wav')] trns_sems = [] for uturn in uturns: transcription = uturn.getElementsByTagName("transcription") cued_da = uturn.getElementsByTagName("semitran") cued_dahyp = uturn.getElementsByTagName("semihyp") asrhyp = uturn.getElementsByTagName("asrhyp") audio = uturn.getElementsByTagName("rec") # If there was something recognised but nothing recorded, if in the # robust mode, if asrhyp and not audio and robust: # Look for the recording elsewhere. audio = [_find_audio_for_turn(uturn, audios)] # This is the first form of the turn record, containing lists of XML # elements and suited only for internal use. rec = TurnRecord(transcription, cued_da, cued_dahyp, asrhyp, audio) if not rec_filter(rec): # Skip this node, it contains a wrong number of elements of either # transcription, cued_da, cued_dahyp, asrhyp, or audio. continue # XXX Here we take always the first tag having the respective tag name. transcription = get_text_from_xml_node( rec.transcription[0]).lower() if rec.transcription else None asrhyp = get_text_from_xml_node( rec.asrhyp[0]).lower() if rec.asrhyp else None # Filter the transcription and the ASR hypothesis through normalisation # and excluding non-conformant utterances. if transcription is not None: if normalise: transcription = normalise_text(transcription) if do_exclude: if known_words is not None: trs_excluded = exclude_by_dict(transcription, known_words) else: trs_excluded = exclude_asr(transcription) if trs_excluded: if verbose: print 'Excluded transcription: "{trs}".'.format( trs=transcription) if 'transcription' in fields: continue transcription = None if asrhyp is not None: if normalise: asrhyp = normalise_text(asrhyp) if do_exclude: if known_words is not None: asr_excluded = exclude_by_dict(asrhyp, known_words) else: asr_excluded = exclude_asr(asrhyp) if asr_excluded: if verbose: print 'Excluded ASR hypothesis: "{asr}".'.format( asr=asrhyp) if 'asrhyp' in fields: continue asrhyp = None cued_da = get_text_from_xml_node( rec.cued_da[0]) if rec.cued_da else None cued_dahyp = get_text_from_xml_node( rec.cued_dahyp[0]) if rec.cued_dahyp else None audio = rec.audio[0].getAttribute( 'fname').strip() if rec.audio else None # Construct the resulting turn record. rec = TurnRecord(transcription, cued_da, cued_dahyp, asrhyp, audio) if verbose: print "#1 f:", rec.audio print "#2 t:", rec.transcription, "# s:", rec.cued_da print "#3 a:", rec.asrhyp, "# s:", rec.cued_dahyp print if rec.cued_da or 'semitran' not in fields: trns_sems.append(rec) return trns_sems
def extract_from_xml(indomain_data_dir, outdir, cfg): glob = 'asr_transcribed.xml' asr = asr_factory(cfg) print 'Collecting files under %s with glob %s' % (indomain_data_dir, glob) files = [] for root, dirnames, filenames in os.walk(indomain_data_dir, followlinks=True): for filename in fnmatch.filter(filenames, glob): files.append(os.path.join(root, filename)) # DEBUG example # files = [ # '/ha/projects/vystadial/data/call-logs/2013-05-30-alex-aotb-prototype/part1/2013-06-27-09-33-25.116055-CEST-00420221914256/asr_transcribed.xml'] try: trn, dec, dec_len, wav_len = [], [], [], [] for fn in files: doc = xml.dom.minidom.parse(fn) turns = doc.getElementsByTagName("turn") f_dir = os.path.dirname(fn) for turn in turns: if turn.getAttribute('speaker') != 'user': continue recs = turn.getElementsByTagName("rec") trans = turn.getElementsByTagName("asr_transcription") if len(recs) != 1: print "Skipping a turn {turn} in file: {fn} - recs: {recs}".format(turn=turn.getAttribute('turn_number'), fn=fn, recs=len(recs)) continue if len(trans) == 0: print "Skipping a turn in {fn} - trans: {trans}".format(fn=fn, trans=len(trans)) continue wav_file = recs[0].getAttribute('fname') # FIXME: Check whether the last transcription is really the best! FJ t = various.get_text_from_xml_node(trans[-1]) t = normalise_text(t) if exclude_lm(t): continue # TODO is it still valid? OP # The silence does not have a label in the language model. t = t.replace('_SIL_', '') trn.append((wav_file, t)) wav_path = os.path.join(f_dir, wav_file) best, dec_dur, fw_dur, wav_dur = decode_info(asr, cfg, wav_path, t) dec.append((wav_file, best)) wav_len.append((wav_file, wav_dur)) dec_len.append((wav_file, dec_dur)) except Exception as e: print 'PARTIAL RESULTS were saved to %s' % outdir print e raise e finally: trn_dict = dict(trn) dec_dict = dict(dec) wavlen_dict = dict(wav_len) declen_dict = dict(dec_len) compute_save_stat(outdir, trn_dict, dec_dict, wavlen_dict, declen_dict)
def get_results(self, timeout=0.6): """" Waits for the complete recognition results from the Julius ASR server. Timeout specifies how long it will wait for the end of message. """ msg = "" # Get results from the server. time_slept = 0.0 while time_slept < timeout: msg_part = self.read_server_message(self.msg_timeout) if not msg_part: # Wait and check whether there is a message. time.sleep(self.cfg['Hub']['main_loop_sleep_time']) time_slept += self.cfg['Hub']['main_loop_sleep_time'] if self.debug >= 2: print "gr.time_slept:", time_slept continue msg += msg_part + '\n' if self.debug: print msg if '<CONFNET>' in msg: break else: raise JuliusASRTimeoutException( "Timeout when waiting for the Julius server results.") # Process the results. """ Typical result returned by the Julius ASR. <STARTPROC/> <INPUT STATUS="LISTEN" TIME="1343896296"/> <INPUT STATUS="STARTREC" TIME="1343896311"/> <STARTRECOG/> <INPUT STATUS="ENDREC" TIME="1343896312"/> <ENDRECOG/> <INPUTPARAM FRAMES="164" MSEC="1640"/> <RECOGOUT> <SHYPO RANK="1" SCORE="-7250.111328"> <WHYPO WORD="" CLASSID="<s>" PHONE="sil" CM="0.887"/> <WHYPO WORD="I'M" CLASSID="I'M" PHONE="ah m" CM="0.705"/> <WHYPO WORD="LOOKING" CLASSID="LOOKING" PHONE="l uh k ih ng" CM="0.992"/> <WHYPO WORD="FOR" CLASSID="FOR" PHONE="f er" CM="0.757"/> <WHYPO WORD="A" CLASSID="A" PHONE="ah" CM="0.672"/> <WHYPO WORD="PUB" CLASSID="PUB" PHONE="p ah b" CM="0.409"/> <WHYPO WORD="" CLASSID="</s>" PHONE="sil" CM="1.000"/> </SHYPO> </RECOGOUT> <GRAPHOUT NODENUM="43" ARCNUM="70"> <NODE GID="0" WORD="" CLASSID="<s>" PHONE="sil" BEGIN="0" END="2"/> <NODE GID="1" WORD="" CLASSID="<s>" PHONE="sil" BEGIN="0" END="3"/> <NODE GID="2" WORD="" CLASSID="<s>" PHONE="sil" BEGIN="0" END="4"/> <NODE GID="3" WORD="I" CLASSID="I" PHONE="ay" BEGIN="3" END="5"/> <NODE GID="4" WORD="NO" CLASSID="NO" PHONE="n ow" BEGIN="3" END="7"/> <NODE GID="5" WORD="I" CLASSID="I" PHONE="ay" BEGIN="4" END="6"/> <NODE GID="6" WORD="UH" CLASSID="UH" PHONE="ah" BEGIN="4" END="6"/> <NODE GID="7" WORD="I'M" CLASSID="I'M" PHONE="ay m" BEGIN="4" END="27"/> ... <NODE GID="38" WORD="PUB" CLASSID="PUB" PHONE="p ah b" BEGIN="79" END="104"/> <NODE GID="39" WORD="AH" CLASSID="AH" PHONE="aa" BEGIN="81" END="110"/> <NODE GID="40" WORD="LOT" CLASSID="LOT" PHONE="l aa t" BEGIN="81" END="110"/> <NODE GID="41" WORD="" CLASSID="</s>" PHONE="sil" BEGIN="105" END="163"/> <NODE GID="42" WORD="" CLASSID="</s>" PHONE="sil" BEGIN="111" END="163"/> <ARC FROM="0" TO="4"/> <ARC FROM="0" TO="3"/> <ARC FROM="1" TO="7"/> <ARC FROM="1" TO="5"/> <ARC FROM="1" TO="6"/> ... <ARC FROM="38" TO="41"/> <ARC FROM="39" TO="42"/> <ARC FROM="40" TO="42"/> </GRAPHOUT> <CONFNET> <WORD> <ALTERNATIVE PROB="1.000"></ALTERNATIVE> </WORD> <WORD> <ALTERNATIVE PROB="0.950">I</ALTERNATIVE> <ALTERNATIVE PROB="0.020">HI</ALTERNATIVE> <ALTERNATIVE PROB="0.013">NO</ALTERNATIVE> <ALTERNATIVE PROB="0.010"></ALTERNATIVE> <ALTERNATIVE PROB="0.006">UH</ALTERNATIVE> </WORD> <WORD> <ALTERNATIVE PROB="0.945">AM</ALTERNATIVE> <ALTERNATIVE PROB="0.055">I'M</ALTERNATIVE> </WORD> <WORD> <ALTERNATIVE PROB="1.000">LOOKING</ALTERNATIVE> </WORD> <WORD> <ALTERNATIVE PROB="1.000">FOR</ALTERNATIVE> </WORD> <WORD> <ALTERNATIVE PROB="1.000">A</ALTERNATIVE> </WORD> <WORD> <ALTERNATIVE PROB="0.963">PUB</ALTERNATIVE> <ALTERNATIVE PROB="0.016">AH</ALTERNATIVE> <ALTERNATIVE PROB="0.012">BAR</ALTERNATIVE> <ALTERNATIVE PROB="0.008">LOT</ALTERNATIVE> </WORD> <WORD> <ALTERNATIVE PROB="1.000"></ALTERNATIVE> </WORD> </CONFNET> <INPUT STATUS="LISTEN" TIME="1343896312"/> """ msg = "<RESULTS>" + msg + "</RESULTS>" msg = msg.replace("<s>", "<s>").replace("</s>", "</s>") nblist = UtteranceNBList() doc = xml.dom.minidom.parseString(msg) recogout = doc.getElementsByTagName("RECOGOUT") for el in recogout: shypo = el.getElementsByTagName("SHYPO") for el in shypo: whypo = el.getElementsByTagName("WHYPO") utterance = "" cm = 1.0 for el in whypo: word = el.getAttribute("WORD") utterance += " " + word if word: cm *= float(el.getAttribute("CM")) nblist.add(cm, Utterance(utterance)) nblist.merge() nblist.add_other() cn = UtteranceConfusionNetwork() confnet = doc.getElementsByTagName("CONFNET") for el in confnet: word = el.getElementsByTagName("WORD") for el in word: alternative = el.getElementsByTagName("ALTERNATIVE") word_list = [] for el in alternative: prob = float(el.getAttribute("PROB")) text = get_text_from_xml_node(el) word_list.append([prob, text]) # Filter out empty hypotheses. if len(word_list) == 0: continue if len(word_list) == 1 and len(word_list[0][1]) == 0: continue # Add the word into the confusion network. cn.add(word_list) cn.merge() cn.normalise() cn.prune() cn.normalise() cn.sort() return nblist, cn
def main(): cldb = CategoryLabelDatabase('../data/database.py') preprocessing = PTIENSLUPreprocessing(cldb) slu = PTIENHDCSLU(preprocessing, cfg={'SLU': {PTIENHDCSLU: {'utt2da': as_project_path("applications/PublicTransportInfoEN/data/utt2da_dict.txt")}}}) cfg = Config.load_configs(['../kaldi.cfg',], use_default=True) asr_rec = asr_factory(cfg) fn_uniq_trn = 'uniq.trn' fn_uniq_trn_hdc_sem = 'uniq.trn.hdc.sem' fn_uniq_trn_sem = 'uniq.trn.sem' fn_all_sem = 'all.sem' fn_all_trn = 'all.trn' fn_all_trn_hdc_sem = 'all.trn.hdc.sem' fn_all_asr = 'all.asr' fn_all_asr_hdc_sem = 'all.asr.hdc.sem' fn_all_nbl = 'all.nbl' fn_all_nbl_hdc_sem = 'all.nbl.hdc.sem' fn_train_sem = 'train.sem' fn_train_trn = 'train.trn' fn_train_trn_hdc_sem = 'train.trn.hdc.sem' fn_train_asr = 'train.asr' fn_train_asr_hdc_sem = 'train.asr.hdc.sem' fn_train_nbl = 'train.nbl' fn_train_nbl_hdc_sem = 'train.nbl.hdc.sem' fn_dev_sem = 'dev.sem' fn_dev_trn = 'dev.trn' fn_dev_trn_hdc_sem = 'dev.trn.hdc.sem' fn_dev_asr = 'dev.asr' fn_dev_asr_hdc_sem = 'dev.asr.hdc.sem' fn_dev_nbl = 'dev.nbl' fn_dev_nbl_hdc_sem = 'dev.nbl.hdc.sem' fn_test_sem = 'test.sem' fn_test_trn = 'test.trn' fn_test_trn_hdc_sem = 'test.trn.hdc.sem' fn_test_asr = 'test.asr' fn_test_asr_hdc_sem = 'test.asr.hdc.sem' fn_test_nbl = 'test.nbl' fn_test_nbl_hdc_sem = 'test.nbl.hdc.sem' indomain_data_dir = "indomain_data" print "Generating the SLU train and test data" print "-"*120 ############################################################################################### files = [] files.append(glob.glob(os.path.join(indomain_data_dir, 'asr_transcribed.xml'))) files.append(glob.glob(os.path.join(indomain_data_dir, '*', 'asr_transcribed.xml'))) files.append(glob.glob(os.path.join(indomain_data_dir, '*', '*', 'asr_transcribed.xml'))) files.append(glob.glob(os.path.join(indomain_data_dir, '*', '*', '*', 'asr_transcribed.xml'))) files.append(glob.glob(os.path.join(indomain_data_dir, '*', '*', '*', '*', 'asr_transcribed.xml'))) files.append(glob.glob(os.path.join(indomain_data_dir, '*', '*', '*', '*', '*', 'asr_transcribed.xml'))) files = various.flatten(files) sem = [] trn = [] trn_hdc_sem = [] asr = [] asr_hdc_sem = [] nbl = [] nbl_hdc_sem = [] for fn in files[:100000]: f_dir = os.path.dirname(fn) print "Processing:", fn doc = xml.dom.minidom.parse(fn) turns = doc.getElementsByTagName("turn") for i, turn in enumerate(turns): if turn.getAttribute('speaker') != 'user': continue recs = turn.getElementsByTagName("rec") trans = turn.getElementsByTagName("asr_transcription") asrs = turn.getElementsByTagName("asr") if len(recs) != 1: print "Skipping a turn {turn} in file: {fn} - recs: {recs}".format(turn=i,fn=fn, recs=len(recs)) continue if len(asrs) == 0 and (i + 1) < len(turns): next_asrs = turns[i+1].getElementsByTagName("asr") if len(next_asrs) != 2: print "Skipping a turn {turn} in file: {fn} - asrs: {asrs} - next_asrs: {next_asrs}".format(turn=i, fn=fn, asrs=len(asrs), next_asrs=len(next_asrs)) continue print "Recovered from missing ASR output by using a delayed ASR output from the following turn of turn {turn}. File: {fn} - next_asrs: {asrs}".format(turn=i, fn=fn, asrs=len(next_asrs)) hyps = next_asrs[0].getElementsByTagName("hypothesis") elif len(asrs) == 1: hyps = asrs[0].getElementsByTagName("hypothesis") elif len(asrs) == 2: print "Recovered from EXTRA ASR outputs by using a the last ASR output from the turn. File: {fn} - asrs: {asrs}".format(fn=fn, asrs=len(asrs)) hyps = asrs[-1].getElementsByTagName("hypothesis") else: print "Skipping a turn {turn} in file {fn} - asrs: {asrs}".format(turn=i,fn=fn, asrs=len(asrs)) continue if len(trans) == 0: print "Skipping a turn in {fn} - trans: {trans}".format(fn=fn, trans=len(trans)) continue wav_key = recs[0].getAttribute('fname') wav_path = os.path.join(f_dir, wav_key) # FIXME: Check whether the last transcription is really the best! FJ t = various.get_text_from_xml_node(trans[-1]) t = normalise_text(t) if '--asr-log' not in sys.argv: asr_rec_nbl = asr_rec.rec_wav_file(wav_path) a = unicode(asr_rec_nbl.get_best()) else: a = various.get_text_from_xml_node(hyps[0]) a = normalise_semi_words(a) if exclude_slu(t) or 'DOM Element:' in a: print "Skipping transcription:", unicode(t) print "Skipping ASR output: ", unicode(a) continue # The silence does not have a label in the language model. t = t.replace('_SIL_','') trn.append((wav_key, t)) print "Parsing transcription:", unicode(t) print " ASR:", unicode(a) # HDC SLU on transcription s = slu.parse_1_best({'utt':Utterance(t)}).get_best_da() trn_hdc_sem.append((wav_key, s)) if '--uniq' not in sys.argv: # HDC SLU on 1 best ASR if '--asr-log' not in sys.argv: a = unicode(asr_rec_nbl.get_best()) else: a = various.get_text_from_xml_node(hyps[0]) a = normalise_semi_words(a) asr.append((wav_key, a)) s = slu.parse_1_best({'utt':Utterance(a)}).get_best_da() asr_hdc_sem.append((wav_key, s)) # HDC SLU on N best ASR n = UtteranceNBList() if '--asr-log' not in sys.argv: n = asr_rec_nbl print 'ASR RECOGNITION NBLIST\n',unicode(n) else: for h in hyps: txt = various.get_text_from_xml_node(h) txt = normalise_semi_words(txt) n.add(abs(float(h.getAttribute('p'))),Utterance(txt)) n.merge() n.normalise() nbl.append((wav_key, n.serialise())) if '--fast' not in sys.argv: s = slu.parse_nblist({'utt_nbl':n}).get_best_da() nbl_hdc_sem.append((wav_key, s)) # there is no manual semantics in the transcriptions yet sem.append((wav_key, None)) uniq_trn = {} uniq_trn_hdc_sem = {} uniq_trn_sem = {} trn_set = set() sem = dict(trn_hdc_sem) for k, v in trn: if not v in trn_set: trn_set.add(v) uniq_trn[k] = v uniq_trn_hdc_sem[k] = sem[k] uniq_trn_sem[k] = v + " <=> " + unicode(sem[k]) save_wavaskey(fn_uniq_trn, uniq_trn) save_wavaskey(fn_uniq_trn_hdc_sem, uniq_trn_hdc_sem, trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) save_wavaskey(fn_uniq_trn_sem, uniq_trn_sem) # all save_wavaskey(fn_all_trn, dict(trn)) save_wavaskey(fn_all_trn_hdc_sem, dict(trn_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) if '--uniq' not in sys.argv: save_wavaskey(fn_all_asr, dict(asr)) save_wavaskey(fn_all_asr_hdc_sem, dict(asr_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) save_wavaskey(fn_all_nbl, dict(nbl)) save_wavaskey(fn_all_nbl_hdc_sem, dict(nbl_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) seed_value = 10 random.seed(seed_value) random.shuffle(trn) random.seed(seed_value) random.shuffle(trn_hdc_sem) random.seed(seed_value) random.shuffle(asr) random.seed(seed_value) random.shuffle(asr_hdc_sem) random.seed(seed_value) random.shuffle(nbl) random.seed(seed_value) random.shuffle(nbl_hdc_sem) # trn train_trn = trn[:int(0.8*len(trn))] dev_trn = trn[int(0.8*len(trn)):int(0.9*len(trn))] test_trn = trn[int(0.9*len(trn)):] save_wavaskey(fn_train_trn, dict(train_trn)) save_wavaskey(fn_dev_trn, dict(dev_trn)) save_wavaskey(fn_test_trn, dict(test_trn)) # trn_hdc_sem train_trn_hdc_sem = trn_hdc_sem[:int(0.8*len(trn_hdc_sem))] dev_trn_hdc_sem = trn_hdc_sem[int(0.8*len(trn_hdc_sem)):int(0.9*len(trn_hdc_sem))] test_trn_hdc_sem = trn_hdc_sem[int(0.9*len(trn_hdc_sem)):] save_wavaskey(fn_train_trn_hdc_sem, dict(train_trn_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) save_wavaskey(fn_dev_trn_hdc_sem, dict(dev_trn_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) save_wavaskey(fn_test_trn_hdc_sem, dict(test_trn_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) # asr train_asr = asr[:int(0.8*len(asr))] dev_asr = asr[int(0.8*len(asr)):int(0.9*len(asr))] test_asr = asr[int(0.9*len(asr)):] save_wavaskey(fn_train_asr, dict(train_asr)) save_wavaskey(fn_dev_asr, dict(dev_asr)) save_wavaskey(fn_test_asr, dict(test_asr)) # asr_hdc_sem train_asr_hdc_sem = asr_hdc_sem[:int(0.8*len(asr_hdc_sem))] dev_asr_hdc_sem = asr_hdc_sem[int(0.8*len(asr_hdc_sem)):int(0.9*len(asr_hdc_sem))] test_asr_hdc_sem = asr_hdc_sem[int(0.9*len(asr_hdc_sem)):] save_wavaskey(fn_train_asr_hdc_sem, dict(train_asr_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) save_wavaskey(fn_dev_asr_hdc_sem, dict(dev_asr_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) save_wavaskey(fn_test_asr_hdc_sem, dict(test_asr_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) # n-best lists train_nbl = nbl[:int(0.8*len(nbl))] dev_nbl = nbl[int(0.8*len(nbl)):int(0.9*len(nbl))] test_nbl = nbl[int(0.9*len(nbl)):] save_wavaskey(fn_train_nbl, dict(train_nbl)) save_wavaskey(fn_dev_nbl, dict(dev_nbl)) save_wavaskey(fn_test_nbl, dict(test_nbl)) # nbl_hdc_sem train_nbl_hdc_sem = nbl_hdc_sem[:int(0.8*len(nbl_hdc_sem))] dev_nbl_hdc_sem = nbl_hdc_sem[int(0.8*len(nbl_hdc_sem)):int(0.9*len(nbl_hdc_sem))] test_nbl_hdc_sem = nbl_hdc_sem[int(0.9*len(nbl_hdc_sem)):] save_wavaskey(fn_train_nbl_hdc_sem, dict(train_nbl_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) save_wavaskey(fn_dev_nbl_hdc_sem, dict(dev_nbl_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) save_wavaskey(fn_test_nbl_hdc_sem, dict(test_nbl_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&'))))
files.extend( glob.glob(os.path.join(indir, '*', '*', '*', '*', '*', 'feedback.xml'))) for f in sorted(files): if verbose: print "Input feedback:", f copy_feedback = force or not os.path.exists(f + ".copied") if copy_feedback: doc = xml.dom.minidom.parse(f) els = doc.getElementsByTagName("dialogueId") if els: target = get_text_from_xml_node(els[0]) if verbose: print "Target:", target cmd = "pscp -p -pw %s %s %s@%s:%s" % (password, f, user, server, target) subprocess.call(cmd, shell=True) fc = open(f + ".copied", "w") fc.write("copied\n") fc.close() else: if verbose: print "The feedback was already copied:"
glob.glob(os.path.join(indir, '*', '*', '*', '*', 'feedback.xml'))) files.extend(glob.glob( os.path.join(indir, '*', '*', '*', '*', '*', 'feedback.xml'))) for f in sorted(files): if verbose: print "Input feedback:", f copy_feedback = force or not os.path.exists(f + ".copied") if copy_feedback: doc = xml.dom.minidom.parse(f) els = doc.getElementsByTagName("dialogueId") if els: target = get_text_from_xml_node(els[0]) if verbose: print "Target:", target cmd = "pscp -p -pw %s %s %s@%s:%s" % ( password, f, user, server, target) subprocess.call(cmd, shell=True) fc = open(f + ".copied", "w") fc.write("copied\n") fc.close() else: if verbose: print "The feedback was already copied:"
def extract_from_xml(indomain_data_dir, outdir, cfg): """Extract transcription and Waves from xml Args: indomain_data_dir(path): path where the xml logs are stored outdir: directory to save the references and wave, Wav file names pairs cfg: Alex configuration """ glob = 'asr_transcribed.xml' asr = asr_factory(cfg) print 'Collecting files under %s with glob %s' % (indomain_data_dir, glob) files = [] for root, dirnames, filenames in os.walk(indomain_data_dir, followlinks=True): for filename in fnmatch.filter(filenames, glob): files.append(os.path.join(root, filename)) # DEBUG example # files = [ # '/ha/projects/vystadial/data/call-logs/2013-05-30-alex-aotb-prototype/part1/2013-06-27-09-33-25.116055-CEST-00420221914256/asr_transcribed.xml'] try: trn, dec, dec_len, wav_len = [], [], [], [] for fn in files: doc = xml.dom.minidom.parse(fn) turns = doc.getElementsByTagName("turn") f_dir = os.path.dirname(fn) for turn in turns: if turn.getAttribute('speaker') != 'user': continue recs = turn.getElementsByTagName("rec") trans = turn.getElementsByTagName("asr_transcription") if len(recs) != 1: print "Skipping a turn {turn} in file: {fn} - recs: {recs}".format( turn=turn.getAttribute('turn_number'), fn=fn, recs=len(recs)) continue if len(trans) == 0: print "Skipping a turn in {fn} - trans: {trans}".format( fn=fn, trans=len(trans)) continue wav_file = recs[0].getAttribute('fname') # FIXME: Check whether the last transcription is really the best! FJ t = various.get_text_from_xml_node(trans[-1]) t = normalise_text(t) if exclude_lm(t): continue # TODO is it still valid? OP # The silence does not have a label in the language model. t = t.replace('_SIL_', '') trn.append((wav_file, t)) wav_path = os.path.join(f_dir, wav_file) best, dec_dur, fw_dur, wav_dur = decode_info( asr, cfg, outdir, wav_path, t) dec.append((wav_file, best)) wav_len.append((wav_file, wav_dur)) dec_len.append((wav_file, dec_dur)) except Exception as e: print 'PARTIAL RESULTS were saved to %s' % outdir print e raise e finally: trn_dict = dict(trn) dec_dict = dict(dec) wavlen_dict = dict(wav_len) declen_dict = dict(dec_len) compute_save_stat(outdir, trn_dict, dec_dict, wavlen_dict, declen_dict)
def process_call_log(fn): name = multiprocessing.current_process().name asr = [] nbl = [] sem = [] trn = [] trn_hdc_sem = [] fcount = 0 tcount = 0 f_dir = os.path.dirname(fn) print "Process name:", name print "File #", fcount fcount += 1 print "Processing:", fn doc = xml.dom.minidom.parse(fn) turns = doc.getElementsByTagName("turn") for i, turn in enumerate(turns): if turn.getAttribute('speaker') != 'user': continue recs = turn.getElementsByTagName("rec") trans = turn.getElementsByTagName("asr_transcription") asrs = turn.getElementsByTagName("asr") if len(recs) != 1: print "Skipping a turn {turn} in file: {fn} - recs: {recs}".format(turn=i, fn=fn, recs=len(recs)) continue if len(asrs) == 0 and (i + 1) < len(turns): next_asrs = turns[i + 1].getElementsByTagName("asr") if len(next_asrs) != 2: print "Skipping a turn {turn} in file: {fn} - asrs: {asrs} - next_asrs: {next_asrs}".format(turn=i, fn=fn, asrs=len( asrs), next_asrs=len( next_asrs)) continue print "Recovered from missing ASR output by using a delayed ASR output from the following turn of turn {turn}. File: {fn} - next_asrs: {asrs}".format( turn=i, fn=fn, asrs=len(next_asrs)) hyps = next_asrs[0].getElementsByTagName("hypothesis") elif len(asrs) == 1: hyps = asrs[0].getElementsByTagName("hypothesis") elif len(asrs) == 2: print "Recovered from EXTRA ASR outputs by using a the last ASR output from the turn. File: {fn} - asrs: {asrs}".format( fn=fn, asrs=len(asrs)) hyps = asrs[-1].getElementsByTagName("hypothesis") else: print "Skipping a turn {turn} in file {fn} - asrs: {asrs}".format(turn=i, fn=fn, asrs=len(asrs)) continue if len(trans) == 0: print "Skipping a turn in {fn} - trans: {trans}".format(fn=fn, trans=len(trans)) continue wav_key = recs[0].getAttribute('fname') wav_path = os.path.join(f_dir, wav_key) # FIXME: Check whether the last transcription is really the best! FJ t = various.get_text_from_xml_node(trans[-1]) t = normalise_text(t) if '--asr-log' not in sys.argv: asr_rec_nbl = asr_rec.rec_wav_file(wav_path) a = unicode(asr_rec_nbl.get_best()) else: a = various.get_text_from_xml_node(hyps[0]) a = normalise_semi_words(a) if exclude_slu(t) or 'DOM Element:' in a: print "Skipping transcription:", unicode(t) print "Skipping ASR output: ", unicode(a) continue # The silence does not have a label in the language model. t = t.replace('_SIL_', '') trn.append((wav_key, t)) print print "Transcritpiton #", tcount tcount += 1 print "Parsing transcription:", unicode(t) print " ASR:", unicode(a) # HDC SLU on transcription s = slu.parse_1_best({'utt': Utterance(t)}).get_best_da() trn_hdc_sem.append((wav_key, s)) # 1 best ASR asr.append((wav_key, a)) # N best ASR n = UtteranceNBList() if '--asr-log' not in sys.argv: n = asr_rec_nbl print 'ASR RECOGNITION NBLIST\n', unicode(n) else: for h in hyps: txt = various.get_text_from_xml_node(h) txt = normalise_semi_words(txt) n.add(abs(float(h.getAttribute('p'))), Utterance(txt)) n.merge() n.normalise() nbl.append((wav_key, n.serialise())) # there is no manual semantics in the transcriptions yet sem.append((wav_key, None)) return asr, nbl, sem, trn, trn_hdc_sem, fcount, tcount