for fn in files: # print "Processing:", fn doc = xml.dom.minidom.parse(fn) turns = doc.getElementsByTagName("turn") for turn in turns: recs_list = turn.getElementsByTagName("rec") trans_list = turn.getElementsByTagName("asr_transcription") if trans_list: trans = trans_list[-1] t = various.get_text_from_xml_node(trans) t = normalise_text(t) if exclude_lm(t): continue # The silence does not have a label in the language model. t = t.replace('_SIL_', '') tt.append(t) wav_file = recs_list[0].getAttribute('fname') wav_path = os.path.realpath(os.path.join(os.path.dirname(fn), wav_file)) pt.append((wav_path, t)) random.seed(10) sf = [(a, b) for a, b in zip(tt, pt)] random.shuffle(sf)
def extract_from_xml(indomain_data_dir, outdir, cfg): glob = 'asr_transcribed.xml' asr = asr_factory(cfg) print 'Collecting files under %s with glob %s' % (indomain_data_dir, glob) files = [] for root, dirnames, filenames in os.walk(indomain_data_dir, followlinks=True): for filename in fnmatch.filter(filenames, glob): files.append(os.path.join(root, filename)) # DEBUG example # files = [ # '/ha/projects/vystadial/data/call-logs/2013-05-30-alex-aotb-prototype/part1/2013-06-27-09-33-25.116055-CEST-00420221914256/asr_transcribed.xml'] try: trn, dec, dec_len, wav_len = [], [], [], [] for fn in files: doc = xml.dom.minidom.parse(fn) turns = doc.getElementsByTagName("turn") f_dir = os.path.dirname(fn) for turn in turns: if turn.getAttribute('speaker') != 'user': continue recs = turn.getElementsByTagName("rec") trans = turn.getElementsByTagName("asr_transcription") if len(recs) != 1: print "Skipping a turn {turn} in file: {fn} - recs: {recs}".format(turn=turn.getAttribute('turn_number'), fn=fn, recs=len(recs)) continue if len(trans) == 0: print "Skipping a turn in {fn} - trans: {trans}".format(fn=fn, trans=len(trans)) continue wav_file = recs[0].getAttribute('fname') # FIXME: Check whether the last transcription is really the best! FJ t = various.get_text_from_xml_node(trans[-1]) t = normalise_text(t) if exclude_lm(t): continue # TODO is it still valid? OP # The silence does not have a label in the language model. t = t.replace('_SIL_', '') trn.append((wav_file, t)) wav_path = os.path.join(f_dir, wav_file) best, dec_dur, fw_dur, wav_dur = decode_info(asr, cfg, wav_path, t) dec.append((wav_file, best)) wav_len.append((wav_file, wav_dur)) dec_len.append((wav_file, dec_dur)) except Exception as e: print 'PARTIAL RESULTS were saved to %s' % outdir print e raise e finally: trn_dict = dict(trn) dec_dict = dict(dec) wavlen_dict = dict(wav_len) declen_dict = dict(dec_len) compute_save_stat(outdir, trn_dict, dec_dict, wavlen_dict, declen_dict)
def extract_from_xml(indomain_data_dir, outdir, cfg): """Extract transcription and Waves from xml Args: indomain_data_dir(path): path where the xml logs are stored outdir: directory to save the references and wave, Wav file names pairs cfg: Alex configuration """ glob = 'asr_transcribed.xml' asr = asr_factory(cfg) print 'Collecting files under %s with glob %s' % (indomain_data_dir, glob) files = [] for root, dirnames, filenames in os.walk(indomain_data_dir, followlinks=True): for filename in fnmatch.filter(filenames, glob): files.append(os.path.join(root, filename)) # DEBUG example # files = [ # '/ha/projects/vystadial/data/call-logs/2013-05-30-alex-aotb-prototype/part1/2013-06-27-09-33-25.116055-CEST-00420221914256/asr_transcribed.xml'] try: trn, dec, dec_len, wav_len = [], [], [], [] for fn in files: doc = xml.dom.minidom.parse(fn) turns = doc.getElementsByTagName("turn") f_dir = os.path.dirname(fn) for turn in turns: if turn.getAttribute('speaker') != 'user': continue recs = turn.getElementsByTagName("rec") trans = turn.getElementsByTagName("asr_transcription") if len(recs) != 1: print "Skipping a turn {turn} in file: {fn} - recs: {recs}".format( turn=turn.getAttribute('turn_number'), fn=fn, recs=len(recs)) continue if len(trans) == 0: print "Skipping a turn in {fn} - trans: {trans}".format( fn=fn, trans=len(trans)) continue wav_file = recs[0].getAttribute('fname') # FIXME: Check whether the last transcription is really the best! FJ t = various.get_text_from_xml_node(trans[-1]) t = normalise_text(t) if exclude_lm(t): continue # TODO is it still valid? OP # The silence does not have a label in the language model. t = t.replace('_SIL_', '') trn.append((wav_file, t)) wav_path = os.path.join(f_dir, wav_file) best, dec_dur, fw_dur, wav_dur = decode_info( asr, cfg, outdir, wav_path, t) dec.append((wav_file, best)) wav_len.append((wav_file, wav_dur)) dec_len.append((wav_file, dec_dur)) except Exception as e: print 'PARTIAL RESULTS were saved to %s' % outdir print e raise e finally: trn_dict = dict(trn) dec_dict = dict(dec) wavlen_dict = dict(wav_len) declen_dict = dict(dec_len) compute_save_stat(outdir, trn_dict, dec_dict, wavlen_dict, declen_dict)