Exemple #1
0
        for fn in files:
#            print "Processing:", fn
            doc = xml.dom.minidom.parse(fn)
            turns = doc.getElementsByTagName("turn")
            
            for turn in turns:
                recs_list = turn.getElementsByTagName("rec")
                trans_list = turn.getElementsByTagName("asr_transcription")

                if trans_list:
                    trans = trans_list[-1]

                    t = various.get_text_from_xml_node(trans)
                    t = normalise_text(t)

                    if exclude_lm(t):
                        continue

                    # The silence does not have a label in the language model.
                    t = t.replace('_SIL_', '')

                    tt.append(t)

                    wav_file = recs_list[0].getAttribute('fname')
                    wav_path = os.path.realpath(os.path.join(os.path.dirname(fn), wav_file))

                    pt.append((wav_path, t))

        random.seed(10)
        sf = [(a, b) for a, b in zip(tt, pt)]
        random.shuffle(sf)
Exemple #2
0
def extract_from_xml(indomain_data_dir, outdir, cfg):
    glob = 'asr_transcribed.xml'
    asr = asr_factory(cfg)

    print 'Collecting files under %s with glob %s' % (indomain_data_dir, glob)
    files = []
    for root, dirnames, filenames in os.walk(indomain_data_dir, followlinks=True):
        for filename in fnmatch.filter(filenames, glob):
            files.append(os.path.join(root, filename))

    # DEBUG example
    # files = [
    #     '/ha/projects/vystadial/data/call-logs/2013-05-30-alex-aotb-prototype/part1/2013-06-27-09-33-25.116055-CEST-00420221914256/asr_transcribed.xml']

    try:
        trn, dec, dec_len, wav_len = [], [], [], []
        for fn in files:
            doc = xml.dom.minidom.parse(fn)
            turns = doc.getElementsByTagName("turn")
            f_dir = os.path.dirname(fn)

            for turn in turns:
                if turn.getAttribute('speaker') != 'user':
                    continue

                recs = turn.getElementsByTagName("rec")
                trans = turn.getElementsByTagName("asr_transcription")

                if len(recs) != 1:
                    print "Skipping a turn {turn} in file: {fn} - recs: {recs}".format(turn=turn.getAttribute('turn_number'), fn=fn, recs=len(recs))
                    continue

                if len(trans) == 0:
                    print "Skipping a turn in {fn} - trans: {trans}".format(fn=fn, trans=len(trans))
                    continue

                wav_file = recs[0].getAttribute('fname')
                # FIXME: Check whether the last transcription is really the best! FJ
                t = various.get_text_from_xml_node(trans[-1])
                t = normalise_text(t)

                if exclude_lm(t):
                    continue

                # TODO is it still valid? OP
                # The silence does not have a label in the language model.
                t = t.replace('_SIL_', '')
                trn.append((wav_file, t))

                wav_path = os.path.join(f_dir, wav_file)
                best, dec_dur, fw_dur, wav_dur = decode_info(asr, cfg, wav_path, t)
                dec.append((wav_file, best))
                wav_len.append((wav_file, wav_dur))
                dec_len.append((wav_file, dec_dur))

    except Exception as e:
        print 'PARTIAL RESULTS were saved to %s' % outdir
        print e
        raise e
    finally:
        trn_dict = dict(trn)
        dec_dict = dict(dec)
        wavlen_dict = dict(wav_len)
        declen_dict = dict(dec_len)
        compute_save_stat(outdir, trn_dict, dec_dict, wavlen_dict, declen_dict)
Exemple #3
0
        for fn in files:
#            print "Processing:", fn
            doc = xml.dom.minidom.parse(fn)
            turns = doc.getElementsByTagName("turn")
            
            for turn in turns:
                recs_list = turn.getElementsByTagName("rec")
                trans_list = turn.getElementsByTagName("asr_transcription")

                if trans_list:
                    trans = trans_list[-1]

                    t = various.get_text_from_xml_node(trans)
                    t = normalise_text(t)

                    if exclude_lm(t):
                        continue

                    # The silence does not have a label in the language model.
                    t = t.replace('_SIL_', '')

                    tt.append(t)

                    wav_file = recs_list[0].getAttribute('fname')
                    wav_path = os.path.realpath(os.path.join(os.path.dirname(fn), wav_file))

                    pt.append((wav_path, t))

        random.seed(10)
        sf = [(a, b) for a, b in zip(tt, pt)]
        random.shuffle(sf)
Exemple #4
0
def extract_from_xml(indomain_data_dir, outdir, cfg):
    """Extract transcription and Waves from xml

    Args:
        indomain_data_dir(path): path where the xml logs are stored
        outdir: directory to save the references and wave, Wav file names pairs
        cfg: Alex configuration
    """

    glob = 'asr_transcribed.xml'
    asr = asr_factory(cfg)

    print 'Collecting files under %s with glob %s' % (indomain_data_dir, glob)
    files = []
    for root, dirnames, filenames in os.walk(indomain_data_dir,
                                             followlinks=True):
        for filename in fnmatch.filter(filenames, glob):
            files.append(os.path.join(root, filename))

    # DEBUG example
    # files = [
    #     '/ha/projects/vystadial/data/call-logs/2013-05-30-alex-aotb-prototype/part1/2013-06-27-09-33-25.116055-CEST-00420221914256/asr_transcribed.xml']

    try:
        trn, dec, dec_len, wav_len = [], [], [], []
        for fn in files:
            doc = xml.dom.minidom.parse(fn)
            turns = doc.getElementsByTagName("turn")
            f_dir = os.path.dirname(fn)

            for turn in turns:
                if turn.getAttribute('speaker') != 'user':
                    continue

                recs = turn.getElementsByTagName("rec")
                trans = turn.getElementsByTagName("asr_transcription")

                if len(recs) != 1:
                    print "Skipping a turn {turn} in file: {fn} - recs: {recs}".format(
                        turn=turn.getAttribute('turn_number'),
                        fn=fn,
                        recs=len(recs))
                    continue

                if len(trans) == 0:
                    print "Skipping a turn in {fn} - trans: {trans}".format(
                        fn=fn, trans=len(trans))
                    continue

                wav_file = recs[0].getAttribute('fname')
                # FIXME: Check whether the last transcription is really the best! FJ
                t = various.get_text_from_xml_node(trans[-1])
                t = normalise_text(t)

                if exclude_lm(t):
                    continue

                # TODO is it still valid? OP
                # The silence does not have a label in the language model.
                t = t.replace('_SIL_', '')
                trn.append((wav_file, t))

                wav_path = os.path.join(f_dir, wav_file)
                best, dec_dur, fw_dur, wav_dur = decode_info(
                    asr, cfg, outdir, wav_path, t)
                dec.append((wav_file, best))
                wav_len.append((wav_file, wav_dur))
                dec_len.append((wav_file, dec_dur))

    except Exception as e:
        print 'PARTIAL RESULTS were saved to %s' % outdir
        print e
        raise e
    finally:
        trn_dict = dict(trn)
        dec_dict = dict(dec)
        wavlen_dict = dict(wav_len)
        declen_dict = dict(dec_len)
        compute_save_stat(outdir, trn_dict, dec_dict, wavlen_dict, declen_dict)