Ejemplo n.º 1
0
def extract_trns_sems_from_file(fname, verbose, fields=None, normalise=True,
                                do_exclude=True, known_words=None,
                                robust=False):
    """
    Extracts transcriptions and their semantic annotation from a CUED call log
    file.

    Arguments:
        fname -- path towards the call log file
        verbose -- print lots of output?
        fields -- names of fields that should be required for the output.
            Field names are strings corresponding to the element names in the
            transcription XML format.  (default: all five of them)
        normalise -- whether to do normalisation on transcriptions
        do_exclude -- whether to exclude transcriptions not considered suitable
        known_words -- a collection of words.  If provided, transcriptions are
            excluded which contain other words.  If not provided, excluded are
            transcriptions that contain any of _excluded_characters.  What
            "excluded" means depends on whether the transcriptions are required
            by being specified in `fields'.
        robust -- whether to assign recordings to turns robustly or trust where
            they are in the log.  This could be useful for older CUED logs
            where the elements sometimes escape to another <turn> than they
            belong.  However, in cases where `robust' leads to finding the
            correct recording for the user turn, the log is damaged at other
            places too, and the resulting turn record would be misleading.
            Therefore, we recommend leaving robust=False.

    Returns a list of TurnRecords.

    """

    if verbose:
        print 'Processing', fname

    # Interpret the arguments.
    if fields is None:
        fields = ("transcription", "semitran", "semihyp", "asrhyp", "rec")
    rec_filter = _make_rec_filter(fields)

    # Load the file.
    doc = xml.dom.minidom.parse(fname)
    uturns = doc.getElementsByTagName("userturn")
    if robust:
        audios = [audio for audio in doc.getElementsByTagName("rec")
                  if not audio.getAttribute('fname').endswith('_all.wav')]

    trns_sems = []
    for uturn in uturns:
        transcription = uturn.getElementsByTagName("transcription")
        cued_da = uturn.getElementsByTagName("semitran")
        cued_dahyp = uturn.getElementsByTagName("semihyp")
        asrhyp = uturn.getElementsByTagName("asrhyp")
        audio = uturn.getElementsByTagName("rec")
        # If there was something recognised but nothing recorded, if in the
        # robust mode,
        if asrhyp and not audio and robust:
            # Look for the recording elsewhere.
            audio = [_find_audio_for_turn(uturn, audios)]

        # This is the first form of the turn record, containing lists of XML
        # elements and suited only for internal use.
        rec = TurnRecord(transcription, cued_da, cued_dahyp, asrhyp, audio)
        if not rec_filter(rec):
            # Skip this node, it contains a wrong number of elements of either
            # transcription, cued_da, cued_dahyp, asrhyp, or audio.
            continue

        # XXX Here we take always the first tag having the respective tag name.
        transcription = get_text_from_xml_node(
            rec.transcription[0]).lower() if rec.transcription else None
        asrhyp = get_text_from_xml_node(
            rec.asrhyp[0]).lower() if rec.asrhyp else None
        # Filter the transcription and the ASR hypothesis through normalisation
        # and excluding non-conformant utterances.
        if transcription is not None:
            if normalise:
                transcription = normalise_text(transcription)
            if do_exclude:
                if known_words is not None:
                    trs_excluded = exclude_by_dict(transcription, known_words)
                else:
                    trs_excluded = exclude_asr(transcription)
                if trs_excluded:
                    if verbose:
                        print 'Excluded transcription: "{trs}".'.format(
                            trs=transcription)
                    if 'transcription' in fields:
                        continue
                    transcription = None
        if asrhyp is not None:
            if normalise:
                asrhyp = normalise_text(asrhyp)
            if do_exclude:
                if known_words is not None:
                    asr_excluded = exclude_by_dict(asrhyp, known_words)
                else:
                    asr_excluded = exclude_asr(asrhyp)
                if asr_excluded:
                    if verbose:
                        print 'Excluded ASR hypothesis: "{asr}".'.format(
                            asr=asrhyp)
                    if 'asrhyp' in fields:
                        continue
                    asrhyp = None

        cued_da = get_text_from_xml_node(
            rec.cued_da[0]) if rec.cued_da else None
        cued_dahyp = get_text_from_xml_node(
            rec.cued_dahyp[0]) if rec.cued_dahyp else None
        audio = rec.audio[0].getAttribute(
            'fname').strip() if rec.audio else None
        # Construct the resulting turn record.
        rec = TurnRecord(transcription, cued_da, cued_dahyp, asrhyp, audio)

        if verbose:
            print "#1 f:", rec.audio
            print "#2 t:", rec.transcription, "# s:", rec.cued_da
            print "#3 a:", rec.asrhyp, "# s:", rec.cued_dahyp
            print

        if rec.cued_da or 'semitran' not in fields:
            trns_sems.append(rec)

    return trns_sems
Ejemplo n.º 2
0
        tt = []
        pt = []
        for fn in files:
#            print "Processing:", fn
            doc = xml.dom.minidom.parse(fn)
            turns = doc.getElementsByTagName("turn")

            for turn in turns:
                recs_list = turn.getElementsByTagName("rec")
                trans_list = turn.getElementsByTagName("asr_transcription")

                if trans_list:
                    trans = trans_list[-1]

                    t = various.get_text_from_xml_node(trans)
                    t = normalise_text(t)

                    if exclude_lm(t):
                        print t + " was excluded!"
                        continue

                    # The silence does not have a label in the language model.
                    t = t.replace('_SIL_', '')

                    tt.append(t)

                    wav_file = recs_list[0].getAttribute('fname')
                    wav_path = os.path.realpath(os.path.join(os.path.dirname(fn), wav_file))

                    pt.append((wav_path, t))
Ejemplo n.º 3
0
        tt = []
        pt = []
        for fn in files:
#            print "Processing:", fn
            doc = xml.dom.minidom.parse(fn)
            turns = doc.getElementsByTagName("turn")

            for turn in turns:
                recs_list = turn.getElementsByTagName("rec")
                trans_list = turn.getElementsByTagName("asr_transcription")

                if trans_list:
                    trans = trans_list[-1]

                    t = various.get_text_from_xml_node(trans)
                    t = normalise_text(t)

                    if exclude_lm(t):
                        print t + " was excluded!"
                        continue

                    # The silence does not have a label in the language model.
                    t = t.replace('_SIL_', '')

                    tt.append(t)

                    wav_file = recs_list[0].getAttribute('fname')
                    wav_path = os.path.realpath(os.path.join(os.path.dirname(fn), wav_file))

                    pt.append((wav_path, t))
Ejemplo n.º 4
0
def extract_wavs_trns(dirname,
                      sess_fname,
                      outdir,
                      wav_mapping,
                      known_words=None,
                      verbose=False):
    """Extracts wavs and their transcriptions from the named in `sess_fname',
    a CUED call log file. Extracting means copying them to `outdir'. Recordings
    themselves are expected to reside in `dirname'.

    If `known_words', a collection of words present in the phonetic dictionary,
    is provided, transcriptions are excluded which contain other words. If
    `known_words' is not provided, excluded are transcriptions that contain any
    of _excluded_characters.

    Returns the total size of audio files copied to `outdir', the number of
    overwritten files by the output files, the number of wav files that were
    missing from the `wav_mapping' dictionary, and the number of transcriptions
    not present for existing recordings.

    """
    # Parse the file.
    try:
        doc = ElementTree.parse(sess_fname)
    except IOError as error:
        if verbose:
            print '!!! Could not parse "{fname}": {msg!s}.'.format(
                fname=sess_fname, msg=error)
            return 0, 0, 0, 0
    uturns = doc.findall(".//userturn")

    annotations = doc.findall('.//annotation')
    print "# annotations: ", len(annotations)
    if len(annotations) > 1:
        # FIXME: This is bad! We waste about 1/3 of all data from CF. However, it is not possible to deduce
        # what transcription to use.
        print "Transcription was rejected as we have more then two transcriptions and " \
              "we cannot decide which one is better."
        return 0, 0, 0, 0
    for a in annotations:
        r = False
        if 'worker_id' in a.attrib and a.attrib['worker_id'] == '19113916':
            r = True

        if r:
            print "Transcription was rejected because of unreliable annotator."
            return 0, 0, 0, 0

    size = 0
    n_overwrites = 0
    n_missing_wav = 0
    n_missing_trs = 0
    for uturn in uturns:
        # trs.text = uturn.getElementsByTagName("trs.text")
        # rec = uturn.getElementsByTagName("rec")
        rec = uturn.find("rec")
        trs = uturn.find("transcription")
        if trs is None:
            # this may be CF style transcription

            trs2 = uturn.find("transcriptions")
            if trs2 is not None:
                trs3 = trs2.findall("transcription")

                if trs3 is None:
                    if rec is not None:
                        n_missing_trs += 1
                    continue
                else:
                    trs = trs3[-1].text
            else:
                continue
        else:
            trs = trs.text

        # Check this is the wav from this directory.
        wav_basename = rec.attrib['fname'].strip()
        if wav_basename in wav_mapping:
            wav_fname = os.path.join(wav_mapping[wav_basename], wav_basename)
            if os.path.dirname(wav_fname) != dirname:
                missing_wav = True
            else:
                missing_wav = False
        else:
            missing_wav = True

        if not missing_wav:
            if verbose:
                term_width = getTerminalSize()[1] or 80
                print '-' * term_width
                print "# f:", wav_basename
                print "orig transcription:", trs.upper().strip()

            trs = normalise_text(trs)

            if verbose:
                print "normalised trans:  ", trs

            if known_words is not None:
                excluded = exclude_by_dict(trs, known_words)
            else:
                excluded = exclude_asr(trs)
            if excluded:
                print "... excluded"
                continue

            wc.update(trs.split())

            trs_fname = os.path.join(outdir, wav_basename + '.trn')

            try:
                szx = os.path.getsize(wav_fname)
            except OSError:
                print "Lost audio file:", wav_fname
            else:
                try:
                    #shutil.copy2(wav_fname, outdir)
                    tgt = os.path.join(outdir, os.path.basename(wav_fname))
                    cmd = "sox --ignore-length {src} -c 1 -r 16000 -b 16 {tgt}".format(
                        src=wav_fname, tgt=tgt)
                    print cmd
                    os.system(cmd)
                    size += os.path.getsize(tgt)
                except shutil.Error as e:
                    print >>sys.stderr, \
                        ("Isn't the `outdir' with previously copied files "
                         "below `infname' within the filesystem?\n")
                    raise e
                n_overwrites += save_transcription(trs_fname, trs)
                shutil.copystat(sess_fname, trs_fname)
        else:
            n_missing_wav += 1
            if args.verbose:
                term_width = getTerminalSize()[1] or 80
                print '-' * term_width
                print "(WW) Ignoring or missing_wav the file '{0}'."\
                    .format(wav_basename)

    if verbose:
        term_width = getTerminalSize()[1] or 80
        print '-' * term_width
        print
    return size, n_overwrites, n_missing_wav, n_missing_trs
Ejemplo n.º 5
0
def extract_wavs_trns(dirname, sess_fname, outdir, wav_mapping,
                      known_words=None, verbose=False):
    """Extracts wavs and their transcriptions from the named in `sess_fname',
    a CUED call log file. Extracting means copying them to `outdir'. Recordings
    themselves are expected to reside in `dirname'.

    If `known_words', a collection of words present in the phonetic dictionary,
    is provided, transcriptions are excluded which contain other words. If
    `known_words' is not provided, excluded are transcriptions that contain any
    of _excluded_characters.

    Returns the total size of audio files copied to `outdir', the number of
    overwritten files by the output files, the number of wav files that were
    missing from the `wav_mapping' dictionary, and the number of transcriptions
    not present for existing recordings.

    """
    # Parse the file.
    try:
        doc = ElementTree.parse(sess_fname)
    except IOError as error:
        if verbose:
            print '!!! Could not parse "{fname}": {msg!s}.'.format(fname=sess_fname, msg=error)
            return 0, 0, 0, 0
    uturns = doc.findall(".//userturn")

    annotations = doc.findall('.//annotation')
    print "# annotations: ", len(annotations)
    if len(annotations) > 1:
        # FIXME: This is bad! We waste about 1/3 of all data from CF. However, it is not possible to deduce
        # what transcription to use.
        print "Transcription was rejected as we have more then two transcriptions and " \
              "we cannot decide which one is better."
        return 0, 0, 0, 0
    for a in annotations:
        r = False
        if 'worker_id' in a.attrib and a.attrib['worker_id'] == '19113916':
            r = True

        if r:
            print "Transcription was rejected because of unreliable annotator."
            return 0, 0, 0, 0


    size = 0
    n_overwrites = 0
    n_missing_wav = 0
    n_missing_trs = 0
    for uturn in uturns:
        # trs.text = uturn.getElementsByTagName("trs.text")
        # rec = uturn.getElementsByTagName("rec")
        rec = uturn.find("rec")
        trs = uturn.find("transcription")
        if trs is None:
            # this may be CF style transcription

            trs2 = uturn.find("transcriptions")
            if trs2 is not None:
                trs3 = trs2.findall("transcription")

                if trs3 is None:
                    if rec is not None:
                        n_missing_trs += 1
                    continue
                else:
                    trs = trs3[-1].text
            else:
                continue
        else:
            trs = trs.text

        # Check this is the wav from this directory.
        wav_basename = rec.attrib['fname'].strip()
        if wav_basename in wav_mapping:
            wav_fname = os.path.join(wav_mapping[wav_basename], wav_basename)
            if os.path.dirname(wav_fname) != dirname:
                missing_wav = True
            else:
                missing_wav = False
        else:
            missing_wav = True

        if not missing_wav:
            if verbose:
                term_width = getTerminalSize()[1] or 80
                print '-' * term_width
                print "# f:", wav_basename
                print "orig transcription:", trs.upper().strip()

            trs = normalise_text(trs)
            
            if verbose:
                print "normalised trans:  ", trs

            if known_words is not None:
                excluded = exclude_by_dict(trs, known_words)
            else:
                excluded = exclude_asr(trs)
            if excluded:
                print "... excluded"
                continue

            wc.update(trs.split())

#            trs_fname = os.path.join(outdir, wav_basename + '.trn')
            sub_dir = "{r:02}".format(r=random.randint(0, 99)), "{r:02}".format(r=random.randint(0, 99))
            trs_fname = os.path.join(outdir, sub_dir[0], sub_dir[1], wav_basename + '.trn')
            if not os.path.exists(os.path.dirname(trs_fname)):
                os.makedirs(os.path.dirname(trs_fname))

            try:
                szx = os.path.getsize(wav_fname)
            except OSError:
                print "Lost audio file:", wav_fname
            else:
                try:
                    #shutil.copy2(wav_fname, outdir)
#                    tgt = os.path.join(outdir, os.path.basename(wav_fname))
                    tgt = os.path.join(outdir, sub_dir[0], sub_dir[1], os.path.basename(wav_fname))
                    cmd = "sox --ignore-length {src} -c 1 -r 16000 -b 16 {tgt}".format(src=wav_fname, tgt=tgt)
                    print cmd
                    os.system(cmd)
                    size += os.path.getsize(tgt)
                except shutil.Error as e:
                    print >>sys.stderr, \
                        ("Isn't the `outdir' with previously copied files "
                         "below `infname' within the filesystem?\n")
                    raise e
                n_overwrites += save_transcription(trs_fname, trs)
                shutil.copystat(sess_fname, trs_fname)
        else:
            n_missing_wav += 1
            if args.verbose:
                term_width = getTerminalSize()[1] or 80
                print '-' * term_width
                print "(WW) Ignoring or missing_wav the file '{0}'."\
                    .format(wav_basename)

    if verbose:
        term_width = getTerminalSize()[1] or 80
        print '-' * term_width
        print
    return size, n_overwrites, n_missing_wav, n_missing_trs