Beispiel #1
0
    def process_dm(self):
        # new turn
        self.cfg['Logging']['session_logger'].turn("system")
        self.dm.log_state()

        term_width = getTerminalSize()[1] or 120
        print "=" * term_width
        print
        sys_da = self.dm.da_out()
        self.output_sys_da(sys_da)

        self.cfg['Logging']['session_logger'].dialogue_act("system", sys_da)

        sys_utt = self.nlg.generate(sys_da)
        self.output_sys_utt(sys_utt)

        term_width = getTerminalSize()[1] or 120
        print '-' * term_width
        print
Beispiel #2
0
Datei: thub.py Projekt: AoJ/alex
    def process_dm(self):
        # new turn
        self.cfg['Logging']['session_logger'].turn("system")
        self.dm.log_state()

        term_width = getTerminalSize()[1] or 120
        print "=" * term_width
        print
        sys_da = self.dm.da_out()
        self.output_sys_da(sys_da)

        self.cfg['Logging']['session_logger'].dialogue_act("system", sys_da)

        sys_utt = self.nlg.generate(sys_da)
        self.output_sys_utt(sys_utt)

        term_width = getTerminalSize()[1] or 120
        print '-' * term_width
        print
Beispiel #3
0
    def process_utterance_hyp(self, obs):
        #self.output_usr_utt_nblist(utt_nblist)
        das = self.slu.parse(obs)
        self.output_usr_da(das)

        self.cfg['Logging']['session_logger'].turn("user")
        self.cfg['Logging']['session_logger'].slu("user", "*", das)

        term_width = getTerminalSize()[1] or 120
        print '-' * term_width
        print
        self.dm.da_in(das, obs.values()[0])
Beispiel #4
0
Datei: thub.py Projekt: AoJ/alex
    def process_utterance_hyp(self, obs):
        #self.output_usr_utt_nblist(utt_nblist)
        das = self.slu.parse(obs)
        self.output_usr_da(das)

        self.cfg['Logging']['session_logger'].turn("user")
        self.cfg['Logging']['session_logger'].slu("user", "*", das)

        term_width = getTerminalSize()[1] or 120
        print '-' * term_width
        print
        self.dm.da_in(das, obs.values()[0])
def extract_wavs_trns(dirname, sess_fname, outdir, wav_mapping, known_words=None, lang='cs', verbose=False):
    """Extracts wavs and their transcriptions from the named in `sess_fname',
    a CUED call log file. Extracting means copying them to `outdir'. Recordings
    themselves are expected to reside in `dirname'.

    If `known_words', a collection of words present in the phonetic dictionary,
    is provided, transcriptions are excluded which contain other words. If
    `known_words' is not provided, excluded are transcriptions that contain any
    of _excluded_characters.

    Returns the total size of audio files copied to `outdir', the number of
    overwritten files by the output files, the number of wav files that were
    missing from the `wav_mapping' dictionary, and the number of transcriptions
    not present for existing recordings.

    """

    # Import the appropriate normalisation module.
    norm_mod_name = _LANG2NORMALISATION_MOD[lang]
    norm_mod = __import__(norm_mod_name, fromlist=( 'normalise_text', 'exclude_asr', 'exclude_by_dict'))

    # Parse the file.
    try:
        doc = ElementTree.parse(sess_fname)
    except IOError as error:
        if verbose:
            print '!!! Could not parse "{fname}": {msg!s}.'\
                .format(fname=sess_fname, msg=error)
            return 0, 0, 0, 0
    uturns = doc.findall(".//turn")

    annotations = doc.findall('.//annotation')
    if len(annotations) > 1:
        print "Transcription was rejected as we have more then two transcriptions and " \
              "we cannot decide which one is better."
        return 0, 0, 0, 0
    for a in annotations:
        r = False
        if 'worker_id' in a.attrib and a.attrib['worker_id'] == '19113916':
            r = True

        if r:
            print "Transcription was rejected because of unreliable annotator."
            return 0, 0, 0, 0

    size = 0
    n_overwrites = 0
    n_missing_wav = 0
    n_missing_trs = 0
    for uturn in uturns:
        # trs.text = uturn.getElementsByTagName("trs.text")
        # rec = uturn.getElementsByTagName("rec")
        
        if uturn.attrib['speaker'] != "user":
            continue 
            
        rec = uturn.find("rec")
        trs = uturn.findall("asr_transcription")
        if trs is None:
            if rec is not None:
                n_missing_trs += 1
            continue
        else:
            # FIXME: Is the last transcription the right thing to be used? Probably. Must be checked!
            trs = trs[-1].text

        # Check this is the wav from this directory.
        wav_basename = rec.attrib['fname'].strip()
        if wav_basename in wav_mapping:
            wav_fname = os.path.join(wav_mapping[wav_basename], wav_basename)
            if os.path.dirname(wav_fname) != dirname:
                missing_wav = True
            else:
                missing_wav = False
        else:
            missing_wav = True

        if not missing_wav:
            if verbose:
                term_width = getTerminalSize()[1] or 80
                print '-' * term_width
                print "# f:", wav_basename
                print "orig transcription:", trs.upper().strip()

            trs = norm_mod.normalise_text(trs)
            if verbose:
                print "normalised trans:  ", trs

            if known_words is not None:
                excluded = norm_mod.exclude_by_dict(trs, known_words)
            else:
                excluded = norm_mod.exclude_asr(trs)
            if excluded:
                print "... excluded"
                continue

            wc.update(trs.split())

            trs_fname = os.path.join(outdir, wav_basename + '.trn')

            try:
                size += os.path.getsize(wav_fname)
            except OSError:
                print "Lost audio file:", wav_fname
            else:
                try:
                    #shutil.copy2(wav_fname, outdir)
                    tgt = os.path.join(outdir, os.path.basename(wav_fname))
                    cmd = "sox --ignore-length {src} -c 1 -r 16000 -b 16 {tgt}".format(src=wav_fname, tgt=tgt)
                    print cmd
                    os.system(cmd)
                except shutil.Error as e:
                    print >>sys.stderr, \
                        ("Isn't the `outdir' with previously copied files "
                         "below `infname' within the filesystem?\n")
                    raise e
                n_overwrites += save_transcription(trs_fname, trs)
                shutil.copystat(sess_fname, trs_fname)
        else:
            n_missing_wav += 1
            if args.verbose:
                term_width = getTerminalSize()[1] or 80
                print '-' * term_width
                print "(WW) Ignoring or missing_wav the file '{0}'."\
                    .format(wav_basename)

    if verbose:
        term_width = getTerminalSize()[1] or 80
        print '-' * term_width
        print
    return size, n_overwrites, n_missing_wav, n_missing_trs
# Make sure the alex package is visible.
if __name__ == '__main__':
    import autopath

from alex.utils.fs import find

_LANG2NORMALISATION_MOD = {
    'cs': 'alex.corpustools.text_norm_cs',
    'en': 'alex.corpustools.text_norm_en',
    'es': 'alex.corpustools.text_norm_es',
}

from alex.utils.ui import getTerminalSize
try:
    _term_width = getTerminalSize()[1]
except:
    _term_width = 80


def unique_str():
    """Generates a fairly unique string."""
    return hex(random.randint(0, 256 * 256 * 256 * 256 - 1))[2:]


def cut_wavs(src, tgt, start, end):
    """Cuts out the interval `start'--`end' from the wav file `src' and saves
    it to `tgt'.

    """
    existed = os.path.exists(tgt)
# Make sure the alex package is visible.
if __name__ == "__main__":
    import autopath

from alex.utils.fs import find

_LANG2NORMALISATION_MOD = {
    "cs": "alex.corpustools.text_norm_cs",
    "en": "alex.corpustools.text_norm_en",
    "es": "alex.corpustools.text_norm_es",
}

from alex.utils.ui import getTerminalSize

try:
    _term_width = getTerminalSize()[1]
except:
    _term_width = 80


def unique_str():
    """Generates a fairly unique string."""
    return hex(random.randint(0, 256 * 256 * 256 * 256 - 1))[2:]


def cut_wavs(src, tgt, start, end):
    """Cuts out the interval `start'--`end' from the wav file `src' and saves
    it to `tgt'.

    """
    existed = os.path.exists(tgt)
Beispiel #8
0
def extract_wavs_trns(dirname,
                      sess_fname,
                      outdir,
                      wav_mapping,
                      known_words=None,
                      lang='cs',
                      verbose=False):
    """Extracts wavs and their transcriptions from the named in `sess_fname',
    a CUED call log file. Extracting means copying them to `outdir'. Recordings
    themselves are expected to reside in `dirname'.

    If `known_words', a collection of words present in the phonetic dictionary,
    is provided, transcriptions are excluded which contain other words. If
    `known_words' is not provided, excluded are transcriptions that contain any
    of _excluded_characters.

    Returns the total size of audio files copied to `outdir', the number of
    overwritten files by the output files, the number of wav files that were
    missing from the `wav_mapping' dictionary, and the number of transcriptions
    not present for existing recordings.

    """

    # Import the appropriate normalisation module.
    norm_mod_name = _LANG2NORMALISATION_MOD[lang]
    norm_mod = __import__(norm_mod_name,
                          fromlist=('normalise_text', 'exclude_asr',
                                    'exclude_by_dict'))

    # Parse the file.
    try:
        doc = ElementTree.parse(sess_fname)
    except IOError as error:
        if verbose:
            print '!!! Could not parse "{fname}": {msg!s}.'\
                .format(fname=sess_fname, msg=error)
            return 0, 0, 0, 0
    uturns = doc.findall(".//turn")

    annotations = doc.findall('.//annotation')
    if len(annotations) > 1:
        print "Transcription was rejected as we have more then two transcriptions and " \
              "we cannot decide which one is better."
        return 0, 0, 0, 0
    for a in annotations:
        r = False
        if 'worker_id' in a.attrib and a.attrib['worker_id'] == '19113916':
            r = True

        if r:
            print "Transcription was rejected because of unreliable annotator."
            return 0, 0, 0, 0

    size = 0
    n_overwrites = 0
    n_missing_wav = 0
    n_missing_trs = 0
    for uturn in uturns:
        # trs.text = uturn.getElementsByTagName("trs.text")
        # rec = uturn.getElementsByTagName("rec")

        if uturn.attrib['speaker'] != "user":
            continue

        rec = uturn.find("rec")
        trs = uturn.findall("asr_transcription")
        if trs is None:
            if rec is not None:
                n_missing_trs += 1
            continue
        else:
            # FIXME: Is the last transcription the right thing to be used? Probably. Must be checked!
            trs = trs[-1].text

        # Check this is the wav from this directory.
        wav_basename = rec.attrib['fname'].strip()
        if wav_basename in wav_mapping:
            wav_fname = os.path.join(wav_mapping[wav_basename], wav_basename)
            if os.path.dirname(wav_fname) != dirname:
                missing_wav = True
            else:
                missing_wav = False
        else:
            missing_wav = True

        if not missing_wav:
            if verbose:
                term_width = getTerminalSize()[1] or 80
                print '-' * term_width
                print "# f:", wav_basename
                print "orig transcription:", trs.upper().strip()

            trs = norm_mod.normalise_text(trs)
            if verbose:
                print "normalised trans:  ", trs

            if known_words is not None:
                excluded = norm_mod.exclude_by_dict(trs, known_words)
            else:
                excluded = norm_mod.exclude_asr(trs)
            if excluded:
                print "... excluded"
                continue

            wc.update(trs.split())

            trs_fname = os.path.join(outdir, wav_basename + '.trn')

            try:
                size += os.path.getsize(wav_fname)
            except OSError:
                print "Lost audio file:", wav_fname
            else:
                try:
                    #shutil.copy2(wav_fname, outdir)
                    tgt = os.path.join(outdir, os.path.basename(wav_fname))
                    cmd = "sox --ignore-length {src} -c 1 -r 16000 -b 16 {tgt}".format(
                        src=wav_fname, tgt=tgt)
                    print cmd
                    os.system(cmd)
                except shutil.Error as e:
                    print >>sys.stderr, \
                        ("Isn't the `outdir' with previously copied files "
                         "below `infname' within the filesystem?\n")
                    raise e
                n_overwrites += save_transcription(trs_fname, trs)
                shutil.copystat(sess_fname, trs_fname)
        else:
            n_missing_wav += 1
            if args.verbose:
                term_width = getTerminalSize()[1] or 80
                print '-' * term_width
                print "(WW) Ignoring or missing_wav the file '{0}'."\
                    .format(wav_basename)

    if verbose:
        term_width = getTerminalSize()[1] or 80
        print '-' * term_width
        print
    return size, n_overwrites, n_missing_wav, n_missing_trs
Beispiel #9
0
def extract_wavs_trns(dirname, sess_fname, outdir, wav_mapping,
                      known_words=None, verbose=False):
    """Extracts wavs and their transcriptions from the named in `sess_fname',
    a CUED call log file. Extracting means copying them to `outdir'. Recordings
    themselves are expected to reside in `dirname'.

    If `known_words', a collection of words present in the phonetic dictionary,
    is provided, transcriptions are excluded which contain other words. If
    `known_words' is not provided, excluded are transcriptions that contain any
    of _excluded_characters.

    Returns the total size of audio files copied to `outdir', the number of
    overwritten files by the output files, the number of wav files that were
    missing from the `wav_mapping' dictionary, and the number of transcriptions
    not present for existing recordings.

    """
    # Parse the file.
    try:
        doc = ElementTree.parse(sess_fname)
    except IOError as error:
        if verbose:
            print '!!! Could not parse "{fname}": {msg!s}.'.format(fname=sess_fname, msg=error)
            return 0, 0, 0, 0
    uturns = doc.findall(".//userturn")

    annotations = doc.findall('.//annotation')
    print "# annotations: ", len(annotations)
    if len(annotations) > 1:
        # FIXME: This is bad! We waste about 1/3 of all data from CF. However, it is not possible to deduce
        # what transcription to use.
        print "Transcription was rejected as we have more then two transcriptions and " \
              "we cannot decide which one is better."
        return 0, 0, 0, 0
    for a in annotations:
        r = False
        if 'worker_id' in a.attrib and a.attrib['worker_id'] == '19113916':
            r = True

        if r:
            print "Transcription was rejected because of unreliable annotator."
            return 0, 0, 0, 0


    size = 0
    n_overwrites = 0
    n_missing_wav = 0
    n_missing_trs = 0
    for uturn in uturns:
        # trs.text = uturn.getElementsByTagName("trs.text")
        # rec = uturn.getElementsByTagName("rec")
        rec = uturn.find("rec")
        trs = uturn.find("transcription")
        if trs is None:
            # this may be CF style transcription

            trs2 = uturn.find("transcriptions")
            if trs2 is not None:
                trs3 = trs2.findall("transcription")

                if trs3 is None:
                    if rec is not None:
                        n_missing_trs += 1
                    continue
                else:
                    trs = trs3[-1].text
            else:
                continue
        else:
            trs = trs.text

        # Check this is the wav from this directory.
        wav_basename = rec.attrib['fname'].strip()
        if wav_basename in wav_mapping:
            wav_fname = os.path.join(wav_mapping[wav_basename], wav_basename)
            if os.path.dirname(wav_fname) != dirname:
                missing_wav = True
            else:
                missing_wav = False
        else:
            missing_wav = True

        if not missing_wav:
            if verbose:
                term_width = getTerminalSize()[1] or 80
                print '-' * term_width
                print "# f:", wav_basename
                print "orig transcription:", trs.upper().strip()

            trs = normalise_text(trs)
            
            if verbose:
                print "normalised trans:  ", trs

            if known_words is not None:
                excluded = exclude_by_dict(trs, known_words)
            else:
                excluded = exclude_asr(trs)
            if excluded:
                print "... excluded"
                continue

            wc.update(trs.split())

#            trs_fname = os.path.join(outdir, wav_basename + '.trn')
            sub_dir = "{r:02}".format(r=random.randint(0, 99)), "{r:02}".format(r=random.randint(0, 99))
            trs_fname = os.path.join(outdir, sub_dir[0], sub_dir[1], wav_basename + '.trn')
            if not os.path.exists(os.path.dirname(trs_fname)):
                os.makedirs(os.path.dirname(trs_fname))

            try:
                szx = os.path.getsize(wav_fname)
            except OSError:
                print "Lost audio file:", wav_fname
            else:
                try:
                    #shutil.copy2(wav_fname, outdir)
#                    tgt = os.path.join(outdir, os.path.basename(wav_fname))
                    tgt = os.path.join(outdir, sub_dir[0], sub_dir[1], os.path.basename(wav_fname))
                    cmd = "sox --ignore-length {src} -c 1 -r 16000 -b 16 {tgt}".format(src=wav_fname, tgt=tgt)
                    print cmd
                    os.system(cmd)
                    size += os.path.getsize(tgt)
                except shutil.Error as e:
                    print >>sys.stderr, \
                        ("Isn't the `outdir' with previously copied files "
                         "below `infname' within the filesystem?\n")
                    raise e
                n_overwrites += save_transcription(trs_fname, trs)
                shutil.copystat(sess_fname, trs_fname)
        else:
            n_missing_wav += 1
            if args.verbose:
                term_width = getTerminalSize()[1] or 80
                print '-' * term_width
                print "(WW) Ignoring or missing_wav the file '{0}'."\
                    .format(wav_basename)

    if verbose:
        term_width = getTerminalSize()[1] or 80
        print '-' * term_width
        print
    return size, n_overwrites, n_missing_wav, n_missing_trs
def extract_wavs_trns(dirname,
                      sess_fname,
                      outdir,
                      wav_mapping,
                      known_words=None,
                      verbose=False):
    """Extracts wavs and their transcriptions from the named in `sess_fname',
    a CUED call log file. Extracting means copying them to `outdir'. Recordings
    themselves are expected to reside in `dirname'.

    If `known_words', a collection of words present in the phonetic dictionary,
    is provided, transcriptions are excluded which contain other words. If
    `known_words' is not provided, excluded are transcriptions that contain any
    of _excluded_characters.

    Returns the total size of audio files copied to `outdir', the number of
    overwritten files by the output files, the number of wav files that were
    missing from the `wav_mapping' dictionary, and the number of transcriptions
    not present for existing recordings.

    """
    # Parse the file.
    try:
        doc = ElementTree.parse(sess_fname)
    except IOError as error:
        if verbose:
            print '!!! Could not parse "{fname}": {msg!s}.'.format(
                fname=sess_fname, msg=error)
            return 0, 0, 0, 0
    uturns = doc.findall(".//userturn")

    annotations = doc.findall('.//annotation')
    print "# annotations: ", len(annotations)
    if len(annotations) > 1:
        # FIXME: This is bad! We waste about 1/3 of all data from CF. However, it is not possible to deduce
        # what transcription to use.
        print "Transcription was rejected as we have more then two transcriptions and " \
              "we cannot decide which one is better."
        return 0, 0, 0, 0
    for a in annotations:
        r = False
        if 'worker_id' in a.attrib and a.attrib['worker_id'] == '19113916':
            r = True

        if r:
            print "Transcription was rejected because of unreliable annotator."
            return 0, 0, 0, 0

    size = 0
    n_overwrites = 0
    n_missing_wav = 0
    n_missing_trs = 0
    for uturn in uturns:
        # trs.text = uturn.getElementsByTagName("trs.text")
        # rec = uturn.getElementsByTagName("rec")
        rec = uturn.find("rec")
        trs = uturn.find("transcription")
        if trs is None:
            # this may be CF style transcription

            trs2 = uturn.find("transcriptions")
            if trs2 is not None:
                trs3 = trs2.findall("transcription")

                if trs3 is None:
                    if rec is not None:
                        n_missing_trs += 1
                    continue
                else:
                    trs = trs3[-1].text
            else:
                continue
        else:
            trs = trs.text

        # Check this is the wav from this directory.
        wav_basename = rec.attrib['fname'].strip()
        if wav_basename in wav_mapping:
            wav_fname = os.path.join(wav_mapping[wav_basename], wav_basename)
            if os.path.dirname(wav_fname) != dirname:
                missing_wav = True
            else:
                missing_wav = False
        else:
            missing_wav = True

        if not missing_wav:
            if verbose:
                term_width = getTerminalSize()[1] or 80
                print '-' * term_width
                print "# f:", wav_basename
                print "orig transcription:", trs.upper().strip()

            trs = normalise_text(trs)

            if verbose:
                print "normalised trans:  ", trs

            if known_words is not None:
                excluded = exclude_by_dict(trs, known_words)
            else:
                excluded = exclude_asr(trs)
            if excluded:
                print "... excluded"
                continue

            wc.update(trs.split())

            #            trs_fname = os.path.join(outdir, wav_basename + '.trn')
            sub_dir = "{r:02}".format(
                r=random.randint(0, 99)), "{r:02}".format(
                    r=random.randint(0, 99))
            trs_fname = os.path.join(outdir, sub_dir[0], sub_dir[1],
                                     wav_basename + '.trn')
            if not os.path.exists(os.path.dirname(trs_fname)):
                os.makedirs(os.path.dirname(trs_fname))

            try:
                szx = os.path.getsize(wav_fname)
            except OSError:
                print "Lost audio file:", wav_fname
            else:
                try:
                    #shutil.copy2(wav_fname, outdir)
                    #                    tgt = os.path.join(outdir, os.path.basename(wav_fname))
                    tgt = os.path.join(outdir, sub_dir[0], sub_dir[1],
                                       os.path.basename(wav_fname))
                    cmd = "sox --ignore-length {src} -c 1 -r 16000 -b 16 {tgt}".format(
                        src=wav_fname, tgt=tgt)
                    print cmd
                    os.system(cmd)
                    size += os.path.getsize(tgt)
                except shutil.Error as e:
                    print >>sys.stderr, \
                        ("Isn't the `outdir' with previously copied files "
                         "below `infname' within the filesystem?\n")
                    raise e
                n_overwrites += save_transcription(trs_fname, trs)
                shutil.copystat(sess_fname, trs_fname)
        else:
            n_missing_wav += 1
            if args.verbose:
                term_width = getTerminalSize()[1] or 80
                print '-' * term_width
                print "(WW) Ignoring or missing_wav the file '{0}'."\
                    .format(wav_basename)

    if verbose:
        term_width = getTerminalSize()[1] or 80
        print '-' * term_width
        print
    return size, n_overwrites, n_missing_wav, n_missing_trs