Python find_with_ignorelistの例、alex.corpustools.cued.find_with_ignorelist Pythonの例

コード例 #1

0

ファイルを表示

ファイル: get_jasr_confnets.py プロジェクト: jakub-stejskal/alex

def get_wav_fnames(dirname, ignore_list_file=None):
    """
    Finds WAV files that should be decoded.

    Returns a list of tuples (filename, WAV unique ID).

    Arguments:
        dirname -- the directory to search for WAVs
        ignore_list_file -- a file of absolute paths or globs (can be mixed)
            specifying logs that should be skipped

    """
    find_kwargs = {'mindepth': 0,
                   'maxdepth': None,
                   'notrx': re.compile('^.*_all\\.wav$')}
    wav_fnames = find_with_ignorelist(dirname, '*.wav',
                                      ignore_list_file=ignore_list_file,
                                      find_kwargs=find_kwargs)
    return [(fname, basename(fname)) for fname in wav_fnames]

コード例 #2

0

ファイルを表示

def get_wav_fnames(dirname, ignore_list_file=None):
    """
    Finds WAV files that should be decoded.

    Returns a list of tuples (filename, WAV unique ID).

    Arguments:
        dirname -- the directory to search for WAVs
        ignore_list_file -- a file of absolute paths or globs (can be mixed)
            specifying logs that should be skipped

    """
    find_kwargs = {
        'mindepth': 0,
        'maxdepth': None,
        'notrx': re.compile('^.*_all\\.wav$')
    }
    wav_fnames = find_with_ignorelist(dirname,
                                      '*.wav',
                                      ignore_list_file=ignore_list_file,
                                      find_kwargs=find_kwargs)
    return [(fname, basename(fname)) for fname in wav_fnames]

コード例 #3

0

ファイルを表示

def convert(args):
    """
    Looks for recordings and transcriptions under the `args.infname'
    directory.  Converts audio files to WAVs and copies the .wav files
    and their transcriptions to `args.outdir' directory. `args.dictionary' may
    refer to an open file listing the only words to be allowed in
    transcriptions in the first whitespace-separated column.

    Returns a tuple of:
        total audio size
        total audio length in seconds
        number of collisions (files at different paths with same basename)
        number of overwrites (files with the same basename as previously
                             present in `args.outdir')
        number of missing files (file basenames referred in transcription logs
                                but missing in the file system)
        number of missing transcriptions

    """

    # Unpack the arguments.
    infname = args.infname
    outdir = args.outdir
    lang = args.language
    verbose = args.verbose
    ignore_list_file = args.ignore
    dict_file = args.dictionary

    size = 0
    seconds = 0
    n_overwrites = 0
    n_missing_audio = 0
    n_missing_trs = 0

    # Import the appropriate normalisation module.
    norm_mod_name = _LANG2NORMALISATION_MOD[lang]
    norm_mod = __import__(norm_mod_name,
                          fromlist=('normalise_text', 'exclude_asr',
                                    'exclude_by_dict'))

    # Read in the dictionary.
    if dict_file:
        known_words = set(line.split()[0] for line in dict_file)
        dict_file.close()
    else:
        known_words = None

    # Find transcription files.
    trs_paths = find_with_ignorelist(infname, '*.trs', ignore_list_file)
    trs_dict = {os.path.split(fpath)[1]: fpath for fpath in trs_paths}

    # Find all audio files, create dictionary of paths by basename.
    audio_paths = find_with_ignorelist(infname, '*.mp2')
    audio_dict = {
        os.path.splitext(os.path.split(fpath)[1])[0]: fpath
        for fpath in audio_paths
    }
    n_collisions = len(audio_paths) - len(audio_dict)

    # Process the files.
    for trs_path in trs_dict.values():
        if verbose:
            print "Processing", trs_path

        # Parse the file.
        doc = xml.dom.minidom.parse(trs_path)
        fname = doc.getElementsByTagName(
            "Trans")[0].attributes['audio_filename'].value
        if not fname in audio_dict or not os.path.isfile(audio_dict[fname]):
            if verbose:
                print "Lost audio file:", fname
            n_missing_audio += 1
            continue
        audio_path = audio_dict[fname]

        # Convert audio to wav.
        tmp_wav_path = os.path.join(outdir, fname + '.wav')
        to_wav(audio_path, tmp_wav_path)

        turns = doc.getElementsByTagName("Turn")

        i = 0
        for turn in turns:
            i += 1

            currtime = float(turn.getAttribute('startTime'))
            currtext = ''

            utterances = []

            # Process all child nodes.
            for node in turn.childNodes:
                if node.nodeType == node.ELEMENT_NODE and node.tagName == 'Sync':
                    starttime = currtime
                    currtime = float(node.getAttribute('time'))

                    if currtime > starttime:
                        utterances += [(currtext, starttime, currtime)]

                    currtext = ''
                elif node.nodeType == node.TEXT_NODE:
                    currtext += ' ' + node.data.strip()

            # Add the last utterance, which is not followed by a Sync tag.
            starttime = currtime
            try:
                currtime = float(turn.getAttribute('endTime'))
            except ValueError:
                currtime = float(turn.getAttribute('endTime').split()[0])

            if currtime > starttime:
                utterances += [(currtext, starttime, currtime)]

            j = 0
            for (trs, starttime, endtime) in utterances:
                j += 1

                if (endtime - starttime) < 0.2:
                    print "Too short segment"
                    continue

                if not trs:  # empty transcription
                    n_missing_trs += 1

                wav_name = '%s_%02d_%04d.wav' % (fname, i, j)
                #wav_path = os.path.join(outdir, wav_name)
                wav_path = os.path.join(
                    outdir, "{r:02}".format(r=random.randint(0, 99)),
                    "{r:02}".format(r=random.randint(0, 99)), wav_name)

                if not os.path.exists(os.path.dirname(wav_path)):
                    os.makedirs(os.path.dirname(wav_path))

                if verbose:
                    print
                    print "src:", os.path.split(audio_path)[1]
                    print "tgt:", wav_name
                    print "time:", starttime, endtime
                    print "orig transcription:", trs.upper().strip()

                trs = norm_mod.normalise_text(trs)

                if verbose:
                    print "normalised trans:  ", trs

                if known_words is not None:
                    excluded = norm_mod.exclude_by_dict(trs, known_words)
                else:
                    excluded = norm_mod.exclude_asr(trs)

                if excluded:
                    if verbose:
                        print "... excluded"
                    continue

                wc.update(trs.split())

                if save_transcription(wav_path + '.trn', trs):
                    n_overwrites += 1

                # Extract utterance from audio.
                segment_to_wav(tmp_wav_path, wav_path, starttime, endtime)
                size += os.path.getsize(wav_path)
                seconds += endtime - starttime

        os.remove(tmp_wav_path)

    return size, seconds, n_collisions, n_overwrites, n_missing_audio, n_missing_trs

コード例 #4

0

ファイルを表示

ファイル: librispeech2ufal-audio.py プロジェクト: UFAL-DSG/alex

def convert(args):
    """
    Looks for recordings and transcriptions under the `args.infname'
    directory.  Converts audio files to WAVs and copies the .wav files
    and their transcriptions to `args.outdir' using the `extract_wavs_trns'
    function. `args.dictionary' may refer to an open file listing the only
    words to be allowed in transcriptions in the first whitespace-separated column.

    Returns a tuple of:
        number of collisions (files at different paths with same basename)
        number of overwrites (files with the same basename as previously
                             present in `args.outdir')
        number of ignored files (file basenames referred in transcription logs
                                but missing in the file system, presumably
                                because specified by one of the ignoring
                                mechanisms)

    """

    # Unpack the arguments.
    infname = args.infname
    outdir = args.outdir
    lang = args.language
    verbose = args.verbose
    ignore_list_file = args.ignore
    dict_file = args.dictionary

    size = 0
    n_overwrites = 0

    # Import the appropriate normalisation module.
    norm_mod_name = _LANG2NORMALISATION_MOD[lang]
    norm_mod = __import__(norm_mod_name, fromlist=( 'normalise_text', 'exclude_asr', 'exclude_by_dict'))

    # Read in the dictionary.
    if dict_file:
        known_words = set(line.split()[0] for line in dict_file)
        dict_file.close()
    else:
        known_words = None

    # Find transcription files.
    txt_paths = find_with_ignorelist(args.infname, '*.trans.txt', ignore_list_file)

    # Process the files.
    flac_names = []
    for txt_path in txt_paths:
        if verbose:
            print "Processing", txt_path

        path_prefix = os.path.split(txt_path)[0]
        with codecs.open(txt_path, 'r', 'UTF-8') as txt_file:
            for line in txt_file:
                # Each line contains the name of the audio file and the transcription
                (flac_name, trs) = line.split(' ', 1)
                flac_names += [flac_name]

                # Process audio file
                flac_path = os.path.join(path_prefix, flac_name + '.flac')
                wav_path = os.path.join(outdir, "{r:02}".format(r=random.randint(0, 99)), "{r:02}".format(r=random.randint(0, 99)), flac_name + '.wav')

                if not os.path.exists(os.path.dirname(wav_path)):
                    os.makedirs(os.path.dirname(wav_path))

                if not os.path.isfile(flac_path):
                    continue
                    
                to_wav(flac_path, wav_path)
                size += os.path.getsize(wav_path)
        
                # Process transcription
                if verbose:
                    print
                    print "# f:", flac_name + '.flac'
                    print "orig transcription:", trs.upper().strip()

                trs = norm_mod.normalise_text(trs)

                if verbose:
                    print "normalised trans:  ", trs

                if known_words is not None:
                    excluded = norm_mod.exclude_by_dict(trs, known_words)
                else:
                    excluded = norm_mod.exclude_asr(trs)
                if excluded:
                    if verbose:
                        print "... excluded"
                    continue

                wc.update(trs.split())

                if save_transcription(wav_path + '.trn', trs):
                    n_overwrites += 1

    n_collisions = len(flac_names) - len(set(flac_names))

    return size, n_collisions, n_overwrites

コード例 #5

0

ファイルを表示

ファイル: fisherptwo2ufal-audio.py プロジェクト: henrypig/alex-1

def convert(args):
    """
    Looks for recordings and transcriptions under the `args.infname'
    directory.  Converts audio files to WAVs and copies the .wav files
    and their transcriptions to `args.outdir' function. `args.dictionary' may
    refer to an open file listing the only words to be allowed in
    transcriptions in the first whitespace-separated column.

    Returns a tuple of:
        total audio size
        total audio length in seconds
        number of collisions (files at different paths with same basename)
        number of overwrites (files with the same basename as previously
                             present in `args.outdir')
        number of missing files (file basenames referred in transcription logs
                                but missing in the file system)

    """

    # Unpack the arguments.
    infname = args.infname
    outdir = args.outdir
    lang = args.language
    verbose = args.verbose
    ignore_list_file = args.ignore
    dict_file = args.dictionary

    size = 0
    seconds = 0
    n_overwrites = 0
    n_missing_audio = 0

    # Import the appropriate normalisation module.
    norm_mod_name = _LANG2NORMALISATION_MOD[lang]
    norm_mod = __import__(norm_mod_name, fromlist=( 'normalise_text', 'exclude_asr', 'exclude_by_dict'))

    # Read in the dictionary.
    if dict_file:
        known_words = set(line.split()[0] for line in dict_file)
        dict_file.close()
    else:
        known_words = None

    # Find transcription files.
    txt_paths = find_with_ignorelist(infname, 'fe_*.txt', ignore_list_file)

    # Find all audio files, create dictionary of paths by basename.
    sph_paths = find_with_ignorelist(infname, 'fe_*.sph')
    sph_dict = {os.path.split(fpath)[1]: fpath for fpath in sph_paths}
    n_collisions = len(sph_paths) - len(sph_dict)

    # Process the files.
    for txt_path in txt_paths:
        if verbose:
            print "Processing", txt_path

        txt_name = os.path.split(txt_path)[1]
        src_name = os.path.splitext(txt_name)[0]
        sph_name = src_name + '.sph'

        if not sph_name in sph_dict or not os.path.isfile(sph_dict[sph_name]):
            if verbose:
                print "Lost audio file:", sph_name
            n_missing_audio += 1
            continue

        sph_path = sph_dict[sph_name]

        utterances = []

        with codecs.open(txt_path, 'r', 'UTF-8') as txt_file:
            i = 1
            for line in txt_file:
                if len(line.strip()) == 0 or line.strip()[0] == '#': continue #ignore comments and empty lines

                # Each line contains start time, end time, speaker id and transcription
                (start, end, speaker, trs) = line.split(' ', 3)
                start = float(start)
                end = float(end)
                channel = 1 if speaker[0] == 'A' else 2
                utterances += [{'start': start, 'end': end, 'trs': trs, 'channel': channel}]
                
                i += 1

        for i in range(len(utterances)):
            utt = utterances[i]
            trs = utt['trs']

            wav_name = '%s_%03d.wav' % (src_name, i)
#            wav_path = os.path.join(outdir, wav_name)
            wav_path = os.path.join(outdir, "{r:02}".format(r=random.randint(0, 99)), "{r:02}".format(r=random.randint(0, 99)), wav_name)

            if not os.path.exists(os.path.dirname(wav_path)):
                os.makedirs(os.path.dirname(wav_path))


            if verbose:
                print
                print "src:", sph_name
                print "tgt:", wav_name
                print "time:", utt['start'], utt['end']
                print "channel:", utt['channel']
                print "orig transcription:", trs.upper().strip()

            trs = norm_mod.normalise_text(trs)

            if verbose:
                print "normalised trans:  ", trs

            if known_words is not None:
                excluded = norm_mod.exclude_by_dict(trs, known_words)
            else:
                excluded = norm_mod.exclude_asr(trs)
            if excluded:
                if verbose:
                    print "... excluded"
                continue

            # Check for very short utterances
            if utt['end']-utt['start'] < 1:
                if verbose:
                    print "... too short"
                continue

            wc.update(trs.split())

            if save_transcription(wav_path + '.trn', trs):
                n_overwrites += 1

            # Extract utterance from audio
            tmp_path = wav_path + '.tmp'
            cmd = ['sph2pipe', '-f', 'wav', '-t', '%f:%f' % (utt['start'], utt['end']), '-c', str(utt['channel']), sph_path, tmp_path]
            subprocess.call(cmd)

            # Convert to valid WAV
            to_wav(tmp_path, wav_path)
            os.remove(tmp_path)
            size += os.path.getsize(wav_path)
            seconds += utt['end'] - utt['start']

    return size, seconds, n_collisions, n_overwrites, n_missing_audio

コード例 #6

0

ファイルを表示

ファイル: malach-en2ufal-audio.py プロジェクト: UFAL-DSG/alex

def convert(args):
    """
    Looks for recordings and transcriptions under the `args.infname'
    directory.  Converts audio files to WAVs and copies the .wav files
    and their transcriptions to `args.outdir' directory. `args.dictionary' may
    refer to an open file listing the only words to be allowed in
    transcriptions in the first whitespace-separated column.

    Returns a tuple of:
        total audio size
        total audio length in seconds
        number of collisions (files at different paths with same basename)
        number of overwrites (files with the same basename as previously
                             present in `args.outdir')
        number of missing files (file basenames referred in transcription logs
                                but missing in the file system)
        number of missing transcriptions

    """

    # Unpack the arguments.
    infname = args.infname
    outdir = args.outdir
    lang = args.language
    verbose = args.verbose
    ignore_list_file = args.ignore
    dict_file = args.dictionary

    size = 0
    seconds = 0
    n_overwrites = 0
    n_missing_audio = 0
    n_missing_trs = 0

    # Import the appropriate normalisation module.
    norm_mod_name = _LANG2NORMALISATION_MOD[lang]
    norm_mod = __import__(norm_mod_name, fromlist=( 'normalise_text', 'exclude_asr', 'exclude_by_dict'))

    # Read in the dictionary.
    if dict_file:
        known_words = set(line.split()[0] for line in dict_file)
        dict_file.close()
    else:
        known_words = None

    # Find transcription files.
    trs_paths = find_with_ignorelist(infname, '*.trs', ignore_list_file)
    trs_dict = {os.path.split(fpath)[1]: fpath for fpath in trs_paths}

    # Find all audio files, create dictionary of paths by basename.
    audio_paths = find_with_ignorelist(infname, '*.mp2')
    audio_dict = {os.path.splitext(os.path.split(fpath)[1])[0]: fpath for fpath in audio_paths}
    n_collisions = len(audio_paths) - len(audio_dict)

    # Process the files.
    for trs_path in trs_dict.values():
        if verbose:
            print "Processing", trs_path

        # Parse the file.
        doc = xml.dom.minidom.parse(trs_path)
        fname = doc.getElementsByTagName("Trans")[0].attributes['audio_filename'].value
        if not fname in audio_dict or not os.path.isfile(audio_dict[fname]):
            if verbose:
                print "Lost audio file:", fname
            n_missing_audio += 1
            continue
        audio_path = audio_dict[fname]

        # Convert audio to wav.
        tmp_wav_path = os.path.join(outdir, fname + '.wav')
        to_wav(audio_path, tmp_wav_path)

        turns = doc.getElementsByTagName("Turn")


        i = 0
        for turn in turns:
            i += 1

            currtime = float(turn.getAttribute('startTime'))
            currtext = ''

            utterances = []

            # Process all child nodes.
            for node in turn.childNodes:
                if node.nodeType == node.ELEMENT_NODE and node.tagName == 'Sync':
                    starttime = currtime
                    currtime = float(node.getAttribute('time'))

                    if currtime > starttime:
                        utterances += [(currtext, starttime, currtime)]

                    currtext = ''
                elif node.nodeType == node.TEXT_NODE:
                    currtext += ' ' + node.data.strip()

            # Add the last utterance, which is not followed by a Sync tag.
            starttime = currtime
            try:
                currtime = float(turn.getAttribute('endTime'))
            except ValueError:
                currtime = float(turn.getAttribute('endTime').split()[0])
            
            if currtime > starttime:
                utterances += [(currtext, starttime, currtime)]

            j = 0
            for (trs, starttime, endtime) in utterances:
                j += 1

                if (endtime - starttime) < 0.2:
                    print "Too short segment"
                    continue
                    
                if not trs: # empty transcription
                    n_missing_trs += 1

                wav_name = '%s_%02d_%04d.wav' % (fname, i, j)
                #wav_path = os.path.join(outdir, wav_name)
                wav_path = os.path.join(outdir, "{r:02}".format(r=random.randint(0, 99)), "{r:02}".format(r=random.randint(0, 99)), wav_name)

                if not os.path.exists(os.path.dirname(wav_path)):
                    os.makedirs(os.path.dirname(wav_path))

                if verbose:
                    print
                    print "src:", os.path.split(audio_path)[1]
                    print "tgt:", wav_name
                    print "time:", starttime, endtime
                    print "orig transcription:", trs.upper().strip()

                trs = norm_mod.normalise_text(trs)

                if verbose:
                    print "normalised trans:  ", trs

                if known_words is not None:
                    excluded = norm_mod.exclude_by_dict(trs, known_words)
                else:
                    excluded = norm_mod.exclude_asr(trs)

                if excluded:
                    if verbose:
                        print "... excluded"
                    continue

                wc.update(trs.split())

                if save_transcription(wav_path + '.trn', trs):
                    n_overwrites += 1

                # Extract utterance from audio.
                segment_to_wav(tmp_wav_path, wav_path, starttime, endtime)
                size += os.path.getsize(wav_path)
                seconds += endtime - starttime
        
        os.remove(tmp_wav_path)


    return size, seconds, n_collisions, n_overwrites, n_missing_audio, n_missing_trs

コード例 #7

0

ファイルを表示

ファイル: librispeech2ufal-audio.py プロジェクト: henrypig/alex-1

def convert(args):
    """
    Looks for recordings and transcriptions under the `args.infname'
    directory.  Converts audio files to WAVs and copies the .wav files
    and their transcriptions to `args.outdir' using the `extract_wavs_trns'
    function. `args.dictionary' may refer to an open file listing the only
    words to be allowed in transcriptions in the first whitespace-separated column.

    Returns a tuple of:
        number of collisions (files at different paths with same basename)
        number of overwrites (files with the same basename as previously
                             present in `args.outdir')
        number of ignored files (file basenames referred in transcription logs
                                but missing in the file system, presumably
                                because specified by one of the ignoring
                                mechanisms)

    """

    # Unpack the arguments.
    infname = args.infname
    outdir = args.outdir
    lang = args.language
    verbose = args.verbose
    ignore_list_file = args.ignore
    dict_file = args.dictionary

    size = 0
    n_overwrites = 0

    # Import the appropriate normalisation module.
    norm_mod_name = _LANG2NORMALISATION_MOD[lang]
    norm_mod = __import__(norm_mod_name,
                          fromlist=('normalise_text', 'exclude_asr',
                                    'exclude_by_dict'))

    # Read in the dictionary.
    if dict_file:
        known_words = set(line.split()[0] for line in dict_file)
        dict_file.close()
    else:
        known_words = None

    # Find transcription files.
    txt_paths = find_with_ignorelist(args.infname, '*.trans.txt',
                                     ignore_list_file)

    # Process the files.
    flac_names = []
    for txt_path in txt_paths:
        if verbose:
            print "Processing", txt_path

        path_prefix = os.path.split(txt_path)[0]
        with codecs.open(txt_path, 'r', 'UTF-8') as txt_file:
            for line in txt_file:
                # Each line contains the name of the audio file and the transcription
                (flac_name, trs) = line.split(' ', 1)
                flac_names += [flac_name]

                # Process audio file
                flac_path = os.path.join(path_prefix, flac_name + '.flac')
                wav_path = os.path.join(
                    outdir, "{r:02}".format(r=random.randint(0, 99)),
                    "{r:02}".format(r=random.randint(0, 99)),
                    flac_name + '.wav')

                if not os.path.exists(os.path.dirname(wav_path)):
                    os.makedirs(os.path.dirname(wav_path))

                if not os.path.isfile(flac_path):
                    continue

                to_wav(flac_path, wav_path)
                size += os.path.getsize(wav_path)

                # Process transcription
                if verbose:
                    print
                    print "# f:", flac_name + '.flac'
                    print "orig transcription:", trs.upper().strip()

                trs = norm_mod.normalise_text(trs)

                if verbose:
                    print "normalised trans:  ", trs

                if known_words is not None:
                    excluded = norm_mod.exclude_by_dict(trs, known_words)
                else:
                    excluded = norm_mod.exclude_asr(trs)
                if excluded:
                    if verbose:
                        print "... excluded"
                    continue

                wc.update(trs.split())

                if save_transcription(wav_path + '.trn', trs):
                    n_overwrites += 1

    n_collisions = len(flac_names) - len(set(flac_names))

    return size, n_collisions, n_overwrites

コード例 #8

0

ファイルを表示

ファイル: fisherptwo2ufal-audio.py プロジェクト: sih4sing5hong5/alex

def convert(args):
    """
    Looks for recordings and transcriptions under the `args.infname'
    directory.  Converts audio files to WAVs and copies the .wav files
    and their transcriptions to `args.outdir' function. `args.dictionary' may
    refer to an open file listing the only words to be allowed in
    transcriptions in the first whitespace-separated column.

    Returns a tuple of:
        total audio size
        total audio length in seconds
        number of collisions (files at different paths with same basename)
        number of overwrites (files with the same basename as previously
                             present in `args.outdir')
        number of missing files (file basenames referred in transcription logs
                                but missing in the file system)

    """

    # Unpack the arguments.
    infname = args.infname
    outdir = args.outdir
    lang = args.language
    verbose = args.verbose
    ignore_list_file = args.ignore
    dict_file = args.dictionary

    size = 0
    seconds = 0
    n_overwrites = 0
    n_missing_audio = 0

    # Import the appropriate normalisation module.
    norm_mod_name = _LANG2NORMALISATION_MOD[lang]
    norm_mod = __import__(norm_mod_name, fromlist=("normalise_text", "exclude_asr", "exclude_by_dict"))

    # Read in the dictionary.
    if dict_file:
        known_words = set(line.split()[0] for line in dict_file)
        dict_file.close()
    else:
        known_words = None

    # Find transcription files.
    txt_paths = find_with_ignorelist(infname, "fe_*.txt", ignore_list_file)

    # Find all audio files, create dictionary of paths by basename.
    sph_paths = find_with_ignorelist(infname, "fe_*.sph")
    sph_dict = {os.path.split(fpath)[1]: fpath for fpath in sph_paths}
    n_collisions = len(sph_paths) - len(sph_dict)

    # Process the files.
    for txt_path in txt_paths:
        if verbose:
            print "Processing", txt_path

        txt_name = os.path.split(txt_path)[1]
        src_name = os.path.splitext(txt_name)[0]
        sph_name = src_name + ".sph"

        if not sph_name in sph_dict or not os.path.isfile(sph_dict[sph_name]):
            if verbose:
                print "Lost audio file:", sph_name
            n_missing_audio += 1
            continue

        sph_path = sph_dict[sph_name]

        utterances = []

        with codecs.open(txt_path, "r", "UTF-8") as txt_file:
            i = 1
            for line in txt_file:
                if len(line.strip()) == 0 or line.strip()[0] == "#":
                    continue  # ignore comments and empty lines

                # Each line contains start time, end time, speaker id and transcription
                (start, end, speaker, trs) = line.split(" ", 3)
                start = float(start)
                end = float(end)
                channel = 1 if speaker[0] == "A" else 2
                utterances += [{"start": start, "end": end, "trs": trs, "channel": channel}]

                i += 1

        for i in range(len(utterances)):
            utt = utterances[i]
            trs = utt["trs"]

            wav_name = "%s_%03d.wav" % (src_name, i)
            #            wav_path = os.path.join(outdir, wav_name)
            wav_path = os.path.join(
                outdir, "{r:02}".format(r=random.randint(0, 99)), "{r:02}".format(r=random.randint(0, 99)), wav_name
            )

            if not os.path.exists(os.path.dirname(wav_path)):
                os.makedirs(os.path.dirname(wav_path))

            if verbose:
                print
                print "src:", sph_name
                print "tgt:", wav_name
                print "time:", utt["start"], utt["end"]
                print "channel:", utt["channel"]
                print "orig transcription:", trs.upper().strip()

            trs = norm_mod.normalise_text(trs)

            if verbose:
                print "normalised trans:  ", trs

            if known_words is not None:
                excluded = norm_mod.exclude_by_dict(trs, known_words)
            else:
                excluded = norm_mod.exclude_asr(trs)
            if excluded:
                if verbose:
                    print "... excluded"
                continue

            # Check for very short utterances
            if utt["end"] - utt["start"] < 1:
                if verbose:
                    print "... too short"
                continue

            wc.update(trs.split())

            if save_transcription(wav_path + ".trn", trs):
                n_overwrites += 1

            # Extract utterance from audio
            tmp_path = wav_path + ".tmp"
            cmd = [
                "sph2pipe",
                "-f",
                "wav",
                "-t",
                "%f:%f" % (utt["start"], utt["end"]),
                "-c",
                str(utt["channel"]),
                sph_path,
                tmp_path,
            ]
            subprocess.call(cmd)

            # Convert to valid WAV
            to_wav(tmp_path, wav_path)
            os.remove(tmp_path)
            size += os.path.getsize(wav_path)
            seconds += utt["end"] - utt["start"]

    return size, seconds, n_collisions, n_overwrites, n_missing_audio