Ejemplo n.º 1
0
def convert(args):
    """
    Looks for .wav files and transcription logs under the `args.infname'
    directory.  Copies .wav files and their transcriptions linked from the log
    to `args.outdir' using the `extract_wavs_trns' function. `args.dictionary'
    may refer to an open file listing the only words to be allowed in
    transcriptions in the first whitespace-separated column.

    Returns a tuple of:
        number of collisions (files at different paths with same basename)
        number of overwrites (files with the same basename as previously
                             present in `args.outdir')
        number of ignored files (file basenames referred in transcription logs
                                but missing in the file system, presumably
                                because specified by one of the ignoring
                                mechanisms)

    """
    # Unpack the arguments.
    infname = args.infname
    outdir = args.outdir
    lang = args.language
    verbose = args.verbose
    ignore_list_file = args.ignore
    dict_file = args.dictionary
    # Read in the dictionary.
    if dict_file:
        known_words = set(line.split()[0] for line in dict_file)
        dict_file.close()
    else:
        known_words = None

    # Find wavs.
    wav_paths = find_wavs(infname, ignore_list_file=ignore_list_file)
    # Map file basenames to their relative paths -- NOTE this can be
    # destructive if multiple files have the same basename.
    # Wav_mapping: wav basename -> path to call log dir
    swapped = [os.path.split(fpath) for fpath in wav_paths]
    wav_mapping = {name: prefix for (prefix, name) in swapped}
    n_collisions = len(wav_paths) - len(wav_mapping)

    # Get all transcription logs.
    n_notnorm_trss = 0
    n_missing_trss = 0
    # XXX ???
    # path session file name created from call log dir
    sess_fnames = dict()
    # Call dir == prefix
    for prefix in wav_mapping.itervalues():
        norm_fname = os.path.join(prefix, 'asr_transcribed.xml')
        if os.path.isfile(norm_fname):
            sess_fnames[prefix] = norm_fname
        else:
            basic_fname = os.path.join(prefix, 'session.xml')
            if os.path.isfile(basic_fname):
                n_notnorm_trss += 1
            else:
              n_missing_trss += 1
    # trn_paths = find(infname, 'user-transcription.norm.xml')

    print ""
    print "Number of sessions:                   ", len(sess_fnames)
    print "Number of untranscribed sessions:     ", n_notnorm_trss
    print "Number of missing sessions:           ", n_missing_trss
    print ""

    # Copy files referred in the transcription logs to `outdir'.
    size = 0
    n_overwrites = 0
    n_missing_wav = 0
    n_missing_trs = 0
    # for trn_path in trn_paths:
    for prefix, call_log in sess_fnames.iteritems():
        if verbose:
            print "Processing call log dir:", prefix

        cursize, cur_n_overwrites, cur_n_missing_wav, cur_n_missing_trs = \
            extract_wavs_trns(prefix, call_log, outdir, wav_mapping, known_words, lang, verbose)
        size += cursize
        n_overwrites += cur_n_overwrites
        n_missing_wav += cur_n_missing_wav
        n_missing_trs += cur_n_missing_trs

    # Print statistics.
    print "Size of copied audio data:", size

    sec = size / (16000 * 2)
    hour = sec / 3600.0

    print "Length of audio data in hours (for 16kHz 16b WAVs):", hour
    # Return the number of file collisions and overwrites.
    return n_collisions, n_overwrites, n_missing_wav, n_missing_trs
Ejemplo n.º 2
0
def convert(args):
    """
    Looks for .wav files and transcription logs under the `args.infname'
    directory.  Copies .wav files and their transcriptions linked from the log
    to `args.outdir' using the `extract_wavs_trns' function. `args.dictionary'
    may refer to an open file listing the only words to be allowed in
    transcriptions in the first whitespace-separated column.

    Returns a tuple of:
        number of collisions (files at different paths with same basename)
        number of overwrites (files with the same basename as previously
                             present in `args.outdir')
        number of ignored files (file basenames referred in transcription logs
                                but missing in the file system, presumably
                                because specified by one of the ignoring
                                mechanisms)

    """
    # Unpack the arguments.
    infname = args.infname
    outdir = args.outdir
    verbose = args.verbose
    ignore_list_file = args.ignore
    dict_file = args.dictionary
    # Read in the dictionary.
    if dict_file:
        known_words = set(line.split()[0] for line in dict_file)
        dict_file.close()
    else:
        known_words = None

    # Find wavs.
    wav_paths = find_wavs(infname, ignore_list_file=ignore_list_file)
    # Map file basenames to their relative paths -- NOTE this can be
    # destructive if multiple files have the same basename.
    # Wav_mapping: wav basename -> path to call log dir
    swapped = [os.path.split(fpath) for fpath in wav_paths]
    wav_mapping = {name: prefix for (prefix, name) in swapped}
    n_collisions = len(wav_paths) - len(wav_mapping)

    # XXX Obsoleted since this should now be done by the `find_wavs' function
    # itself.  Yes, it does not match basenames but whole paths, still it
    # should work because all paths are normalised and absolutised.
    #
    # # Get basename for ignore files and we do NOT care about collisions
    # # ignore_wav_mapping: wav basename -> path to call log dir
    # swapped = [os.path.split(fpath) for fpath in ignore_wav_paths]
    # ignore_wav_mapping = dict([(name, prefix) for (prefix, name) in swapped])
    #
    # # Delete ignore basenames from wav_mapping (if any pass through
    # # ignore_paths and globs)
    # for ignore_basename, ignore_prefix in ignore_wav_mapping.iteritems():
    #     if ignore_basename in wav_mapping:
    #         if verbose:
    #             ignore_path = os.path.join(ignore_prefix, ignore_basename)
    #             print 'Wav file SKIPPED:', ignore_path
    #         # Removing the ignore_basename from wav_mapping
    #         del wav_mapping[ignore_basename]

    # Get all transcription logs.
    n_notnorm_trss = 0
    n_missing_trss = 0
    # XXX ???
    # path session file name created from call log dir
    sess_fnames = dict()
    # Call dir == prefix
    for prefix in wav_mapping.itervalues():
        norm_fname = os.path.join(prefix, 'user-transcription.norm.xml')
        if os.path.isfile(norm_fname):
            sess_fnames[prefix] = norm_fname
        else:
            basic_fname = os.path.join(prefix, 'user-transcription.xml')
            if os.path.isfile(basic_fname):
                sess_fnames[prefix] = basic_fname
                n_notnorm_trss += 1
            else:
                basic_fname = os.path.join(prefix,
                                           'user-transcription-all.xml')
                if os.path.isfile(basic_fname):
                    sess_fnames[prefix] = basic_fname
                    n_notnorm_trss += 1
                else:
                    n_missing_trss += 1
    # trn_paths = find(infname, 'user-transcription.norm.xml')

    print ""
    print "Number of sessions                   :", len(sess_fnames)
    print "Number of unnormalised transcriptions:", n_notnorm_trss
    print "Number of missing transcriptions     :", n_missing_trss
    print ""

    # Copy files referred in the transcription logs to `outdir'.
    size = 0
    n_overwrites = 0
    n_missing_wav = 0
    n_missing_trs = 0
    # for trn_path in trn_paths:
    for prefix, call_log in sess_fnames.iteritems():
        if verbose:
            print "Processing call log dir:", prefix
            print " session file name:     ", call_log

        cursize, cur_n_overwrites, cur_n_missing_wav, cur_n_missing_trs = \
            extract_wavs_trns(prefix, call_log, outdir, wav_mapping, known_words, verbose)
        size += cursize
        n_overwrites += cur_n_overwrites
        n_missing_wav += cur_n_missing_wav
        n_missing_trs += cur_n_missing_trs

    # Print statistics.
    print "Size of copied audio data:", size

    sec = size / (16000 * 2)
    hour = sec / 3600.0

    print "Length of audio data in hours (for 16kHz 16b WAVs):", hour
    # Return the number of file collisions and overwrites.
    return n_collisions, n_overwrites, n_missing_wav, n_missing_trs
def convert(args):
    """
    Looks for .wav files and transcription logs under the `args.infname'
    directory.  Copies .wav files and their transcriptions linked from the log
    to `args.outdir' using the `extract_wavs_trns' function. `args.dictionary'
    may refer to an open file listing the only words to be allowed in
    transcriptions in the first whitespace-separated column.

    Returns a tuple of:
        number of collisions (files at different paths with same basename)
        number of overwrites (files with the same basename as previously
                             present in `args.outdir')
        number of ignored files (file basenames referred in transcription logs
                                but missing in the file system, presumably
                                because specified by one of the ignoring
                                mechanisms)

    """
    # Unpack the arguments.
    infname = args.infname
    outdir = args.outdir
    lang = args.language
    verbose = args.verbose
    ignore_list_file = args.ignore
    dict_file = args.dictionary
    # Read in the dictionary.
    if dict_file:
        known_words = set(line.split()[0] for line in dict_file)
        dict_file.close()
    else:
        known_words = None

    # Find wavs.
    wav_paths = find_wavs(infname, ignore_list_file=ignore_list_file)
    # Map file basenames to their relative paths -- NOTE this can be
    # destructive if multiple files have the same basename.
    # Wav_mapping: wav basename -> path to call log dir
    swapped = [os.path.split(fpath) for fpath in wav_paths]
    wav_mapping = {name: prefix for (prefix, name) in swapped}
    n_collisions = len(wav_paths) - len(wav_mapping)

    # Get all transcription logs.
    n_notnorm_trss = 0
    n_missing_trss = 0
    # XXX ???
    # path session file name created from call log dir
    sess_fnames = dict()
    # Call dir == prefix
    for prefix in wav_mapping.itervalues():
        norm_fname = os.path.join(prefix, 'asr_transcribed.xml')
        if os.path.isfile(norm_fname):
            sess_fnames[prefix] = norm_fname
        else:
            basic_fname = os.path.join(prefix, 'session.xml')
            if os.path.isfile(basic_fname):
                n_notnorm_trss += 1
            else:
              n_missing_trss += 1
    # trn_paths = find(infname, 'user-transcription.norm.xml')

    print ""
    print "Number of sessions:                   ", len(sess_fnames)
    print "Number of untranscribed sessions:     ", n_notnorm_trss
    print "Number of missing sessions:           ", n_missing_trss
    print ""

    # Copy files referred in the transcription logs to `outdir'.
    size = 0
    n_overwrites = 0
    n_missing_wav = 0
    n_missing_trs = 0
    # for trn_path in trn_paths:
    for prefix, call_log in sess_fnames.iteritems():
        if verbose:
            print "Processing call log dir:", prefix

        cursize, cur_n_overwrites, cur_n_missing_wav, cur_n_missing_trs = \
            extract_wavs_trns(prefix, call_log, outdir, wav_mapping, known_words, lang, verbose)
        size += cursize
        n_overwrites += cur_n_overwrites
        n_missing_wav += cur_n_missing_wav
        n_missing_trs += cur_n_missing_trs

    # Print statistics.
    print "Size of copied audio data:", size

    sec = size / (16000 * 2)
    hour = sec / 3600.0

    print "Length of audio data in hours (for 16kHz 16b WAVs):", hour
    # Return the number of file collisions and overwrites.
    return n_collisions, n_overwrites, n_missing_wav, n_missing_trs
Ejemplo n.º 4
0
def convert(args):
    """
    Looks for .wav files and transcription logs under the `args.infname'
    directory.  Copies .wav files and their transcriptions linked from the log
    to `args.outdir' using the `extract_wavs_trns' function. `args.dictionary'
    may refer to an open file listing the only words to be allowed in
    transcriptions in the first whitespace-separated column.

    Returns a tuple of:
        number of collisions (files at different paths with same basename)
        number of overwrites (files with the same basename as previously
                             present in `args.outdir')
        number of ignored files (file basenames referred in transcription logs
                                but missing in the file system, presumably
                                because specified by one of the ignoring
                                mechanisms)

    """
    # Unpack the arguments.
    infname = args.infname
    outdir = args.outdir
    verbose = args.verbose
    ignore_list_file = args.ignore
    dict_file = args.dictionary
    # Read in the dictionary.
    if dict_file:
        known_words = set(line.split()[0] for line in dict_file)
        dict_file.close()
    else:
        known_words = None

    # Find wavs.
    wav_paths = find_wavs(infname, ignore_list_file=ignore_list_file)
    # Map file basenames to their relative paths -- NOTE this can be
    # destructive if multiple files have the same basename.
    # Wav_mapping: wav basename -> path to call log dir
    swapped = [os.path.split(fpath) for fpath in wav_paths]
    wav_mapping = {name: prefix for (prefix, name) in swapped}
    n_collisions = len(wav_paths) - len(wav_mapping)

    # XXX Obsoleted since this should now be done by the `find_wavs' function
    # itself.  Yes, it does not match basenames but whole paths, still it
    # should work because all paths are normalised and absolutised.
    #
    # # Get basename for ignore files and we do NOT care about collisions
    # # ignore_wav_mapping: wav basename -> path to call log dir
    # swapped = [os.path.split(fpath) for fpath in ignore_wav_paths]
    # ignore_wav_mapping = dict([(name, prefix) for (prefix, name) in swapped])
    #
    # # Delete ignore basenames from wav_mapping (if any pass through
    # # ignore_paths and globs)
    # for ignore_basename, ignore_prefix in ignore_wav_mapping.iteritems():
    #     if ignore_basename in wav_mapping:
    #         if verbose:
    #             ignore_path = os.path.join(ignore_prefix, ignore_basename)
    #             print 'Wav file SKIPPED:', ignore_path
    #         # Removing the ignore_basename from wav_mapping
    #         del wav_mapping[ignore_basename]

    # Get all transcription logs.
    n_notnorm_trss = 0
    n_missing_trss = 0
    # XXX ???
    # path session file name created from call log dir
    sess_fnames = dict()
    # Call dir == prefix
    for prefix in wav_mapping.itervalues():
        norm_fname = os.path.join(prefix, 'user-transcription.norm.xml')
        if os.path.isfile(norm_fname):
            sess_fnames[prefix] = norm_fname
        else:
            basic_fname = os.path.join(prefix, 'user-transcription.xml')
            if os.path.isfile(basic_fname):
                sess_fnames[prefix] = basic_fname
                n_notnorm_trss += 1
            else:
                basic_fname = os.path.join(prefix,
                                           'user-transcription-all.xml')
                if os.path.isfile(basic_fname):
                    sess_fnames[prefix] = basic_fname
                    n_notnorm_trss += 1
                else:
                    n_missing_trss += 1
    # trn_paths = find(infname, 'user-transcription.norm.xml')

    print ""
    print "Number of sessions                   :", len(sess_fnames)
    print "Number of unnormalised transcriptions:", n_notnorm_trss
    print "Number of missing transcriptions     :", n_missing_trss
    print ""

    # Copy files referred in the transcription logs to `outdir'.
    size = 0
    n_overwrites = 0
    n_missing_wav = 0
    n_missing_trs = 0
    # for trn_path in trn_paths:
    for prefix, call_log in sess_fnames.iteritems():
        if verbose:
            print "Processing call log dir:", prefix
            print " session file name:     ", call_log

        cursize, cur_n_overwrites, cur_n_missing_wav, cur_n_missing_trs = \
            extract_wavs_trns(prefix, call_log, outdir, wav_mapping, known_words, verbose)
        size += cursize
        n_overwrites += cur_n_overwrites
        n_missing_wav += cur_n_missing_wav
        n_missing_trs += cur_n_missing_trs

    # Print statistics.
    print "Size of copied audio data:", size

    sec = size / (16000 * 2)
    hour = sec / 3600.0

    print "Length of audio data in hours (for 16kHz 16b WAVs):", hour
    # Return the number of file collisions and overwrites.
    return n_collisions, n_overwrites, n_missing_wav, n_missing_trs