Esempio n. 1
0
def convert(args):
    # TODO docstring
    # Unpack the arguments.
    infname = args.infname
    outdir = args.outdir
    lang = args.language
    verbose = args.verbose
    trs_only = args.only_transcriptions
    ignore_list_file = args.ignore
    # Read in the ignore list.
    ignore_paths = set()
    ignore_globs = set()
    if ignore_list_file:
        for path_or_glob in ignore_list_file:
            path_or_glob = path_or_glob.rstrip('\n')
            # For lines that list absolute paths,
            if os.path.abspath(path_or_glob) == os.path.normpath(path_or_glob):
                # add them to the list of paths to ignore.
                ignore_paths.add(path_or_glob)
            # For other lines, treat them as basename globs.
            else:
                ignore_globs.add(path_or_glob)
        ignore_list_file.close()

    # Get all but the ignored transcriptions.
    if os.path.isdir(infname):
        trs_paths = find(infname, '*.trs', ignore_globs=ignore_globs, ignore_paths=ignore_paths)
    else:
        trs_paths = list()
        with open(infname, 'r') as inlist:
            for line in inlist:
                trs_paths.extend(find(line.strip(), '*.trs',
                                      mindepth=1, maxdepth=1,
                                      ignore_globs=ignore_globs,
                                      ignore_paths=ignore_paths))

    size = 0
    n_overwrites = 0
    n_missing_wav = 0
    n_missing_trs = 0
    for trs_path in trs_paths:

        if verbose:
            print u"Processing transcription file: ", trs_path

        cursize, cur_n_overwrites, cur_n_missing_wav, cur_n_missing_trs = \
            extract_wavs_trns(trs_path, outdir, trs_only, lang, verbose)
        size += cursize
        n_overwrites += cur_n_overwrites
        n_missing_wav += cur_n_missing_wav
        n_missing_trs += cur_n_missing_trs

    print u"Size of copied audio data:", size

    sec = size / (2*16000)
    hour = sec / 3600.0

    print u"Length of audio data in hours (for 16kHz 16bit WAVs output):", hour
    # Return the number of file collisions and overwrites.
    return n_overwrites, n_missing_wav, n_missing_trs
Esempio n. 2
0
def get_call_data_from_fs(rootdir):
    voip_names = find(rootdir, 'voip-*', mindepth=0, prune=True)
    # Here we assume the following format for call-log directory basenames:
    #   voip-<phone>-<YYMMDD_HHMMSS>
    voip_parts = [
        set_and_ret(basename(name).split('-'), 0, name) for name in voip_names
    ]
    # Here we assume the following format for call-log files:
    #   jurcic-<num>-<YYMMDD_HHMMSS>_<ms-start>_<ms-end>.wav
    # where
    #   num =~ /\d\d\d/
    #   ms-start =~ /\d{7}/
    #   ms-end =~ /\d{7}/
    #
    # ...ms-start and ms-end actually denote hundredths of seconds, not
    # milliseconds.

    # Build the mapping (phone_no -> timestamps_of_calls).
    # Build the mapping (phone_no -> total_size_of_call_logs).
    call_timestamps = dict()
    call_size = dict()
    for split in voip_parts:
        voip_path = split[0]
        phone = split[1]
        date_str = split[2]
        # Transform the date string into a timestamp.
        date = datetime(year=2000 + int(date_str[:2]),
                        month=int(date_str[2:4]),
                        day=int(date_str[4:6]),
                        hour=int(date_str[7:9]),
                        minute=int(date_str[9:11]),
                        second=int(date_str[11:13]))
        timestamp = get_timestamp(date)
        # Compute the total size of the wavs.
        wavs = iglob(
            os.path.join(voip_path,
                         'jurcic-???-??????_??????_???????_???????.wav'))
        total = 0  # total size in bytes
        for wav in wavs:
            total += getsize(wav)
        # Save the timestamp and the size.
        call_timestamps.setdefault(phone, []).append(timestamp)
        call_size[phone] = call_size.get(phone, 0) + total
    return call_size, call_timestamps
Esempio n. 3
0
def get_call_data_from_fs(rootdir):
    voip_names = find(rootdir, 'voip-*', mindepth=0, prune=True)
    # Here we assume the following format for call-log directory basenames:
    #   voip-<phone>-<YYMMDD_HHMMSS>
    voip_parts = [set_and_ret(basename(name).split('-'), 0, name)
                  for name in voip_names]
    # Here we assume the following format for call-log files:
    #   jurcic-<num>-<YYMMDD_HHMMSS>_<ms-start>_<ms-end>.wav
    # where
    #   num =~ /\d\d\d/
    #   ms-start =~ /\d{7}/
    #   ms-end =~ /\d{7}/
    #
    # ...ms-start and ms-end actually denote hundredths of seconds, not
    # milliseconds.

    # Build the mapping (phone_no -> timestamps_of_calls).
    # Build the mapping (phone_no -> total_size_of_call_logs).
    call_timestamps = dict()
    call_size = dict()
    for split in voip_parts:
        voip_path = split[0]
        phone = split[1]
        date_str = split[2]
        # Transform the date string into a timestamp.
        date = datetime(year=2000 + int(date_str[:2]),
                        month=int(date_str[2:4]),
                        day=int(date_str[4:6]),
                        hour=int(date_str[7:9]),
                        minute=int(date_str[9:11]),
                        second=int(date_str[11:13]))
        timestamp = get_timestamp(date)
        # Compute the total size of the wavs.
        wavs = iglob(os.path.join(
                     voip_path,
                     'jurcic-???-??????_??????_???????_???????.wav'))
        total = 0  # total size in bytes
        for wav in wavs:
            total += getsize(wav)
        # Save the timestamp and the size.
        call_timestamps.setdefault(phone, []).append(timestamp)
        call_size[phone] = call_size.get(phone, 0) + total
    return call_size, call_timestamps
Esempio n. 4
0
    parser.add_argument('-i', '--input',
                        action="store",
                        help='an input directory with all wav files')
    parser.add_argument('-o', '--output',
                        action="store",
                        help='an output directory for the converted wav')
    parser.add_argument('-v',
                        action="store_true",
                        dest="verbose",
                        help='set verbose output')
    
    args = parser.parse_args()

    
    trn_files = find(args.input, '*.trn', mindepth=1, maxdepth=5)
    
    for fn in trn_files:
        if args.verbose:
            print "Processing file:", fn
            
        real_fn = os.path.realpath(fn)
        base_fn = os.path.basename(fn)

        wav_fn = real_fn.replace('.trn', '')
        trn_fn = real_fn

        if not os.path.exists(wav_fn) or not os.path.exists(wav_fn+'.trn'):
            print "Does not exists {fn} or {fnt}".format(fn=wav_fn, fnt=wav_fn+'.trn')
            continue
Esempio n. 5
0
                        help='a list of test wav files')
    parser.add_argument('-v',
                        action="store_true",
                        dest="verbose",
                        help='set verbose output')
    
    args = parser.parse_args()

    
    with open(args.devlist, 'r') as f:
        dev_files = set([os.path.basename(fn.strip()) for fn in f.readlines()])
    with open(args.testlist, 'r') as f:
        test_files = set([os.path.basename(fn.strip()) for fn in f.readlines()])
        
    
    all_files = find(args.all, '*.wav', mindepth=1, maxdepth=5)
    
    for fn in all_files:
        if args.verbose:
            print "Processing file:", fn
            
        base_fn = os.path.basename(fn)
        
        if base_fn in dev_files:
            os.system("ln -s {src} {tgt}".format(src = os.path.join('..', args.all, base_fn), tgt = os.path.join(args.dev, base_fn)))
            os.system("ln -s {src} {tgt}".format(src = os.path.join('..', args.all, base_fn+'.trn'), tgt = os.path.join(args.dev, base_fn+'.trn')))
        elif base_fn in test_files:
            os.system("ln -s {src} {tgt}".format(src = os.path.join('..', args.all, base_fn), tgt = os.path.join(args.test, base_fn)))
            os.system("ln -s {src} {tgt}".format(src = os.path.join('..', args.all, base_fn+'.trn'), tgt = os.path.join(args.test, base_fn+'.trn')))
        else:
            os.system("ln -s {src} {tgt}".format(src = os.path.join('..', args.all, base_fn), tgt = os.path.join(args.train, base_fn)))
Esempio n. 6
0
    parser.add_argument('-i',
                        '--input',
                        action="store",
                        help='an input directory with all wav files')
    parser.add_argument('-o',
                        '--output',
                        action="store",
                        help='an output directory for the converted wav')
    parser.add_argument('-v',
                        action="store_true",
                        dest="verbose",
                        help='set verbose output')

    args = parser.parse_args()

    trn_files = find(args.input, '*.trn', mindepth=1, maxdepth=5)

    for fn in trn_files:
        if args.verbose:
            print "Processing file:", fn

        real_fn = os.path.realpath(fn)
        base_fn = os.path.basename(fn)

        wav_fn = real_fn.replace('.trn', '')
        trn_fn = real_fn

        if not os.path.exists(wav_fn) or not os.path.exists(wav_fn + '.trn'):
            print "Does not exists {fn} or {fnt}".format(fn=wav_fn,
                                                         fnt=wav_fn + '.trn')
            continue
def convert(args):
    # TODO docstring
    # Unpack the arguments.
    infname = args.infname
    outdir = args.outdir
    lang = args.language
    verbose = args.verbose
    trs_only = args.only_transcriptions
    ignore_list_file = args.ignore
    # Read in the ignore list.
    ignore_paths = set()
    ignore_globs = set()
    if ignore_list_file:
        for path_or_glob in ignore_list_file:
            path_or_glob = path_or_glob.rstrip('\n')
            # For lines that list absolute paths,
            if os.path.abspath(path_or_glob) == os.path.normpath(path_or_glob):
                # add them to the list of paths to ignore.
                ignore_paths.add(path_or_glob)
            # For other lines, treat them as basename globs.
            else:
                ignore_globs.add(path_or_glob)
        ignore_list_file.close()

    # Get all but the ignored transcriptions.
    if os.path.isdir(infname):
        trs_paths = find(infname,
                         '*.trs',
                         mindepth=1,
                         ignore_globs=ignore_globs,
                         ignore_paths=ignore_paths)
    else:
        trs_paths = list()
        with open(infname, 'r') as inlist:
            for line in inlist:
                trs_paths.extend(
                    find(line.strip(),
                         '*.trs',
                         mindepth=1,
                         maxdepth=1,
                         ignore_globs=ignore_globs,
                         ignore_paths=ignore_paths))

    size = 0
    n_overwrites = 0
    n_missing_wav = 0
    n_missing_trs = 0
    for trs_path in trs_paths:

        if verbose:
            print u"Processing transcription file: ", trs_path

        cursize, cur_n_overwrites, cur_n_missing_wav, cur_n_missing_trs = \
            extract_wavs_trns(trs_path, outdir, trs_only, lang, verbose)
        size += cursize
        n_overwrites += cur_n_overwrites
        n_missing_wav += cur_n_missing_wav
        n_missing_trs += cur_n_missing_trs

    print u"Size of copied audio data:", size

    sec = size / (2 * 16000)
    hour = sec / 3600.0

    print u"Length of audio data in hours (for 16kHz 16bit WAVs output):", hour
    # Return the number of file collisions and overwrites.
    return n_overwrites, n_missing_wav, n_missing_trs
Esempio n. 8
0
def find_with_ignorelist(infname, pat, ignore_list_file=None, find_kwargs=dict()):
    """
    Finds specific files below the paths specified and returns their filenames.

    Arguments:
        pat -- globbing pattern specifying the files to look for
        infname -- either a directory, or a file.  In the first case, wavs are
            looked for below that directory.  In the latter case, the file is
            read line by line, each line specifying a directory or a glob
            determining the wav to include.
        ignore_list_file -- a file of absolute paths or globs (can be mixed)
            specifying wavs that should be excluded from the results
        find_kwargs -- if provided, this dictionary is used as additional
            keyword arguments for the function `utils.fs.find' for finding
            positive examples of files (not the ignored ones)

    Returns a set of paths to files satisfying the criteria.

    """

    # Read in the ignore list.
    ignore_paths = set()
    ignore_globs = set()
    if ignore_list_file:
        for path_or_glob in ignore_list_file:
            path_or_glob = path_or_glob.rstrip("\n")
            # For lines that list absolute paths,
            if os.path.abspath(path_or_glob) == os.path.normpath(path_or_glob):
                # add them to the list of paths to ignore.
                ignore_paths.add(path_or_glob)
            # For other lines, treat them as basename globs.
            else:
                ignore_globs.add(path_or_glob)
        ignore_list_file.close()

    # Get all files matching `pat', skipping ignore globs and ignore paths.
    #
    # First option: the infile is actually a directory.  Then, take all
    # matching files from below that directory.
    if os.path.isdir(infname):
        find_kwargs = _build_find_kwargs(find_kwargs, ignore_globs=ignore_globs, ignore_paths=ignore_paths)
        if "mindepth" not in find_kwargs:
            find_kwargs["mindepth"] = 1
        file_paths = set(find(infname, pat, **find_kwargs))
    # Second option: the infile is a file listing all paths to check for
    # matching files.
    else:
        file_paths = set()
        find_kwargs = _build_find_kwargs(
            find_kwargs, mindepth=1, maxdepth=1, ignore_globs=ignore_globs, ignore_paths=ignore_paths
        )
        with open(infname, "r") as inlist:
            for line in inlist:
                line = line.rstrip("\n")
                # If the line contains directories:
                if os.path.isdir(line):
                    file_paths.update(find(line, pat, **find_kwargs))
                # If it is not a directory name, treat the line as a file glob.
                else:
                    new_paths = [os.path.abspath(f) for f in glob.glob(line)]
                    file_paths.update(new_paths)

    # Find all files in ignore paths and remove them from the returned files,
    # to be sure that symlinks from other, not ignored paths did not add them.
    for ignore_path in ignore_paths:
        file_paths.difference_update(find(ignore_path, pat, mindepth=1, maxdepth=1))
    for ignore_glob in ignore_globs:
        file_paths.difference_update(os.path.abspath(fname) for fname in glob.glob(ignore_glob))

    return file_paths
Esempio n. 9
0
                        action="store",
                        help='a list of test wav files')
    parser.add_argument('-v',
                        action="store_true",
                        dest="verbose",
                        help='set verbose output')

    args = parser.parse_args()

    with open(args.devlist, 'r') as f:
        dev_files = set([os.path.basename(fn.strip()) for fn in f.readlines()])
    with open(args.testlist, 'r') as f:
        test_files = set(
            [os.path.basename(fn.strip()) for fn in f.readlines()])

    all_files = find(args.all, '*.wav', mindepth=1, maxdepth=5)

    for fn in all_files:
        if args.verbose:
            print "Processing file:", fn

        base_fn = os.path.basename(fn)

        if base_fn in dev_files:
            os.system("ln -s {src} {tgt}".format(
                src=os.path.join('..', args.all, base_fn),
                tgt=os.path.join(args.dev, base_fn)))
            os.system("ln -s {src} {tgt}".format(
                src=os.path.join('..', args.all, base_fn + '.trn'),
                tgt=os.path.join(args.dev, base_fn + '.trn')))
        elif base_fn in test_files:
Esempio n. 10
0
def find_with_ignorelist(infname,
                         pat,
                         ignore_list_file=None,
                         find_kwargs=dict()):
    """
    Finds specific files below the paths specified and returns their filenames.

    Arguments:
        pat -- globbing pattern specifying the files to look for
        infname -- either a directory, or a file.  In the first case, wavs are
            looked for below that directory.  In the latter case, the file is
            read line by line, each line specifying a directory or a glob
            determining the wav to include.
        ignore_list_file -- a file of absolute paths or globs (can be mixed)
            specifying wavs that should be excluded from the results
        find_kwargs -- if provided, this dictionary is used as additional
            keyword arguments for the function `utils.fs.find' for finding
            positive examples of files (not the ignored ones)

    Returns a set of paths to files satisfying the criteria.

    """

    # Read in the ignore list.
    ignore_paths = set()
    ignore_globs = set()
    if ignore_list_file:
        for path_or_glob in ignore_list_file:
            path_or_glob = path_or_glob.rstrip('\n')
            # For lines that list absolute paths,
            if os.path.abspath(path_or_glob) == os.path.normpath(path_or_glob):
                # add them to the list of paths to ignore.
                ignore_paths.add(path_or_glob)
            # For other lines, treat them as basename globs.
            else:
                ignore_globs.add(path_or_glob)
        ignore_list_file.close()

    # Get all files matching `pat', skipping ignore globs and ignore paths.
    #
    # First option: the infile is actually a directory.  Then, take all
    # matching files from below that directory.
    if os.path.isdir(infname):
        find_kwargs = _build_find_kwargs(find_kwargs,
                                         ignore_globs=ignore_globs,
                                         ignore_paths=ignore_paths)
        if 'mindepth' not in find_kwargs:
            find_kwargs['mindepth'] = 1
        file_paths = set(find(infname, pat, **find_kwargs))
    # Second option: the infile is a file listing all paths to check for
    # matching files.
    else:
        file_paths = set()
        find_kwargs = _build_find_kwargs(find_kwargs,
                                         mindepth=1,
                                         maxdepth=1,
                                         ignore_globs=ignore_globs,
                                         ignore_paths=ignore_paths)
        with open(infname, 'r') as inlist:
            for line in inlist:
                line = line.rstrip('\n')
                # If the line contains directories:
                if os.path.isdir(line):
                    file_paths.update(find(line, pat, **find_kwargs))
                # If it is not a directory name, treat the line as a file glob.
                else:
                    new_paths = [os.path.abspath(f) for f in glob.glob(line)]
                    file_paths.update(new_paths)

    # Find all files in ignore paths and remove them from the returned files,
    # to be sure that symlinks from other, not ignored paths did not add them.
    for ignore_path in ignore_paths:
        file_paths.difference_update(
            find(ignore_path, pat, mindepth=1, maxdepth=1))
    for ignore_glob in ignore_globs:
        file_paths.difference_update(
            os.path.abspath(fname) for fname in glob.glob(ignore_glob))

    return file_paths