def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--sessions",
                        help="list of sessions to process (defaults to all)")
    parser.add_argument("--chans",
                        help="list of channels to process (defaults to all)")
    parser.add_argument(
        "--refine",
        help="path to output of 1st pass (runs a refinement pass if provided)")

    parser.add_argument("audiopath", help="path to audio data")
    parser.add_argument("outpath", help="path to output alignment data")
    args = parser.parse_args()

    if args.sessions is None:
        sessions = tu.chime_data()
    else:
        sessions = args.sessions.split()

    if args.chans is None:
        chans = None
    else:
        chans = args.chans.split()

    chime_data = tu.chime_data()

    if args.refine:
        # The alignment refinement pass.
        for session in sessions:
            refine_session(session, args.audiopath, args.refine, args.outpath)
    else:
        # The initial alignment pass.
        for session in sessions:
            print(session, chans)
            align_session(session, args.audiopath, args.outpath, chans=chans)
Example #2
0
def process_all_devices(session, linear_fit_data, inpath, outpath, sox_path):
    """Process all devices."""

    chime_data = tu.chime_data()
    dataset = chime_data[session]['dataset']
    devices = chime_data[session]['pids'] + chime_data[session]['kinects']

    session_fits = linear_fit_data[session]

    # Note, skipping the first pid from list of devices as this is the reference device to which all others are being aligned.
    # (Or could insert pad 0 and scale 1 into json so that it is still processed through sox??)
    for device in devices:
        if device not in session_fits:
            print(f'WARNING: device {device} missing for session {session}')
            continue

        linear_fit = session_fits[device]
        
        if device[0] == 'P':
            # Processing binaural mic signal
            name = session + '_' + device
            process_device(session, device, linear_fit, inpath, outpath, dataset, name, sox_path)
        elif device[0] == 'U':
            # Processing kinect signal
            for chan in [1, 2, 3, 4]:
                name = session + '_' + device + '.CH' + str(chan)
                process_device(session, device, linear_fit, inpath, outpath, dataset, name, sox_path)
Example #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--sessions", help="list of sessions to process (defaults to all)")   
    parser.add_argument("--sox_path", help="path for sox command (defaults to .)")  
    parser.add_argument("clock_drift_data", help="json file storing clock drift data") 
    parser.add_argument("inpath", help="path to input audio")  
    parser.add_argument("outpath", help="path to output audio")   
 
    args = parser.parse_args()

    if args.sox_path is None:
        sox_path = "."
    else:
        sox_path = args.sox_path

    if args.sessions is None:
        sessions = tu.chime_data()
    else:
        sessions = args.sessions.split()

    with open(args.clock_drift_data) as f:
        linear_fit_data = json.load(f)

    for session in sessions:
        process_all_devices(session, linear_fit_data, args.inpath, args.outpath, sox_path)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--sessions",
                        help="list of sessions to process (defaults to all)")
    parser.add_argument("--clock_drift_data", help="json file storing clock drift data")  
    parser.add_argument('--chime5', action='store_true', help="write transcript file in older CHiME-5 format") 
    parser.add_argument("in_path", help="path for the input transcription file")
    parser.add_argument("out_path", help="path for the output transcription files")
    
    args = parser.parse_args()
    if args.sessions is None:
        sessions = tu.chime_data()
    else:
        sessions = args.sessions.split()

    print(args.chime5)
    with open(args.clock_drift_data) as f:
        linear_fit_data = json.load(f)

    for session in sessions:
        try:
            print(session)
            correct_transcription_for_clock_drift(session, linear_fit_data, args.in_path, args.out_path, 
                                                  chime5_mode=args.chime5)
        except:
            traceback.print_exc()
def refine_session(session, audiopath, inpath, outpath):
    """Refine alignment of all channels within a given session."""
    chime_data = tu.chime_data()

    ref = chime_data[session]['pids'][0]
    pids = chime_data[session]['pids'][1:]
    kinects = chime_data[session]['kinects']
    all_results = pickle.load(open(f'{inpath}/align.{session}.p', "rb"))
    kinects = sorted(list(set(kinects).intersection(all_results.keys())))
    print(session)

    # Merges results of left and right channel alignments
    for channel in pids + kinects:
        results = all_results[channel]
        lag = down_mix_lags(results)
        results['lag'] = scipy.signal.medfilt(lag, 9)

    # Compute the linear fit for modelling clock drift
    for channel in pids:
        results = all_results[channel]
        results['linear_fit'] = clock_drift_linear_fit(results)

    # Refine kinect alignments - i.e. reanalyse on finer time
    # scale in regions where big jumps in offset occur and
    # apply a bit of smoothing to remove spurious estimates
    for channel in kinects:
        refine_kinect_lags(all_results[channel],
                           audiopath,
                           session=session,
                           target_chan=channel,
                           ref_chan=ref)
        results['lag'] = scipy.signal.medfilt(results['lag'], 7)

    pickle.dump(all_results, open(f'{outpath}/align.{session}.p', "wb"))
def refine_kinect_lags(results, audiopath, session, target_chan, ref_chan):
    """Refine alignment around big jumps in lag.
    
    The initial alignment is computed at 10 second intervals. If the alignment
    changes by a large amount (>50 ms) during a single 10 second step then the
    alignment is recomputed at a resolution of 1 second intervals.

    Arguments:
    results -- the alignment returned by align_channels()
    audiopath -- the directory containing the audio data
    session -- the name of the session to process (e.g. 'S10')
    target_chan -- the name of the kinect channel to process (e.g. 'U01')
    ref_chan -- the name of the reference binaural recorded (e.g. 'P34')

    Return:
    Note, the function updates the contents of results rather than returns results explicitly
    """
    threshold = 0.05
    search_duration = KINECT_SEARCH_DURATION
    template_duration = KINECT_TEMPLATE_DURATION
    chime_data = tu.chime_data()

    times = np.array(results['times'])
    lag = np.array(results['lag'])
    if len(times) != len(lag):
        # This happens for the one case where a kinect was turned off early
        # and 15 minutes of audio got lost
        print('WARNING: missing lags')
        times = times[:len(lag)]
    dlag = np.diff(lag)
    jump_times = times[1:][dlag > threshold]
    analysis_times = set()

    for time in jump_times:
        analysis_times |= set(list(range(time - 10, time + 10)))
    analysis_times = list(analysis_times)
    print(len(analysis_times))

    if len(analysis_times) > 0:
        missing = None
        if (('missing' in chime_data[session]
             and target_chan in chime_data[session]['missing'])):
            missing = chime_data[session]['missing'][target_chan]

        ref_fn = f'{audiopath}/{session}_{ref_chan}.wav'
        target_fn = f'{audiopath}/{session}_{target_chan}.CH1.wav'

        new_results = align_channels(ref_fn,
                                     target_fn,
                                     analysis_times,
                                     search_duration,
                                     template_duration,
                                     missing=missing)
        new_results['lag'] = down_mix_lags(new_results)
        merge_results(results, new_results)
Example #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--sessions",
                        help="list of sessions to process (defaults to all)")
    parser.add_argument("align_path", help="path for the alignment pickle files")
    parser.add_argument("in_path", help="path for the input transcription file")
    parser.add_argument("out_path", help="path for the output transcription files")
    args = parser.parse_args()
    if args.sessions is None:
        sessions = tu.chime_data()
    else:
        sessions = args.sessions.split()

    for session in sessions:
        try:
            print(session)
            align_transcription(session, args.align_path, args.in_path, args.out_path)
        except:
            traceback.print_exc()
Example #8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--sessions",
                        help="list of sessions to process (defaults to all)")
    parser.add_argument("--save",
                        help="path of directory in which to save plots")
    parser.add_argument("--no_plot",
                        action='store_true',
                        help="suppress display of plot (defaults to false)")
    parser.add_argument("path", help="path to alignment data")
    args = parser.parse_args()
    if args.sessions is None:
        sessions = tu.chime_data()
    else:
        sessions = args.sessions.split()

    for session in sessions:
        print(session)
        try:
            plot_session(session, args.path, not args.no_plot, args.save)
        except:
            traceback.print_exc()
Example #9
0
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""
align_transcription.py

Apply the alignments to the transcription file
"""

import pickle
import argparse
import traceback
import numpy as np

import transcript_utils as tu

CHIME_DATA = tu.chime_data()


def correct_time_linear(time_to_correct, linear_fit):
    """Adjust the time using a linear fit of time to lag."""
    corrected = time_to_correct - linear_fit * time_to_correct
    return round_to_sample(corrected)


def correct_time_mapping(time_to_correct, linear_fit, times, lags):
    """Adjust the time using a linear fit + a mapping from time to lag."""
    corrected = np.interp(time_to_correct + linear_fit * time_to_correct,
                          np.array(times) + lags, np.array(times))
    return round_to_sample(corrected)

def align_session(session, audiopath, outpath, chans=None):
    """Align all channels within a given session."""
    chime_data = tu.chime_data()

    # The first binaural recorder is taken as the reference
    ref_chan = chime_data[session]['pids'][0]

    # If chans not specified then use all channels available
    if chans is None:
        pids = chime_data[session]['pids']
        kinects = chime_data[session]['kinects']
        chans = pids[1:] + kinects

    all_results = dict()  # Empty dictionary for storing results

    for target_chan in chans:
        print(target_chan)

        # For dealing with channels with big missing audio segments
        missing = None
        if (('missing' in chime_data[session]
             and target_chan in chime_data[session]['missing'])):
            missing = chime_data[session]['missing'][target_chan]

        # Parameters for alignment depend on whether target is
        # a binaural mic ('P') or a kinect mic
        if target_chan[0] == 'P':
            search_duration = BINAURAL_SEARCH_DURATION
            template_duration = BINAURAL_TEMPLATE_DURATION
            alignment_resolution = BINAURAL_RESOLUTION
            target_chan_name = target_chan
        else:
            search_duration = KINECT_SEARCH_DURATION
            template_duration = KINECT_TEMPLATE_DURATION
            alignment_resolution = KINECT_RESOLUTION
            target_chan_name = target_chan + '.CH1'

        # Place it try-except block so that can continue
        # if a channel fails. This shouldn't happen unless
        # there is some problem reading the audio data.
        try:
            offset = 0
            if missing is not None:
                _, offset = missing

            ref_fn = f'{audiopath}/{session}_{ref_chan}.wav'
            target_fn = f'{audiopath}/{session}_{target_chan_name}.wav'

            # Will analyse the alignment offset at regular intervals
            session_duration = int(
                min(
                    wavfile_duration(ref_fn) -
                    offset, wavfile_duration(target_fn)) - template_duration -
                search_duration)
            analysis_times = range(alignment_resolution, session_duration,
                                   alignment_resolution)

            # Run the alignment code and store results in dictionary
            all_results[target_chan] = \
                align_channels(ref_fn,
                               target_fn,
                               analysis_times,
                               search_duration,
                               template_duration,
                               missing=missing)
        except:
            traceback.print_exc()

    pickle.dump(all_results, open(f'{outpath}/align.{session}.p', "wb"))