def main(): parser = argparse.ArgumentParser() parser.add_argument("--sessions", help="list of sessions to process (defaults to all)") parser.add_argument("--chans", help="list of channels to process (defaults to all)") parser.add_argument( "--refine", help="path to output of 1st pass (runs a refinement pass if provided)") parser.add_argument("audiopath", help="path to audio data") parser.add_argument("outpath", help="path to output alignment data") args = parser.parse_args() if args.sessions is None: sessions = tu.chime_data() else: sessions = args.sessions.split() if args.chans is None: chans = None else: chans = args.chans.split() chime_data = tu.chime_data() if args.refine: # The alignment refinement pass. for session in sessions: refine_session(session, args.audiopath, args.refine, args.outpath) else: # The initial alignment pass. for session in sessions: print(session, chans) align_session(session, args.audiopath, args.outpath, chans=chans)
def process_all_devices(session, linear_fit_data, inpath, outpath, sox_path): """Process all devices.""" chime_data = tu.chime_data() dataset = chime_data[session]['dataset'] devices = chime_data[session]['pids'] + chime_data[session]['kinects'] session_fits = linear_fit_data[session] # Note, skipping the first pid from list of devices as this is the reference device to which all others are being aligned. # (Or could insert pad 0 and scale 1 into json so that it is still processed through sox??) for device in devices: if device not in session_fits: print(f'WARNING: device {device} missing for session {session}') continue linear_fit = session_fits[device] if device[0] == 'P': # Processing binaural mic signal name = session + '_' + device process_device(session, device, linear_fit, inpath, outpath, dataset, name, sox_path) elif device[0] == 'U': # Processing kinect signal for chan in [1, 2, 3, 4]: name = session + '_' + device + '.CH' + str(chan) process_device(session, device, linear_fit, inpath, outpath, dataset, name, sox_path)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--sessions", help="list of sessions to process (defaults to all)") parser.add_argument("--sox_path", help="path for sox command (defaults to .)") parser.add_argument("clock_drift_data", help="json file storing clock drift data") parser.add_argument("inpath", help="path to input audio") parser.add_argument("outpath", help="path to output audio") args = parser.parse_args() if args.sox_path is None: sox_path = "." else: sox_path = args.sox_path if args.sessions is None: sessions = tu.chime_data() else: sessions = args.sessions.split() with open(args.clock_drift_data) as f: linear_fit_data = json.load(f) for session in sessions: process_all_devices(session, linear_fit_data, args.inpath, args.outpath, sox_path)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--sessions", help="list of sessions to process (defaults to all)") parser.add_argument("--clock_drift_data", help="json file storing clock drift data") parser.add_argument('--chime5', action='store_true', help="write transcript file in older CHiME-5 format") parser.add_argument("in_path", help="path for the input transcription file") parser.add_argument("out_path", help="path for the output transcription files") args = parser.parse_args() if args.sessions is None: sessions = tu.chime_data() else: sessions = args.sessions.split() print(args.chime5) with open(args.clock_drift_data) as f: linear_fit_data = json.load(f) for session in sessions: try: print(session) correct_transcription_for_clock_drift(session, linear_fit_data, args.in_path, args.out_path, chime5_mode=args.chime5) except: traceback.print_exc()
def refine_session(session, audiopath, inpath, outpath): """Refine alignment of all channels within a given session.""" chime_data = tu.chime_data() ref = chime_data[session]['pids'][0] pids = chime_data[session]['pids'][1:] kinects = chime_data[session]['kinects'] all_results = pickle.load(open(f'{inpath}/align.{session}.p', "rb")) kinects = sorted(list(set(kinects).intersection(all_results.keys()))) print(session) # Merges results of left and right channel alignments for channel in pids + kinects: results = all_results[channel] lag = down_mix_lags(results) results['lag'] = scipy.signal.medfilt(lag, 9) # Compute the linear fit for modelling clock drift for channel in pids: results = all_results[channel] results['linear_fit'] = clock_drift_linear_fit(results) # Refine kinect alignments - i.e. reanalyse on finer time # scale in regions where big jumps in offset occur and # apply a bit of smoothing to remove spurious estimates for channel in kinects: refine_kinect_lags(all_results[channel], audiopath, session=session, target_chan=channel, ref_chan=ref) results['lag'] = scipy.signal.medfilt(results['lag'], 7) pickle.dump(all_results, open(f'{outpath}/align.{session}.p', "wb"))
def refine_kinect_lags(results, audiopath, session, target_chan, ref_chan): """Refine alignment around big jumps in lag. The initial alignment is computed at 10 second intervals. If the alignment changes by a large amount (>50 ms) during a single 10 second step then the alignment is recomputed at a resolution of 1 second intervals. Arguments: results -- the alignment returned by align_channels() audiopath -- the directory containing the audio data session -- the name of the session to process (e.g. 'S10') target_chan -- the name of the kinect channel to process (e.g. 'U01') ref_chan -- the name of the reference binaural recorded (e.g. 'P34') Return: Note, the function updates the contents of results rather than returns results explicitly """ threshold = 0.05 search_duration = KINECT_SEARCH_DURATION template_duration = KINECT_TEMPLATE_DURATION chime_data = tu.chime_data() times = np.array(results['times']) lag = np.array(results['lag']) if len(times) != len(lag): # This happens for the one case where a kinect was turned off early # and 15 minutes of audio got lost print('WARNING: missing lags') times = times[:len(lag)] dlag = np.diff(lag) jump_times = times[1:][dlag > threshold] analysis_times = set() for time in jump_times: analysis_times |= set(list(range(time - 10, time + 10))) analysis_times = list(analysis_times) print(len(analysis_times)) if len(analysis_times) > 0: missing = None if (('missing' in chime_data[session] and target_chan in chime_data[session]['missing'])): missing = chime_data[session]['missing'][target_chan] ref_fn = f'{audiopath}/{session}_{ref_chan}.wav' target_fn = f'{audiopath}/{session}_{target_chan}.CH1.wav' new_results = align_channels(ref_fn, target_fn, analysis_times, search_duration, template_duration, missing=missing) new_results['lag'] = down_mix_lags(new_results) merge_results(results, new_results)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--sessions", help="list of sessions to process (defaults to all)") parser.add_argument("align_path", help="path for the alignment pickle files") parser.add_argument("in_path", help="path for the input transcription file") parser.add_argument("out_path", help="path for the output transcription files") args = parser.parse_args() if args.sessions is None: sessions = tu.chime_data() else: sessions = args.sessions.split() for session in sessions: try: print(session) align_transcription(session, args.align_path, args.in_path, args.out_path) except: traceback.print_exc()
def main(): parser = argparse.ArgumentParser() parser.add_argument("--sessions", help="list of sessions to process (defaults to all)") parser.add_argument("--save", help="path of directory in which to save plots") parser.add_argument("--no_plot", action='store_true', help="suppress display of plot (defaults to false)") parser.add_argument("path", help="path to alignment data") args = parser.parse_args() if args.sessions is None: sessions = tu.chime_data() else: sessions = args.sessions.split() for session in sessions: print(session) try: plot_session(session, args.path, not args.no_plot, args.save) except: traceback.print_exc()
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ align_transcription.py Apply the alignments to the transcription file """ import pickle import argparse import traceback import numpy as np import transcript_utils as tu CHIME_DATA = tu.chime_data() def correct_time_linear(time_to_correct, linear_fit): """Adjust the time using a linear fit of time to lag.""" corrected = time_to_correct - linear_fit * time_to_correct return round_to_sample(corrected) def correct_time_mapping(time_to_correct, linear_fit, times, lags): """Adjust the time using a linear fit + a mapping from time to lag.""" corrected = np.interp(time_to_correct + linear_fit * time_to_correct, np.array(times) + lags, np.array(times)) return round_to_sample(corrected)
def align_session(session, audiopath, outpath, chans=None): """Align all channels within a given session.""" chime_data = tu.chime_data() # The first binaural recorder is taken as the reference ref_chan = chime_data[session]['pids'][0] # If chans not specified then use all channels available if chans is None: pids = chime_data[session]['pids'] kinects = chime_data[session]['kinects'] chans = pids[1:] + kinects all_results = dict() # Empty dictionary for storing results for target_chan in chans: print(target_chan) # For dealing with channels with big missing audio segments missing = None if (('missing' in chime_data[session] and target_chan in chime_data[session]['missing'])): missing = chime_data[session]['missing'][target_chan] # Parameters for alignment depend on whether target is # a binaural mic ('P') or a kinect mic if target_chan[0] == 'P': search_duration = BINAURAL_SEARCH_DURATION template_duration = BINAURAL_TEMPLATE_DURATION alignment_resolution = BINAURAL_RESOLUTION target_chan_name = target_chan else: search_duration = KINECT_SEARCH_DURATION template_duration = KINECT_TEMPLATE_DURATION alignment_resolution = KINECT_RESOLUTION target_chan_name = target_chan + '.CH1' # Place it try-except block so that can continue # if a channel fails. This shouldn't happen unless # there is some problem reading the audio data. try: offset = 0 if missing is not None: _, offset = missing ref_fn = f'{audiopath}/{session}_{ref_chan}.wav' target_fn = f'{audiopath}/{session}_{target_chan_name}.wav' # Will analyse the alignment offset at regular intervals session_duration = int( min( wavfile_duration(ref_fn) - offset, wavfile_duration(target_fn)) - template_duration - search_duration) analysis_times = range(alignment_resolution, session_duration, alignment_resolution) # Run the alignment code and store results in dictionary all_results[target_chan] = \ align_channels(ref_fn, target_fn, analysis_times, search_duration, template_duration, missing=missing) except: traceback.print_exc() pickle.dump(all_results, open(f'{outpath}/align.{session}.p', "wb"))