def collect_swc_features(processed_sub_dir, collection_name, word_list, feature_func=audio2lmfe): sub_dir = os.path.join(processed_data_dir, processed_sub_dir) util.ensure_exists(sub_dir) output_name = os.path.join(sub_dir, 'SWC_{0}'.format(collection_name)) swc2features(output_name, corpus_dir, feature_func, word_filter=partial(select_words, words_to_keep=word_list), verbose=True)
def print_synonym_stats(ratings): synonyms = 0 util.ensure_exists('test') for i, rating in enumerate(ratings): if not response_missing(rating) and response_with_synonym(rating): synonyms += 1 print("{i:04d}: {0} -> {1}".format(rating.word, rating.synonym, i=i)) data, rate = soundfile.read(rating.wav_path, always_2d=1) segment = data[time2sample(rating.p_delay, rate):time2sample( rating.p_delay + rating.duration, rate), 0] sound_util.write_array_to_wav( os.path.join('test', "{i:04d}".format(i=i)), segment, rate) print(synonyms, synonyms / len(ratings))
def net_annotate_vad_segmentation(run_dir, run_epoch, vad_aggressiveness, ratings_file, load_cleaned=True, skip_starting=0): net, config, _, _, _, _, train_scp, feature_mean, feature_std, word2id, mean_sub, var_norm = \ load_net(run_dir, epoch=run_epoch, logger=None, train=False) ratings, vecs_train, word_idxs_train = load_common_rating_data( ratings_file, run_dir, run_epoch) util.ensure_exists('output') output_file = 'output/{0}_epoch_{1}_{2}_full_{3}_skip{4:.3f}.netrating' \ .format(os.path.basename(run_dir), run_epoch, os.path.basename(ratings_file), '-noise_vad' if load_cleaned else 'raw_vad', skip_starting) # apparently [[]] * len(ratings) returns copies of *the same* empty list output: List[List[NetAnnotatedSegment]] = [[] for _ in range(len(ratings))] for rating, rating_idx, segment_idx, start_sec, end_sec, segment_audio, sample_rate, num_segments in \ segment_generator(ratings, vad_aggressiveness, load_cleaned=load_cleaned, skip_starting=skip_starting): if segment_audio.shape[0] == 0: output[rating_idx].append( NetAnnotatedSegment(0, 0, 0, np.array([1000]), np.array([1000]), rating.word, rating.vp, rating.date, rating.wav_path)) else: dists, features = evaluate_segment_stepwise( net, config, feature_mean, feature_std, rating, segment_audio, sample_rate, vecs_train, word_idxs_train, mean_sub, var_norm) output[rating_idx].append( NetAnnotatedSegment(start_sec, end_sec, segment_idx, dists, features[:, 0, :].mean(axis=1), rating.word, rating.vp, rating.date, rating.wav_path)) if segment_idx == num_segments - 1: print('Finished rating {0}'.format(rating_idx)) save_pickled(output, output_file) if hasattr(net, 'beta'): beta = net.beta.detach().cpu().numpy() with open( "output/{0}_epoch_{1}.beta".format(os.path.basename(run_dir), run_epoch), 'wb') as f: np.save(f, beta)
def _process_emu_db(db, sub_dir, name, words): sub_dir = os.path.join(processed_data_dir, sub_dir) util.ensure_exists(sub_dir) db_path = os.path.join(raw_data_dir, db) seq_rds_path = os.path.join(raw_data_dir, '{0}.rds'.format(db)) if debug: emu2wav_segments(os.path.join(sub_dir, '{0}_{1}'.format(db, name)), db, db_path, seq_rds_path, partial(select_words, words_to_keep=words), verbose=True) else: emu2features(os.path.join(sub_dir, '{0}_{1}'.format(db, name)), db, db_path, seq_rds_path, feature_func, partial(select_words, words_to_keep=words), verbose=True)
def setup_training_run(model_name): args = parse_training_args() config = Settings(args.config) util.ensure_exists(awe_runs_dir) run_name = '{0}_{1}_{2}'.format( model_name, len([path for path in os.listdir(awe_runs_dir) if os.path.isdir(os.path.join(awe_runs_dir, path)) and model_name in path]), datetime.datetime.now().strftime('%d_%m_%Y') ) log_dir = os.path.join(awe_runs_dir, run_name) checkpoint_dir = os.path.join(log_dir, 'checkpoints') util.ensure_exists(log_dir) util.ensure_exists(checkpoint_dir) copyfile(args.config, os.path.join(log_dir, 'conf.ini')) logger = create_logger(model_name, os.path.join(log_dir, 'log')) logger.info('Running with args:') for var in vars(args): logger.info('{0}: {1}'.format(var, getattr(args, var))) use_gru = config.general_training.use_gru if hasattr(config, 'general_training') else False noise_mult = config.general_training.noise_multiplier if hasattr(config.general_training, 'noise_multiplier') else 0 noise_prob = config.general_training.noise_prob if hasattr(config.general_training, 'noise_prob') else 0 mean_sub = getattr(config.general_training, 'mean_subtraction', True) var_norm = getattr(config.general_training, 'variance_normalization', False) return args, config, logger, checkpoint_dir, log_dir, use_gru, noise_mult, noise_prob, mean_sub, var_norm
def patient_adaptation_test_on_dev(args): run_dir = args.run_dir run_epoch = args.run_epoch ratings_file = args.ratings_file patient = args.patient out_dir = 'patient_adaptation_test_output_{0}'.format(patient) util.ensure_exists(out_dir) if '_dev_' not in ratings_file: raise RuntimeError( 'Only ratings available in the dev dataset currently supported') train_epoch_embeddings, dev_epoch_embeddings, _ = \ get_or_generate_embeddings(run_dir, run_epoch, dev_needed=True, test_needed=False) words_train, datasets_train, vecs_train, counts_train, word_idxs_train = load_embeddings( train_epoch_embeddings[run_epoch]) words_dev, datasets_dev, vecs_dev, counts_dev, word_idxs_dev, keys_dev = load_embeddings( dev_epoch_embeddings[run_epoch], return_keys=True) all_ratings: List[SnodgrassWordRating] = load_pickled( os.path.join(processed_data_dir, ratings_file)) ratings_patient = [r for r in all_ratings if r.vp == patient] all_sessions = np.unique([r.date for r in ratings_patient]) print('{0} sessions for patient {1}'.format(len(all_sessions), patient)) adaptation_functions = { 'only_new_session': adaptation_only_session_examples_if_available, 'add_new_session': adaptation_add_session_examples, 'average_with_new_session': adaptation_average_with_session_examples } for session in all_sessions: fold_ratings = [r for r in ratings_patient if r.date != session] sessions_vecs, session_word_idxs = collect_session_embeddings_data( session, vecs_dev, keys_dev) ratings_name = '{0}_patient_{1}_except_{2}'.format( os.path.basename(ratings_file), patient, session) net_annotate_sliding_window_framewise( run_dir=run_dir, run_epoch=run_epoch, ratings_file_or_object=fold_ratings, skip_starting=0.3, save=True, ratings_name=ratings_name, output_dir=out_dir) for adaptation_type, method in adaptation_functions.items(): reference_vecs, reference_word_idxs = method( vecs_train, word_idxs_train, sessions_vecs, session_word_idxs) ratings_name = '{0}_patient_{1}_except_{2}_adaptation_{3}'.format( os.path.basename(ratings_file), patient, session, adaptation_type) net_annotate_sliding_window_framewise( run_dir=run_dir, run_epoch=run_epoch, ratings_file_or_object=fold_ratings, skip_starting=0.3, reference_vecs_override=reference_vecs, reference_word_idxs_override=reference_word_idxs, save=True, ratings_name=ratings_name, output_dir=out_dir)
def net_annotate_sliding_window_framewise(run_dir, run_epoch, ratings_file_or_object, skip_starting=0, reference_vecs_override=None, reference_word_idxs_override=None, save=True, ratings_name=None, output_dir=None, plot_mode=False): time_start = time.time() if save: if ratings_name is None: ratings_name = os.path.basename(ratings_file_or_object) if output_dir is None: output_dir = 'output' util.ensure_exists(output_dir) output_file = '{0}_epoch_{1}_{2}_full{3}_skip{4:.3f}.netrating_faster' \ .format(os.path.basename(run_dir), run_epoch, ratings_name, 'own_segmentation', skip_starting) output_file = os.path.join(output_dir, output_file) net, config, _, _, _, _, train_scp, feature_mean, feature_std, word2id, mean_sub, var_norm = \ load_net(run_dir, epoch=run_epoch, logger=None, train=False) ratings, vecs_train, word_idxs_train = load_common_rating_data( ratings_file_or_object, run_dir, run_epoch) word_lengths = load_pickled(scp2word_lengths_file(train_scp)) reference_vecs = reference_vecs_override if reference_vecs_override is not None else vecs_train reference_word_idxs = reference_word_idxs_override if reference_word_idxs_override is not None else word_idxs_train output: List[List[NetAnnotatedSegment]] = [[] for _ in range(len(ratings))] for rating, rating_idx, start_sec, end_sec, audio, sample_rate in \ plain_audio_generator(ratings, skip_starting=skip_starting): if audio.shape[0] == 0: output[rating_idx].append( NetAnnotatedSegment(0, 0, 0, np.array([1000]), np.array([1000]), rating.word, rating.vp, rating.date, rating.wav_path)) else: mean_length, max_length = word_lengths[rating.word] spacing_frames = 5 # TODO: half of mean duration may not be the best choice for every word duration_frames = time2frames(mean_length / 2) full_features = new_features(audio, sample_rate, feature_mean, feature_std, mean_sub, var_norm) starts = subsegment_starts(full_features.shape[0], duration_frames, spacing_frames) # much faster than segmenting first and then getting the features of each small segment features = [(full_features[s:s + duration_frames]) for s in starts] num_segments = len(features) subsegment_portions = [0.75, 0.5, 0.25] size_multiplier = len(subsegment_portions) + 1 features_plus_extra, lengths = stack_features( features, num_segments, duration_frames, subsegment_portions, size_multiplier) stacked_dists = evaluate_stacked_features( net, config, features_plus_extra, lengths, rating, reference_vecs, reference_word_idxs, num_segments, size_multiplier) bs_idx, best_start, best_end, best_duration = \ select_best_segmentation(stacked_dists, starts, end_sec, max_length, skip_starting) best_duration_frames = time2frames(best_duration) def plot_dists(savefig=False): import matplotlib.pyplot as plt from matplotlib import rc rc('text', usetex=True) rc('font', size=12) rc('legend', fontsize=12) font = {'family': 'serif', 'serif': ['cmr10']} rc('font', **font) if not response_missing(rating) and not response_with_synonym( rating): p_delay_adjusted = rating.p_delay plt.axvline(p_delay_adjusted, color='xkcd:bright lavender', dashes=[5, 5], zorder=2, label='Word start') plt.axvline(p_delay_adjusted + rating.duration, color='xkcd:light grass green', dashes=[1, 1], zorder=2, label='Word end') plt.plot([frames2time(x) + skip_starting for x in starts], stacked_dists[:, -1], zorder=1, color='xkcd:cobalt blue') plt.axvline(best_start, color='xkcd:lightish red', dashes=[1, 0], zorder=2, label='Word start guess') plt.xlabel('Time (s)') plt.ylabel('Avg distance to reference examples') plt.legend() if savefig: plt.savefig( 'plots_output/recording_dists_{0:04}.pdf'.format( rating_idx), dpi=300, bbox_inches='tight', pad_inches=0) else: plt.show() plt.clf() if plot_mode: if not response_missing(rating) and not response_with_synonym( rating): plot_dists(savefig=True) if rating_idx >= 10: break dists_best_guess, features_best_guess = \ evaluate_stepwise(net, config, full_features[starts[bs_idx]:starts[bs_idx] + best_duration_frames], rating, reference_vecs, reference_word_idxs) output[rating_idx].append( NetAnnotatedSegment(best_start, best_end, 0, dists_best_guess, features_best_guess[:, 0, :].mean(axis=1), rating.word, rating.vp, rating.date, rating.wav_path)) print('Finished rating number {0}'.format(rating_idx + 1)) if save and not plot_mode: save_pickled(output, output_file) if hasattr(net, 'beta'): beta = net.beta.detach().cpu().numpy() beta_out_file = os.path.join( output_dir, "{0}_epoch_{1}.beta".format(os.path.basename(run_dir), run_epoch)) with open(beta_out_file, 'wb') as f: np.save(f, beta) print('Elapsed sec: {0:.3f}'.format(time.time() - time_start)) return output, net.beta.detach().cpu().numpy() if hasattr(net, 'beta') else None
def _process_snodgrass(input_path, sub_dir, feature_func=audio2lmfe): sub_dir = os.path.join(processed_data_dir, sub_dir) util.ensure_exists(sub_dir) output_path = os.path.join(sub_dir, 'snodgrass_data_v3') snodgrass2features(input_path, output_path, feature_func, compress=False, verbose=True)
def update_ratings_with_fixed_audio(ratings_file): """Collects the sound data for words where the response spills to the next recording, also performs resampling to 32000 for VAD (which can only work with the sampling rate in multiples of 8000). After resampling the audio for VAD can be additionally cleaned with sox: find . -name '*_32.wav' | parallel 'sox {} {.}-clean.wav noisered ~/noise.profile 0.2'""" fixed_audio_output_dir = os.path.join( processed_data_dir, os.path.basename(ratings_file) + '_audio') util.ensure_exists(fixed_audio_output_dir) with open(os.path.join(processed_data_dir, ratings_file), 'rb') as f: ratings: List[SnodgrassWordRating] = pickle.load(f) ratings_updated = [] for rating in ratings: new_name = '{0}_{1}_{2}_{3}.wav'.format(rating.word, rating.order, rating.vp, rating.date) new_name_32 = '{0}_{1}_{2}_{3}_32.wav'.format(rating.word, rating.order, rating.vp, rating.date) target_path = os.path.join(fixed_audio_output_dir, new_name) target_path_32 = os.path.join(fixed_audio_output_dir, new_name_32) if not isinstance(rating.wav_path, list): copy(rating.wav_path, target_path) audio, rate = soundfile.read(rating.wav_path) data_32 = resampy.resample(audio[:, 0], rate, 32000, filter='kaiser_best') soundfile.write(target_path_32, data_32, 32000) else: data = [] rates = [] for wav_path in rating.wav_path: audio, rate = soundfile.read(wav_path) data.append(audio) rates.append(rate) if not all([x == rates[0] for x in rates]): print( 'Unequal sampling rates for snodgrass files not supported') sys.exit(-1) else: data = np.concatenate(data, axis=0) soundfile.write(target_path, data, rates[0]) data_32 = resampy.resample(data[:, 0], rates[0], 32000, filter='kaiser_best') soundfile.write(target_path_32, data_32, 32000) ratings_updated.append( SnodgrassWordRating(rating.word, rating.order, rating.p_score, rating.p_delay, rating.duration, rating.synonym, rating.comment, rating.vp, rating.date, target_path)) with open(os.path.join(processed_data_dir, ratings_file), 'wb') as f: pickle.dump(ratings_updated, f, protocol=pickle.HIGHEST_PROTOCOL)
def emu2wav_segments(output_path, emu_name, emu_dir, seq_rds_path, word_filter=None, alphabetic_order=True, compress=False, verbose=False): wav_files = WavCache(maxsize=1000 * 2**20) if not os.path.exists(seq_rds_path): print('Skipping {0} - no segment list found'.format(emu_name)) return seq_rds = pandas2ri.ri2py(robjects.r.readRDS(seq_rds_path)) seq_rds = seq_rds.sort_values(by=['labels']) if word_filter is None: word_filter = basic_word_filter selected_words = word_filter(list(seq_rds['labels'])) seq_filtered = seq_rds.loc[seq_rds['labels'].isin(selected_words)] if not alphabetic_order: seq_filtered = seq_filtered.sort_values( by=['session', 'bundle']) # improve cache performance # don't even need the cache then, the segments are sorted so that the wav-files are processed sequentially total = len(seq_filtered.index) if total > 0: print('Selected {0} segments from the {1} dataset'.format( total, emu_name)) else: print('No matching segments found in {0}, exiting'.format(emu_name)) return util.ensure_exists(output_path) n = 0 for i, row in seq_filtered.iterrows(): if n > 1000: break word = row['labels'] session = row['session'] bundle = row['bundle'] key = '{0}_{1}_{2}_{3}_{4}'.format(word, i, emu_name.split('_')[0], session, bundle) duration_sec = (row['end'] - row['start']) / 1000 if reject_by_duration_sec(duration_sec, emu_name, key, verbose=True): continue wav_path = os.path.join(emu_dir, '{0}_ses'.format(session), '{0}_bndl'.format(bundle), '{0}.wav'.format(bundle)) rate, signal = wav_files[wav_path] segment = signal[row['sample_start']:row['sample_end']] if rate < 16000: print('Too low sampling rate for {0}, skipping'.format(key)) continue # segment = signal[time2sample(row['start'] / 1000, rate):time2sample(row['end'] / 1000, rate)] features_with_deltas = audio2lmfe(segment, rate) if reject_by_frame_count(features_with_deltas, word, emu_name, key, verbose=True): continue sound_util.write_array_to_wav(os.path.join(output_path, key), segment, rate) n += 1 if verbose and n % 100 == 0: print_progress_bar(n, total, prefix='Progress:', suffix='Complete', length=50) if verbose: print_progress_bar(total, total, prefix='Progress:', suffix='Complete', length=50)
def gen_and_save_dataset_embeddings(net, config, checkpoint, dataset, get_embeddings, embeddings_dir): util.ensure_exists(embeddings_dir) embedding_dict = get_embeddings_dict(net, config, checkpoint, dataset, get_embeddings) save_embeddings(embedding_dict, embeddings_dir, checkpoint)