def test_extract_features(utterances_index, features): config = pipeline.get_default_config(features, with_cmvn=False, with_pitch=False) feats = pipeline.extract_features(config, utterances_index) feat1 = feats[utterances_index[0][0]] assert feat1.is_valid() assert feat1.shape[0] == 140 assert feat1.dtype == np.float32 config = pipeline.get_default_config(features, with_cmvn=False, with_pitch=True) feats = pipeline.extract_features(config, utterances_index) feat2 = feats[utterances_index[0][0]] assert feat2.is_valid() assert feat2.shape[0] == 140 assert feat2.shape[1] == feat1.shape[1] + 3 utterances_index = [('u1', utterances_index[0][1], 0, 1)] config = pipeline.get_default_config(features, with_cmvn=False, with_pitch=False) feats = pipeline.extract_features(config, utterances_index) feat3 = feats[utterances_index[0][0]] assert feat3.is_valid() assert feat3.shape[0] == 98 assert feat3.shape[1] == feat1.shape[1]
def test_cmvn(utterances_index, by_speaker, with_vad): config = pipeline.get_default_config( 'mfcc', with_cmvn=True, with_pitch=False, with_delta=False) config['cmvn']['by_speaker'] = by_speaker config['cmvn']['with_vad'] = with_vad feats = pipeline.extract_features(config, utterances_index) feat2 = feats[utterances_index[0][0]] assert feat2.is_valid() assert feat2.shape[0] == 140 assert feat2.shape[1] == 13
def command_extract(args): # setup the logger (level given by -q/-v arguments) if args.quiet: log = utils.null_logger() else: if args.verbose == 0: level = 'warning' elif args.verbose == 1: level = 'info' else: # verbose >= 2 level = 'debug' log = utils.get_logger(name='speech-features', level=level) # forward the initialized log to shennong utils._logger = log # make sure the output file is not already existing and have a # valid extension output_file = args.output_file if os.path.exists(output_file): log.error('output file already exist: %s', output_file) return output_ext = os.path.splitext(output_file)[1] if output_ext not in supported_extensions().keys(): log.error( 'output file has an unsupported extension "%s", must be in %s', output_ext, ", ".join(supported_extensions().keys())) return # make sure the input config and wavs_index exists for filename in (args.config, args.utts_index): if not os.path.exists(filename): log.error('input file not found: %s', filename) # read the utterances file as a list of lists, ignore empty lines # in the file utterances = [ utt.split(' ') for utt in (utt.strip() for utt in open(args.utts_index, 'r')) if utt] # run the pipeline features = pipeline.extract_features( args.config, utterances, njobs=args.njobs, log=log) # save the features log.info('saving the features to %s', output_file) features.save(output_file)
def main(): # parse input arguments parser = argparse.ArgumentParser() parser.add_argument('data_directory', help='input/output data directory') parser.add_argument('config_file', help='YAML configuration file') parser.add_argument( 'corpus', choices=['english', 'xitsonga'], help='corpus to process') parser.add_argument( '-j', '--njobs', type=int, default=4, metavar='<int>', help='number of parallel jobs (default to %(default)s)') parser.add_argument( '-v', '--verbose', action='store_true', help='increase log level') args = parser.parse_args() # check and setup arguments data_directory = args.data_directory if not os.path.isdir(data_directory): raise ValueError(f'directory not found: {data_directory}') config = args.config_file if not os.path.isfile(config): raise ValueError(f'file not found: {config}') try: os.makedirs(os.path.join(data_directory, 'features')) except FileExistsError: pass log = get_logger(level='debug' if args.verbose else 'info') # load input utterances utterances = [line.split(' ') for line in open(os.path.join( data_directory, f'{args.corpus}.utts'), 'r')] # extract the features features = pipeline.extract_features( config, utterances, njobs=args.njobs, log=log) # save them h5f_file = os.path.join( data_directory, 'features', f'{args.corpus}_{os.path.basename(config)}' .replace('.yaml', '.h5f')) features.save(h5f_file)
def test_check_speakers(utterances_index, capsys): log = utils.get_logger(level='info') config = pipeline.get_default_config('mfcc') with pytest.raises(ValueError) as err: pipeline.extract_features(config, [(utterances_index[0][1], )], log=log) assert 'no speaker information provided' in str(err) capsys.readouterr() # clean the buffer config = pipeline.get_default_config('mfcc', with_cmvn=False) pipeline.extract_features(config, utterances_index, log=log) log_out = capsys.readouterr() assert 'cmvn' not in log_out.err assert '(CMVN disabled)' in log_out.err config = pipeline.get_default_config('mfcc', with_cmvn=True) config['cmvn']['by_speaker'] = False pipeline.extract_features(config, utterances_index, log=log) log_out = capsys.readouterr().err assert 'cmvn by utterance' in log_out assert '(CMVN by speaker disabled)' in log_out
def test_config_bad(utterances_index): with pytest.raises(ValueError) as err: pipeline.get_default_config('bad') assert 'invalid features "bad"' in str(err) config = pipeline.get_default_config('mfcc') del config['mfcc'] with pytest.raises(ValueError) as err: pipeline.extract_features(config, utterances_index) assert 'the configuration does not define any features' in str(err) config = pipeline.get_default_config('mfcc') config['plp'] = config['mfcc'] with pytest.raises(ValueError) as err: pipeline.extract_features(config, utterances_index) assert 'more than one features extraction processor' in str(err) config = pipeline.get_default_config('mfcc') config['invalid'] = config['mfcc'] with pytest.raises(ValueError) as err: pipeline.extract_features(config, utterances_index) assert 'invalid keys in configuration' in str(err) config = pipeline.get_default_config('mfcc') del config['cmvn']['with_vad'] parsed = pipeline._init_config(config) assert 'cmvn' in parsed assert parsed['cmvn']['with_vad'] config = pipeline.get_default_config('mfcc') del config['cmvn']['by_speaker'] c = pipeline._init_config(config) assert not c['cmvn']['by_speaker'] config = pipeline.get_default_config('mfcc') del config['pitch']['postprocessing'] c = pipeline._init_config(config) assert c['pitch']['postprocessing'] == {}
c for c in pairs.columns if len(c) > 2 and c[-2:] == "_1" ]] items_1.columns = [c[:-2] for c in items_1.columns] items_2 = pairs[[ c for c in pairs.columns if len(c) > 2 and c[-2:] == "_2" ]] items_2.columns = [c[:-2] for c in items_2.columns] if set(items_1.columns) != set(items_2.columns): eprint("""Issue with pair file (<F>): columns don't match""".replace("<F>", str(args.pair_file))) sys.exit(1) if not set(['file', 'onset', 'offset', 'speaker']).issubset( items_1.columns): eprint("""Issue with pair file (<F>): missing 'file', 'speaker', 'onset', or 'offset' column(s)""".replace("<F>", str(args.pair_file))) sys.exit(1) items = pd.concat([items_1, items_2], sort=True).drop_duplicates() file_spk_ = items[['file', 'speaker']].drop_duplicates() utterance_index = [(str(i), ) + tuple(x) for (i, x) in enumerate(file_spk_.values)] utterance_ids = dict(((f, s), uid) for (uid, f, s) in utterance_index) features = snpipeline.extract_features(args.shennong_config_file, utterance_index, njobs=args.njobs) pairs['distance'] = calculate_distances(pairs, features, utterance_ids, njobs=args.njobs) pairs.to_csv(args.output_file, index=False)
def test_extract_features_full(ext, wav_file, wav_file_8k, wav_file_float32, capsys, tmpdir): # difficult case with parallel jobs, different sampling rates, # speakers and segments index = [('u1', wav_file, 's1', 0, 1), ('u2', wav_file_float32, 's2', 1, 1.2), ('u3', wav_file_8k, 's1', 1, 3)] config = pipeline.get_default_config('mfcc') # disable VAD because it can alter the cmvn result (far from (0, # 1) when the signal includes non-voiced frames) config['cmvn']['with_vad'] = False feats = pipeline.extract_features(config, index, njobs=2, log=utils.get_logger()) # ensure we have the expected log messages messages = capsys.readouterr().err assert 'INFO - get 3 utterances from 2 speakers in 3 wavs' in messages assert 'WARNING - several sample rates found in wav files' in messages for utt in ('u1', 'u2', 'u3'): assert utt in feats assert feats[utt].dtype == np.float32 # check properies p1 = feats['u1'].properties p2 = feats['u2'].properties p3 = feats['u3'].properties assert p1['audio']['file'] == wav_file assert p1['audio']['duration'] == 1.0 assert p2['audio']['file'] == wav_file_float32 assert p2['audio']['duration'] == pytest.approx(0.2) assert p3['audio']['file'] == wav_file_8k assert p3['audio']['duration'] < 0.5 # ask 3s but get duration-tstart assert p1['mfcc'] == p2['mfcc'] assert p1['mfcc']['sample_rate'] != p3['mfcc']['sample_rate'] assert p1.keys() == { 'audio', 'mfcc', 'cmvn', 'pitch', 'delta', 'speaker', 'pipeline' } assert p1.keys() == p2.keys() == p3.keys() assert p1['pipeline'] == p2['pipeline'] == p3['pipeline'] # check shape. mfcc*delta + pitch = 13 * 3 + 3 = 42 assert feats['u1'].shape == (98, 42) assert feats['u2'].shape == (18, 42) assert feats['u3'].shape == (40, 42) # check cmvn assert feats['u2'].data[:, :13].mean() == pytest.approx(0.0, abs=1e-6) assert feats['u2'].data[:, :13].std() == pytest.approx(1.0, abs=1e-6) data = np.vstack((feats['u1'].data[:, :13], feats['u3'].data[:, :13])) assert data.mean() == pytest.approx(0.0, abs=1e-6) assert data.std() == pytest.approx(1.0, abs=1e-6) assert np.abs(data.mean()) <= np.abs(feats['u1'].data[:, :13].mean()) assert np.abs(data.std() - 1.0) <= np.abs(feats['u1'].data[:, :13].std() - 1.0) assert np.abs(data.mean()) <= np.abs(feats['u3'].data[:, :13].mean()) assert np.abs(data.std() - 1.0) <= np.abs(feats['u3'].data[:, :13].std() - 1.0) # save / load the features filename = str(tmpdir.join('feats' + ext)) feats.save(filename) feats2 = FeaturesCollection.load(filename) assert feats2 == feats