def _check_vggish_ckpt_exists(): """check VGGish checkpoint exists or not.""" util.maybe_create_directory(FLAGS.vggish_ckpt_dir) if not util.is_exists(vggish_ckpt_path): url = 'https://storage.googleapis.com/audioset/vggish_model.ckpt' util.maybe_download(url, params.VGGISH_CHECKPOINT_DIR) if not util.is_exists(vggish_pca_path): url = 'https://storage.googleapis.com/audioset/vggish_pca_params.npz' util.maybe_download(url, params.VGGISH_CHECKPOINT_DIR)
def arange_urban_sound_file_by_class(): """Arange urban sound file by it's class.""" src_paths = '/data1/data/UrbanSound8K-16bit/audio' dst_dir = '/data1/data/UrbanSound8K-16bit/audio-classfied' CLASSES = [ 'air conditioner', 'car horn', 'children playing', 'dog bark', 'drilling', 'engine idling', 'gun shot', 'jackhammer', 'siren', 'street music' ] CLASSES_STRIPED = [c.replace(' ', '_') for c in CLASSES] for src in src_paths: lbl = urban_labels([src])[0] dst = '{dir}/{label}'.format(dir=dst_dir, label=CLASSES_STRIPED[lbl]) maybe_create_directory(dst) maybe_copy_file( src, '{dst}/{name}'.format(dst=dst, name=os.path.split(src)[-1]))
def convert_urban_pcm24_to_pcm16(): """Convert urban sound codec from PCM_24 to PCM_16.""" src_dir = [ '/data1/data/UrbanSound8K/audio/fold{:d}'.format(i + 1) for i in range(10) ] dst_dir = [ '/data1/data/UrbanSound8K-16bit/audio/fold{:d}'.format(i + 1) for i in range(10) ] converted_wav_paths = [] for dsrc, ddst in zip(src_dir, dst_dir): maybe_create_directory(ddst) wav_files = filter(lambda FP: FP if FP.endswith('.wav') else None, [FP for FP in os.listdir(dsrc)]) for wav_file in wav_files: src_wav, dst_wav = os.path.join(dsrc, wav_file), os.path.join( ddst, wav_file) convert_wav(src_wav, dst_wav, subtype='PCM_16') converted_wav_paths.append(dst_wav) print('converted count:', len(converted_wav_paths)) print(converted_wav_paths, len(converted_wav_paths))
def _create_records(): """Create audio `train`, `test` and `val` records file.""" tf.logging.info("Create records..") util.maybe_create_directory(FLAGS.records_dir) _check_vggish_ckpt_exists() wav_files, wav_labels = _wav_files_and_labels() tf.logging.info('Possible labels: {}'.format(set(wav_labels))) train, test, val = util.train_test_val_split(wav_files, wav_labels) with VGGishExtractor(vggish_ckpt_path, vggish_pca_path, params.VGGISH_INPUT_TENSOR_NAME, params.VGGISH_OUTPUT_TENSOR_NAME) as ve: train_x, train_y = train ve.create_records(train_records_path, train_x, train_y) test_x, test_y = test ve.create_records(test_records_path, test_x, test_y) val_x, val_y = val ve.create_records(val_records_path, val_x, val_y) tf.logging.info('Dataset size: Train-{} Test-{} Val-{}'.format( len(train_y), len(test_y), len(val_y)))
def create_records(self, record_path, wav_files, wav_labels): """Create TF Records from wav files and corresponding labels.""" record_dir = os.path.dirname(record_path) maybe_create_directory(record_dir) writer = tf.python_io.TFRecordWriter(record_path) N = len(wav_labels) n = 1 for (wav_file, wav_label) in zip(wav_files, wav_labels): tf.logging.info('[{}/{}] Extracting VGGish feature:' ' label: {} - {}'.format(n, N, wav_label, wav_file)) n += 1 features = self.wavfile_to_features(wav_file) num_features = features.shape[0] # one feature for one second if num_features == 0: tf.logging.warning('No vggish features:' ' label: {} - {}'.format(wav_label, wav_file)) continue cur_wav_labels = [wav_label] * num_features for (f, l) in zip(features, cur_wav_labels): example = encodes_example(np.float64(f), np.int64(l)) writer.write(example.SerializeToString()) writer.close()
test_records_path = os.path.join(FLAGS.records_dir, params.TF_RECORDS_TEST_NAME) val_records_path = os.path.join(FLAGS.records_dir, params.TF_RECORDS_VAL_NAME) vggish_ckpt_path = os.path.join(FLAGS.vggish_ckpt_dir, params.VGGISH_CHECKPOINT_NAME) vggish_pca_path = os.path.join(FLAGS.vggish_ckpt_dir, params.VGGISH_PCA_PARAMS_NAME) tensorboard_dir = os.path.join(params.TENSORBOARD_DIR, FLAGS.train_name) audio_ckpt_dir = os.path.join(FLAGS.audio_ckpt_dir, FLAGS.train_name) util.maybe_create_directory(tensorboard_dir) util.maybe_create_directory(audio_ckpt_dir) def _add_triaining_graph(): with tf.Graph().as_default() as graph: logits = define_audio_slim(training=True) tf.summary.histogram('logits', logits) # define training subgraph with tf.variable_scope('train'): labels = tf.placeholder(tf.float32, shape=[None, params.NUM_CLASSES], name='labels') cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits, labels=labels, name='cross_entropy') loss = tf.reduce_mean(cross_entropy, name='loss_op')