def extract_audioset_features(paths, path2gt): """Extracts Audioset features and their corresponding ground_truth and identifiers (the path). Audioset features are extracted from non-overlapping audio patches of 0.96 seconds, where each audio patch covers 64 mel bands and 96 frames of 10 ms each. We repeat ground_truth and identifiers to fit the number of extracted Audioset features. """ # 1) Extract log-mel spectrograms first_audio = True for p in paths: if first_audio: input_data = vggish_input.wavfile_to_examples( config['audio_folder'] + p) ground_truth = np.repeat(path2gt[p], input_data.shape[0], axis=0) identifiers = np.repeat(p, input_data.shape[0], axis=0) first_audio = False else: tmp_in = vggish_input.wavfile_to_examples(config['audio_folder'] + p) input_data = np.concatenate((input_data, tmp_in), axis=0) tmp_gt = np.repeat(path2gt[p], tmp_in.shape[0], axis=0) ground_truth = np.concatenate((ground_truth, tmp_gt), axis=0) tmp_id = np.repeat(p, tmp_in.shape[0], axis=0) identifiers = np.concatenate((identifiers, tmp_id), axis=0) # 2) Load Tensorflow model to extract Audioset features with tf.Graph().as_default(), tf.Session() as sess: vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, 'vggish_model.ckpt') features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) extracted_feat = sess.run([embedding_tensor], feed_dict={features_tensor: input_data}) feature = np.squeeze(np.asarray(extracted_feat)) return [feature, ground_truth, identifiers]
def embedding(self, input_paths, output_paths): """Run VGGish embedding.""" paths = list(zip(input_paths, output_paths)) with tf.Graph().as_default(), tf.Session() as sess: vggish_slim.define_vggish_slim() vggish_slim.load_vggish_slim_checkpoint(sess, self.model_checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) func = partial( self._embed, sess=sess, features_tensor=features_tensor, embedding_tensor=embedding_tensor, ) self.single_process(func, paths)
def CreateVGGishNetwork(sess, hop_size=0.96): # Hop size is in seconds. """Define VGGish model, load the checkpoint, and return a dictionary that points to the different tensors defined by the model. """ vggish_slim.define_vggish_slim() checkpoint_path = 'vggish_model.ckpt' vggish_params.EXAMPLE_HOP_SECONDS = hop_size vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) layers = { 'conv1': 'vggish/conv1/Relu', 'pool1': 'vggish/pool1/MaxPool', 'conv2': 'vggish/conv2/Relu', 'pool2': 'vggish/pool2/MaxPool', 'conv3': 'vggish/conv3/conv3_2/Relu', 'pool3': 'vggish/pool3/MaxPool', 'conv4': 'vggish/conv4/conv4_2/Relu', 'pool4': 'vggish/pool4/MaxPool', 'fc1': 'vggish/fc1/fc1_2/Relu', 'fc2': 'vggish/fc2/Relu', 'embedding': 'vggish/embedding', 'features': 'vggish/input_features', } g = tf.get_default_graph() for k in layers: layers[k] = g.get_tensor_by_name(layers[k] + ':0') return { 'features': features_tensor, 'embedding': embedding_tensor, 'layers': layers, }
def OutputAudioEmbeddings(wav_file_path, save_path): # In this simple example, we run the examples from a single audio file through # the model. If none is provided, we generate a synthetic input. if os.path.isfile(wav_file_path) and not os.path.isfile(save_path + '.npy'): wav_file = wav_file_path print(wav_file_path) print(save_path + '.npy') examples_batch = vggish_input.wavfile_to_examples(wav_file) # print(examples_batch) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) # If needed, prepare a record writer to store the postprocessed embeddings. writer = tf.python_io.TFRecordWriter( FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) # print(embedding_batch) postprocessed_batch = pproc.postprocess(embedding_batch) # print(postprocessed_batch.shape) np.save(save_path, postprocessed_batch)
def extract_wav_features(f_dir): examples_batch = vggish_input.wavfile_to_examples(f_dir) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor('vggish_pca_params.npz') with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, 'vggish_model.ckpt') features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) print(embedding_batch) postprocessed_batch = pproc.postprocess(embedding_batch) print(postprocessed_batch) return postprocessed_batch
def generate_embedding(filePath): # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor("vggish_pca_params.npz") output = None with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, "vggish_model.ckpt") features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) examples_batch = vggish_input_mod.wavfile_to_examples(filePath) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) postprocessed_batch = pproc.postprocess(embedding_batch) output = postprocessed_batch return output
def main(wav_file, flag_for_data, data, model_type): """ #Specify the path for the downloaded or recorded audio files and #also path for writing the embeddings or pickle files """ if flag_for_data == 0: if wav_file: pkl = wav_file[:-4] + '.pkl' # print (pkl) examples_batch = vggish_input.wavfile_to_examples(wav_file) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) postprocessed_batch = pproc.postprocess(embedding_batch) return postprocessed_batch # print(postprocessed_batch) elif flag_for_data == 1: predict_prob, predictions = model_function_binary_relevance.predictions_wavfile( data, model_type) K.clear_session() return predict_prob, predictions
def setup(self): # Paths to downloaded VGGish files. self.checkpoint_path = 'vggish_model.ckpt' self.pca_params_path = 'vggish_pca_params.npz' self.batch_size = 60 # If we can't find the trained model files, download them if not os.path.exists(self.checkpoint_path): print('AudiosetAnalysis: Downloading model file {} (please wait - this may take a while)'.format(self.checkpoint_path)) urllib.urlretrieve('https://storage.googleapis.com/audioset/vggish_model.ckpt', self.checkpoint_path) if not os.path.exists(self.pca_params_path): print('AudiosetAnalysis: Downloading params file {} (please wait - this may take a while)'.format(self.pca_params_path)) urllib.urlretrieve('https://storage.googleapis.com/audioset/vggish_pca_params.npz', self.pca_params_path) # Define VGGish self.sess = tf.Graph().as_default() config = tf.ConfigProto(device_count={'CPU': 4}) self.sess = tf.Session(config=config) # Load the checkpoint vggish_slim.define_vggish_slim() vggish_slim.load_vggish_slim_checkpoint(self.sess, self.checkpoint_path) self.features_tensor = self.sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME) self.embedding_tensor = self.sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME)
def loadVGGish(sess, number_of_classes, lr = vggish_params.LEARNING_RATE): embeddings = vggish_slim.define_vggish_slim(True) # Do we train VGG-ish? # Define a shallow classification model and associated training ops on top # of VGGish. with tf.variable_scope('mymodel'): # Add a fully connected layer with 100 units. num_units = 100 fc = slim.fully_connected(embeddings, num_units) # Add a classifier layer at the end, consisting of parallel logistic # classifiers, one per class. This allows for multi-class tasks. logits = slim.fully_connected( fc, number_of_classes, activation_fn=None, scope='logits') pred = tf.sigmoid(logits, name='prediction') # Add training ops. with tf.variable_scope('train'): global_step = tf.Variable( 0, name='global_step', trainable=False, collections=[tf.GraphKeys.GLOBAL_VARIABLES, tf.GraphKeys.GLOBAL_STEP]) # Labels are assumed to be fed as a batch multi-hot vectors, with # a 1 in the position of each positive class label, and 0 elsewhere. labels = tf.placeholder( tf.float32, shape=(None, number_of_classes), name='labels') # Cross-entropy label loss. xent = tf.nn.sigmoid_cross_entropy_with_logits( logits=logits, labels=labels, name='xent') loss = tf.reduce_mean(xent, name='loss_op') tf.summary.scalar('loss', loss) # We use the same optimizer and hyperparameters as used to train VGGish. optimizer = tf.train.AdamOptimizer( learning_rate=lr, epsilon=vggish_params.ADAM_EPSILON) optimizer.minimize(loss, global_step=global_step, name='train_op') # Initialize all variables in the model, and then load the pre-trained # VGGish checkpoint. sess.run(tf.global_variables_initializer()) vggish_slim.load_vggish_slim_checkpoint(sess, './vggish_model.ckpt') return logits, pred
def define_model(self, sess): # Define VGGish. embeddings = vggish_slim.define_vggish_slim(FLAGS.train_vggish) # Define a shallow classification model and associated training ops on top # of VGGish. with tf.variable_scope('mymodel'): # Add a fully connected layer with 100 units. num_units = 100 fc = slim.fully_connected(embeddings, num_units) # Add a classifier layer at the end, consisting of parallel logistic # classifiers, one per class. This allows for multi-class tasks. self.logits = slim.fully_connected(fc, self._NUM_CLASSES, activation_fn=None, scope='logits') self.prediction = tf.sigmoid(self.logits, name='prediction') if (self.isTrain): self.add_training_op(sess)
freq = 1000 sr = 44100 t = np.linspace(0, num_secs, int(num_secs * sr)) x = np.sin(2 * np.pi * freq * t) # Produce a batch of log mel spectrogram examples. input_batch = vggish_input.waveform_to_examples(x, sr) print('Log Mel Spectrogram example: ', input_batch[0]) np.testing.assert_equal( input_batch.shape, [num_secs, vggish_params.NUM_FRAMES, vggish_params.NUM_BANDS]) # Define VGGish, load the checkpoint, and run the batch through the model to # produce embeddings. with tf.Graph().as_default(), tf.Session() as sess: vggish_slim.define_vggish_slim() vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: input_batch}) print('VGGish embedding: ', embedding_batch[0]) expected_embedding_mean = 0.131 expected_embedding_std = 0.238 np.testing.assert_allclose( [np.mean(embedding_batch), np.std(embedding_batch)], [expected_embedding_mean, expected_embedding_std],
def main(_): parser = argparse.ArgumentParser(description="Options") parser.add_argument("-i", "--input", help="Input of catalog", default="./") parser.add_argument("-s", "--save_serialized_file", default="../serialized_file.pickle", help="Save serialiazed file") parser.add_argument("-l", "--load_serialized_files", help="Load serialiazed files from catalog") parser.add_argument("-e", "--etalon_class_file", help="Load etalon file of class") parser.add_argument("-t", "--etalon_class_id", help="Load etalon file with content of class id") parser.add_argument("-m", "--merge", help="Number seconds for merge", default=1) parser.add_argument("-v", "--vad", help="use VAD optimization", default=True) parser.add_argument("--save_list_files", help="Serialize list of files", default=False) parser.add_argument("--layer", help="Layer of serialized data", default='embedding') parser.add_argument("--load_test_data", help="Load test data from catalog") parser.add_argument("--models", help="Catalog for save/load checkpoint of models") parser.add_argument("--folds", help="Number of folds", default=int(3)) parser.add_argument("--lr", help="learning rate", default=float(0.0004)) parser.add_argument("--lr_lim", help="limit of learning rate", default=float(0.0001)) parser.add_argument("--factor", help="new_lr = factor * lr", default=float(0.8)) parser.add_argument("--tensorboard", help="new_lr = factor * lr", default="tensorboard") parser.add_argument("--batch_size", help="Batch size", default=int(32)) parser.add_argument("--scheduler_mode", help="Scheduler of learning rate", default=None) parser.add_argument("--optimizer", help="Optimizer of learning rate", default='SGD') args = parser.parse_args() checkPath = os.path.normpath(str(args.models)) loadSerializeFiles = args.load_serialized_files loadTestData = args.load_test_data etalon_class_file = args.etalon_class_file etalon_class_id = args.etalon_class_id merge_sec = args.merge vad_optimization = bool(args.vad) save_list_files = bool(args.save_list_files) saveSerilizationFile = args.save_serialized_file inputCatalog = args.input layer = args.layer folds = int(args.folds) lr = float(args.lr) lr_lim = float(args.lr_lim) factor = float(args.factor) tboard = str(args.tensorboard) batch_size = int(args.batch_size) scheduler_mode = args.scheduler_mode optimizer = str(args.optimizer) if checkPath == "None": checkPath = getExecPath() t, h = os.path.split(checkPath) dtn = str(dt.datetime.now()).split(" ") dtn = str(dtn[1]).split(".") dtn = str(dtn[0]).replace(":", "_") checkPath = t + '//nnmodels_' + dtn checkPath = os.path.normpath(checkPath) t, h = os.path.split(checkPath) tboard = t + '//' + tboard + "//" + h if not os.path.exists(checkPath): os.makedirs(checkPath) checkPath = checkPath + '//' + str(layer) + '_lr_' + str( lr) + '_factor_' + str(factor) + '_folds_' + str(folds) checkPath = os.path.normpath(checkPath) _, h = os.path.split(checkPath) tboard = os.path.normpath(tboard + '//' + h) if save_list_files: serializeData = loadSerializedListFiles(loadTestData) with open(os.path.normpath( "D:\\repo\\ML\\test_post_competition.csv")) as f: et_files = list() for line in f.readlines(): block = line.split(",") et_files.append([block[0], block[1]]) new_merge_data = list() count = len(serializeData) pos = 0 for sdata in serializeData: fname = sdata['file_name'] features = sdata['features'] for label_et in et_files: if (str(fname) == str(label_et[0]) and str(label_et[1]) != "None"): label_id, _ = find_type_class(etalon_class_id, label_et[1], pos=1) features_data = { "file_name": fname, "label_id": int(label_id), "features": features } new_merge_data.append(features_data) break pos = pos + 1 status_string = str(pos) + "/" + str(count) print(termcolor.colored(status_string, "green")) t, h = os.path.split(loadTestData) h = str(h).split(".") save_list = os.path.normpath(str(t + h[0] + "_label.features")) with open(save_list, "wb") as f2: pickle.dump(new_merge_data, f2) print(termcolor.colored("save: " + save_list, "green")) return if loadSerializeFiles: serializeData = loadSerializedListFiles(loadSerializeFiles) if not serializeData: raise Exception("Can not unpack serialized data") skf = StratifiedKFold(n_splits=folds) y = np.asarray([labels['label_id'] for labels in serializeData]) modelsPath, _ = os.path.split(checkPath) modelsPath = modelsPath + "//*.hdf5" models = glob.glob(modelsPath) if len(models) > 0: for i in models: print(termcolor.colored(str(i), "green")) else: print( termcolor.colored( str("Not find models in catalog:" + checkPath), "red")) if len(models) == 0: i = int(0) for train_index, valid_index in skf.split(serializeData, y): train_data = DataGenerator(serializeData, train_index, count_class=41, batch_size=batch_size, koeff_merge=int(merge_sec), layer=layer) valid_data = DataGenerator(serializeData, valid_index, count_class=41, batch_size=batch_size, koeff_merge=int(merge_sec), layer=layer) shape = train_data.get_shape() dnn = VggDNN(input_shape=shape, lr=lr, optimizer=optimizer) postfix = '_' + str(i) + '.hdf5' dnn_model_path = os.path.normpath(checkPath + postfix) tboard = os.path.normpath(tboard + postfix) dnn.train(train_data, valid_data, checkPath=dnn_model_path, batch_size=batch_size, factor=float(factor), tensorboardPath=tboard, lim_lr=lr_lim, scheduler_mode=scheduler_mode, iteration=train_data.__len__()) i = i + 1 # del dnn models = glob.glob(modelsPath) # batch_data.deleted_garbage() actual = list() predicts = list() pos = int(0) # status_string = "Number of models: " + str(len(models)) # print(termcolor.colored(status_string,"green")) dnn_models = list() for m in models: dnn_models.append(VggDNN(path=m)) # resuls_predict_string = list() # # resuls_predict_string.append(["fname,label"]) # # # # for soundfiles,labels,names in batch_data.get_soundfiles(): # # count = batch_data.get_countfiles() # # # # for sound,label,name in zip(soundfiles,labels,names): # # # # predict_merge = np.empty(shape=(0,41)); # # # # for model in dnn_models: # # # # # sound_ex = np.expand_dims(sound, axis=2) # # # predict = model.predict_on_batch(np.asarray(sound_ex)) # # predict = model.predict_on_batch(np.asarray(sound)) # # # mean_predict = np.mean(predict, axis=0) # # # mean_predict = mean_predict.reshape(np.shape(predict)[1],1) # # predict_merge = np.concatenate((predict_merge,predict), axis=0) # # # # mean = np.mean(np.asarray(predict_merge),axis=0) # # amax = np.argsort(mean, axis=0) # # amax = amax[::-1] # # # # predict_string = name # # # # if etalon_class_id: # # for r in amax[0:3]: # # predict_string = predict_string + str(" ") + str(find_type_class(etalon_class_id,r)[1]) # # # # predict_string = predict_string + " origin: "+ str(find_type_class(etalon_class_id,np.argmax(label))) # # # # resuls_predict_string.append(predict_string) # # # # actual.append([np.argmax(label)]) # # predicts.append(list(amax)) # # # # pos = pos + 1 # # status_string = "Calculate predict: " + str(pos) + "/" + str(count) # # print(termcolor.colored(status_string,"green")) # # # print(termcolor.colored(str(mean),"green")) # # met = average_precision.mapk(actual,predicts,k=3) # result_string = "Predict: "+str(met) # # print(termcolor.colored(result_string,"green")) # # print(termcolor.colored(resuls_predict_string,"green")) # rp = getExecPath() # t,h = os.path.split(rp) # t = t + "//result.log" # print_result(t,resuls_predict_string) pos = int(0) result_predict_string = list() result_predict_string.append(str("fname,label")) actual = list() predicts = list() if loadTestData and os.path.isdir(loadTestData): test_data = loadSerializedListFiles(loadTestData) batch_data = Batcher(test_data, layer=layer, koeff_merge=int(merge_sec), shuffle=True, n_splits=2) for soundfiles, labels, names in batch_data.get_soundfiles(): count = batch_data.get_countfiles() for sound, label, name in zip(soundfiles, labels, names): predict_merge = np.empty(shape=(0, 41)) for model in dnn_models: predict = model.predict_on_batch(np.asarray(sound)) predict_merge = np.concatenate( (predict_merge, predict), axis=0) mean = np.mean(np.asarray(predict_merge), axis=0) amax = np.argsort(mean, axis=0) amax = amax[::-1] predict_string = name + str(",") if etalon_class_id: for r in amax[0:3]: predict_string = predict_string + str(" ") + str( find_type_class(etalon_class_id, r)[1]) result_predict_string.append(predict_string) actual.append([np.argmax(label)]) predicts.append(list(amax)) pos = pos + 1 status_string = "Calculate predict: " + str( pos) + "/" + str(count) print(termcolor.colored(status_string, "green")) met = average_precision.mapk(actual, predicts, k=3) result_string = "Predict test data: " + str(met) print(termcolor.colored(result_string, "green")) rp = getExecPath() t, h = os.path.split(rp) t = t + "//test_result.log" print_result(t, result_predict_string) return Catalogs = inputCatalog + "/*/" Catalogs = glob.glob(Catalogs) Catalogs = Catalogs + [inputCatalog] listWavFiles, countFiles = makeWaveFilesList(Catalogs) if not listWavFiles: print(inputCatalog + ": this catalog has not wav files") return else: processedFiles = 0 for wav_file in listWavFiles: processedFiles = processedFiles + 1 error_string = "Input file : " + wav_file + " - " if (os.path.getsize(wav_file) == 0): print(termcolor.colored(error_string, "red")) continue sample_rate, wav_data = wavfile.read(wav_file) if vad_optimization == True: wav_data = vad.apply_vad(wav_data, sample_rate) assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype samples = wav_data / 32768.0 # Convert to [-1.0, +1.0] if (np.shape(samples)[0] < sample_rate): append_size = sample_rate - np.shape(samples)[0] samples = np.append(samples, np.full(append_size, float(0))) examples_batch = vggish_input.waveform_to_examples( samples, sample_rate) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) try: with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint( sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) flatten_result = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_FL_NAME) # Run inference and postprocessing. [embedding_batch, flatten_batch ] = sess.run([embedding_tensor, flatten_result], feed_dict={features_tensor: examples_batch}) postprocessed_batch = pproc.postprocess(embedding_batch) wav_file_name = os.path.basename(wav_file) label_id, _ = find_id_class(etalon_class_file, etalon_class_id, wav_file_name) if label_id == -1: label_id = getCatalogName(wav_file) features_batch = { "embedding": embedding_batch, "flatten": flatten_batch, "postprocessing": postprocessed_batch } features_data = { "file_name": wav_file_name, "label_id": int(label_id), "features": features_batch } if saveSerilizationFile: with open( saveSerilizationFile + "." + str(processedFiles) + ".features", "wb") as f: pickle.dump(features_data, f) error_string = error_string + "successful" color = "green" if label_id == -1: color = "yellow" error_string = error_string + ". File is not classification" error_string = error_string + " (" + str( processedFiles) + "/" + str(countFiles) + ")" print(termcolor.colored(error_string, color)) except: error_string = error_string + "failed" error_string = error_string + " (" + str( processedFiles) + "/" + str(countFiles) + ")" print(termcolor.colored(error_string, "red")) continue if saveSerilizationFile: tail, head = os.path.split(saveSerilizationFile) serilizationFilesList = tail + "//*.features" serilizationFilesList = glob.glob(serilizationFilesList) features_data_merge = list() for sfeature in serilizationFilesList: with open(sfeature, "rb") as fd: features_data = pickle.load(fd) features_data_merge.append(features_data) os.remove(sfeature) with open(saveSerilizationFile + ".features", "wb") as f: pickle.dump(features_data_merge, f)
def preprocess_data(): audio_root_dir = Path(r'C:\Users\zhanglichuan\Desktop\ECE496\data') audio_file_pattern = Path(r'**/*.wav') # takes about 6-8 min on my machine counter = 0 oldm, oldn = 0, 0 for audio_file in glob.iglob(str(audio_root_dir / audio_file_pattern), recursive=True): #load label sample = wavfile_to_examples(audio_file) #print(audio_file) image_path = re.sub('.wav', '.jpg', os.path.split(audio_file)[1]) image_path = os.path.join(image_dir, image_path) #print(image_path) input_image = load_img(image_path, target_size=(image_size, image_size), color_mode='grayscale') input_image = img_to_array(input_image) #input_image = np.expand_dims(input_image, axis=0) input_image = preprocess_input(input_image) if sample.shape[0] == 0 or get_emotion_label( audio_file) == 0 or get_emotion_label(audio_file) == 1: continue else: labels.append(get_emotion_label(audio_file) - 2) temp_dict[counter] = sample image_list.append(input_image) if counter % 100 == 0: print('Processing the {}th file: {}'.format(counter, audio_file)) counter += 1 oldm, oldn = 0, 0 check = temp_dict print("start to construct embedding feature from input") with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint( sess, r'C:\Users\zhanglichuan\Desktop\ECE496\lstm\vggish_model.ckpt') features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. counter = 0 for key in temp_dict: #print(counter) #print(temp_dict[key]) [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: temp_dict[key]}) embedding_dict[key] = embedding_batch m, n = embedding_batch.shape[0], embedding_batch.shape[1] if m > oldm: oldm = m if n > oldn: oldn = n if counter % 100 == 0: print('Processing the {}th file: {}'.format( counter, audio_file)) counter += 1 maxLen = oldm * oldn pproc = vggish_postprocess.Postprocessor( r'C:\Users\zhanglichuan\Desktop\ECE496\lstm\vggish_pca_params.npz') train_set = [] counter = 0 for key in embedding_dict: print(key) test = embedding_dict[key] embed_sample = embedding_dict[key].flatten() tempOne = np.pad(embed_sample, (0, maxLen - embed_sample.shape[0]), mode='constant', constant_values=0) temp_embed = np.reshape(tempOne, (1, oldm, oldn)) if counter == 0: train_set = temp_embed else: train_set = np.concatenate((train_set, temp_embed), axis=0) if counter % 100 == 0: print('Processing the {}th file: {}'.format(counter, audio_file)) counter += 1 print("preprocess finished") with open(os.path.join(script_path, 'labels.txt'), 'wb') as tfp: pickle.dump(labels, tfp) with open(os.path.join(script_path, 'train_set.txt'), 'wb') as tdfp: pickle.dump(train_set, tdfp) with open(os.path.join(script_path, 'image.txt'), 'wb') as imfp: pickle.dump(image_list, imfp)
def main(_): # If needed, prepare a record writer to store the postprocessed embeddings. writer = tf.python_io.TFRecordWriter( FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None # In this simple example, we run the examples from all audio files in a directory through the model. for infile in os.listdir(path=FLAGS.wav_dir): wav_file = str(FLAGS.wav_dir) + infile #####Parse file name for the video id depending on the dataset #### ## EC 50 Dataset, Format: cross validation group - sound file id from freesound - Sound segment - Label (0 to 49).wav ## if (FLAGS.dataset == "EC50"): namegps = infile.split('-') videoid = namegps[0] + "-" + namegps[1] + "-" + namegps[2] label = namegps[3].split('.')[0] ## Urban Sound Dataset, Format: sound file id from freesound - Label (0 to 9) - occurance id - sound segment id.wav ## elif (FLAGS.dataset == "UrbanSound"): namegps = infile.split('-') videoid = namegps[0] + "-" + namegps[2] + "-" + namegps[3].split( '.')[0] label = namegps[1] else: print("Please specify one of the supported datasets.") examples_batch = vggish_input.wavfile_to_examples(wav_file) print(examples_batch) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) print(embedding_batch) postprocessed_batch = pproc.postprocess(embedding_batch) print(postprocessed_batch) # Write the postprocessed embeddings as a SequenceExample, in a similar # format as the features released in AudioSet. Each row of the batch of # embeddings corresponds to roughly a second of audio (96 10ms frames), and # the rows are written as a sequence of bytes-valued features, where each # feature value contains the 128 bytes of the whitened quantized embedding. seq_example = tf.train.SequenceExample( context=tf.train.Features( feature={ 'labels': tf.train.Feature(int64_list=tf.train.Int64List( value=[int(label)])), 'video_id': tf.train.Feature(bytes_list=tf.train.BytesList( value=[bytes(videoid, 'utf-8')])) }), feature_lists=tf.train.FeatureLists( feature_list={ vggish_params.AUDIO_EMBEDDING_FEATURE_NAME: tf.train.FeatureList(feature=[ tf.train.Feature(bytes_list=tf.train.BytesList( value=[embedding.tobytes()])) for embedding in postprocessed_batch ]) })) print(seq_example) if writer: writer.write(seq_example.SerializeToString()) if writer: writer.close()
freq = 1000 sr = 44100 t = np.linspace(0, num_secs, int(num_secs * sr)) x = np.sin(2 * np.pi * freq * t) # Produce a batch of log mel spectrogram examples. input_batch = vggish_input.waveform_to_examples(x, sr) print('Log Mel Spectrogram example: ', input_batch[0]) np.testing.assert_equal( input_batch.shape, [num_secs, vggish_params.NUM_FRAMES, vggish_params.NUM_BANDS]) # Define VGGish, load the checkpoint, and run the batch through the model to # produce embeddings. with tf.Graph().as_default(), tf.Session() as sess: vggish_slim.define_vggish_slim() vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: input_batch}) print('VGGish embedding: ', embedding_batch[0]) expected_embedding_mean = 0.131 expected_embedding_std = 0.238 np.testing.assert_allclose( [np.mean(embedding_batch), np.std(embedding_batch)], [expected_embedding_mean, expected_embedding_std], rtol=rel_error)
os.path.join(audio_path, folder, sentence, scenario)): if audio[-4:] == '.wav': wav_name = os.path.join(audio_path, folder, sentence, scenario, audio) wav_rate, wav_samples = wavfile.read(wav_name) if len(wav_samples) < wav_rate: wav_samples = numpy.pad( wav_samples, (0, wav_rate - len(wav_samples)), 'constant') samples = vggish_input.waveform_to_examples( wav_samples, wav_rate) with tensorflow.Graph().as_default(), tensorflow.Session( ) as session: vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint( session, args.model_file) samples_tensor = session.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) features_tensor = session.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) [features ] = session.run([features_tensor], feed_dict={samples_tensor: samples}) output_file = os.path.join(acoustic_features_path, folder, sentence, scenario) os.makedirs(output_file, exist_ok=True)
def main(_): # In this simple example, we run the examples from a single audio file through # the model. If none is provided, we generate a synthetic input. if FLAGS.wav_file: wav_file = FLAGS.wav_file else: # Write a WAV of a sine wav into an in-memory file object. num_secs = 5 freq = 1000 sr = 44100 t = np.linspace(0, num_secs, int(num_secs * sr)) x = np.sin(2 * np.pi * freq * t) # Convert to signed 16-bit samples. samples = np.clip(x * 32768, -32768, 32767).astype(np.int16) wav_file = six.BytesIO() wavfile.write(wav_file, sr, samples) wav_file.seek(0) examples_batch = vggish_input.wavfile_to_examples(wav_file) print(examples_batch) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) # If needed, prepare a record writer to store the postprocessed embeddings. writer = tf.python_io.TFRecordWriter( FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) print(embedding_batch) postprocessed_batch = pproc.postprocess(embedding_batch) print(postprocessed_batch) #pdb.set_trace() # Write the postprocessed embeddings as a SequenceExample, in a similar # format as the features released in AudioSet. Each row of the batch of # embeddings corresponds to roughly a second of audio (96 10ms frames), and # the rows are written as a sequence of bytes-valued features, where each # feature value contains the 128 bytes of the whitened quantized embedding. seq_example = tf.train.SequenceExample( feature_lists=tf.train.FeatureLists( feature_list={ vggish_params.AUDIO_EMBEDDING_FEATURE_NAME: tf.train.FeatureList( feature=[ tf.train.Feature( bytes_list=tf.train.BytesList( value=[embedding.tobytes()])) for embedding in postprocessed_batch ] ) } ) ) print(seq_example) if writer: writer.write(seq_example.SerializeToString()) if writer: writer.close()
def main(_): # In this simple example, we run the examples from a single audio file through # the model. If none is provided, we generate a synthetic input. if FLAGS.wav_file: wav_file = FLAGS.wav_file else: # Write a WAV of a sine wav into an in-memory file object. num_secs = 5 freq = 1000 sr = 44100 t = np.linspace(0, num_secs, int(num_secs * sr)) x = np.sin(2 * np.pi * freq * t) # Convert to signed 16-bit samples. samples = np.clip(x * 32768, -32768, 32767).astype(np.int16) wav_file = six.BytesIO() wavfile.write(wav_file, sr, samples) wav_file.seek(0) examples_batch = vggish_input.wavfile_to_examples(wav_file) print(examples_batch) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) # If needed, prepare a record writer to store the postprocessed embeddings. writer = tf.python_io.TFRecordWriter( FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) print(embedding_batch) postprocessed_batch = pproc.postprocess(embedding_batch) print(postprocessed_batch) # Write the postprocessed embeddings as a SequenceExample, in a similar # format as the features released in AudioSet. Each row of the batch of # embeddings corresponds to roughly a second of audio (96 10ms frames), and # the rows are written as a sequence of bytes-valued features, where each # feature value contains the 128 bytes of the whitened quantized embedding. seq_example = tf.train.SequenceExample( feature_lists=tf.train.FeatureLists( feature_list={ vggish_params.AUDIO_EMBEDDING_FEATURE_NAME: tf.train.FeatureList( feature=[ tf.train.Feature( bytes_list=tf.train.BytesList( value=[embedding.tobytes()])) for embedding in postprocessed_batch ] ) } ) ) print(seq_example) if writer: writer.write(seq_example.SerializeToString()) if writer: writer.close()
def main(_): # In this simple example, we run the examples from a single audio file through # the model. If none is provided, we generate a synthetic input. if FLAGS.wav_file: wav_file = FLAGS.wav_file else: # Write a WAV of a sine wav into an in-memory file object. num_secs = 5 freq = 1000 sr = 44100 t = np.linspace(0, num_secs, int(num_secs * sr)) x = np.sin(2 * np.pi * freq * t) # Convert to signed 16-bit samples. samples = np.clip(x * 32768, -32768, 32767).astype(np.int16) wav_file = six.BytesIO() wavfile.write(wav_file, sr, samples) wav_file.seek(0) examples_batch = vggish_input.wavfile_to_examples(wav_file) #print(examples_batch) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) # If needed, prepare a record writer to store the postprocessed embeddings. writer = tf.python_io.TFRecordWriter( FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) #print(embedding_batch) postprocessed_batch = pproc.postprocess(embedding_batch) #print(postprocessed_batch) ######### ######### ######### ######### ######### ######### ######### # CHANGED CODE - SUPPRESSED THIS PART # # supresss this part; we don't really need to write a sequence example # ######### ######### ######### ######### ######### ######### ######### # Write the postprocessed embeddings as a SequenceExample, in a similar # format as the features released in AudioSet. Each row of the batch of # embeddings corresponds to roughly a second of audio (96 10ms frames), and # the rows are written as a sequence of bytes-valued features, where each # feature value contains the 128 bytes of the whitened quantized embedding. # seq_example = tf.train.SequenceExample( # feature_lists=tf.train.FeatureLists( # feature_list={ # vggish_params.AUDIO_EMBEDDING_FEATURE_NAME: # tf.train.FeatureList( # feature=[ # tf.train.Feature( # bytes_list=tf.train.BytesList( # value=[embedding.tobytes()])) # for embedding in postprocessed_batch # ] # ) # } # ) # ) # #print(seq_example) # if writer: # writer.write(seq_example.SerializeToString()) # if writer: # writer.close() ######### ######### ######### ######### ######### ######### ######### ######### # MODIFIED CODE HERE # # to allow for featurization into a processdir # ######### ######### ######### ######### ######### ######### ######### ######### try: os.chdir(os.getcwd() + '/processdir') except: os.mkdir(os.getcwd() + '/processdir') os.chdir(os.getcwd() + '/processdir') #print(len(postprocessed_batch)) #print(type(postprocessed_batch)) filepath = sys.argv[2] i1 = filepath[::-1].find('/') jsonfilename = filepath[-1 * i1:][0:-4] + '.json' print('writing data to ' + jsonfilename) jsonfile = open(jsonfilename, 'w') data = { 'features': postprocessed_batch.tolist(), } json.dump(data, jsonfile) jsonfile.close()
def main(_): # In this simple example, we run the examples from a single audio file through # the model. If none is provided, we generate a synthetic input. # if FLAGS.wav_file: # wav_file = FLAGS.wav_file # else: # # Write a WAV of a sine wav into an in-memory file object. # num_secs = 5 # freq = 1000 # sr = 44100 # t = np.linspace(0, num_secs, int(num_secs * sr)) # x = np.sin(2 * np.pi * freq * t) # # Convert to signed 16-bit samples. # samples = np.clip(x * 32768, -32768, 32767).astype(np.int16) # wav_file = six.BytesIO() # wavfile.write(wav_file, sr, samples) # wav_file.seek(0) #global vggish_params.VERBOSE vggish_params.VERBOSE = FLAGS.verbose if FLAGS.wav_file_inputdir: wav_file_list = glob.glob( os.path.join(FLAGS.wav_file_inputdir, "*.wav")) else: if FLAGS.wav_file_list: wav_file_list = [ x for x in map(lambda x: x.strip('\n'), open(FLAGS.wav_file_list, 'r').readlines()) ] else: if FLAGS.wav_file: wav_file_list = [FLAGS.wav_file] else: print( "must supply wave file path, file with list of paths, or input directory" ) return for wav_file in wav_file_list: print('RAW WAV FILE: {}'.format(wav_file)) examples_batch = vggish_input.wavfile_to_examples(wav_file) vprint('examples_batch shape') vprint(str(examples_batch.shape)) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) # If needed, prepare a record writer to store the postprocessed embeddings. outputfile_tf = os.path.join( FLAGS.output, "{}.tfrecord".format(os.path.basename(wav_file))) print('TF FILE output : {}'.format(outputfile_tf)) writer = tf.python_io.TFRecordWriter(outputfile_tf) with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) vprint('embedding_batch shape') vprint(str(embedding_batch.shape)) postprocessed_batch = pproc.postprocess(embedding_batch, FLAGS.clip_and_quantize) #vprint(postprocessed_batch) vprint('postprocessed_batch shape') vprint(str(postprocessed_batch.shape)) #calculate means postprocessed_batch_mean = np.mean(postprocessed_batch, axis=0) postprocessed_batch_median = np.median(postprocessed_batch, axis=0) vprint('postprocessed_batch_mean shape') vprint(str(postprocessed_batch_mean.shape)) # Write the postprocessed embeddings as a SequenceExample, in a similar # format as the features released in AudioSet. Each row of the batch of # embeddings corresponds to roughly a second of audio (96 10ms frames), and # the rows are written as a sequence of bytes-valued features, where each # feature value contains the 128 bytes of the whitened quantized embedding. context_features = { 'mean_audio': tf.train.Feature(float_list=tf.train.FloatList( value=postprocessed_batch_mean)), 'median_audio': tf.train.Feature(float_list=tf.train.FloatList( value=postprocessed_batch_median)) } seq_example = tf.train.SequenceExample( context=tf.train.Features(feature=context_features), feature_lists=tf.train.FeatureLists( feature_list={ vggish_params.AUDIO_EMBEDDING_FEATURE_NAME: tf.train.FeatureList(feature=[ tf.train.Feature(bytes_list=tf.train.BytesList( value=[embedding.tobytes()])) for embedding in postprocessed_batch ]) })) #print(seq_example) if writer: writer.write(seq_example.SerializeToString()) jsout = { 'filename': os.path.basename(wav_file), 'mean_audio': postprocessed_batch_mean.tolist(), 'median_audio': postprocessed_batch_median.tolist(), 'audio': postprocessed_batch.tolist() } outputfile = os.path.join( FLAGS.output, "{}.json".format(os.path.basename(wav_file))) print(outputfile) print('JSON FILE output : {}'.format(outputfile)) with open(outputfile, 'w') as outfile: json.dump(jsout, outfile) if writer: writer.close()
def extract_features(dirname, label): pproc = vggish_postprocess.Postprocessor( os.path.join(SELF_DIR, "vggish_pca_params.npz")) for wav_file in glob.glob(dirname + "*.wav"): print(wav_file) try: examples_batch = vggish_input.wavfile_to_examples(wav_file) except: continue tfrecord_path = os.path.join( FLAGS.dest, os.path.basename(wav_file)[:-3] + "tfrecord") writer = tf.python_io.TFRecordWriter(tfrecord_path) with tf.Graph().as_default(), tf.Session() as sess: vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint( sess, os.path.join(SELF_DIR, "vggish_model.ckpt")) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) try: [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) except: continue postprocessed_batch = pproc.postprocess(embedding_batch) nBatches = len(postprocessed_batch) if nBatches < 10: nBatches = 1 else: nBatches = int(nBatches / 10) for i in range(nBatches): seq_example = tf.train.SequenceExample( context=tf.train.Features( feature={ "labels": tf.train.Feature(int64_list=tf.train.Int64List( value=[label])) }), feature_lists=tf.train.FeatureLists( feature_list={ vggish_params.AUDIO_EMBEDDING_FEATURE_NAME: tf.train.FeatureList(feature=[ tf.train.Feature(bytes_list=tf.train.BytesList( value=[embedding.tobytes()])) for embedding in postprocessed_batch[i * 10:i * 10 + 10] ]) })) if writer: writer.write(seq_example.SerializeToString()) if writer: writer.close()
def main(_): print("please speak a word into the microphone") record_to_file('demo.wav') y, sr = librosa.load('demo.wav') print("sampling rate:", sr) print("Recorded sound wave: ") print(sr) wav_file = 'demo.wav' examples_batch = vggish_input.wavfile_to_examples(wav_file) # print(examples_batch) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) # If needed, prepare a record writer to store the postprocessed embeddings. writer = tf.python_io.TFRecordWriter( FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) # print(embedding_batch) postprocessed_batch = pproc.postprocess(embedding_batch) # print(postprocessed_batch) # Write the postprocessed embeddings as a SequenceExample, in a similar # format as the features released in AudioSet. Each row of the batch of # embeddings corresponds to roughly a second of audio (96 10ms frames), and # the rows are written as a sequence of bytes-valued features, where each # feature value contains the 128 bytes of the whitened quantized embedding. tf_seq_example = tf.train.SequenceExample( feature_lists=tf.train.FeatureLists( feature_list={ vggish_params.AUDIO_EMBEDDING_FEATURE_NAME: tf.train.FeatureList(feature=[ tf.train.Feature(bytes_list=tf.train.BytesList( value=[embedding.tobytes()])) for embedding in postprocessed_batch ]) })) # print(tf_seq_example) if writer: writer.write(tf_seq_example.SerializeToString()) if writer: writer.close() X = [] max_len = 10 n_frames = len( tf_seq_example.feature_lists.feature_list['audio_embedding'].feature) # print("number of frames = ", n_frames) audio_frame = [] for i in range(n_frames): audio_frame.append( np.frombuffer( tf_seq_example.feature_lists.feature_list['audio_embedding']. feature[i].bytes_list.value[0], np.uint8).astype(np.float32)) pad = [np.zeros([128], np.float32) for i in range(max_len - n_frames)] audio_frame += pad X.append(audio_frame) X = np.array(X) # print("Dimension before adding newaxis", X.shape) # X = X[newaxis,:,:] # print("Dimension after adding newaxis", X.shape) #Loading LSTM model m4 = load_model('src/models/1LayerLSTM__Loss=BinCE_20Epochs_july02.h5') p4 = m4.predict(X) print("Gunshot score for inference_sample: ====> ", float(p4 * 100), "percent confidence") if (p4 >= 0.51): print("Gunshot present in the clip") else: print("Gunshot is not present in the clip")
def main(_): # Create folders, if necessary for p in (output_dir, log_dir, log_dir_test, log_dir_train, model_dir): create_dir(p) # allow_soft_placement gives fallback GPU, log_device_placement=True displays device info with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto( allow_soft_placement=True)) as sess: now = datetime.datetime.now().isoformat().replace(":", "_") fmt = logging.Formatter( '%(asctime)s:%(name)s:%(levelname)s:%(message)s', '%Y%m%d-%H%M%S') # TF logger tflog = logging.getLogger('tensorflow') tflog.setLevel(log_level) tflog_fh = logging.FileHandler( os.path.join(log_dir, "{}-{}-tf.log".format(FLAGS.model_version, now))) tflog_fh.setLevel(log_level) tflog_fh.setFormatter(fmt) tflog_sh = logging.StreamHandler(sys.stdout) tflog_sh.setLevel(log_level) tflog_sh.setFormatter(fmt) tflog.addHandler(tflog_fh) tflog.addHandler(tflog_sh) # Root logger log = logging.getLogger() log.setLevel(log_level) root_fh = logging.FileHandler( os.path.join(log_dir, "{}-{}-run.log".format(FLAGS.model_version, now))) root_fh.setFormatter(fmt) root_fh.setLevel(log_level) root_sh = logging.StreamHandler(sys.stdout) root_sh.setFormatter(fmt) root_sh.setLevel(log_level) log.addHandler(root_fh) log.addHandler(root_sh) start = time.time() log.info("Model version: {}".format(FLAGS.model_version)) log.info("Number of epochs: {}".format(FLAGS.num_batches)) log.info("Number of classes: {}".format(FLAGS.num_classes)) log.info("Number of Mini batches: {}".format(FLAGS.num_mini_batches)) log.info("Validation enabled: {}".format(FLAGS.validation)) log.info("Size of Validation set: {}".format(FLAGS.test_size)) log.info("Saving model after every {}th step".format(FLAGS.save_step)) run_options = tf.RunOptions(report_tensor_allocations_upon_oom=True) # Define VGGish as our convolutional blocks embeddings = vggish_slim.define_vggish_slim(FLAGS.train_vggish) # Define a shallow classification model and associated training ops on top of VGGish. with tf.variable_scope('mymodel'): # Add a fully connected layer with 100 units. num_units = 100 fc = slim.fully_connected(embeddings, num_units) # Add a classifier layer at the end, consisting of parallel logistic # classifiers, one per class. This allows for multi-class tasks. logits = slim.fully_connected(fc, FLAGS.num_classes, activation_fn=None, scope='logits') # Use Sigmoid as our activation function tf.sigmoid(logits, name='prediction') log.debug("Logits: {}".format(logits)) # Add training ops. with tf.variable_scope('train'): global_step = tf.Variable(0, name='global_step', trainable=False, collections=[ tf.GraphKeys.GLOBAL_VARIABLES, tf.GraphKeys.GLOBAL_STEP ]) # Labels are assumed to be fed as a batch multi-hot vectors, with # a 1 in the position of each positive class label, and 0 elsewhere. """ Accipiter_gentilis --> [1, 0, 0] Cygnus_olor --> [0, 1, 0] Regulus_regulus --> [0, 0, 1] """ labels = tf.placeholder(tf.float32, shape=(None, FLAGS.num_classes), name='labels') # Cross-entropy label loss. xent = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels, name='xent') loss = tf.reduce_mean(xent, name='loss_op') tf.summary.scalar('loss', loss) # We use the same optimizer and hyperparameters as used to train VGGish. optimizer = tf.train.AdamOptimizer( learning_rate=vggish_params.LEARNING_RATE, epsilon=vggish_params.ADAM_EPSILON) optimizer.minimize(loss, global_step=global_step, name='train_op') # Add evaluation ops. with tf.variable_scope("evaluation"): prediction = tf.argmax(logits, 1) correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(labels, 1)) accuracy = tf.reduce_mean( tf.cast(correct_prediction, tf.float32)) # Create a summarizer that summarizes loss and accuracy # TODO: Fix validation loss summary tf.summary.scalar("Accuracy", accuracy) # Add average loss summary over entire batch tf.summary.scalar("Loss", tf.reduce_mean(xent)) # Merge all the summaries and write them out to /tmp/mnist_logs (by default) summary_op = tf.summary.merge_all() # TensorBoard stuff train_writer = tf.summary.FileWriter(log_dir_train, sess.graph) validation_writer = tf.summary.FileWriter(log_dir_test, sess.graph) #tf.global_variables_initializer().run() # Initialize all variables in the model, and then load the pre-trained # VGGish checkpoint. sess.run(tf.global_variables_initializer()) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) # Locate all the tensors and ops we need for the training loop. features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) output_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) labels_tensor = sess.graph.get_tensor_by_name('mymodel/train/labels:0') global_step_tensor = sess.graph.get_tensor_by_name( 'mymodel/train/global_step:0') loss_tensor = sess.graph.get_tensor_by_name('mymodel/train/loss_op:0') train_op = sess.graph.get_operation_by_name('mymodel/train/train_op') # Load all input with corresponding labels log.info("Loading data set and mapping birds to training IDs...") all_examples, all_labels = load_spectrogram(os.path.join(data_dir), log) # Create training and test sets X_train_entire, X_validation_entire, y_train_entire, y_validation_entire = sk.train_test_split( all_examples, all_labels, test_size=FLAGS.test_size) # Test set stays the same throughout all epochs (X_validation, y_validation) = get_random_batches(X_validation_entire, y_validation_entire, log) # Start training for step in range(FLAGS.num_batches): log.info("######## Epoch {}/{} started ########".format( step + 1, FLAGS.num_batches)) # Shuffle the order of input examples to foster generalization (X_train, y_train) = get_random_batches(X_train_entire, y_train_entire, log) # Train on n batches per epoch minibatch_n = FLAGS.num_mini_batches minibatch_size = len(X_train) / minibatch_n if minibatch_size <= 0: log.error( "Size of minibatch too small ({}), choose smaller number of minibatches or use more classes!" .format(minibatch_size)) sys.exit(1) counter = 1 for i in range(0, len(X_train), minibatch_size): log.info("(Epoch {}/{}) ==> Minibatch {} started ...".format( step + 1, FLAGS.num_batches, counter)) # Get pair of (X, y) of the current minibatch/chunk X_train_mini = X_train[i:i + minibatch_size] y_train_mini = y_train[i:i + minibatch_size] log.info("Size of mini batch (features): {}".format( len(X_train_mini))) log.info("Size of mini batch (labels): {}".format( len(y_train_mini))) # Actual execution of the graph [summary, num_steps, loss, _, train_acc, temp] = sess.run([ summary_op, global_step_tensor, loss_tensor, train_op, accuracy, prediction ], feed_dict={ features_tensor: X_train_mini, labels_tensor: y_train_mini }, options=run_options) train_writer.add_summary(summary, step * minibatch_size + i) log.info("Loss in minibatch: {} ".format(loss)) log.info( "Training accuracy in minibatch: {}".format(train_acc)) log.info( "(Epoch {}/{}) ==> Minibatch {} finished ...\n".format( step + 1, FLAGS.num_batches, counter)) counter += 1 # Test set mini batching minibatch_valid_size = 4 val_acc_entire = 0. for j in range(0, len(X_validation), minibatch_valid_size): X_validation_mini = X_validation[j:j + minibatch_valid_size] y_validation_mini = y_validation[j:j + minibatch_valid_size] summary, val_acc, pred, corr_pred = sess.run( [summary_op, accuracy, prediction, correct_prediction], feed_dict={ features_tensor: X_validation_mini, labels_tensor: y_validation_mini }, options=run_options) val_acc_entire += val_acc validation_writer.add_summary( summary, step * minibatch_valid_size + j) average_val_acc = val_acc_entire / (j / minibatch_valid_size) log.info("Epoch {} -- Validation Accuracy: {}".format( step + 1, average_val_acc)) log.debug("Correct prediction: {}".format(corr_pred)) # Save model to disk. saver = tf.train.Saver() if step % FLAGS.save_step == 0: save_path = saver.save(sess, os.path.join( model_dir, "jibjib_model-{}.ckpt".format( FLAGS.model_version)), global_step=step) log.info("Model saved to {}".format(save_path)) now = datetime.datetime.now().isoformat().replace(":", "_").split(".")[0] end = time.time() out = "Training finished after {}s".format(end - start) log.info(out)
(x + 1) * batch_size, len(labeled_data))] features = [example for (example, _) in batch_labeled_data] labels = [label for (_, label) in batch_labeled_data] return (features, labels) f2 = h5py.File('val.hdf5', 'r') val_data, val_label = f2['val_data'], f2['val_label'] val_labeled_data = list(zip(val_data, val_label)) f3 = h5py.File('test.hdf5', 'r') test_data, test_label = f3['test_data'], f3['test_label'] test_labeled_data = list(zip(test_data, test_label)) # Define VGGish. embeddings = vggish_slim.define_vggish_slim(FLAGS.train_vggish) # Define a shallow classification model and associated training ops on top of VGGish. # Add a fully connected layer with FLAGS.num_units units. num_units = FLAGS.num_units fc = slim.fully_connected(embeddings, num_units) # Add a classifier layer at the end, consisting of parallel logistic classifiers, one per class. This allows for multi-class tasks. logits = slim.fully_connected(fc, _NUM_CLASSES, activation_fn=None, scope='logits') # logits = tf.sigmoid(logits, name='prediction') # Add training ops.
def main(unused_argv): print("Input file: " + FLAGS.input_video_label) print("Output tfrecord file: " + FLAGS.tfrecord_file) writer = tf.python_io.TFRecordWriter( FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None for wav_file, st_time, end_time, label in csv.reader(open( FLAGS.input_video_label), delimiter='\t'): print(wav_file, st_time, end_time, label) if (os.path.isfile(wav_file)): examples_batch = vggish_input.wavfile_to_examples(wav_file) #print(examples_batch) pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) #print(embedding_batch) postprocessed_batch = pproc.postprocess(embedding_batch) #print(postprocessed_batch) # Write the postprocessed embeddings as a SequenceExample, in a similar # format as the features released in AudioSet. Each row of the batch of # embeddings corresponds to roughly a second of audio (96 10ms frames), and # the rows are written as a sequence of bytes-valued features, where each # feature value contains the 128 bytes of the whitened quantized embedding. seq_example = tf.train.SequenceExample( context=tf.train.Features( feature={ vggish_params.LABELS_FEATURE_KEY: _int64_list_feature(sorted(map(int, label))), vggish_params.VIDEO_FILE_KEY_FEATURE_KEY: _bytes_feature(_make_bytes(map(ord, wav_file))), }), feature_lists=tf.train.FeatureLists( feature_list={ vggish_params.AUDIO_EMBEDDING_FEATURE_NAME: tf.train.FeatureList(feature=[ tf.train.Feature(bytes_list=tf.train.BytesList( value=[embedding.tobytes()])) for embedding in postprocessed_batch ]) })) #print(seq_example) if writer: writer.write(seq_example.SerializeToString()) tf.reset_default_graph() if writer: writer.close()
def main(_): with tf.Graph().as_default(), tf.Session() as sess: # Define VGGish. embeddings = vggish_slim.define_vggish_slim(FLAGS.train_vggish) # Define a shallow classification model and associated training ops on top # of VGGish. with tf.variable_scope("mymodel"): # Add a fully connected layer with 100 units. num_units = 100 fc = slim.fully_connected(embeddings, num_units) # Add a classifier layer at the end, consisting of parallel logistic # classifiers, one per class. This allows for multi-class tasks. logits = slim.fully_connected(fc, _NUM_CLASSES, activation_fn=None, scope="logits") tf.sigmoid(logits, name="prediction") # Add training ops. with tf.variable_scope("train"): global_step = tf.Variable( 0, name="global_step", trainable=False, collections=[ tf.GraphKeys.GLOBAL_VARIABLES, tf.GraphKeys.GLOBAL_STEP, ], ) # Labels are assumed to be fed as a batch multi-hot vectors, with # a 1 in the position of each positive class label, and 0 elsewhere. labels = tf.placeholder(tf.float32, shape=(None, _NUM_CLASSES), name="labels") # Cross-entropy label loss. xent = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels, name="xent") loss = tf.reduce_mean(xent, name="loss_op") tf.summary.scalar("loss", loss) # We use the same optimizer and hyperparameters as used to train VGGish. optimizer = tf.train.AdamOptimizer( learning_rate=vggish_params.LEARNING_RATE, epsilon=vggish_params.ADAM_EPSILON, ) optimizer.minimize(loss, global_step=global_step, name="train_op") # Initialize all variables in the model, and then load the pre-trained # VGGish checkpoint. sess.run(tf.global_variables_initializer()) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) # Locate all the tensors and ops we need for the training loop. features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) labels_tensor = sess.graph.get_tensor_by_name("mymodel/train/labels:0") global_step_tensor = sess.graph.get_tensor_by_name( "mymodel/train/global_step:0") loss_tensor = sess.graph.get_tensor_by_name("mymodel/train/loss_op:0") train_op = sess.graph.get_operation_by_name("mymodel/train/train_op") # The training loop. for _ in range(FLAGS.num_batches): (features, labels) = _get_examples_batch() [num_steps, loss, _] = sess.run( [global_step_tensor, loss_tensor, train_op], feed_dict={ features_tensor: features, labels_tensor: labels }, ) print("Step %d: loss %g" % (num_steps, loss))
def main(_): # In this simple example, we run the examples from a single audio file through # the model. If none is provided, we generate a synthetic input. #Read in .wav files from input directory, create array of wav_file names wav_file_direc = "./audio_input/" embedding_direc = "./json_output/" checkpoint = "vggish_model.ckpt" pca_params = "vggish_pca_params.npz" wav_files = listdir(wav_file_direc) #Initialize array of batches and read each wav_file in wav_files array batches = [] for wav_file in wav_files: if "wav" in wav_file: print(join(wav_file_direc,wav_file)) examples_batch = vggish_input.wavfile_to_examples(join(wav_file_direc,wav_file)) batches.append(examples_batch) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(pca_params) output_dicts = [] with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) output_sequences = [] #Create a JSON output file for each audio file for batch in batches: # Run inference and postprocessing. [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: batch}) postprocessed_batch = pproc.postprocess(embedding_batch) seq_example = tf.train.SequenceExample( feature_lists=tf.train.FeatureLists( feature_list={ vggish_params.AUDIO_EMBEDDING_FEATURE_NAME: tf.train.FeatureList( feature=[ tf.train.Feature( bytes_list=tf.train.BytesList( value=[embedding])) for embedding in postprocessed_batch ] ) } ) ) output_sequences.append(seq_example) for i in range(0, len(wav_files)): with open(join(embedding_direc,wav_files[i][:-3])+"json", 'w') as outfile: json.dump(output_sequences[i], outfile)
def main(unused_argv): extractor = feature_extractor.YouTube8MFeatureExtractor(FLAGS.model_dir) writer = tf.python_io.TFRecordWriter(FLAGS.output_tfrecords_file) total_written = 0 total_error = 0 for video_file, labels in csv.reader(open(FLAGS.input_videos_csv)): rgb_features = [] for rgb in frame_iterator(video_file, every_ms=1000.0 / FLAGS.frames_per_second): features = extractor.extract_rgb_frame_features(rgb[:, :, ::-1]) rgb_features.append(_bytes_feature(quantize(features))) if not rgb_features: print >> sys.stderr, 'Could not get features for ' + video_file total_error += 1 continue # Create SequenceExample proto and write to output. feature_list = { FLAGS.image_feature_key: tf.train.FeatureList(feature=rgb_features), } if FLAGS.insert_zero_audio_features: try: wav_file = video_file + '.wav' examples_batch = vggish_input.wavfile_to_examples(wav_file) pproc = vggish_postprocess.Postprocessor( 'vggish_pca_params.npz') with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint( sess, 'vggish_model.ckpt') features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) postprocessed_batch = pproc.postprocess(embedding_batch) feature_list['audio'] = tf.train.FeatureList(feature=[ tf.train.Feature(bytes_list=tf.train.BytesList( value=[embedding.tobytes()])) for embedding in postprocessed_batch ]) except: feature_list['audio'] = tf.train.FeatureList( feature=[_bytes_feature(_make_bytes([0] * 128))] * len(rgb_features)) example = tf.train.SequenceExample( context=tf.train.Features( feature={ FLAGS.labels_feature_key: _int64_list_feature(sorted(map(int, labels.split(';')))), FLAGS.video_file_key_feature_key: _bytes_feature(_make_bytes(map(ord, video_file))), }), feature_lists=tf.train.FeatureLists(feature_list=feature_list)) writer.write(example.SerializeToString()) total_written += 1 writer.close() print('Successfully encoded %i out of %i videos' % (total_written, total_written + total_error))
def main(wav_file=None, checkpoint='audioset/vggish_model.ckpt', pca_params='audioset/vggish_pca_params.npz', tfrecord_file=None): # In this simple example, we run the examples from a single audio file through # the model. If none is provided, we generate a synthetic input. if not wav_file: # Write a WAV of a sine wav into an in-memory file object. num_secs = 5 freq = 1000 sr = 44100 t = np.linspace(0, num_secs, int(num_secs * sr)) x = np.sin(2 * np.pi * freq * t) # Convert to signed 16-bit samples. samples = np.clip(x * 32768, -32768, 32767).astype(np.int16) wav_file = six.BytesIO() wavfile.write(wav_file, sr, samples) wav_file.seek(0) examples_batch = vggish_input.wavfile_to_examples(wav_file) # print(examples_batch.shape) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(pca_params) # If needed, prepare a record writer to store the postprocessed embeddings. # writer = tf.python_io.TFRecordWriter( # tfrecord_file) if tfrecord_file else None # with tf.Graph().as_default(), tf.Session() as sess: with tf.Graph().as_default(): # config = tf.ConfigProto() # restrict tensorflow memory usage config = tf.ConfigProto(gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.2),\ allow_soft_placement=True) config.gpu_options.allow_growth = True sess = tf.Session(config=config) # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) # print(embedding_batch.shape) sess.close() postprocessed_batch = pproc.postprocess(embedding_batch) # print(postprocessed_batch.shape) # Write the postprocessed embeddings as a SequenceExample, in a similar # format as the features released in AudioSet. Each row of the batch of # embeddings corresponds to roughly a second of audio (96 10ms frames), and # the rows are written as a sequence of bytes-valued features, where each # feature value contains the 128 bytes of the whitened quantized embedding. # seq_example = tf.train.SequenceExample( # feature_lists=tf.train.FeatureLists( # feature_list={ # vggish_params.AUDIO_EMBEDDING_FEATURE_NAME: # tf.train.FeatureList( # feature=[ # tf.train.Feature( # bytes_list=tf.train.BytesList( # value=[embedding.tobytes()])) # for embedding in postprocessed_batch # ] # ) # } # ) # ) # print(seq_example) # if writer: # writer.write(seq_example.SerializeToString()) # if writer: # writer.close() tf.reset_default_graph() return embedding_batch, postprocessed_batch
def main(_): (train_addrs, train_labels, val_addrs, val_labels, test_addrs, test_labels) = utils.adressLabelSort('sortedTestAudio2') addr = train_addrs embedding_labels = train_labels print('number of addr: ', len(addr)) print('number of labels: ', len(embedding_labels)) (examples_batch, embedding_labels) = utils._get_batch(addr, embedding_labels) tfrecords_filename = 'Evalval1.tfrecords' writer = tf.python_io.TFRecordWriter(tfrecords_filename) # restricting memory usage, TensorFlow is greedy and will use all memory otherwise config = tf.ConfigProto() #config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allocator_type = 'BFC' #config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = 0.90 with tf.Graph().as_default(), tf.Session(config=config) as sess: vggish_slim.define_vggish_slim( training=False) # Defines the VGGish TensorFlow model. vggish_slim.load_vggish_slim_checkpoint( sess, 'vggish_model.ckpt' ) # Loads a pre-trained VGGish-compatible checkpoint. # locate input and output tensors. features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) feed_dict = {features_tensor: examples_batch} [embedding_batch] = sess.run([embedding_tensor], feed_dict=feed_dict) print('example_batch shape: ', examples_batch.shape) print('embedding_batch shape: ', embedding_batch.shape) print('labels_batch shape: ', len(embedding_labels)) # store the data to the TFRecords file. for i in range(len(embedding_batch)): embedding = embedding_batch[i] # convert into proper data type: embedding_length = embedding_labels[i] # embedding.shape[0] embedding_raw = embedding.tostring() # Create a feature feature = { 'Evalval1/labels': utils._int64_feature(embedding_length), 'Evalval1/embedding': utils._bytes_feature(embedding_raw) } # Create an example protocol buffer example = tf.train.Example(features=tf.train.Features( feature=feature)) # Serialize to string and write on the file writer.write(example.SerializeToString()) writer.close() sys.stdout.flush()
def define_vggish(waveform): with tf.variable_creator_scope(var_tracker): features = waveform_to_features(waveform) return vggish_slim.define_vggish_slim(features, training=False)
def main(_): with tf.Graph().as_default(), tf.Session() as sess: # Define VGGish. embeddings = vggish_slim.define_vggish_slim( training=FLAGS.train_vggish) # Define a shallow classification model and associated training ops on top # of VGGish. with tf.variable_scope('mymodel'): # Add a fully connected layer with 100 units. Add an activation function # to the embeddings since they are pre-activation. num_units = 100 fc = slim.fully_connected(tf.nn.relu(embeddings), num_units) # Add a classifier layer at the end, consisting of parallel logistic # classifiers, one per class. This allows for multi-class tasks. logits = slim.fully_connected(fc, _NUM_CLASSES, activation_fn=None, scope='logits') tf.sigmoid(logits, name='prediction') # Add training ops. with tf.variable_scope('train'): global_step = tf.train.create_global_step() # Labels are assumed to be fed as a batch multi-hot vectors, with # a 1 in the position of each positive class label, and 0 elsewhere. labels_input = tf.placeholder(tf.float32, shape=(None, _NUM_CLASSES), name='labels') # Cross-entropy label loss. xent = tf.nn.sigmoid_cross_entropy_with_logits( logits=logits, labels=labels_input, name='xent') loss = tf.reduce_mean(xent, name='loss_op') tf.summary.scalar('loss', loss) # We use the same optimizer and hyperparameters as used to train VGGish. optimizer = tf.train.AdamOptimizer( learning_rate=vggish_params.LEARNING_RATE, epsilon=vggish_params.ADAM_EPSILON) train_op = optimizer.minimize(loss, global_step=global_step) # Initialize all variables in the model, and then load the pre-trained # VGGish checkpoint. sess.run(tf.global_variables_initializer()) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) # The training loop. features_input = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) for _ in range(FLAGS.num_batches): (features, labels) = _get_examples_batch() [num_steps, loss_value, _] = sess.run([global_step, loss, train_op], feed_dict={ features_input: features, labels_input: labels }) print('Step %d: loss %g' % (num_steps, loss_value))
def main(_): with tf.Graph().as_default(), tf.Session() as sess: # Define VGGish. embeddings = vggish_slim.define_vggish_slim(FLAGS.train_vggish) # Define a shallow classification model and associated training ops on top # of VGGish. with tf.variable_scope('mymodel'): # Add a fully connected layer with 100 units. num_units = 100 fc = slim.fully_connected(embeddings, num_units) # Add a classifier layer at the end, consisting of parallel logistic # classifiers, one per class. This allows for multi-class tasks. logits = slim.fully_connected( fc, _NUM_CLASSES, activation_fn=None, scope='logits') tf.sigmoid(logits, name='prediction') # Add training ops. with tf.variable_scope('train'): global_step = tf.Variable( 0, name='global_step', trainable=False, collections=[tf.GraphKeys.GLOBAL_VARIABLES, tf.GraphKeys.GLOBAL_STEP]) # Labels are assumed to be fed as a batch multi-hot vectors, with # a 1 in the position of each positive class label, and 0 elsewhere. labels = tf.placeholder( tf.float32, shape=(None, _NUM_CLASSES), name='labels') # Cross-entropy label loss. xent = tf.nn.sigmoid_cross_entropy_with_logits( logits=logits, labels=labels, name='xent') loss = tf.reduce_mean(xent, name='loss_op') tf.summary.scalar('loss', loss) # We use the same optimizer and hyperparameters as used to train VGGish. optimizer = tf.train.AdamOptimizer( learning_rate=vggish_params.LEARNING_RATE, epsilon=vggish_params.ADAM_EPSILON) optimizer.minimize(loss, global_step=global_step, name='train_op') # Initialize all variables in the model, and then load the pre-trained # VGGish checkpoint. sess.run(tf.global_variables_initializer()) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) # Locate all the tensors and ops we need for the training loop. features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) labels_tensor = sess.graph.get_tensor_by_name('mymodel/train/labels:0') global_step_tensor = sess.graph.get_tensor_by_name( 'mymodel/train/global_step:0') loss_tensor = sess.graph.get_tensor_by_name('mymodel/train/loss_op:0') train_op = sess.graph.get_operation_by_name('mymodel/train/train_op') # The training loop. for _ in range(FLAGS.num_batches): (features, labels) = _get_examples_batch() [num_steps, loss, _] = sess.run( [global_step_tensor, loss_tensor, train_op], feed_dict={features_tensor: features, labels_tensor: labels}) print('Step %d: loss %g' % (num_steps, loss))