def get_frame_input_feature(input_file): features = [] record_iterator = tf.python_io.tf_record_iterator(path=input_file) for i, string_record in enumerate(record_iterator): example = tf.train.SequenceExample() example.ParseFromString(string_record) # traverse the Example format to get data video_id = example.context.feature['video_id'].bytes_list.value[0] label = example.context.feature['labels'].int64_list.value[:] rgbs = [] audios = [] rgb_feature = example.feature_lists.feature_list['rgb'].feature for i in range(len(rgb_feature)): rgb = np.fromstring(rgb_feature[i].bytes_list.value[0], dtype=np.uint8).astype(np.float32) rgb = utils.Dequantize(rgb, 2, -2) rgbs.append(rgb) audio_feature = example.feature_lists.feature_list['audio'].feature for i in range(len(audio_feature)): audio = np.fromstring(audio_feature[i].bytes_list.value[0], dtype=np.uint8).astype(np.float32) audio = utils.Dequantize(audio, 2, -2) audios.append(audio) rgbs = np.array(rgbs) audios = np.array(audios) features.append((video_id, label, rgbs, audios)) return features
def get_video_matrix(self, features, feature_size, max_frames, max_quantized_value, min_quantized_value): """Decodes features from an input string and quantizes it. Args: features: raw feature values feature_size: length of each frame feature vector max_frames: number of frames (rows) in the output feature_matrix max_quantized_value: the maximum of the quantized value. min_quantized_value: the minimum of the quantized value. Returns: feature_matrix: matrix of all frame-features num_frames: number of frames in the sequence """ decoded_features = tf.reshape( tf.cast(tf.decode_raw(features, tf.uint8), tf.float32), [-1, feature_size]) num_frames = tf.minimum(tf.shape(decoded_features)[0], max_frames) feature_matrix = utils.Dequantize(decoded_features, max_quantized_value, min_quantized_value) feature_matrix = resize_axis(feature_matrix, 0, max_frames) if self.prepare_distill: def_feature_matrix = tf.reshape(tf.decode_raw(features, tf.uint8), [-1, feature_size]) def_feature_matrix = resize_axis(def_feature_matrix, 0, max_frames) return feature_matrix, num_frames, def_feature_matrix return feature_matrix, num_frames
def do_pca(self, input, max_quantized_value=2.0, min_quantized_value=-2.0): reduce_dim = 1024 load_file = open("model_pca_tag_category_100w.pickle", "rb") mean_block3 = pickle.load(load_file) component_block3 = pickle.load(load_file) component_block3 = component_block3[:, 0:reduce_dim] singular_values_ = pickle.load(load_file) singular_block3 = tf.constant(singular_values_, dtype=tf.float32, name='pac_singular_block3') mean_block3 = tf.constant(mean_block3, dtype=tf.float32, name='pac_mean_block3') component_block3 = tf.constant(component_block3, dtype=tf.float32, name='pac_component_block3') res_fea_pca = tf.matmul( input - mean_block3, component_block3) / tf.sqrt(singular_block3[0:reduce_dim] + 1e-4) res_fea = utils.quantize(res_fea_pca, max_quantized_value=max_quantized_value, min_quantized_value=min_quantized_value) res_fea = utils.Dequantize(res_fea, max_quantized_value=max_quantized_value, min_quantized_value=min_quantized_value) # res_fea_pca = tf.reshape(res_fea_pca, [-1, frams, reduce_dim]) # res_fea = tf.reshape(res_fea_pca, tf.shape(res_fea_pca)) return res_fea
def get_video_matrix(self, features, feature_size, max_frames, max_quantized_value, min_quantized_value): """Decodes features from an input string and quantizes it. Args: features: raw feature values feature_size: length of each frame feature vector max_frames: number of frames (rows) in the output feature_matrix max_quantized_value: the maximum of the quantized value. min_quantized_value: the minimum of the quantized value. Returns: feature_matrix: matrix of all frame-features num_frames: number of frames in the sequence """ decoded_features = tf.reshape( tf.cast(tf.decode_raw(features, tf.uint8), tf.float32), [-1, feature_size]) num_frames = tf.minimum(tf.shape(decoded_features)[0], max_frames) feature_matrix = utils.Dequantize(decoded_features, max_quantized_value, min_quantized_value) if feature_size == 1024: feature_matrix = feature_matrix * tf.transpose( tf.sqrt(self.pca_eigenvals + 1e-4)) feature_matrix = tf.reduce_sum( tf.multiply(tf.expand_dims(feature_matrix, 1), self.pca_eigenvecs), 2) feature_matrix += np.transpose(self.pca_mean) feature_matrix = resize_axis(feature_matrix, 0, max_frames) return feature_matrix, num_frames
def extract_n_predict(input_wav_file, pca_params, checkpoint, checkpoint_file, train_dir, output_file): print("Input file: " +input_wav_file) if (os.path.isfile(input_wav_file)): examples_batch = vggish_input.wavfile_to_examples(input_wav_file) #print(examples_batch) pproc = vggish_postprocess.Postprocessor(pca_params) with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) #print(embedding_batch) postprocessed_batch = pproc.postprocess(embedding_batch) #print(postprocessed_batch) num_frames_batch_val = np.array([postprocessed_batch.shape[0]],dtype=np.int32) video_batch_val = np.zeros((1, 300, 128), dtype=np.float32) video_batch_val[0,0:postprocessed_batch.shape[0],:] = utils.Dequantize(postprocessed_batch.astype(float),2,-2) # extract_n_predict() predicted_class = inference(video_batch_val ,num_frames_batch_val, checkpoint_file, train_dir, output_file) return(predicted_class) tf.reset_default_graph()
def get_video_matrix(self, features, feature_size, max_frames, max_quantized_value, min_quantized_value): """Decodes features from an input string and quantizes it. Args: features: raw feature values feature_size: length of each frame feature vector max_frames: number of frames (rows) in the output feature_matrix max_quantized_value: the maximum of the quantized value. min_quantized_value: the minimum of the quantized value. Returns: feature_matrix: matrix of all frame-features num_frames: number of frames in the sequence """ decoded_features = tf.reshape( tf.cast(tf.decode_raw(features, tf.uint8), tf.float32), [-1, feature_size]) interval = FLAGS.crop_interval if FLAGS.crop: ind = tf.multinomial(tf.log([[1.] * interval]), 1)[0, 0] length_local = tf.shape(decoded_features, out_type=tf.int64)[0] start_idx = tf.minimum(ind, length_local - 1) index = tf.range(start_idx, length_local, interval) decoded_features = tf.reshape(tf.gather(decoded_features, index), [-1, feature_size]) num_frames = tf.minimum(tf.shape(decoded_features)[0], max_frames) feature_matrix = utils.Dequantize(decoded_features, max_quantized_value, min_quantized_value) feature_matrix = resize_axis(feature_matrix, 0, max_frames) return feature_matrix, num_frames
def build_graph(): feature_names = ['rgb', 'audio'] feature_sizes = [1024, 128] max_quantized_value = 2 min_quantized_value = -2 seq_example_bytes = tf.placeholder(tf.string) contexts, features = tf.parse_single_sequence_example( seq_example_bytes, context_features={ "video_id": tf.FixedLenFeature([], tf.string), "labels": tf.VarLenFeature(tf.int64) }, sequence_features={ feature_name: tf.FixedLenSequenceFeature([], dtype=tf.string) for feature_name in feature_names }) decoded_features = { name: tf.reshape( tf.cast(tf.decode_raw(features[name], tf.uint8), tf.float32), [-1, size]) for name, size in zip(feature_names, feature_sizes) } feature_matrices = { name: utils.Dequantize(decoded_features[name], max_quantized_value, min_quantized_value) for name in feature_names } tf.add_to_collection("vid_tsr", contexts['video_id']) tf.add_to_collection("labs_tsr", contexts['labels'].values) tf.add_to_collection("rgb_tsr", feature_matrices['rgb']) tf.add_to_collection("audio_tsr", feature_matrices['audio']) tf.add_to_collection("seq_example_bytes", seq_example_bytes)
def get_video_matrix(self, features, feature_size, max_frames, max_quantized_value, min_quantized_value): decoded_features = tf.reshape( tf.cast(tf.decode_raw(features, tf.uint8), tf.float32), [-1, feature_size]) num_frames = tf.minimum(tf.shape(decoded_features)[0], max_frames) feature_matrix = utils.Dequantize(decoded_features, max_quantized_value, min_quantized_value) feature_matrix = resize_axis(feature_matrix, 0, max_frames) return feature_matrix, num_frames
def prepare_reader(self, filename_queue, max_quantized_value=2, min_quantized_value=-2): """Creates a single reader thread for YouTube8M SequenceExamples. Args: filename_queue: A tensorflow queue of filename locations. max_quantized_value: the maximum of the quantized value. min_quantized_value: the minimum of the quantized value. Returns: A tuple of video indexes, video features, labels, and padding data. """ reader = tf.TFRecordReader() _, serialized_example = reader.read(filename_queue) contexts, features = tf.parse_single_sequence_example( serialized_example, context_features={"video_id": tf.FixedLenFeature([], tf.string), "labels": tf.VarLenFeature(tf.int64)}, sequence_features={ "rgb" : tf.FixedLenSequenceFeature([], dtype=tf.string), "audio": tf.FixedLenSequenceFeature([], dtype=tf.string) }) # read ground truth labels labels = tf.cast(tf.sparse_to_dense(contexts["labels"].values, (self.num_classes,), 1, validate_indices=False), tf.int32) rgb = tf.reshape(tf.cast(tf.decode_raw(features["rgb"], tf.uint8), tf.float32), [-1, FLAGS.rgb_size]) audio = tf.reshape(tf.cast(tf.decode_raw(features["audio"], tf.uint8), tf.float32), [-1, FLAGS.audio_size]) num_frames = tf.minimum(tf.shape(rgb)[0], self.max_frames) tf.assert_equal(tf.shape(rgb)[0], tf.shape(audio)[0]) rgb = resize_axis(utils.Dequantize(rgb,max_quantized_value,min_quantized_value), 0, self.max_frames) audio = resize_axis(utils.Dequantize(audio,max_quantized_value,min_quantized_value), 0, self.max_frames) return contexts["video_id"], labels, rgb, audio, num_frames
def main(unused_argv): print("Input file: " + FLAGS.input_video_label) for wav_file, st_time, end_time, label in csv.reader(open( FLAGS.input_video_label), delimiter='\t'): print(wav_file, st_time, end_time, label) if (os.path.isfile(wav_file)): examples_batch = vggish_input.wavfile_to_examples(wav_file) #print(examples_batch) pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) #print(embedding_batch) postprocessed_batch = pproc.postprocess(embedding_batch) #print(postprocessed_batch) num_frames_batch_val = np.array([postprocessed_batch.shape[0]], dtype=np.int32) video_batch_val = np.zeros((1, 300, 128), dtype=np.float32) video_batch_val[ 0, 0:postprocessed_batch.shape[0], :] = utils.Dequantize( postprocessed_batch.astype(float), 2, -2) inference(video_batch_val, num_frames_batch_val, FLAGS.checkpoint_file, FLAGS.train_dir, FLAGS.output_file) tf.reset_default_graph()
def frame_example_2_np(seq_example_bytes, max_quantized_value=2, min_quantized_value=-2): feature_names = ['rgb', 'audio'] feature_sizes = [1024, 128] with tf.Graph().as_default(): contexts, features = tf.parse_single_sequence_example( seq_example_bytes, context_features={ "video_id": tf.FixedLenFeature([], tf.string), "labels": tf.VarLenFeature(tf.int64) }, sequence_features={ feature_name: tf.FixedLenSequenceFeature([], dtype=tf.string) for feature_name in feature_names }) decoded_features = { name: tf.reshape( tf.cast(tf.decode_raw(features[name], tf.uint8), tf.float32), [-1, size]) for name, size in zip(feature_names, feature_sizes) } feature_matrices = { name: utils.Dequantize(decoded_features[name], max_quantized_value, min_quantized_value) for name in feature_names } with tf.Session() as sess: vid = sess.run(contexts['video_id']) labs = sess.run(contexts['labels'].values) rgb = sess.run(feature_matrices['rgb']) audio = sess.run(feature_matrices['audio']) return vid, labs, rgb, audio
def _process_videos(thread_index, ranges, name, videos, num_shards): """Processes and saves a subset of video metadata as TFRecord files in one thread. Each thread produces N shards where N = num_shards / num_threads. For instance, if num_shards = 128, and num_threads = 2, then the first thread would produce shards [0, 64). Args: thread_index: Integer thread identifier within [0, len(ranges)]. ranges: A list of pairs of integers specifying the ranges of the datset to process in parallel. name: Unique identifier specifying the dataset. videos: List of VideoMetadata. num_shards: Integer number of shards for the output files. """ for i in range(len(videos)): vid = videos[i] filename_queue = tf.train.string_input_producer([vid], num_epochs=1, shuffle=True) reader = tf.TFRecordReader() _, serialized_examples = reader.read(filename_queue) context_features = { "video_id": tf.FixedLenFeature([], tf.string), "labels": tf.VarLenFeature(tf.int64) } sequence_features = { "rgb": tf.FixedLenSequenceFeature([], dtype=tf.string), "audio": tf.FixedLenSequenceFeature([], dtype=tf.string) } contexts, features = tf.parse_single_sequence_example( serialized_examples, context_features=context_features, sequence_features=sequence_features) output_file = os.path.join(FLAGS.output_dir, vid.split('/')[-1]) writer = tf.python_io.TFRecordWriter(output_file) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) vis_dic = tf.contrib.learn.KMeansClustering( num_clusters=256, relative_tolerance=0.0001, model_dir='/data1/yj/kmeans/') counter = 0 while not coord.should_stop(): feat = sess.run(features) cont = sess.run(contexts) decoded_rgb = tf.reshape(tf.cast( tf.decode_raw(features['rgb'], tf.uint8), tf.float32), shape=[-1, 1024]) temp = sess.run(decoded_rgb) vlad = VLAD_tf(utils.Dequantize(temp), vis_dic) sequence_example = _to_sequence_example(cont, vlad) if sequence_example is not None: counter += 1 writer.write(sequence_example.SerializeToString()) writer.close() print("%s [thread %d]: Wrote %d %s working data to %s." % (datetime.now(), thread_index, counter, FLAGS.type, output_file)) sys.stdout.flush() shard_counter = 0 print("%s [thread %d]: Wrote %d %s working data to %d shards." % (datetime.now(), thread_index, counter, FLAGS.type, num_shards_per_batch)) sys.stdout.flush()
def create_model(self, model_input, vocab_size, num_frames, **unused_params): """Creates a model which uses a logistic classifier over the average of the frame-level features. This class is intended to be an example for implementors of frame level models. If you want to train a model over averaged features it is more efficient to average them beforehand rather than on the fly. Args: model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of input features. vocab_size: The number of classes in the dataset. num_frames: A vector of length 'batch' which indicates the number of frames for each video (before padding). Returns: A dictionary with a tensor containing the probability predictions of the model in the 'predictions' key. The dimensions of the tensor are 'batch_size' x 'num_classes'. """ ##########################################################original logisticModel # logging.info("model_input_shape: %s." ,str(model_input)) # # ###(1,300,1024),padding to 300 frames even if the true num_frames not 300. # ##if use audio_information, the vector becomes(?,300,1152),since 1152=1024+128 # num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) # feature_size = model_input.get_shape().as_list()[2] # # denominators = tf.reshape( # tf.tile(num_frames, [1, feature_size]), [-1, feature_size]) # ## # logging.info("denominators: %s.", str(denominators)) # # ##(1,1024) # avg_pooled = tf.reduce_sum(model_input, # axis=[1]) / denominators # ##an average 1024 feature # output = slim.fully_connected( # avg_pooled, vocab_size, activation_fn=tf.nn.sigmoid, # weights_regularizer=slim.l2_regularizer(1e-8)) # return {"predictions": output} ############################################################# num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) feature_size = model_input.get_shape().as_list()[2] extrac_frames = 100 model_input = utils.SampleFramesOrdered(model_input, num_frames, extrac_frames) # model_input = tf.expand_dims(model_input, -1) logging.info("model_input_after_shape: %s.", str(model_input)) #batchsize*extrac_frames*feature_size ######## filters = [16, 64, 256, 1024, 4096] #dequantize model_input = tools.Dequantize(model_input) x = self._conv('conv1', model_input, time_stride=30, in_filters=1, out_filters=400, feature_size=feature_size, strides=[1, 10, 1, 1], padding='VALID') logging.info("after_conv1: %s.", str(x)) #8 bias = tf.get_variable('bias1', [400], tf.float32, initializer=tf.zeros_initializer()) x = self._relu(x + bias, 0.0) x = tf.nn.max_pool(x, ksize=[1, 8, 1, 1], strides=[1, 8, 1, 1], padding='VALID', name="max1") #42 # logging.info("x_after_maxpool1: %s.", str(x)) # x=self._conv('conv2',x,time_stride=3,in_filters=filters[0],out_filters=filters[2],feature_size=1, # strides=[1,1,1,1],padding='SAME') # bias = tf.get_variable('bias2', [filters[2]], tf.float32, initializer=tf.zeros_initializer()) # # x = self._relu(x + bias,0.0) # # x=tf.nn.max_pool(x,ksize=[1,41,1,1],strides=[1,41,1,1],padding='VALID',name="max2") # #21 # # # x=self.group_conv(name='group',x=x,time_stride=21,in_filters=filters[1],out_filters=filters[2],strides=[1,1,1,1]) # # x=tf.nn.relu6(x,name='relu6') x = tf.contrib.layers.flatten(x) # x=tf.nn.dropout(x,keep_prob=0.5) logging.info("output: %s.", x) # hidden = slim.fully_connected( # x, 8196, activation_fn=None, # weights_regularizer=slim.l2_regularizer(1e-8)) # # drop=tf.nn.dropout(hidden,keep_prob=0.5) # hidden=tf.nn.relu(hidden,'relu6') aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) # logging.info("DBoF_activitions:%s", str(activation)) return aggregated_model().create_model(model_input=x, vocab_size=vocab_size, **unused_params)
def create_model(self, model_input, vocab_size, num_frames, iterations=None, add_batch_norm=None, sample_random_frames=None, cluster_size=None, hidden_size=None, is_training=True, **unused_params): iterations = iterations or FLAGS.iterations add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm random_frames = sample_random_frames or FLAGS.sample_random_frames cluster_size = cluster_size or FLAGS.dbof_cluster_size hidden1_size = hidden_size or FLAGS.dbof_hidden_size num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if random_frames: model_input = utils.SampleRandomFrames(model_input, num_frames, iterations) else: model_input = utils.SampleRandomSequence(model_input, num_frames, iterations) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] reshaped_input = tf.reshape(model_input, [-1, feature_size]) ##dequantize logging.info("preprocessed input:%s", str(reshaped_input)) reshaped_input = tools.Dequantize(reshaped_input) logging.info("deQuantized input:%s", str(reshaped_input)) tf.summary.histogram("input_hist", reshaped_input) if add_batch_norm: reshaped_input = slim.batch_norm(reshaped_input, center=True, scale=True, is_training=is_training, scope="input_bn") cluster_weights = tf.get_variable( "cluster_weights", [feature_size, cluster_size], initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size))) tf.summary.histogram("cluster_weights", cluster_weights) activation = tf.matmul(reshaped_input, cluster_weights) if add_batch_norm: activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="cluster_bn") else: cluster_biases = tf.get_variable( "cluster_biases", [cluster_size], initializer=tf.random_normal(stddev=1 / math.sqrt(feature_size))) tf.summary.histogram("cluster_biases", cluster_biases) activation += cluster_biases activation = tf.nn.relu6(activation) tf.summary.histogram("cluster_output", activation) activation = tf.reshape(activation, [-1, max_frames, cluster_size]) activation = utils.FramePooling(activation, FLAGS.dbof_pooling_method) hidden1_weights = tf.get_variable( "hidden1_weights", [cluster_size, hidden1_size], initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size))) tf.summary.histogram("hidden1_weights", hidden1_weights) activation = tf.matmul(activation, hidden1_weights) if add_batch_norm: activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="hidden1_bn") else: hidden1_biases = tf.get_variable( "hidden1_biases", [hidden1_size], initializer=tf.random_normal_initializer(stddev=0.01)) tf.summary.histogram("hidden1_biases", hidden1_biases) activation += hidden1_biases activation = tf.nn.relu6(activation) tf.summary.histogram("hidden1_output", activation) #dropout activation = tf.nn.dropout(activation, 0.5) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) logging.info("DBoF_activitions:%s", str(activation)) return aggregated_model().create_model(model_input=activation, vocab_size=vocab_size, **unused_params)
vid_batch = tf.train.batch_join( [[decoded_rgb]], batch_size=batch_size, capacity=batch_size * 2, dynamic_pad=True) ''' vis_dic = tf.contrib.learn.KMeansClustering(num_clusters=256, relative_tolerance=0.0001, model_dir='/data1/yj/kmeans/') while not coord.should_stop(): #print sess.run(features) ''' print type(sess.run(features)) print len(sess.run(features)) print sess.run(features).keys() print type(sess.run(features)['rgb']) ''' #drgb = utils.Dequantize(sess.run(decoded_rgb)) #pudb.set_trace() #rgb_VLAD = VLAD_tf(drgb, vis_dic) drgb = sess.run(decoded_audio) drgb = drgb[::20] drgb = utils.Dequantize(drgb) rgbs.append(drgb) if len(rgbs) % 10000 == 0: print 'doing...' if len(rgbs) > 80000: rgb_stack = np.concatenate(rgbs, axis=0) pkl.dump(rgb_stack, open(cluster_dir, 'w')) break