def __init__(self, map_file, sequence_length, dataset, skip_frame=0, data_preprocessing=None, random_sequence=False, label_count=None, in_memory=True, camera_data_file=None, is_training=True, seed=1000): ''' Initialize SequenceBodyReader which return a batch of sequences of Body packed in a numpy array. Args: map_file(str): path to the CSV file that contains the list of clips. sequence_length(int): the number of frames in the sequence. dataset(str): the name of the dataset. skip_frame(int): how many frames to skips. data_preprocessing(DataPreprocessing): responsible of normalizing the input data. random_sequence(bool): pick a random sequence from the clip. label_count(optional, int): assuming the label range from 0 to label_count-1, none mean no label provided. in_memory(bool): load the entire dataset in memory or not. camera_data_file(str): contains camera calibration data. is_training(bool): true mean shuffle the input. seed(int): seed used for the random number generator, can be None. ''' self._source = SourceFactory(dataset, camera_data_file) self._map_file = map_file self._label_count = label_count self._sequence_length = sequence_length self._data_preprocessing = data_preprocessing self._files = [] self._targets = [] self._batch_start = 0 self._skip_frame = skip_frame self._random_sequence = random_sequence self._in_memory = in_memory self._files.clear() self._sensor = self._source.create_sensor() self._body = self._source.create_body() self._feature_shape = (self._body.joint_count, 3) assert self._skip_frame >= 0 with open(map_file) as csv_file: data = csv.reader(csv_file) for row in data: # file path, activity id, subject id filename_or_object = self._source.create_file_reader(row[0]) \ if self._in_memory else row[0] self._files.append([filename_or_object, int(row[1]), int(row[2])]) if (self._label_count is not None) and (len(row) > 1): target = [0.0] * self._label_count target[int(row[1])] = 1.0 self._targets.append(target) self._indices = np.arange(len(self._files)) if is_training: if seed != None: np.random.seed(seed) np.random.shuffle(self._indices)
class SequenceBodyReader(object): ''' Prepare a batch of sequence of bodies for each clip. ''' def __init__(self, map_file, sequence_length, dataset, skip_frame=0, data_preprocessing=None, random_sequence=False, label_count=None, in_memory=True, camera_data_file=None, is_training=True, seed=None): ''' Initialize SequenceBodyReader which return a batch of sequences of Body packed in a numpy array. Args: map_file(str): path to the CSV file that contains the list of clips. sequence_length(int): the number of frames in the sequence. dataset(str): the name of the dataset. skip_frame(int): how many frames to skips. data_preprocessing(DataPreprocessing): responsible of normalizing the input data. random_sequence(bool): pick a random sequence from the clip. label_count(optional, int): assuming the label range from 0 to label_count-1, none mean no label provided. in_memory(bool): load the entire dataset in memory or not. camera_data_file(str): contains camera calibration data. is_training(bool): true mean shuffle the input. seed(int): seed used for the random number generator, can be None. ''' self._source = SourceFactory(dataset, camera_data_file) self._map_file = map_file self._label_count = label_count self._sequence_length = sequence_length self._data_preprocessing = data_preprocessing self._files = [] self._targets = [] self._batch_start = 0 self._skip_frame = skip_frame self._random_sequence = random_sequence self._in_memory = in_memory self._files.clear() self._sensor = self._source.create_sensor() self._body = self._source.create_body() self._feature_shape = (self._body.joint_count, 3) assert self._skip_frame >= 0 with open(map_file) as csv_file: data = csv.reader(csv_file) for row in data: # file path, activity id, subject id filename_or_object = self._source.create_file_reader(row[0]) \ if self._in_memory else row[0] self._files.append( [filename_or_object, int(row[1]), int(row[2])]) if (self._label_count is not None) and (len(row) > 1): target = [0.0] * self._label_count target[int(row[1])] = 1.0 self._targets.append(target) self._indices = np.arange(len(self._files)) if is_training: if seed != None: np.random.seed(seed) np.random.shuffle(self._indices) def size(self): return len(self._files) @property def element_shape(self): return self._feature_shape def has_more(self): if self._batch_start < self.size(): return True return False def reset(self): self._batch_start = 0 def next_minibatch(self, batch_size): ''' Return a mini batch of sequences and their ground truth. Args: batch_size(int): mini batch size. ''' batch_end = min(self._batch_start + batch_size, self.size()) current_batch_size = batch_end - self._batch_start if current_batch_size < 0: raise Exception('Reach the end of the training data.') inputs = np.empty(shape=(current_batch_size, self._sequence_length) + self._feature_shape, dtype=np.float32) activities = np.zeros(shape=(current_batch_size), dtype=np.int32) subjects = np.zeros(shape=(current_batch_size), dtype=np.int32) targets = None if self._label_count is not None: targets = np.empty(shape=(current_batch_size, self._label_count), dtype=np.float32) for idx in range(self._batch_start, batch_end): index = self._indices[idx] frames = self._files[index][ 0] if self._in_memory else self._source.create_file_reader( self._files[index][0]) inputs[idx - self._batch_start, :, :, :] = self._select_frames(frames) activities[idx - self._batch_start] = self._files[index][1] subjects[idx - self._batch_start] = self._files[index][2] if self._label_count is not None: targets[idx - self._batch_start, :] = self._targets[index] self._batch_start += current_batch_size return inputs, targets, current_batch_size, activities, subjects def _select_frames(self, frames): ''' Return a fixed sequence length from the provided clip. Args: file_path(str): path of the skeleton file to load. ''' assert self._skip_frame >= 0 num_frames = len(frames) multiplier = self._skip_frame + 1 if not self._random_sequence: features = [] if num_frames >= multiplier * self._sequence_length: start_frame = int(num_frames / 2 - (multiplier * self._sequence_length) / 2) for index in range(multiplier * self._sequence_length): if (index % multiplier) == 0: features.append( self._from_body_to_feature(frames[start_frame + index])) else: raise ValueError( "Clip is too small, it has {} frames only.".format( num_frames)) return np.stack(features, axis=0) else: features = [] if num_frames >= multiplier * self._sequence_length: low = 0 high = num_frames - multiplier * self._sequence_length + 1 start = np.random.randint(low, high) for index in range(multiplier * self._sequence_length): if (index % multiplier) == 0: features.append( self._from_body_to_feature(frames[start + index])) else: raise ValueError( "Clip is too small, it has {} frames only.".format( num_frames)) return np.stack(features, axis=0) def _from_body_to_feature(self, frame): ''' Convert body joints to a numpy array and apply the needed normalization. Args: frame: contain one or more body object. ''' if len(frame) > 0: body = frame[0] return self._data_preprocessing.normalize(body.as_numpy()) return None
def main(args): ''' Main entry point that drive GAN training for body and skeleton data. Args: args: arg parser object, contains all arguments provided by the user. ''' # setting up paths and log information. base_folder = args.output_folder output_path = os.path.join(base_folder, R'output') output_folder = os.path.join(output_path, "") if not os.path.exists(output_folder): os.makedirs(output_folder) output_models_folder = os.path.join(output_folder, "models") if not os.path.exists(output_models_folder): os.makedirs(output_models_folder) output_videos_folder = os.path.join(output_folder, "videos") if not os.path.exists(output_videos_folder): os.makedirs(output_videos_folder) output_tensorboard_folder = os.path.join(output_folder, "tensorboard") if not os.path.exists(output_tensorboard_folder): os.makedirs(output_tensorboard_folder) # creating logging file logging.basicConfig(filename=os.path.join(output_folder, "train.log"), filemode='w', level=logging.INFO) logging.getLogger().addHandler(logging.StreamHandler()) max_epochs = args.max_epochs dataset = args.dataset_name # Return sensor and body information for the specific dataset. source = SourceFactory(dataset, args.camera_calibration_file) sensor = source.create_sensor() body_inf = source.create_body() # Log training information # logging.info("Dataset: {}, Generator: {}".format(dataset, args.gan_type)) # create the visualizer skeleton2D = Skeleton2D(sensor, body_inf) # input and output information input_sequence_length = 10 output_sequence_length = 10 sequence_length = input_sequence_length + output_sequence_length inputs_depth = 128 z_size = 128 # Latent value that control predicted poses. data_preprocessing = None # prepare the data. logging.info("Preprocessing data...") if args.data_normalization_file is not None: data_preprocessing = DataPreprocessing( args.data_normalization_file, normalization_mode=NormalizationMode.MeanAndStd2) logging.info("Loading data...") if dataset == 'nturgbd': train_data_reader = SequenceBodyReader( args.train_file, sequence_length, dataset, skip_frame=0, data_preprocessing=data_preprocessing, random_sequence=False) elif dataset == 'human36m': train_data_reader = SequenceBodyReader( args.train_file, sequence_length, dataset, skip_frame=1, data_preprocessing=data_preprocessing, random_sequence=True) else: raise ValueError("Invalid dataset value.") # setting up the model logging.info("Setting up the model...") minibatch_size = 16 lr_init = 5e-5 d_lr = lr_init g_lr = lr_init epoch = 0 d_inputs = tf.placeholder(dtype=tf.float32, shape=[None, sequence_length] + list(train_data_reader.element_shape), name="d_inputs") g_inputs = tf.placeholder(dtype=tf.float32, shape=[None, input_sequence_length] + list(train_data_reader.element_shape), name="g_inputs") g_z = tf.placeholder(dtype=tf.float32, shape=[None, z_size], name="g_z") # Defining the model. logging.info("Defining the model...") d_real = NNDiscriminator(d_inputs, inputs_depth, sequence_length) g = SequenceToSequenceGenerator(g_inputs, inputs_depth, g_z, input_sequence_length, output_sequence_length, reverse_input=False) d_fake_inputs = tf.concat([g_inputs, g.output], axis=1) d_fake = NNDiscriminator(d_fake_inputs, inputs_depth, sequence_length, reuse=True) # The purpose of those is only to learn the probability in case of WGAN, LS-GAN and their families. d_real_prob = NNDiscriminator(d_inputs, inputs_depth, sequence_length, scope="prob") d_fake_prob = NNDiscriminator(d_fake_inputs, inputs_depth, sequence_length, reuse=True, scope="prob") # Skeleton specific loss g_prev = g_inputs[:, input_sequence_length - 1:input_sequence_length, :, :] if output_sequence_length > 1: g_prev = tf.concat( [g_prev, g.output[:, 0:output_sequence_length - 1, :, :]], axis=1) g_next = g.output g_consistency_loss = tf.maximum( 0.0001, tf.norm(g_next - g_prev, ord=2) / (minibatch_size * output_sequence_length)) tf.summary.scalar("consistency_loss", g_consistency_loss) g_bone_loss = (nn.bone_loss(g_prev, g_next, body_inf) / (minibatch_size * output_sequence_length)) tf.summary.scalar("bone_loss", g_bone_loss) # Gradient penalty def gradient_penalty(): alpha = tf.random_uniform([], 0.0, 1.0) d_inputs_hat = alpha * d_inputs + (1 - alpha) * d_fake_inputs d_outputs_hat = NNDiscriminator(d_inputs_hat, inputs_depth, sequence_length, reuse=True).output gradients = tf.gradients(d_outputs_hat, d_inputs_hat)[0] gradients_l2 = tf.sqrt(tf.reduce_sum(tf.square(gradients), axis=[2, 3])) return tf.reduce_mean(tf.square(gradients_l2 - 1.)) gradient_penalty_loss = 10.0 * gradient_penalty() tf.summary.scalar("gradient_penalty_loss", gradient_penalty_loss) # Discriminator and generative loss function. d_loss = tf.reduce_mean(d_fake.output - d_real.output) + gradient_penalty_loss g_gan_loss = -tf.reduce_mean(d_fake.output) d_loss_prob = -tf.reduce_mean( tf.log(d_real_prob.prob) + tf.log(1. - d_fake_prob.prob)) d_loss += 0.001 * tf.add_n([tf.nn.l2_loss(p) for p in d_real.weights]) tf.summary.scalar("discriminator_or_critic_loss", d_loss) tf.summary.scalar("gan_loss", g_gan_loss) g_loss = g_gan_loss + 0.001 * g_consistency_loss + 0.01 * g_bone_loss tf.summary.scalar("generator_loss", g_loss) d_loss_prob += 0.001 * tf.add_n( [tf.nn.l2_loss(p) for p in d_real_prob.weights]) tf.summary.scalar("discriminator_loss", d_loss_prob) # Optimizers logging.info("Optimizers...") d_op = tf.train.AdamOptimizer(learning_rate=d_lr).minimize( d_loss, var_list=d_real.parameters) g_op = tf.train.AdamOptimizer(learning_rate=g_lr).minimize( g_loss, var_list=g.parameters) d_op_prob = tf.train.AdamOptimizer(learning_rate=d_lr / 2.0).minimize( d_loss_prob, var_list=d_real_prob.parameters) # tensorboard log summary_op = tf.summary.merge_all() writer = tf.summary.FileWriter(output_tensorboard_folder, graph=tf.get_default_graph()) # Must be after the optimizer init_op = tf.global_variables_initializer() # Draws all z random vectors used in visualization logging.info("Drawing z vectors values...") z_rand_type = 'uniform' z_rand_params = {'low': -0.1, 'high': 0.1, 'mean': 0.0, 'std': 0.2} z_data_p = [] for _ in range(10): z_data_p.append( nn.generate_random(z_rand_type, z_rand_params, shape=[minibatch_size, z_size])) logging.info("Start training, training clip count {}.".format( train_data_reader.size())) g_best_loss = float('inf') g_best_epoch = -1 g_best_pos_loss = float('inf') g_best_pos_epoch = -1 g_best_prob = 0 g_best_prob_epoch = -1 d_losses = [] g_losses = [] d_losses_prob = [] model_saver = tf.train.Saver() with tf.Session() as sess: sess.run(init_op) d_loss_val = 0. g_loss_val = 0. d_loss_val_prob = 0. d_per_mb_iterations = 10 g_per_mb_iterations = 2 tensorboard_index = 0 generate_skeletons = nn.generate_skeletons_with_prob( sess, g, d_real_prob, d_fake_prob, data_preprocessing, d_inputs, g_inputs, g_z) while epoch < max_epochs: train_data_reader.reset() # Training start_time = time.time() d_training_loss = 0. g_training_loss = 0. d_training_loss_prob = 0. d_is_sequence_probs = [] k = 0 while train_data_reader.has_more(): input_data, _, current_batch_size, activities, subjects = train_data_reader.next_minibatch( minibatch_size) input_past_data = input_data[:, 0:input_sequence_length, :, :] if minibatch_size != current_batch_size: continue if k == 0: subject_id = subjects[0] skeleton_data, d_is_sequence_probs = \ generate_skeletons(input_data, input_past_data, z_data_p) skeleton2D.draw_to_file( skeleton_data, subject_id, os.path.join(output_folder, "pred_{}.png".format(epoch))) z_data = nn.generate_random(z_rand_type, z_rand_params, shape=[minibatch_size, z_size]) for _ in range(int(d_per_mb_iterations - 1)): _, d_loss_val = sess.run( [d_op, d_loss], feed_dict={ d_inputs: input_data, g_inputs: input_past_data, g_z: z_data }) _, d_loss_val_prob = sess.run([d_op_prob, d_loss_prob], feed_dict={ d_inputs: input_data, g_inputs: input_past_data, g_z: z_data }) for _ in range(int(g_per_mb_iterations - 1)): _, g_loss_val = sess.run( [g_op, g_loss], feed_dict={ g_inputs: input_past_data, d_inputs: input_data, g_z: z_data }) summary, _, g_loss_val = sess.run([summary_op, g_op, g_loss], feed_dict={ g_inputs: input_past_data, d_inputs: input_data, g_z: z_data }) writer.add_summary(summary, tensorboard_index) d_training_loss += d_loss_val g_training_loss += g_loss_val d_training_loss_prob += d_loss_val_prob tensorboard_index += 1 k += 1 if k > 0: d_losses.append(d_training_loss / current_batch_size) g_losses.append(g_training_loss / current_batch_size) d_losses_prob.append(d_training_loss_prob / current_batch_size) # Keep track of best epoch. if epoch > 20: # Ignore first couple of epochs. save_model_and_video = False prob_count = -1. # Don't count ground truth. for z_prob in d_is_sequence_probs: if z_prob >= 0.5: prob_count += 1. current_prob = prob_count / (len(d_is_sequence_probs) - 1) if (current_prob >= g_best_prob) and (current_prob > 0.): save_model_and_video = True g_best_prob = current_prob g_best_prob_epoch = epoch if g_training_loss < g_best_loss: save_model_and_video = True g_best_loss = g_training_loss g_best_epoch = epoch if (g_training_loss > 0) and (g_training_loss < g_best_pos_loss): save_model_and_video = True g_best_pos_loss = g_training_loss g_best_pos_epoch = epoch # Save current model trained parameters and a video per z value. if save_model_and_video: model_saver.save(sess, os.path.join(output_models_folder, "models"), global_step=epoch + 1) video_index = 0 if args.record_clip: for sequence in skeleton_data: skeleton2D.draw_to_video_file( sequence, os.path.join( output_videos_folder, "pred_{}_z{}.mp4".format( epoch, video_index))) video_index += 1 logging.info("Epoch {}: took {:.3f}s".format( epoch, time.time() - start_time)) logging.info( " discriminator training loss:\t{:e}".format(d_training_loss)) logging.info( " generative training loss:\t{:e}".format(g_training_loss)) logging.info(" discriminator prob training loss:\t{:e}".format( d_training_loss_prob)) logging.info(" is sequence: {}".format(d_is_sequence_probs)) if epoch > 20: logging.info( " generative best loss:\t{:e}, for epoch {}".format( g_best_loss, g_best_epoch)) logging.info( " generative best pos loss:\t{:e}, for epoch {}".format( g_best_pos_loss, g_best_pos_epoch)) logging.info( " best motion prob:\t{:.1%}, for epoch {}".format( g_best_prob, g_best_prob_epoch)) epoch += 1