def prepare_inputs(self, inputs): """Prepares inputs on device to be fed to model in eval mode.""" params = self.params.eval.input video_shape = processing.get_video_shape(params) audio_shape = processing.get_audio_shape(params, REF_FPS, REF_SR) if FeatureNames.VISION in inputs: images = inputs[FeatureNames.VISION] else: images = tf.zeros([1] + video_shape, dtype=tf.float32) if FeatureNames.AUDIO_MEL in inputs or FeatureNames.AUDIO in inputs: if params.raw_audio: audio = inputs[FeatureNames.AUDIO] else: audio = inputs[FeatureNames.AUDIO_MEL] else: audio = tf.zeros([1] + audio_shape, dtype=tf.float32) if FeatureNames.TEXT_INDEX in inputs: words = inputs[FeatureNames.TEXT_INDEX] else: words = tf.zeros([1, params.max_num_words], dtype=tf.int32) audio = tf.reshape(audio, [-1] + audio_shape) words = tf.reshape(words, [-1, words.shape.as_list()[-1]]) labels_onehot = inputs.get(FeatureNames.LABEL_INDEX, None) labels = {'one_hot': labels_onehot} inputs = {'video': images, 'audio': audio, 'text': words} return inputs, labels
def prepare_eval_inputs(self, inputs): """Prepares inputs on device to be fed to model in eval mode.""" params = self.params.eval.input images = inputs[FeatureNames.VISION] labels_onehot = inputs[FeatureNames.LABEL_INDEX] if params.linearize_vision: img_shape = [params.frame_size, params.frame_size, 3] if params.name in dataloaders.VID_CLS_DS: space_to_depth = params.space_to_depth img_shape = processing.get_video_shape( params, is_space_to_depth=space_to_depth) else: img_shape = [1] + img_shape img_shape = [-1] + img_shape images = tf.reshape(images, img_shape) if params.name in dataloaders.IMG_CLS_DS: num_replica = self.params.model_config.temporal_patch_size images = tf.tile(images, [1, num_replica, 1, 1, 1]) labels = {'one_hot': labels_onehot} inputs = {'images': images} return inputs, labels
def prepare_inputs(self, inputs): """Prepares inputs on device to be fed to model in train mode.""" params = self.params.train.input images = inputs[FeatureNames.VISION] space_to_depth = params.space_to_depth if params.linearize_vision: vid_shape = processing.get_video_shape( params, is_space_to_depth=space_to_depth) images = tf.reshape(images, [-1] + vid_shape) if params.raw_audio: audio = inputs[FeatureNames.AUDIO] else: audio = inputs[FeatureNames.AUDIO_MEL] words = inputs[FeatureNames.TEXT_INDEX] words = tf.reshape(words, [-1, words.shape.as_list()[-1]]) audio_mask = inputs[FeatureNames.AUDIO_MASK] text_mask = inputs[FeatureNames.TEXT_MASK] labels = { FeatureNames.AUDIO_MASK: audio_mask, FeatureNames.TEXT_MASK: text_mask } inputs = {'video': images, 'audio': audio, 'text': words} return inputs, labels
def construct_model(self, params): """Build models for train/eval.""" if params.mode == 'train': input_params = params.train.input space_to_depth = input_params.space_to_depth else: input_params = params.eval.input space_to_depth = input_params.space_to_depth video_shape = processing.get_video_shape(input_params, space_to_depth) audio_shape = processing.get_audio_shape(input_params, REF_FPS, REF_SR) text_shape = (input_params.max_num_words, ) inputs = { 'video': tf.keras.Input(shape=video_shape), 'audio': tf.keras.Input(shape=audio_shape), 'text': tf.keras.Input(shape=text_shape), } model = model_factory.build_model(params.model_config) outputs = model(inputs, None) keras_model = tf.keras.Model(inputs=inputs, outputs=outputs) keras_model.loss_fn = model.loss_fn # Restoring word embeddings self.restore_text_embeddings(keras_model, params) logging.info('Number of parameters in model: %f M.', keras_model.count_params() / 10.**6) learning_rate = schedules.get_learning_rate( params.train.optimizer.learning_rate) keras_model.optimizer = optimizers.get_optimizer( learning_rate, params.train.optimizer) return keras_model
def construct_model(self, params): """Build models for train/eval.""" num_test_samples = 1 if params.mode == 'train': input_params = params.train.input ds_name = input_params.name is_vid_cls = ds_name in dataloaders.VID_CLS_DS is_img_cls = ds_name in dataloaders.IMG_CLS_DS is_aud_cls = ds_name in dataloaders.AUD_CLS_DS elif params.mode == 'eval': input_params = params.eval.input ds_name = input_params.name is_vid_cls = ds_name in dataloaders.VID_CLS_DS is_img_cls = ds_name in dataloaders.IMG_CLS_DS is_aud_cls = ds_name in dataloaders.AUD_CLS_DS if not is_img_cls: num_test_samples = params.eval.input.num_windows_test if params.eval.input.multi_crop and not is_aud_cls: num_test_samples *= 3 else: raise ValueError('Invalid mode!') if is_aud_cls: input_shape = processing.get_audio_shape(input_params, REF_FPS, REF_SR) elif is_vid_cls: space_to_depth = input_params.space_to_depth input_shape = processing.get_video_shape( input_params, is_space_to_depth=space_to_depth) elif is_img_cls: input_shape = processing.get_video_shape(input_params) if is_img_cls: input_shape[0] = params.model_config.temporal_patch_size num_classes = dataloaders.CLS_DS[ds_name]['num_classes'] model_kwargs = { 'num_classes': num_classes, 'num_test_samples': num_test_samples } if is_aud_cls: inputs = {'audio': tf.keras.Input(shape=input_shape)} model_factory = aud_factory else: inputs = {'images': tf.keras.Input(shape=input_shape)} model_factory = vid_factory model = model_factory.build_model(params=params.model_config, override_params=model_kwargs, mode='predict') outputs = model(inputs, None) keras_model = tf.keras.Model(inputs=inputs, outputs=outputs) keras_model.loss_fn = model.loss_fn if params.mode == 'train': self.partial_restore(params, keras_model) logging.info('Number of parameters in model: %f M.', keras_model.count_params() / 10.**6) learning_rate = schedules.get_learning_rate( params.train.optimizer.learning_rate) keras_model.optimizer = optimizers.get_optimizer( learning_rate, params.train.optimizer) return keras_model