def __init__(self, model_file, mean_pose_file, host, port, using_docker): """ This service generates gestures for an input speech segment by passing the audio and the text transcription (as received from the 3D agent) to the trained Gesticulator model. The generated gestures are first saved into csv files, then the paths to those files are sent to the standalone ActiveMQ server, which forwards them to the 3D agent. Args: model_file: The pretrained Gesticulator model mean_pose_file: The path to .npy file that contains the mean pose of the dataset host: The hostname of the ActiveMQ connection. port: The port of the ActiveMQ connection. using_docker: See 'on_message()' for details. unity_assets_folder: The path to Unity's Assets folder. """ self.using_docker = using_docker self.connection = MessagingServer(listener=self, host=host, port=port) print("Loading Blenderbot model") self.chatbot = Blenderbot() print("Loading text-to-speech model") self.tts_model = GlowTTS() print("Loading pretrained Gesticulator model") self.model = GesticulatorModel.load_from_checkpoint( model_file, inference_mode=True) print("Creating GesturePredictor interface") feature_type = check_feature_type(self.model) self.predictor = GesturePredictor(self.model, feature_type=feature_type)
def main(args): # 0. Check feature type based on the model feature_type, audio_dim = check_feature_type(args.model_file) # 1. Load the model model = GesticulatorModel.load_from_checkpoint(args.model_file, inference_mode=True) # This interface is a wrapper around the model for predicting new gestures conveniently gp = GesturePredictor(model, feature_type) # 2. Predict the gestures with the loaded model motion = gp.predict_gestures(args.audio, args.text) # 3. Visualize the results motion_length_sec = int(motion.shape[1] / 20) visualize(motion.detach(), "temp.bvh", "temp.npy", "temp.mp4", start_t=0, end_t=motion_length_sec, data_pipe_dir='../gesticulator/utils/data_pipe.sav') # Add the audio to the video command = f"ffmpeg -y -i {args.audio} -i temp.mp4 -c:v libx264 -c:a libvorbis -loglevel quiet -shortest {args.video_out}" subprocess.call(command.split()) print("\nGenerated video:", args.video_out) # Remove temporary files for ext in ["bvh", "npy", "mp4"]: os.remove("temp." + ext)
def main(test_params): model = GesticulatorModel.load_from_checkpoint(test_params.model_file, inference_mode=True) create_save_dirs(model) model.generate_evaluation_videos(semantic=test_params.use_semantic_input, random=test_params.use_random_input)
def _setup(self, config): self.hparams = config["hparams"] # Namespace() for key, val in config.items(): if key == "hparams": continue try: val = val.item() except AttributeError: pass setattr(self.hparams, key, val) self.model = GesticulatorModel(self.hparams) checkpoint_callback = ModelCheckpoint( filepath=os.path.join(self.logdir, "checkpoint"), save_best_only=True, verbose=True, monitor="avg_val_loss", mode="min", ) try: gpus = len(ray.get_gpu_ids()) except: print("failed to get gpus") gpus = 1 self.trainer = Trainer( gpus=gpus, distributed_backend="dp", max_nb_epochs=1, checkpoint_callback=checkpoint_callback, nb_sanity_val_steps=2, log_gpu_memory="all", weights_summary=None, early_stop_callback=None, # show_progress_bar=False, train_percent_check=0.00001 if self.hparams.dev_test else 1, ) self.val_loss = float("inf")
def main(hparams): model = GesticulatorModel(hparams) logger = create_logger(model.save_dir) callbacks = [ModelSavingCallback() ] if hparams.save_model_every_n_epochs > 0 else [] trainer = Trainer.from_argparse_args(hparams, logger=logger, callbacks=callbacks, checkpoint_callback=False, early_stop_callback=False) trainer.fit(model) trainer.save_checkpoint(os.path.join(model.save_dir, "trained_model.ckpt"))
def __init__(self, model : GesticulatorModel, feature_type : str): """An interface for generating gestures using saved GesticulatorModel. Args: model: the trained Gesticulator model feature_type: the feature type in the input data (must be the same as it was in the training dataset!) """ if feature_type not in self.supported_features: print(f"ERROR: unknown feature type '{self.feature_type}'!") print(f"Possible values: {self.supported_features}") exit(-1) self.feature_type = feature_type self.model = model.eval() # Put the model into 'testing' mode self.embedding = self._create_embedding(model.text_dim)
def load_model(test_params): """This function enables the test datasets that were selected by the user""" model = GesticulatorModel.load_from_checkpoint(test_params.model_file, inference_mode=True) # Make sure that at least one of the two test datasets are enabled if not test_params.use_semantic_input and not test_params.use_random_input: print("ERROR: Please provide at least one of the following two flags:") print( " python test.py --use_semantic_input (to use the semantic test input segments)" ) print( " python test.py --use_random_input (to use the random test input segments)" ) exit(-1) model.hparams.generate_semantic_test_predictions = test_params.use_semantic_input model.hparams.generate_random_test_predictions = test_params.use_random_input return model
def main(config): hparams = Namespace() for key, val in config.items(): setattr(hparams, key, val) model = GesticulatorModel(hparams) checkpoint_callback = ModelCheckpoint( filepath=hparams.model_path, save_best_only=True, verbose=True, monitor="avg_val_loss", mode="min", ) trainer = Trainer( gpus=len(ray.get_gpu_ids()), distributed_backend=hparams.distributed_backend, max_nb_epochs=20, checkpoint_callback=checkpoint_callback, ) trainer.fit(model)
def profile_with_clipping(model_file, feature_type, mean_pose_file, input, duration): """Profile the inference phase and the conversion from exp. map to joint angles.""" model = GesticulatorModel.load_from_checkpoint( model_file, inference_mode=True, mean_pose_file=mean_pose_file, audio_dim=4) predictor = GesturePredictor(model, feature_type) truncate_audio(input, duration) audio = f"{input}_{duration}s.wav" text = f"{input}_{duration}s.json" print("Profiling gesture prediction...") profiler = cProfile.Profile() profiler.enable() gestures = predictor.predict_gestures(audio, text) out_file = "/home/work/Desktop/repositories/gesticulator/gesticulator/interface/profiling/predicted_rotations_{}.csv" np.savetxt(out_file.format('_DATASET_INPUT_x'), gestures[:, :, 0], delimiter=',') np.savetxt(out_file.format('_DATASET_INPUT_y'), gestures[:, :, 1], delimiter=',') np.savetxt(out_file.format('_DATASET_INPUT_z'), gestures[:, :, 2], delimiter=',') profiler.disable() profiler.print_stats(sort='cumtime')
def main(hparams): if hparams.model_checkpoint is None: model = GesticulatorModel(hparams) else: model = GesticulatorModel.load_from_checkpoint(hparams.model_checkpoint, model_checkpoint=hparams.model_checkpoint) logger = create_logger(model.save_dir) callbacks = [ModelSavingCallback()] if hparams.save_model_every_n_epochs > 0 else [] if hparams.model_checkpoint is None: trainer = Trainer.from_argparse_args(hparams, logger=logger, callbacks = callbacks, checkpoint_callback=False) else: # Workaround model.init_prediction_saving_params() model.on_train_start() trainer = Trainer.from_argparse_args(hparams, resume_from_checkpoint=hparams.model_checkpoint, logger=logger, callbacks=callbacks, checkpoint_callback=False, num_sanity_val_steps=0) trainer.fit(model) trainer.save_checkpoint(os.path.join(model.save_dir, f"trained_model_{model.current_epoch+1}epochs.ckpt"))
# denormalize gestures = gestures_norm * model.max_val + model.mean_pose[np.newaxis] print(gestures.shape) np.save(gesture_file, gestures) if __name__ == "__main__": args = parser.parse_args() mean_pose = np.array([0 for i in range(45)]) max_val = np.array([0 for i in range(45)]) the_model = GesticulatorModel(args, mean_pose, max_val) the_model.load_state_dict(torch.load(args.model_file)) train_dataset = SpeechGestureDataset(args.data_dir, train=True, apply_PCA=args.pca) # Produce gestures print("Generation gestures ...") gesture_file = "temp_ges.npy" predict(the_model, args.test_audio, args.test_text, gesture_file) """print("Making a video ... ") epoch = args.curr_epoch # define files
if __name__ == "__main__": parent_parser = ArgumentParser(add_help=False) parent_parser.add_argument("--gpus", default=[0, 1, 2, 3, 4, 5], help="how many gpus") parent_parser.add_argument( "--distributed_backend", type=str, default="dp", help="supports three options dp, ddp, ddp2", ) parser = GesticulatorModel.add_model_specific_args(parent_parser) hyperparams = parser.parse_args() config = {} for hparam, val in vars(hyperparams).items(): if isinstance(val, list): config[hparam] = tune.sample_from(val) else: config[hparam] = val class MyAsyncHyperBandScheduler(AsyncHyperBandScheduler): def on_trial_error(self, trial_runner, trial): if trial.resources.gpu < 4: trial.resources = Resources(cpu=trial.resources.cpu * 2, gpu=trial.resources.gpu * 2) super().on_trial_error(trial_runner, trial)
class TrainableTrainer(tune.Trainable): def _setup(self, config): self.hparams = config["hparams"] # Namespace() for key, val in config.items(): if key == "hparams": continue try: val = val.item() except AttributeError: pass setattr(self.hparams, key, val) self.model = GesticulatorModel(self.hparams) checkpoint_callback = ModelCheckpoint( filepath=os.path.join(self.logdir, "checkpoint"), save_best_only=True, verbose=True, monitor="avg_val_loss", mode="min", ) try: gpus = len(ray.get_gpu_ids()) except: print("failed to get gpus") gpus = 1 self.trainer = Trainer( gpus=gpus, distributed_backend="dp", max_nb_epochs=1, checkpoint_callback=checkpoint_callback, nb_sanity_val_steps=2, log_gpu_memory="all", weights_summary=None, early_stop_callback=None, # show_progress_bar=False, train_percent_check=0.00001 if self.hparams.dev_test else 1, ) self.val_loss = float("inf") def _train(self): self.trainer.fit(self.model) self.val_loss = self.trainer.callback_metrics["avg_val_loss"] return {"mean_loss": self.val_loss} def _generate_video(self): print("generating video!") seq_len = 300 text_len = int(seq_len / 2) # read data dev_dir = "/home/tarask/Documents/storage/SpeechToMotion/Irish/WithTextV5/dev_inputs" speech_data = np.load(dev_dir + "/X_test_NaturalTalking_01.npy")[:seq_len] text = np.load(dev_dir + "/T_test_NaturalTalking_01.npy")[:text_len] # upsample text to get the same sampling rate as the audio cols = np.linspace(0, text.shape[0], endpoint=False, num=text.shape[0] * 2, dtype=int) text_data = text[cols, :] # Convert to float tensors and put on GPU speech = torch.tensor([speech_data]).float().cuda() text = torch.tensor([text_data]).float().cuda() # Text on validation sequences without teacher forcing predicted_gesture = self.model.forward(speech, text, condition=True, motion=None, teacher=False) """if self.hparams.pca_model: pca_output = pca.inverse_transform(val.reshape(-1, self.hparams.pose_dim)) output = pca_output.reshape(val.shape[0], val.shape[1], -1) else: output = val""" gen_dir = "/home/tarask/Documents/Code/CVPR2020/gesticulator/log/gestures/" ges_file = gen_dir + self.logdir[88:95] + ".npy" np.save(ges_file, predicted_gesture.detach().cpu().numpy()) print("Writing into: ", gen_dir + self.logdir[88:95] + ".npy") def _stop(self): if self.val_loss < 0.3 and self.iteration >= 4: self._generate_video() def _save(self, tmp_checkpoint_dir): print("Saving!") return {} def _restore(self, checkpoint): print("Restoring!")