num_total_train_speakers) else: if (params.num_speakers_per_batch * params.num_segments_per_speaker) % args.num_gpus != 0: sys.exit( "To use multiple GPUs, the batch size should divide num_gpus.") params.dict["ps"] = args.ps if args.ps == "cpu": tf.logging.warn("[Warning] The parameters will be placed on CPU.") trainer = TrainerMGPU(params, args.finetune_model, dim, num_total_train_speakers, num_gpus=args.num_gpus) trainer.build("train", noupdate_var_list=params.noupdate_var_list) trainer.build("valid") if "early_stop_epochs" not in params.dict: params.dict["early_stop_epochs"] = 5 if "min_learning_rate" not in params.dict: params.dict["min_learning_rate"] = 1e-5 if "lr_start_decay_epoch" not in params.dict: params.dict["lr_start_decay_epoch"] = 0 if "learning_rate_reduce_factor" not in params.dict: params.dict["learning_rate_reduce_factor"] = 2 if start_epoch == 0: # Load the pre-trained model and transfer to current model trainer.get_finetune_model(params.noload_var_list)
dim = FeatureReader(args.train_dir).get_dim() with open(os.path.join(model_dir, "feature_dim"), "w") as f: f.write("%d\n" % dim) num_total_train_speakers = KaldiDataRandomQueue(args.train_dir, args.train_spklist).num_total_speakers tf.logging.info("There are %d speakers in the training set and the dim is %d" % (num_total_train_speakers, dim)) # Load the history valid loss min_valid_loss = ValidLoss() if os.path.isfile(os.path.join(model_dir, "valid_loss")): min_valid_loss = load_valid_loss(os.path.join(model_dir, "valid_loss")) # The trainer is used to control the training process trainer = Trainer(params, args.model) trainer.build("train", dim=dim, loss_type=params.loss_func, num_speakers=num_total_train_speakers) trainer.build("valid", dim=dim, loss_type=params.loss_func, num_speakers=num_total_train_speakers) if "early_stop_epochs" not in params.dict: params.dict["early_stop_epochs"] = 10 if "min_learning_rate" not in params.dict: params.dict["min_learning_rate"] = 1e-5 for epoch in range(start_epoch, params.num_epochs): trainer.train(args.train_dir, args.train_spklist, learning_rate_array[epoch]) valid_loss, valid_embeddings, valid_labels = trainer.valid(args.valid_dir, args.valid_spklist, batch_type=params.batch_type,
# params.dict["triplet_center"] = "average" # params.dict["triplet_center_momentum"] = 0.9 # params.dict["loss_compute"] = "softplus" # params.dict["margin"] = 0.1 num_total_train_speakers = KaldiDataRandomQueue( args.data_dir, args.data_spklist).num_total_speakers dim = FeatureReader(args.data_dir).get_dim() if "selected_dim" in params.dict: dim = params.selected_dim trainer = Trainer(params, args.model_dir, dim, num_total_train_speakers, single_cpu=True) trainer.build("valid") # Load the model and output embeddings trainer.sess.run(tf.global_variables_initializer()) trainer.sess.run(tf.local_variables_initializer()) # load the weights curr_step = trainer.load() with tf.variable_scope("softmax", reuse=True): kernel = tf.get_variable("output/kernel", shape=[ trainer.embeddings.get_shape()[-1], num_total_train_speakers ]) kernel_val = trainer.sess.run(kernel) weights = np.transpose(kernel_val)
config_json = os.path.join(args.model_dir, "nnet/config.json") if not os.path.isfile(config_json): sys.exit("Cannot find params.json in %s" % config_json) params = Params(config_json) # Change the output node if necessary if len(args.node) != 0: params.embedding_node = args.node tf.logging.info("Extract embedding from %s" % params.embedding_node) trainer = Trainer(params, args.model_dir, single_cpu=True) with open(os.path.join(nnet_dir, "feature_dim"), "r") as f: dim = int(f.readline().strip()) trainer.build("predict", dim=dim) if args.rspecifier.rsplit(".", 1)[1] == "scp": # The rspecifier cannot be scp sys.exit("The rspecifier must be ark or input pipe") fp_out = open_or_fd(args.wspecifier, "wb") for index, (key, feature) in enumerate(read_mat_ark(args.rspecifier)): if feature.shape[0] < args.min_chunk_size: tf.logging.info("[INFO] Key %s length too short, %d < %d, skip." % (key, feature.shape[0], args.min_chunk_size)) continue if feature.shape[0] > args.chunk_size: feature_array = [] feature_length = [] num_chunks = int(
torch.manual_seed(params.random_seed) np.random.seed(params.random_seed) random.seed(params.random_seed) dim = FeatureReader(args.train_dir).get_dim() with open(os.path.join(model_dir, "feature_dim"), 'w') as f: f.write("%d\n" % dim) num_total_train_speakers = KaldiDataRandomQueue( args.train_dir, args.train_spklist).num_total_speakers # 训练说话人数目 with open(os.path.join(model_dir, "num_speakers"), 'w') as f: f.write("%d\n" % num_total_train_speakers) trainer = Trainer(params, args.model, num_total_train_speakers) trainer.build(loss_type=params.loss_func) if args.continue_training: checkpoint = torch.load(os.path.join(trainer.model, 'net.pth')) start_epoch = checkpoint['epoch'] + 1 trainer.optimizer.load_state_dict(checkpoint['optimizer']) trainer.network.load_state_dict(checkpoint['state_dict']) else: start_epoch = 0 learning_scheduler = lr_scheduler.StepLR(trainer.optimizer, step_size=params.reduce_lr_epochs, gamma=params.reduce_lr_gamma) for epoch in range(start_epoch, params.num_epochs): trainer.train(epoch=epoch, data=args.train_dir,
if __name__ == '__main__': tf.reset_default_graph() tf.logging.set_verbosity(tf.logging.INFO) nnet_dir = os.path.join(args.model_dir, "nnet") config_json = os.path.join(args.model_dir, "nnet/config.json") if not os.path.isfile(config_json): sys.exit("Cannot find params.json in %s" % config_json) params = Params(config_json) # Attention weights params.embedding_node = "attention_weights" with open(os.path.join(nnet_dir, "feature_dim"), "r") as f: dim = int(f.readline().strip()) trainer = Trainer(params, args.model_dir, dim, single_cpu=True) trainer.build("predict") if args.rspecifier.rsplit(".", 1)[1] == "scp": # The rspecifier cannot be scp sys.exit("The rspecifier must be ark or input pipe") fp_out = open_or_fd(args.wspecifier, "wb") for index, (key, feature) in enumerate(read_mat_ark(args.rspecifier)): if feature.shape[0] < args.min_chunk_size: tf.logging.info("[INFO] Key %s length too short, %d < %d, skip." % (key, feature.shape[0], args.min_chunk_size)) continue if feature.shape[0] > args.chunk_size: # We only extract the first segment feature = feature[:args.chunk_size] attention_weights = trainer.predict(feature)
with open(os.path.join(model_dir, "feature_dim"), "w") as f: f.write("%d\n" % dim) num_total_train_speakers = KaldiDataRandomQueue(args.train_dir, args.train_spklist).num_total_speakers tf.logging.info("There are %d speakers in the training set and the dim is %d" % (num_total_train_speakers, dim)) # Load the history valid loss min_valid_loss = ValidLoss() # The trainer is used to control the training process if args.num_gpus == 1: trainer = Trainer(params, args.model, dim, num_total_train_speakers) else: if (params.num_speakers_per_batch * params.num_segments_per_speaker) % args.num_gpus != 0: sys.exit("To use multiple GPUs, the batch size should divide num_gpus.") params.dict["ps"] = args.ps if args.ps == "cpu": tf.logging.warn("[Warning] The parameters will be placed on CPU.") trainer = TrainerMGPU(params, args.model, dim, num_total_train_speakers, num_gpus=args.num_gpus) # The trainer is used to control the training process trainer.build("train") trainer.build("valid") # You can tune the learning rate using the following function. # After training, you should plot the loss v.s. the learning rate and pich a learning rate that decrease the # loss fastest. trainer.train_tune_lr(args.train_dir, args.train_spklist, args.tune_period) trainer.close() tf.logging.info("Finish tuning.")
dim = FeatureReader(args.train_dir).get_dim() with open(os.path.join(model_dir, "feature_dim"), "w") as f: f.write("%d\n" % dim) num_total_train_speakers = KaldiDataRandomQueue(args.train_dir, args.train_spklist).num_total_speakers tf.logging.info("There are %d speakers in the training set and the dim is %d" % (num_total_train_speakers, dim)) min_valid_loss = ValidLoss() if os.path.isfile(os.path.join(model_dir, "valid_loss")): min_valid_loss = load_valid_loss(os.path.join(model_dir, "valid_loss")) # The trainer is used to control the training process trainer = Trainer(params, args.finetune_model) trainer.build("train", dim=dim, loss_type=params.loss_func, num_speakers=num_total_train_speakers, noupdate_var_list=params.noupdate_var_list) trainer.build("valid", dim=dim, loss_type=params.loss_func, num_speakers=num_total_train_speakers) if "early_stop_epochs" not in params.dict: params.dict["early_stop_epochs"] = 5 if "min_learning_rate" not in params.dict: params.dict["min_learning_rate"] = 1e-5 if start_epoch == 0: # Load the pre-trained model and transfer to current model trainer.get_finetune_model(params.noload_var_list)
if not os.path.isfile(config_json): sys.exit("Cannot find params.json in %s" % config_json) params = Params(config_json) # First, we need to extract the weights num_total_train_speakers = KaldiDataRandomQueue( os.path.dirname(args.spklist), args.spklist).num_total_speakers dim = FeatureReader(os.path.dirname(args.spklist)).get_dim() if "selected_dim" in params.dict: dim = params.selected_dim trainer = Trainer(params, args.model_dir, dim, num_total_train_speakers, single_cpu=True) trainer.build("valid") trainer.sess.run(tf.global_variables_initializer()) trainer.sess.run(tf.local_variables_initializer()) if not args.init: curr_step = trainer.load() else: # Hack: tf.logging.info("Use random initialization") trainer.is_loaded = True with tf.variable_scope("softmax", reuse=True): kernel = tf.get_variable("output/kernel", shape=[ trainer.embeddings.get_shape()[-1], num_total_train_speakers