Beispiel #1
0
    def __init__(self, params, model, name="loss"):

        check_params(params, self.get_required_params(),
                     self.get_optional_params())
        self._params = copy.deepcopy(params)
        self._model = model

        if "dtype" not in self._params:
            if self._model:
                self._params["dtype"] = self._model.get_tf_dtype()
            else:
                self._params["dtype"] = tf.float32

        self._name = name
Beispiel #2
0
    def __init__(self, params, model, name = "encoder", mode = "train"):
    
        check_params(params, self.get_required_params(), self.get_optional_params())
        self._params = params
        self._model = model

        if "dtype" not in self._params:
            if self._model:
                self._params["dtype"] = self._model.params["dtype"]
            else:
                self._params["dtype"] = tf.float32

        self._name = name
        self._mode = mode
        self._compiled = False
Beispiel #3
0
    def __init__(self, params, model, num_workers, worker_id):

        check_params(params, self.get_required_params(),
                     self.get_optional_params())
        self._params = copy.deepcopy(params)
        self._model = model

        if "dtype" not in self._params:
            if self._model:
                self._params["dtype"] = self._model.get_tf_dtype()
            else:
                self._params["dtype"] = tf.float32

        if "shuffle" not in self._params:
            self._params["dtype"] = (self._params["mode"] == "train")

        if self._params["mode"] != "train" and self._params["shuffle"]:
            raise ValueError("Shuffle should not be performed in eval mode")

        self._num_workers = num_workers
        self._worker_id = worker_id
Beispiel #4
0
def post_process_gradients(grads_and_vars, summaries, lr, 
                           clip_gradients,larc_params):

    """ Apply post processing to gradients, i.e. clipping, LARC, summaries"""
    if "global_gradient_norm" in summaries:
        tf.summary.scalar(
                "global_gradient_norm",
                _global_norm_with_cast(grads_and_vars))

    if clip_gradients is not None:
        grads_and_vars = _clip_gradients_by_norm(grads_and_vars, clip_gradients)
    
    # Add histograms for variables, gradients and gradient norms
    for gradient, variable in grads_and_vars:
        if isinstance(gradient, tf.IndexedSlices):
            grad_values = gradient.values
        else:
            grad_values = gradient

        if isinstance(variable, tf.IndexedSlices):
            var_values = variable.values
        else:
            var_values = variable

        if grad_values is not None:
            var_name = variable.name.replace(":", "_")
            if "gradients" in summaries:
                tf.summary.histogram("gradients%s" % var_name, mask_nans(grad_values))
            if "gradient_norm" in summaries:
                tf.summary.scalar("gradient_norm%s" % var_name, tf.norm(grad_values))
            if "variabels" in summaries:
                tf.summary.histogram("variabels%s" % var_name, var_values)
            if "variable_norm" in summaries:
                tf.summary.scalar("varibale_norm%s" % var_name, tf.norm(var_values))

    if clip_gradients is not None and "global_gradient_norm" in summaries:
        tf.summary.scalar(
                "global_clipped_gradient_norm",
                _global_norm_with_cast(grads_and_vars))
    
    # LARC gradient re-scaling
    if larc_params is not None:
        check_params(
                config = larc_params,
                required_dict = {"larc_eta": float},
                optional_dict = {
                    "larc_mode": ["clip", "scale"],
                    "min_update": float,
                    "epsilon": float
                },
        )

        larc_eta = larc_params["larc_eta"]
        larc_mode = larc_params.get("larc_mode", "clip")
        min_update = larc_params.get("min_update", 1e-7)
        eps = larc_params.get("epsilon", 1e-7)

        grads_and_vars_larc = [None] * len(grads_and_vars)
        for idx, (g,v) in enumerate(grads_and_vars):
            var_dtype = v.dtype
            v_norm = tf.norm(tensor = tf.cast(v, tf.float32), ord = 2)
            g_norm = tf.norm(tensor = tf.cast(g, tf.float32), ord = 2)

            if larc_mode == "clip":
                larc_grad_update = tf.maximum(
                        larc_eta * v_norm / (lr * (g_norm + eps)),
                        min_update)

                if "larc_summaries" in summaries:
                    tf.summary.scalar("larc_clip_on/{}".format(v.name),
                                      tf.cast(tf.less(larc_grad_update, 1.0), tf.int32))

                larc_grad_update = tf.minimum(larc_grad_update, 1.0)
            else:
                larc_grad_update = tf.maximum(
                        larc_eta * v_norm / (g_norm + eps),
                        min_update)

            larc_grad_update = tf.saturate_cast(larc_grad_update, var_dtype)
            grads_and_vars_larc[idx] = (larc_grad_update * g, v)

            if "larc_summaries" in summaries:
                tf.summary.scalar("larc_grad_update/{}".format(v.name), larc_grad_update)
                tf.summary.scalar("larc_final_lr/{}".format(v.name), tf.cast(lr, var_dtype) * larc_grad_update)

        grads_and_vars = grads_and_vars_larc

    return grads_and_vars
Beispiel #5
0
    def __init__(self, params, mode="train"):
        """
        tf的图不在这里创建,而是在self.compile()中

        params: dict
        
        mode: train - all parts of the graph will be built (model loss optimizer)
              eval - (model loss)
        """

        check_params(params, self.get_required_params(),
                     self.get_optional_params())

        self._params = copy.deepcopy(params)

        #parameter checks
        self._mode = mode
        self._interactive = False

        if self._mode == "interactive_infer":
            self._mode = "infer"
            self._interactive = True

        if self._mode not in ["train", "eval", "infer"]:
            raise ValueError("Mode has to be one of [train, eval, infer]")

        if "max_steps" in params and "num_epochs" in params:
            raise ValueError(
                "You can't provide both of max_steps and num_epochs")

        if mode == "train":
            if "max_steps" not in params and "num_epochs" not in params:
                raise ValueError(
                    "For the training mode, either max_steps or num_epochs has to be provided"
                )

        none_list = [
            "print_samples_steps", "print_loss_steps", "save_checkpoint_steps",
            "save_summaries_steps"
        ]
        for param in none_list:
            if param not in self._params:
                self._params[params] = None

        # num_checkpoints不存在时自动返回5
        self._params["num_checkpoints"] = self._params.get(
            "num_checkpoints", 5)
        self._params["finetune"] = self._params.get("finetune", False)
        self._params["load_model"] = self._params.get("load_model", None)
        self._params["eval_batch_size_per_gpu"] = self._params.get(
            "eval_batch_size_per_gpu", self._params["batch_size_per_gpu"])

        # checking that freq of samples and loss are aligned
        s_fr = self._params["print_samples_steps"]
        l_fr = self._params["print_loss_steps"]

        if s_fr is not None and l_fr is not None and s_fr % l_fr != 0:
            raise ValueError(
                "print_sample_steps has to be the multiple of print_loss_steps"
            )

        if "gpu_ids" in self._params:
            self._gpu_ids = self._params["gpu_ids"]
        elif "num_gpus" in self._params:
            self._gpu_ids = range(self._params["num_gpus"])
        else:
            raise ValueError(
                "Either gpu_ids or num_gpus has to be specified in the config")

        if self._interactive and len(self._gpu_ids) > 1:
            raise ValueError(
                "Interactive infer is meant to be used with 1 gpu")

        # setting random seed
        rs = self._params.get("random_seed", int(time.time()))
        tf.set_random_seed(rs)
        np.random.seed(rs)

        if "data_type" not in self._params:
            self._params["data_type"] = tf.float32

        dl_params = self._params.get("data_layer_params", {})
        """
        data_layer_params里面原来没有定义batch_size
        """
        if mode == "train":
            dl_params["batch_size"] = self._params["batch_size_per_gpu"]
        else:
            dl_params["batch_size"] = self._params["eval_batch_size_per_gpu"]

        if "lm_vocal_file" in self._params:
            dl_params["lm_vocal_file"] = self._params["lm_vocal_file"]

        if "processed_data_folder" in self._params:
            dl_params["processed_data_folder"] = self._params[
                "processed_data_folder"]

        dl_params["mode"] = self._mode
        dl_params["interactive"] = self._interactive

        self._data_layers = []
        """
        多GPU运算的话,每个GPU对应一个Speech2TextDataLayer
        Speech2TextDataLayer(params, model, num_workers, work_id)
        """
        for worker_id in range(self.num_gpus):
            self._data_layers.append(self._params["data_layer"](
                params=dl_params,
                model=self,
                num_workers=self.num_gpus,
                worker_id=worker_id))

        if self._mode == "train":

            if "max_steps" in self._params:
                slef._last_step = self._params["max_steps"]
                self._steps_in_epoch = None
            else:
                # doing a few steps if data size is not divisible by the batch size
                self._steps_in_epoch = self.get_data_layer(
                ).get_size_in_samples() // self.get_data_layer(
                ).params["batch_size"]

                if self._steps_in_epoch is None:
                    raise ValueError(
                        "The data_layer is not compatible with epoch execution"
                    )
                """ 
                多GPU计算中,在一个epoch中每个GPU各执行一部分steps
                batch_size超过samples时steps_in_epoch为0
                """
                self._steps_in_epoch //= self.num_gpus
                self._steps_in_epoch //= self._params.get("iter_size", 1)

                if self._steps_in_epoch == 0:
                    raise ValueError(
                        "Overall batch size is too big for this dataset")
                self._last_step = self._params[
                    "num_epochs"] * self._steps_in_epoch

        self._outputs = [None] * self.num_gpus

        self.loss = None
        self.train_op = None
        self.eval_losses = None
        self._num_objects_per_step = None
        self.skip_update_ph = None