def __init__(self,
                 input_dim,
                 output_dim,
                 context=[0],
                 affine_type="tdnn",
                 **options):
        super(ReluBatchNormTdnnLayerR, self).__init__()

        affine_options = {
            "bias": True,
            "groups": 1,
            "norm_w": False,
            "norm_f": False
        }

        affine_options = utils.assign_params_dict(affine_options, options)

        self.add_relu_bn(input_dim, options=options)

        if affine_type == "tdnn":
            self.affine = TdnnAffine(input_dim,
                                     output_dim,
                                     context=context,
                                     **affine_options)
        else:
            self.affine = ChunkSeparationAffine(input_dim,
                                                output_dim,
                                                context=context,
                                                **affine_options)
Exemple #2
0
    def __init__(self,
                 input_dim,
                 output_dim,
                 context=[0],
                 affine_type="tdnn",
                 **options):
        super(ReluBatchNormTdnnLayer, self).__init__()

        affine_options = {
            "bias": True,
            "groups": 1,
            "norm_w": False,
            "norm_f": False
        }

        affine_options = utils.assign_params_dict(affine_options, options)

        # Only keep the order: affine -> layers.insert -> add_relu_bn,
        # the structure order will be right when print(model), such as follows:
        # (tdnn1): ReluBatchNormTdnnLayer(
        #          (affine): TdnnAffine()
        #          (activation): ReLU()
        #          (batchnorm): BatchNorm1d(512, eps=1e-05, momentum=0.5, affine=False, track_running_stats=True)
        if affine_type == "tdnn":
            self.affine = TdnnAffine(input_dim,
                                     output_dim,
                                     context=context,
                                     **affine_options)
        else:
            self.affine = ChunkSeparationAffine(input_dim,
                                                output_dim,
                                                context=context,
                                                **affine_options)

        self.add_relu_bn(output_dim, options=options)
Exemple #3
0
def get_augmentation(aug=None, aug_params={}):
    default_aug_params = {
        "frequency": 0.2,
        "frame": 0.,
        "rows": 1,
        "cols": 0,
        "random_rows": False,
        "random_cols": False
    }

    aug_params = utils.assign_params_dict(default_aug_params, aug_params)

    if aug is None or aug == "" or aug == False:
        return None
    elif aug == "specaugment":
        return SpecAugment(frequency=aug_params["frequency"],
                           frame=aug_params["frame"],
                           rows=aug_params["rows"],
                           cols=aug_params["cols"],
                           random_rows=aug_params["random_rows"],
                           random_cols=aug_params["random_cols"])
    elif aug == "cutout":
        raise NotImplementedError
    else:
        raise TypeError("Do not support {} augmentation.".format(aug))
Exemple #4
0
    def __init__(self, package, stop_early=False):
        default_elements = {
            "data": None,
            "model": None,
            "optimizer": None,
            "lr_scheduler": None
        }
        default_params = {
            "model_dir": "",
            "model_blueprint": "",
            "exist_model": "",
            "start_epoch": 0,
            "epochs": 10,
            "use_gpu": True,
            "gpu_id": "",
            "benchmark": True,
            "max_change": 10.0,
            "compute_accuracy": True,
            "compute_valid_accuracy": True,
            "compute_one_batch_valid": False,
            "suffix": "params",
            "nan_debug": False,
            "use_tensorboard": True,
            "mixed_prec": False
        }

        elements, params = package
        self.elements = utils.assign_params_dict(default_elements, elements)
        self.params = utils.assign_params_dict(default_params,
                                               params,
                                               support_unknow=True)

        assert self.elements["data"] is not None
        assert self.elements["model"] is not None
        assert self.elements["optimizer"] is not None

        assert self.params["model_dir"] != ""
        assert self.params["model_blueprint"] != ""

        self.elements["model_forward"] = self.elements["model"]
        self.params["start_epoch"] = max(0, self.params["start_epoch"])

        if self.params["mixed_prec"] is True: self.scaler = GradScaler()

        self.stop_early = stop_early  # To do.
        self.training_point = (self.params["start_epoch"], 0,
                               self.elements["data"].num_batch_train)
    def add_relu_bn(self, output_dim=None, options: dict = {}):
        default_params = {
            "bn-relu": False,
            "nonlinearity": 'relu',
            "nonlinearity_params": {
                "inplace": True,
                "negative_slope": 0.01
            },
            "bn": True,
            "bn_params": {
                "momentum": 0.1,
                "affine": True,
                "track_running_stats": True
            },
            "special_init": True,
            "mode": 'fan_out'
        }

        default_params = utils.assign_params_dict(default_params, options)

        # This 'if else' is used to keep a corrected order when printing model.
        # torch.sequential is not used for I do not want too many layer wrapper and just keep structure as tdnn1.affine
        # rather than tdnn1.layers.affine or tdnn1.layers[0] etc..
        if not default_params["bn-relu"]:
            # ReLU-BN
            # For speaker recognition, relu-bn seems better than bn-relu. And w/o affine (scale and shift) of bn is
            # also better than w/ affine.
            self.after_forward = self._relu_bn_forward
            self.activation = Nonlinearity(
                default_params["nonlinearity"],
                **default_params["nonlinearity_params"])
            if default_params["bn"]:
                self.batchnorm = torch.nn.BatchNorm1d(
                    output_dim, **default_params["bn_params"])
        else:
            # BN-ReLU
            self.after_forward = self._bn_relu_forward
            if default_params["bn"]:
                self.batchnorm = torch.nn.BatchNorm1d(
                    output_dim, **default_params["bn_params"])
            self.activation = Nonlinearity(
                default_params["nonlinearity"],
                **default_params["nonlinearity_params"])

        if default_params["special_init"] and self.affine is not None:
            if default_params["nonlinearity"] in [
                    "relu", "leaky_relu", "tanh", "sigmoid"
            ]:
                # Before special_init, there is another initial way been done in TdnnAffine and it
                # is just equal to use torch.nn.init.normal_(self.affine.weight, 0., 0.01) here.
                torch.nn.init.kaiming_uniform_(
                    self.affine.weight,
                    a=0,
                    mode=default_params["mode"],
                    nonlinearity=default_params["nonlinearity"])
            else:
                torch.nn.init.xavier_normal_(self.affine.weight, gain=1.0)
    def __init__(self, optimizer, params:dict={}):
        # Suggested weight_decay: 1e-4 for l2 regularization (sgd, adam) and 
        #                         1e-1 for decouped weight decay (sgdw, adamw, radam, ralamb, adamod etc.)
        default_params = {
            "name":"warmR",
            "1cycle.learn_rate":0.001,
            "warmR.T_max":10,
            "warmR.T_mult":1,
            "warmR.factor":1.0,
            "warmR.eta_min":4e-8,
            "warmR.log_decay":False,
            "warmR.lr_decay_step":1,
            "reduceP.metric":'valid_acc',
            "reduceP.check_interval":0, 
            "reduceP.factor":0.1, 
            "reduceP.patience":10, 
            "reduceP.threshold":0.0001, 
            "reduceP.cooldown":0, 
            "reduceP.min_lr":0
        }

        used_params = utils.assign_params_dict(default_params, params, force_check=False, support_unknow=True)
        split_params = utils.split_params(used_params)

        if isinstance(optimizer, Lookahead):
            base_optimizer = optimizer.optimizer
        else:
            base_optimizer = optimizer

        self.name = split_params["public"]["name"]
        if self.name == "1cycle":
            # To do.
            self.lr_scheduler = optim.lr_scheduler.OneCycleLR(base_optimizer, **split_params["1cycle"])
        elif self.name == "warmR":
            T_max = split_params["warmR"].pop("T_max")
            self.lr_decay_step = split_params["warmR"].pop("lr_decay_step")
            self.lr_scheduler = CosineAnnealingWarmRestarts(base_optimizer, T_max, **split_params["warmR"])
        elif self.name == "reduceP":
            self.check_interval = split_params["reduceP"].pop("check_interval")
            self.metric = split_params["reduceP"].pop("metric")
            if self.metric == "valid_acc":
                mode = "max"
            elif self.metric == "valid_loss":
                mode = "min"
            else:
                raise ValueError("Do not support {} metric for ReduceLROnPlateau strategy.".format(self.metric))
            self.lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(base_optimizer, mode=mode, **split_params["reduceP"])
            self.init = False
            if utils.use_horovod():
                raise TypeError("Do not support ReduceLROnPlateau for multi-gpu of Horovod now.")
        else:
            raise ValueError("Do not support {0} lr_scheduler now.".format(name))
Exemple #7
0
    def init(self,
             inputs_dim,
             num_targets,
             aug_dropout=0.2,
             training=True,
             extracted_embedding="far",
             tdnn_layer_params={}):

        default_tdnn_layer_params = {
            "nonlinearity": 'relu',
            "bn-relu": False,
            "bn": True,
            "bn_params": {
                "momentum": 0.1,
                "affine": True,
                "track_running_stats": True
            }
        }
        tdnn_layer_params = utils.assign_params_dict(default_tdnn_layer_params,
                                                     tdnn_layer_params)

        # Var
        self.extracted_embedding = extracted_embedding

        # Nnet
        self.aug_dropout = torch.nn.Dropout2d(
            p=aug_dropout) if aug_dropout > 0 else None

        self.tdnn1 = ReluBatchNormTdnnLayer(inputs_dim, 512, [-2, -1, 0, 1, 2],
                                            **tdnn_layer_params)
        self.tdnn2 = ReluBatchNormTdnnLayer(512, 512, [-2, 0, 2],
                                            **tdnn_layer_params)
        self.tdnn3 = ReluBatchNormTdnnLayer(512, 512, [-3, 0, 3],
                                            **tdnn_layer_params)
        self.tdnn4 = ReluBatchNormTdnnLayer(512, 512, **tdnn_layer_params)
        self.tdnn5 = ReluBatchNormTdnnLayer(512, 1500, **tdnn_layer_params)
        self.stats = StatisticsPooling(1500, stddev=True)
        self.tdnn6 = ReluBatchNormTdnnLayer(self.stats.get_output_dim(), 512,
                                            **tdnn_layer_params)
        self.tdnn7 = ReluBatchNormTdnnLayer(512, 512, **tdnn_layer_params)

        # Do not need when extracting embedding.
        if training:
            self.loss = SoftmaxLoss(512, num_targets)

            # An example to using transform-learning without initializing loss.affine parameters
            self.transform_keys = [
                "tdnn1", "tdnn2", "tdnn3", "tdnn4", "tdnn5", "stats", "tdnn6",
                "tdnn7"
            ]
Exemple #8
0
def get_dropout_from_wrapper(p=0., dropout_params={}):

    assert 0. <= p < 1.

    default_dropout_params = {
        "type": "default",  # default | random
        "start_p": 0.,
        "dim": 2,
        "method": "normal",
        "continuous": False,
        "inplace": True,
        "frequency": 0.2,
        "frame": 0.2,
        "rows": 1,
        "cols": 1,
        "random_rows": False,
        "random_cols": False
    }

    dropout_params = utils.assign_params_dict(default_dropout_params,
                                              dropout_params)
    name = dropout_params["type"]

    if p == 0:
        return None

    if name == "default":
        return get_default_dropout(p=p,
                                   dim=dropout_params["dim"],
                                   inplace=dropout_params["inplace"])
    elif name == "random":
        return RandomDropout(p=p,
                             start_p=dropout_params["start_p"],
                             dim=dropout_params["dim"],
                             method=dropout_params["method"],
                             inplace=dropout_params["inplace"])
    elif name == "alpha":
        return torch.nn.AlphaDropout(p=p, inplace=dropout_params["inplace"])
    elif name == "context":
        return ContextDropout(p=p)
    elif name == "noise":
        return NoiseDropout(p=p,
                            dim=dropout_params["dim"],
                            method=dropout_params["method"],
                            continuous=dropout_params["continuous"],
                            inplace=dropout_params["inplace"])
    else:
        raise TypeError(
            "Do not support {} dropout in current wrapper.".format(name))
Exemple #9
0
    def __init__(self, channels, context=[0], bias=False, scale=4, inplace=True,
                 affine_type="tdnn-affine", bn_params={}):
        super().__init__()
        default_bn_params = {"momentum": 0.1, "affine": True, "track_running_stats": True}
        bn_params = utils.assign_params_dict(default_bn_params, bn_params)

        assert channels % scale == 0, "{} % {} != 0".format(channels, scale)
        self.scale = scale
        self.width = channels // scale
        self.nums = scale if scale == 1 else scale - 1

        self.convs = []
        self.bns = []
        for i in range(self.nums):
            self.convs.append(ReluBatchNormTdnnLayer(self.width, self.width, context, affine_type,
                                                     bias=bias, nonlinearity="", bn=False))
            self.bns.append(nn.BatchNorm1d(self.width, **bn_params))
        self.convs = nn.ModuleList(self.convs)
        self.bns = nn.ModuleList(self.bns)
        self.relu = nn.ReLU(inplace=inplace)
    def __init__(self, optimizer, params: dict = {}):
        # Suggested weight_decay: 1e-4 for l2 regularization (sgd, adam) and
        #                         1e-1 for decouped weight decay (sgdw, adamw, radam, ralamb, adamod etc.)
        default_params = {
            "name": "warmR",
            "1cycle.learn_rate": 0.001,
            "warmR.T_max": 10,
            "warmR.T_mult": 1,
            "warmR.factor": 1.0,
            "warmR.eta_min": 4e-8,
            "warmR.log_decay": False,
            "warmR.lr_decay_step": 1
        }

        used_params = utils.assign_params_dict(default_params,
                                               params,
                                               force_check=False,
                                               support_unknow=True)
        split_params = utils.split_params(used_params)

        if isinstance(optimizer, Lookahead):
            base_optimizer = optimizer.optimizer
        else:
            base_optimizer = optimizer

        self.name = split_params["public"]["name"]
        if self.name == "1cycle":
            # To do.
            self.lr_scheduler = optim.lr_scheduler.OneCycleLR(
                base_optimizer, **split_params["1cycle"])
        elif self.name == "warmR":
            T_max = split_params["warmR"].pop("T_max")
            self.lr_decay_step = split_params["warmR"].pop("lr_decay_step")
            self.lr_scheduler = CosineAnnealingWarmRestarts(
                base_optimizer, T_max, **split_params["warmR"])
        else:
            raise ValueError(
                "Do not support {0} lr_scheduler now.".format(name))
Exemple #11
0
    def __init__(self, trainer):
        default_params = {
            "report_times_every_epoch": None,
            "report_interval_iters": 100,
            "record_file": "train.csv",
            "use_tensorboard": False
        }
        self.trainer = trainer
        default_params = utils.assign_params_dict(default_params,
                                                  self.trainer.params)

        if default_params["report_times_every_epoch"] is not None:
            self.report_interval_iters = max(
                1, self.trainer.training_point[2] //
                default_params["report_times_every_epoch"])
        else:
            self.report_interval_iters = default_params[
                "report_interval_iters"]

        if not self.trainer.params["debug"] and default_params[
                "use_tensorboard"]:
            # from tensorboardX import SummaryWriter
            from torch.utils.tensorboard import SummaryWriter
            model_name = os.path.basename(self.trainer.params["model_dir"])
            # time_string = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time()))
            # time_string = self.trainer.params["time_string"]
            # self.board_writer = SummaryWriter("{}/log/{}-{}-tensorboard".format(self.trainer.params["model_dir"], model_name, time_string))
            # self.board_writer = SummaryWriter("{}/log/{}-{}-tensorboard".format(
            #     self.trainer.params["model_dir"], time_string, model_name))
            self.board_writer = SummaryWriter("{}/log/tensorboard".format(
                self.trainer.params["model_dir"]))
        else:
            self.board_writer = None

        self.epochs = self.trainer.params["epochs"]

        self.optimizer = self.trainer.elements["optimizer"]

        # For optimizer wrapper such as lookahead.
        # "None" is the default value
        if getattr(self.optimizer, "optimizer", None) is not None:
            self.optimizer = self.optimizer.optimizer

        self.device = "[{0}]".format(
            utils.get_device(self.trainer.elements["model"]))

        self.record_value = []

        self.start_write_log = False
        if not self.trainer.params["debug"] and default_params[
                "record_file"] != "" and default_params[
                    "record_file"] is not None:
            self.record_file = "{0}/log/{1}".format(
                self.trainer.params["model_dir"],
                default_params["record_file"])

            # The case to recover training
            if self.trainer.params["start_epoch"] > 0:
                # train.csv using append mode
                self.start_write_log = True
            elif os.path.exists(self.record_file):
                # Do backup to avoid clearing the loss log when re-running a same launcher.
                bk_file = "{0}.backup.{1}".format(
                    self.record_file,
                    time.strftime('%Y-%m-%d_%H:%M:%S',
                                  time.localtime(time.time())))
                shutil.move(self.record_file, bk_file)
        else:
            self.record_file = None

        # A format to show progress
        # Do not use progressbar.Bar(marker="\x1b[32m█\x1b[39m") and progressbar.SimpleProgress(format='%(value_s)s/%(max_value_s)s') to avoid too long string.
        widgets = [
            progressbar.Percentage(format='%(percentage)3.2f%%'), " | ",
            "Epoch:",
            progressbar.Variable('current_epoch',
                                 format='{formatted_value}',
                                 width=0,
                                 precision=0), "/{0}, ".format(self.epochs),
            "Iter:",
            progressbar.Variable('current_iter',
                                 format='{formatted_value}',
                                 width=0,
                                 precision=0),
            "/{0}".format(self.trainer.training_point[2]), " (",
            progressbar.Timer(format='ELA: %(elapsed)s'), ", ",
            progressbar.AdaptiveETA(), ")"
        ]

        # total num of iter
        max_value = self.trainer.params[
            "epochs"] * self.trainer.training_point[2]

        self.bar = progressbar.ProgressBar(max_value=max_value,
                                           widgets=widgets,
                                           redirect_stdout=True)

        # Use multi-process for update.
        self.queue = Queue()
        self.process = Process(target=self._update, daemon=True)
        self.process.start()
    def init(self, inputs_dim, num_targets, num_phones, extend=False, skip_connection=False, 
             mixup=False, mixup_alpha=1.0,
             specaugment=False, specaugment_params={},
             aug_dropout=0., context_dropout=0., hidden_dropout=0., dropout_params={},
             SE=False, se_ratio=4,
             tdnn_layer_params={},
             tdnn6=True, tdnn7_params={},
             pooling="statistics", pooling_params={},
             margin_loss=False, margin_loss_params={},
             use_step=False, step_params={},
             transfer_from="softmax_loss",
             training=True, extracted_embedding="far",mt_alpha=0.1):

        ## Params.
        default_dropout_params = {
            "type":"default", # default | random
            "start_p":0.,
            "dim":2,
            "method":"uniform", # uniform | normals
            "continuous":False,
            "inplace":True
        }

        default_tdnn_layer_params = {
            "nonlinearity":'relu', "nonlinearity_params":{"inplace":True},
            "bn-relu":False, "bn":True, "bn_params":{"momentum":0.5, "affine":False, "track_running_stats":True}
        }

        default_pooling_params = {
            "num_nodes":1500,
            "num_head":1,
            "share":True,
            "affine_layers":1,
            "hidden_size":64,
            "context":[0],
            "stddev":True,
            "temperature":False, 
            "fixed":True,
            "stddev":True
        }

        default_margin_loss_params = {
            "method":"am", "m":0.2, 
            "feature_normalize":True, "s":30, 
            "double":False,
            "mhe_loss":False, "mhe_w":0.01,
            "inter_loss":0.,
            "ring_loss":0.,
            "curricular":False
        }

        default_step_params = {
            "T":None,
            "m":False, "lambda_0":0, "lambda_b":1000, "alpha":5, "gamma":1e-4,
            "s":False, "s_tuple":(30, 12), "s_list":None,
            "t":False, "t_tuple":(0.5, 1.2), 
            "p":False, "p_tuple":(0.5, 0.1)
        }

        dropout_params = utils.assign_params_dict(default_dropout_params, dropout_params)
        tdnn_layer_params = utils.assign_params_dict(default_tdnn_layer_params, tdnn_layer_params)
        # If param is not be specified, default it w.r.t tdnn_layer_params.
        tdnn7_params = utils.assign_params_dict(tdnn_layer_params, tdnn7_params)
        pooling_params = utils.assign_params_dict(default_pooling_params, pooling_params)
        margin_loss_params = utils.assign_params_dict(default_margin_loss_params, margin_loss_params)
        step_params = utils.assign_params_dict(default_step_params, step_params)

        ## Var.
        self.skip_connection = skip_connection
        self.use_step = use_step
        self.step_params = step_params

        self.extracted_embedding = extracted_embedding # For extract.

        self.mt_alpha = mt_alpha
        
        ## Nnet.
        # Head
        self.mixup = Mixup(alpha=mixup_alpha) if mixup else None
        self.specaugment = SpecAugment(**specaugment_params) if specaugment else None
        self.aug_dropout = get_dropout_from_wrapper(aug_dropout, dropout_params)
        self.context_dropout = ContextDropout(p=context_dropout) if context_dropout > 0 else None
        self.hidden_dropout = get_dropout_from_wrapper(hidden_dropout, dropout_params)

        # Frame level
        self.tdnn1 = ReluBatchNormTdnnLayer(inputs_dim,512,[-2,-1,0,1,2], **tdnn_layer_params)
        self.se1 = SEBlock(512, ratio=se_ratio) if SE else None
        self.ex_tdnn1 = ReluBatchNormTdnnLayer(512,512, **tdnn_layer_params) if extend else None
        self.tdnn2 = ReluBatchNormTdnnLayer(512,512,[-2,0,2], **tdnn_layer_params)
        self.se2 = SEBlock(512, ratio=se_ratio) if SE else None
        self.ex_tdnn2 = ReluBatchNormTdnnLayer(512,512, **tdnn_layer_params) if extend else None
        self.tdnn3 = ReluBatchNormTdnnLayer(512,512,[-3,0,3], **tdnn_layer_params)
        self.se3 = SEBlock(512, ratio=se_ratio) if SE else None
        self.ex_tdnn3 = ReluBatchNormTdnnLayer(512,512, **tdnn_layer_params) if extend else None
        self.ex_tdnn4 = ReluBatchNormTdnnLayer(512,512,[-4,0,4], **tdnn_layer_params) if extend else None
        self.se4 = SEBlock(512, ratio=se_ratio) if SE and extend else None
        self.ex_tdnn5 = ReluBatchNormTdnnLayer(512,512, **tdnn_layer_params) if extend else None
        self.tdnn4 = ReluBatchNormTdnnLayer(512,512, **tdnn_layer_params)

        num_nodes = pooling_params.pop("num_nodes")

        self.tdnn5 = ReluBatchNormTdnnLayer(512, num_nodes, **tdnn_layer_params)
        
        #Zheng Li 2021-06-08
        self.phonetic_tdnn5 = ReluBatchNormTdnnLayer(512,512,**tdnn_layer_params)
        self.phonetic_tdnn6 = ReluBatchNormTdnnLayer(512,512,**tdnn_layer_params)
        self.phonetic_tdnn7 = ReluBatchNormTdnnLayer(512,512,**tdnn_layer_params)

        # Pooling
        stddev = pooling_params.pop("stddev")
        if pooling == "lde":
            self.stats = LDEPooling(num_nodes, c_num=pooling_params["num_head"])
        elif pooling == "attentive":
            self.stats = AttentiveStatisticsPooling(num_nodes, affine_layers=pooling_params["affine_layers"], 
                                                    hidden_size=pooling_params["hidden_size"], 
                                                    context=pooling_params["context"], stddev=stddev)
        elif pooling == "multi-head":
            self.stats = MultiHeadAttentionPooling(num_nodes, stddev=stddev, **pooling_params)
        elif pooling == "multi-resolution":
            self.stats = MultiResolutionMultiHeadAttentionPooling(num_nodes, **pooling_params)
        else:
            self.stats = StatisticsPooling(num_nodes, stddev=stddev)

        stats_dim = self.stats.get_output_dim()

        # Segment level
        if tdnn6:
            self.tdnn6 = ReluBatchNormTdnnLayer(stats_dim, 512, **tdnn_layer_params)
            tdnn7_dim = 512
        else:
            self.tdnn6 = None
            tdnn7_dim = stats_dim

        if tdnn7_params["nonlinearity"] == "default":
            tdnn7_params["nonlinearity"] = tdnn_layer_params["nonlinearity"]

        self.tdnn7 = ReluBatchNormTdnnLayer(tdnn7_dim,512, **tdnn7_params)

        # Loss
        # Do not need when extracting embedding.
        if training :
            if margin_loss:
                self.loss = MarginSoftmaxLoss(512, num_targets, **margin_loss_params)
                    #Zheng Li 2021-06-08
                self.loss_spk = MarginSoftmaxLoss(512, num_targets, **margin_loss_params)
                self.loss_phone= SoftmaxLoss_frame_phone_fix(512, num_phones)
            else:
                #Zheng Li 2021-06-08
                self.loss_spk = SoftmaxLoss(512, num_targets)
                self.loss_phone= SoftmaxLoss_frame_phone_fix(512, num_phones)

            self.wrapper_loss = MixupLoss(self.loss, self.mixup) if mixup else None

            # An example to using transform-learning without initializing loss.affine parameters
            self.transform_keys = ["tdnn1","tdnn2","tdnn3","tdnn4","tdnn5","stats","tdnn6","tdnn7",
                                   "ex_tdnn1","ex_tdnn2","ex_tdnn3","ex_tdnn4","ex_tdnn5",
                                   "se1","se2","se3","se4","loss"]

            if margin_loss and transfer_from == "softmax_loss":
                # For softmax_loss to am_softmax_loss
                self.rename_transform_keys = {"loss.affine.weight":"loss.weight"} 
    def init(self, inputs_dim, num_targets, aug_dropout=0., tail_dropout=0., training=True, extracted_embedding="near", 
             resnet_params={}, pooling="statistics", pooling_params={}, fc1=False, fc1_params={}, fc2_params={}, margin_loss=False, margin_loss_params={},
             use_step=False, step_params={}, transfer_from="softmax_loss"):

        ## Params.
        default_resnet_params = {
            "head_conv":True, "head_conv_params":{"kernel_size":3, "stride":1, "padding":1},
            "head_maxpool":False, "head_maxpool_params":{"kernel_size":3, "stride":1, "padding":1},
            "block":"BasicBlock",
            "layers":[3, 4, 6, 3],
            "planes":[32, 64, 128, 256], # a.k.a channels.
            "convXd":2,
            "norm_layer_params":{"momentum":0.5, "affine":True},
            "full_pre_activation":True,
            "zero_init_residual":False
            }

        default_pooling_params = {
            "num_head":1,
            "hidden_size":64,
            "share":True,
            "affine_layers":1,
            "context":[0],
            "stddev":True,
            "temperature":False, 
            "fixed":True
        }
        
        default_fc_params = {
            "nonlinearity":'relu', "nonlinearity_params":{"inplace":True},
            "bn-relu":False, 
            "bn":True, 
            "bn_params":{"momentum":0.5, "affine":True, "track_running_stats":True}
            }

        default_margin_loss_params = {
            "method":"am", "m":0.2, "feature_normalize":True, 
            "s":30, "mhe_loss":False, "mhe_w":0.01
            }
        
        default_step_params = {
            "T":None,
            "m":False, "lambda_0":0, "lambda_b":1000, "alpha":5, "gamma":1e-4,
            "s":False, "s_tuple":(30, 12), "s_list":None,
            "t":False, "t_tuple":(0.5, 1.2), 
            "p":False, "p_tuple":(0.5, 0.1)
            }

        resnet_params = utils.assign_params_dict(default_resnet_params, resnet_params)
        pooling_params = utils.assign_params_dict(default_pooling_params, pooling_params)
        fc1_params = utils.assign_params_dict(default_fc_params, fc1_params)
        fc2_params = utils.assign_params_dict(default_fc_params, fc2_params)
        margin_loss_params = utils.assign_params_dict(default_margin_loss_params, margin_loss_params)
        step_params = utils.assign_params_dict(default_step_params, step_params)

        ## Var.
        self.extracted_embedding = extracted_embedding # only near here.
        self.use_step = use_step
        self.step_params = step_params
        self.convXd = resnet_params["convXd"]
        
        ## Nnet.
        self.aug_dropout = torch.nn.Dropout2d(p=aug_dropout) if aug_dropout > 0 else None

        # [batch, 1, feats-dim, frames] for 2d and  [batch, feats-dim, frames] for 1d.
        # Should keep the channel/plane is always in 1-dim of tensor (index-0 based).
        inplanes = 1 if self.convXd == 2 else inputs_dim
        self.resnet = ResNet(inplanes, **resnet_params)

        # It is just equal to Ceil function.
        resnet_output_dim = (inputs_dim + self.resnet.get_downsample_multiple() - 1) // self.resnet.get_downsample_multiple() \
                            * self.resnet.get_output_planes() if self.convXd == 2 else self.resnet.get_output_planes()

        # Pooling
        stddev = pooling_params.pop("stddev")
        if pooling == "lde":
            self.stats = LDEPooling(resnet_output_dim, c_num=pooling_params["num_head"])
        elif pooling == "attentive":
            self.stats = AttentiveStatisticsPooling(resnet_output_dim, hidden_size=pooling_params["hidden_size"], 
                                                    context=pooling_params["context"], stddev=stddev)
        elif pooling == "multi-head":
            self.stats = MultiHeadAttentionPooling(resnet_output_dim, stddev=stddev, **pooling_params)
        elif pooling == "multi-resolution":
            self.stats = MultiResolutionMultiHeadAttentionPooling(resnet_output_dim, **pooling_params)
        else:
            self.stats = StatisticsPooling(resnet_output_dim, stddev=stddev)

        self.fc1 = ReluBatchNormTdnnLayer(self.stats.get_output_dim(), resnet_params["planes"][3], **fc1_params) if fc1 else None

        if fc1:
            fc2_in_dim = resnet_params["planes"][3]
        else:
            fc2_in_dim = self.stats.get_output_dim()

        self.fc2 = ReluBatchNormTdnnLayer(fc2_in_dim, resnet_params["planes"][3], **fc2_params)

        self.tail_dropout = torch.nn.Dropout2d(p=tail_dropout) if tail_dropout > 0 else None

        ## Do not need when extracting embedding.
        if training :
            if margin_loss:
                self.loss = MarginSoftmaxLoss(resnet_params["planes"][3], num_targets, **margin_loss_params)
            else:
                self.loss = SoftmaxLoss(resnet_params["planes"][3], num_targets)

            # An example to using transform-learning without initializing loss.affine parameters
            self.transform_keys = ["resnet", "stats", "fc1", "fc2"]

            if margin_loss and transfer_from == "softmax_loss":
                # For softmax_loss to am_softmax_loss
                self.rename_transform_keys = {"loss.affine.weight":"loss.weight"} 
Exemple #14
0
def get_optimizer(model, params: dict = {}):
    # Suggested weight_decay: 1e-4 for l2 regularization (sgd, adam) and
    #                         1e-1 for decouped weight decay (sgdw, adamw, radam, ralamb, adamod etc.)
    default_params = {
        "name": "adamW",
        "learn_rate": 0.001,
        "beta1": 0.9,
        "beta2": 0.999,
        "beta3": 0.999,
        "weight_decay": 1e-4,
        "lookahead.k": 5,
        "lookahead.alpha": 0.,
        "gc": False
    }

    used_params = utils.assign_params_dict(default_params, params)

    # Base params
    name = used_params["name"]
    learn_rate = used_params["learn_rate"]
    beta1 = used_params["beta1"]
    beta2 = used_params["beta2"]
    beta3 = used_params["beta3"]
    weight_decay = used_params["weight_decay"]
    gc = used_params["gc"]

    extra_params = {}

    # Gradient centralization:
    # Yong, H., Huang, J., Hua, X., & Zhang, L. (2020). Gradient Centralization:
    #     A New Optimization Technique for Deep Neural Networks. arXiv e-prints, arXiv:2004.01461.
    #     Retrieved from https://ui.adsabs.harvard.edu/abs/2020arXiv200401461Y
    # Github: https://github.com/Yonghongwei/Gradient-Centralization
    if gc:
        # Specify this list by developer.
        default_support_gc_list = ["adamW", "ralamb"]

        if name not in default_support_gc_list:
            raise TypeError(
                "Optimizer {} does not support gradient centralization (GC) now."
                .format(name))

        extra_params["gc"] = True

    # Select optimizer
    if name == "sgd":
        base_optimizer = optim.SGD(model.parameters(),
                                   lr=learn_rate,
                                   momentum=beta1,
                                   weight_decay=weight_decay)
    elif name == "sgdW":
        base_optimizer = SGDW(model.parameters(),
                              lr=learn_rate,
                              momentum=beta1,
                              weight_decay=weight_decay)
    elif name == "adam":
        base_optimizer = optim.Adam(model.parameters(),
                                    lr=learn_rate,
                                    betas=(beta1, beta2),
                                    weight_decay=weight_decay)
    elif name == "adamW":
        base_optimizer = AdamW(model.parameters(),
                               lr=learn_rate,
                               betas=(beta1, beta2),
                               weight_decay=weight_decay,
                               **extra_params)
    elif name == "radam":
        base_optimizer = RAdam(model.parameters(),
                               lr=learn_rate,
                               betas=(beta1, beta2),
                               weight_decay=weight_decay)
    elif name == "ralamb":
        base_optimizer = Ralamb(model.parameters(),
                                lr=learn_rate,
                                betas=(beta1, beta2),
                                weight_decay=weight_decay,
                                **extra_params)
    elif name == "adamod":
        base_optimizer = AdaMod(model.parameters(),
                                lr=learn_rate,
                                betas=(beta1, beta2),
                                beta3=beta3,
                                weight_decay=weight_decay)
    elif name == "novograd":
        base_optimizer = Novograd(model.parameters(),
                                  lr=learn_rate,
                                  betas=(beta1, beta2),
                                  weight_decay=weight_decay)
    else:
        raise ValueError("Do not support {0} optimizer now.".format(name))

    # Using alpha to decide whether to use lookahead
    if used_params["lookahead.alpha"] > 0:
        logger.info("Use lookahead optimizer with alpha={} and k={}".format(
            used_params["lookahead.alpha"], used_params["lookahead.k"]))
        optimizer = Lookahead(base_optimizer,
                              k=used_params["lookahead.k"],
                              alpha=used_params["lookahead.alpha"])
    else:
        optimizer = base_optimizer

    return optimizer
Exemple #15
0
    def __init__(self, optimizer, params:dict={}):
        # Suggested weight_decay: 1e-4 for l2 regularization (sgd, adam) and 
        #                         1e-1 for decouped weight decay (sgdw, adamw, radam, ralamb, adamod etc.)
        default_params = {
            "name":"warmR",

            "cyclic.max_lr":1e-3,
            "cyclic.base_lr":1e-8,
            "cyclic.step_size_up":2e4,
            "cyclic.step_size_down":None,
            "cyclic.mode":'triangular2', 
            "cyclic.gamma":1.0, 
            "cyclic.scale_fn":None, 
            "cyclic.scale_mode":'cycle', 
            "cyclic.cycle_momentum":False, 
            "cyclic.base_momentum":0.8, 
            "cyclic.max_momentum":0.9,

            "1cycle.learn_rate":0.001,
            "1cycle.total_steps":None,
            "1cycle.epochs":None,
            "1cycle.steps_per_epoch":None,
            "1cycle.pct_start":0.3,
            "1cycle.anneal_strategy":'linear',
            "1cycle.cycle_momentum":False,
            "1cycle.base_momentum":0.85,
            "1cycle.max_momentum":0.95,
            "1cycle.div_factor":25.0,
            "1cycle.final_div_factor":10000.0,

            "warmR.T_max":10,
            "warmR.T_mult":1,
            "warmR.factor":1.0,
            "warmR.eta_min":4e-8,
            "warmR.log_decay":False,
            "warmR.lr_decay_step":1,

            "reduceP.metric":'valid_acc',
            "reduceP.check_interval":0, 
            "reduceP.factor":0.5, 
            "reduceP.patience":10, 
            "reduceP.threshold":0.0001, 
            "reduceP.cooldown":0, 
            "reduceP.min_lr":0.
        }

        used_params = utils.assign_params_dict(default_params, params, force_check=False, support_unknow=True)
        split_params = utils.split_params(used_params)

        if isinstance(optimizer, Lookahead):
            base_optimizer = optimizer.optimizer
        else:
            base_optimizer = optimizer

        self.name = split_params["public"]["name"]
        if self.name == "cyclic":
            base_lr = split_params["cyclic"].pop("base_lr")
            max_lr = split_params["cyclic"].pop("max_lr")
            self.lr_scheduler = torch.optim.lr_scheduler.CyclicLR(base_optimizer, base_lr, max_lr, **split_params["cyclic"])
        elif self.name == "1cycle":
            max_lr = split_params["1cycle"].pop("learn_rate")
            self.lr_scheduler = optim.lr_scheduler.OneCycleLR(base_optimizer, max_lr, **split_params["1cycle"])
        elif self.name == "warmR":
            T_max = split_params["warmR"].pop("T_max")
            self.lr_decay_step = split_params["warmR"].pop("lr_decay_step")
            self.lr_scheduler = CosineAnnealingWarmRestarts(base_optimizer, T_max, **split_params["warmR"])
        elif self.name == "reduceP":
            self.check_interval = split_params["reduceP"].pop("check_interval")
            self.metric = split_params["reduceP"].pop("metric")
            self.min_lr = split_params["reduceP"]["min_lr"]
            if self.metric == "valid_acc":
                mode = "max"
            elif self.metric == "valid_loss":
                mode = "min"
            else:
                raise ValueError("Do not support {} metric for ReduceLROnPlateau strategy.".format(self.metric))
            self.lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(base_optimizer, mode=mode, **split_params["reduceP"])
            self.init = False
            if utils.use_horovod():
                raise TypeError("Do not support ReduceLROnPlateau for multi-gpu of Horovod now.")
        else:
            raise ValueError("Do not support {0} lr_scheduler now.".format(name))
    def init(self,
             inputs_dim,
             num_targets,
             mixup=False,
             mixup_alpha=1.0,
             specaugment=False,
             specaugment_params={},
             aug_dropout=0.,
             context_dropout=0.,
             hidden_dropout=0.,
             dropout_params={},
             xvector_params={},
             pooling="statistics",
             pooling_params={},
             fc_params={},
             margin_loss=False,
             margin_loss_params={},
             use_step=False,
             step_params={},
             transfer_from="softmax_loss",
             training=True):

        ## Params.
        default_dropout_params = {
            "type": "default",  # default | random
            "start_p": 0.,
            "dim": 2,
            "method": "uniform",  # uniform | normals
            "continuous": False,
            "inplace": True
        }

        default_xvector_params = {
            "init_dim": 128,
            "layers": [6, 12],
            "growth_rate": 64,
            "bn_scale": 2,
            "nonlinearity": "relu",
            "memory_efficient": True
        }

        default_pooling_params = {
            "num_head": 1,
            "hidden_size": 64,
            "share": True,
            "affine_layers": 1,
            "context": [0],
            "stddev": True,
            "temperature": False,
            "fixed": True
        }

        default_fc_params = {
            "nonlinearity": 'relu',
            "nonlinearity_params": {
                "inplace": True
            },
            "bn-relu": False,
            "bn": True,
            "bn_params": {
                "momentum": 0.5,
                "affine": True,
                "track_running_stats": True
            }
        }

        default_margin_loss_params = {
            "method": "am",
            "m": 0.2,
            "feature_normalize": True,
            "s": 30,
            "double": False,
            "mhe_loss": False,
            "mhe_w": 0.01,
            "inter_loss": 0.,
            "ring_loss": 0.,
            "curricular": False
        }

        default_step_params = {
            "T": None,
            "m": False,
            "lambda_0": 0,
            "lambda_b": 1000,
            "alpha": 5,
            "gamma": 1e-4,
            "s": False,
            "s_tuple": (30, 12),
            "s_list": None,
            "t": False,
            "t_tuple": (0.5, 1.2),
            "p": False,
            "p_tuple": (0.5, 0.1)
        }

        dropout_params = utils.assign_params_dict(default_dropout_params,
                                                  dropout_params)
        xvector_params = utils.assign_params_dict(default_xvector_params,
                                                  xvector_params)
        pooling_params = utils.assign_params_dict(default_pooling_params,
                                                  pooling_params)
        fc_params = utils.assign_params_dict(default_fc_params, fc_params)
        margin_loss_params = utils.assign_params_dict(
            default_margin_loss_params, margin_loss_params)
        step_params = utils.assign_params_dict(default_step_params,
                                               step_params)

        ## Var.
        self.use_step = use_step
        self.step_params = step_params

        ## Nnet
        # Head
        self.mixup = Mixup(alpha=mixup_alpha) if mixup else None
        self.specaugment = SpecAugment(
            **specaugment_params) if specaugment else None
        self.aug_dropout = get_dropout_from_wrapper(aug_dropout,
                                                    dropout_params)
        self.context_dropout = ContextDropout(
            p=context_dropout) if context_dropout > 0 else None
        self.hidden_dropout = get_dropout_from_wrapper(hidden_dropout,
                                                       dropout_params)

        # Frame level
        in_dim = xvector_params["init_dim"]
        layers = xvector_params["layers"]
        out_dim = xvector_params["growth_rate"]
        bn_dim = out_dim * xvector_params["bn_scale"]
        nonlinearity = xvector_params["nonlinearity"]
        memory_efficient = xvector_params["memory_efficient"]
        options = {"bias": False, "bn-relu": True}
        self.tdnn = ReluBatchNormTdnnLayer(inputs_dim,
                                           in_dim, [-2, -1, 0, 1, 2],
                                           nonlinearity=nonlinearity,
                                           **options)
        self.dense_block1 = DTdnnBlock(layers[0],
                                       in_dim,
                                       out_dim,
                                       bn_dim, [-1, 0, 1],
                                       memory_efficient,
                                       nonlinearity=nonlinearity,
                                       **options)
        in_dim += layers[0] * out_dim
        self.transit1 = ReluBatchNormTdnnLayerR(in_dim,
                                                in_dim // 2,
                                                nonlinearity=nonlinearity,
                                                **options)
        in_dim //= 2
        self.dense_block2 = DTdnnBlock(layers[1],
                                       in_dim,
                                       out_dim,
                                       bn_dim, [-3, 0, 3],
                                       memory_efficient,
                                       nonlinearity=nonlinearity,
                                       **options)
        in_dim += layers[1] * out_dim
        self.transit2 = ReluBatchNormTdnnLayerR(in_dim,
                                                in_dim // 2,
                                                nonlinearity=nonlinearity,
                                                **options)
        in_dim //= 2

        # Pooling
        stddev = pooling_params.pop("stddev")
        if pooling == "lde":
            self.stats = LDEPooling(in_dim, c_num=pooling_params["num_head"])
        elif pooling == "attentive":
            self.stats = AttentiveStatisticsPooling(
                in_dim,
                affine_layers=pooling_params["affine_layers"],
                hidden_size=pooling_params["hidden_size"],
                context=pooling_params["context"],
                stddev=stddev)
        elif pooling == "multi-head":
            self.stats = MultiHeadAttentionPooling(in_dim,
                                                   stddev=stddev,
                                                   **pooling_params)
        elif pooling == "multi-resolution":
            self.stats = MultiResolutionMultiHeadAttentionPooling(
                in_dim, **pooling_params)
        else:
            self.stats = StatisticsPooling(in_dim, stddev=stddev)

        # Segment level
        self.fc = ReluBatchNormTdnnLayer(self.stats.get_output_dim(), 512,
                                         **fc_params)

        # Loss
        # Do not need when extracting embedding.
        if training:
            if margin_loss:
                self.loss = MarginSoftmaxLoss(512, num_targets,
                                              **margin_loss_params)
            else:
                self.loss = SoftmaxLoss(512, num_targets)

            self.wrapper_loss = MixupLoss(self.loss,
                                          self.mixup) if mixup else None

            # An example to using transform-learning without initializing loss.affine parameters
            self.transform_keys = [
                "tdnn", "block1", "transit1", "block2", "transit2", "stats",
                "fc", "loss"
            ]

            if margin_loss and transfer_from == "softmax_loss":
                # For softmax_loss to am_softmax_loss
                self.rename_transform_keys = {
                    "loss.affine.weight": "loss.weight"
                }
    def init(self, inputs_dim, num_targets, channels=512, embd_dim=192, 
             aug_dropout=0., tail_dropout=0., training=True,
             extracted_embedding="near", mixup=False, mixup_alpha=1.0,
             pooling="ecpa-attentive", pooling_params={}, fc1=False, fc1_params={}, fc2_params={},
             margin_loss= True, margin_loss_params={}, use_step=False, step_params={}, transfer_from="softmax_loss" ):
        

        default_pooling_params = {
            "num_head":1,
            "hidden_size":64,
            "share":True,
            "affine_layers":1,
            "context":[0],
            "stddev":True,
            "temperature":False, 
            "fixed":True
        }

        default_fc_params = {
            "nonlinearity":'relu', "nonlinearity_params":{"inplace":True},
            "bn-relu":False, 
            "bn":True, 
            "bn_params":{"momentum":0.5, "affine":True, "track_running_stats":True}
            }


        default_margin_loss_params = {
            "method":"am", "m":0.2, 
            "feature_normalize":True, "s":30, 
            "double":False,
            "mhe_loss":False, "mhe_w":0.01,
            "inter_loss":0.,
            "ring_loss":0.,
            "curricular":False}

        default_step_params = {
            "T":None,
            "m":False, "lambda_0":0, "lambda_b":1000, "alpha":5, "gamma":1e-4,
            "s":False, "s_tuple":(30, 12), "s_list":None,
            "t":False, "t_tuple":(0.5, 1.2), 
            "p":False, "p_tuple":(0.5, 0.1)
        }

        self.use_step = use_step
        self.step_params = step_params
        self.extracted_embedding = extracted_embedding 

        pooling_params = utils.assign_params_dict(default_pooling_params, pooling_params)
        fc1_params = utils.assign_params_dict(default_fc_params, fc1_params)
        fc2_params = utils.assign_params_dict(default_fc_params, fc2_params)
        margin_loss_params = utils.assign_params_dict(default_margin_loss_params, margin_loss_params)
        step_params = utils.assign_params_dict(default_step_params, step_params)


        self.mixup = Mixup(alpha=mixup_alpha) if mixup else None

        self.layer1 = Conv1dReluBn(inputs_dim, channels, kernel_size=5, padding=2)
        self.layer2 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=2, dilation=2, scale=8)
        self.layer3 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=3, dilation=3, scale=8)
        self.layer4 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=4, dilation=4, scale=8)
        cat_channels = channels * 3
        self.conv = nn.Conv1d(cat_channels, cat_channels, kernel_size=1)
        self.bn_conv = nn.BatchNorm1d(cat_channels)

        # Pooling
        stddev = pooling_params.pop("stddev")
        if pooling == "attentive":
            self.stats = AttentiveStatisticsPooling(cat_channels, hidden_size=pooling_params["hidden_size"],context=pooling_params["context"], stddev=stddev)
            self.bn_stats = nn.BatchNorm1d(cat_channels * 2)
            self.fc1 = ReluBatchNormTdnnLayer(cat_channels * 2, embd_dim, **fc1_params) if fc1 else None
        elif pooling == "ecpa-attentive":
            self.stats = AttentiveStatsPool(cat_channels,128)
            self.bn_stats = nn.BatchNorm1d(cat_channels * 2)
            self.fc1 = ReluBatchNormTdnnLayer(cat_channels * 2, embd_dim, **fc1_params) if fc1 else None
        elif pooling == "multi-head":
            self.stats = MultiHeadAttentionPooling(cat_channels, stddev=stddev, **pooling_params)
            self.bn_stats = nn.BatchNorm1d(cat_channels * 2)
            self.fc1 = ReluBatchNormTdnnLayer(cat_channels * 2, embd_dim, **fc1_params) if fc1 else None
        elif pooling == "global-multi":
            self.stats = GlobalMultiHeadAttentionPooling(cat_channels,stddev=stddev, **pooling_params)
            self.bn_stats = nn.BatchNorm1d(cat_channels * 2* pooling_params["num_head"])
            self.fc1 = ReluBatchNormTdnnLayer(cat_channels * 2* pooling_params["num_head"], embd_dim, **fc1_params) if fc1 else None
        elif pooling == "multi-resolution":
            self.stats = MultiResolutionMultiHeadAttentionPooling(cat_channels, **pooling_params)
            self.bn_stats = nn.BatchNorm1d(cat_channels * 2* pooling_params["num_head"])
            self.fc1 = ReluBatchNormTdnnLayer(cat_channels * 2* pooling_params["num_head"], embd_dim, **fc1_params) if fc1 else None

        else:
            self.stats = StatisticsPooling(cat_channels, stddev=stddev)
            self.bn_stats = nn.BatchNorm1d(cat_channels * 2)
            self.fc1 = ReluBatchNormTdnnLayer(cat_channels * 2, embd_dim, **fc1_params) if fc1 else None

        self.tail_dropout = torch.nn.Dropout2d(p=tail_dropout) if tail_dropout > 0 else None

        if fc1:
            fc2_in_dim = embd_dim
        else:
            fc2_in_dim = cat_channels * 2
        self.fc2 = ReluBatchNormTdnnLayer(fc2_in_dim, embd_dim, **fc2_params)
        self.tail_dropout = torch.nn.Dropout2d(p=tail_dropout) if tail_dropout > 0 else None

         # Loss
        # Do not need when extracting embedding.
        if training :
            if margin_loss:
                self.loss = MarginSoftmaxLoss(embd_dim, num_targets, **margin_loss_params)
            else:
                self.loss = SoftmaxLoss(embd_dim, num_targets)
                # self.loss = AngleLoss(embd_dim,num_targets)
            self.wrapper_loss = MixupLoss(self.loss, self.mixup) if mixup else None
            # An example to using transform-learning without initializing loss.affine parameters
            self.transform_keys = ["layer2","layer3","layer4","conv","stats","fc1","fc2"]

            if margin_loss and transfer_from == "softmax_loss":
                # For softmax_loss to am_softmax_loss
                self.rename_transform_keys = {"loss.affine.weight":"loss.weight"}
Exemple #18
0
    def init(self, inputs_dim, num_targets, channels=512, emb_dim=192,
             tdnn_layer_params={}, layer5_params={}, layer6=False, layer7_params={},
             margin_loss=False, margin_loss_params={}, pooling="statistics",
             use_step=False, step_params={}, training=True, extracted_embedding="near"):
        default_tdnn_layer_params = {
            "affine_type": 'tdnn-affine',
            "nonlinearity": 'relu', "nonlinearity_params": {"inplace": True},
            "bn-relu": False, "bn": True, "bn_params": {"momentum": 0.5, "affine": False, "track_running_stats": True}
        }
        default_layer5_params = {"nonlinearity": 'relu', "bn": False}
        default_layer7_params = {"nonlinearity": '', "bn": True}
        default_margin_loss_params = {
            "method": "am", "m": 0.2,
            "feature_normalize": True, "s": 30,
            "double": False,
            "mhe_loss": False, "mhe_w": 0.01,
            "inter_loss": 0.,
            "ring_loss": 0.,
            "curricular": False
        }

        default_step_params = {
            "T": None,
            "m": False, "lambda_0": 0, "lambda_b": 1000, "alpha": 5, "gamma": 1e-4,
            "s": False, "s_tuple": (30, 12), "s_list": None,
            "t": False, "t_tuple": (0.5, 1.2),
            "p": False, "p_tuple": (0.5, 0.1)
        }

        tdnn_layer_params = utils.assign_params_dict(default_tdnn_layer_params, tdnn_layer_params)
        layer5_params = utils.assign_params_dict(default_layer5_params, layer5_params)
        layer5_params = utils.assign_params_dict(default_tdnn_layer_params, layer5_params)
        layer7_params = utils.assign_params_dict(default_layer7_params, layer7_params)
        layer7_params = utils.assign_params_dict(default_tdnn_layer_params, layer7_params)
        margin_loss_params = utils.assign_params_dict(default_margin_loss_params, margin_loss_params)
        step_params = utils.assign_params_dict(default_step_params, step_params)

        self.use_step = use_step
        self.step_params = step_params
        self.extracted_embedding = extracted_embedding  # For extract.

        self.layer1 = ReluBatchNormTdnnLayer(inputs_dim, channels, [-2, -1, 0, 1, 2], **tdnn_layer_params)
        # channels, kernel_size, stride, padding, dilation, scale
        self.layer2 = SE_Res2Block(channels, [-2, 0, 2], 8, tdnn_layer_params)
        self.layer3 = SE_Res2Block(channels, [-3, 0, 3], 8, tdnn_layer_params)
        self.layer4 = SE_Res2Block(channels, [-4, 0, 4], 8, tdnn_layer_params)

        cat_channels = channels * 3
        self.layer5 = ReluBatchNormTdnnLayer(cat_channels, cat_channels, **layer5_params)

        if pooling == "attention":
            self.pooling = AttentiveStatsPool(cat_channels, 128, tdnn_layer_params["affine_type"])
        else:
            self.pooling = StatisticsPooling(cat_channels, stddev=True)

        # self.bn1 = nn.BatchNorm1d(cat_channels * 2, **tdnn_layer_params["bn_params"])


        # Segment level
        if layer6:
            self.layer6 = ReluBatchNormTdnnLayer(cat_channels * 2, 512, **tdnn_layer_params)
            layer7_dim = 512
        else:
            self.layer6 = None
            layer7_dim = cat_channels * 2
        self.layer7 = ReluBatchNormTdnnLayer(layer7_dim, emb_dim, **layer7_params)

        for m in self.modules():
            if isinstance(m, nn.Conv1d):
                nn.init.kaiming_uniform_(m.weight, mode='fan_out', nonlinearity='relu')

        if training:
            if margin_loss:
                self.loss = MarginSoftmaxLoss(emb_dim, num_targets, **margin_loss_params)
            else:
                self.loss = SoftmaxLoss(emb_dim, num_targets, affine_type=tdnn_layer_params["affine_type"])
    def init(self, inputs_dim, num_targets, extend=False, skip_connection=False,
             aug_dropout=0., context_dropout=0., hidden_dropout=0., dropout_params={},
             SE=False, se_ratio=4,
             tdnn_layer_params={},
             tdnn6=True, tdnn7_params={},
             attentive_pooling=False, attentive_pooling_params={"hidden_size":64, "stddev_attention":False},
             LDE_pooling=False, LDE_pooling_params={"c_num":64, "nodes":128},
             focal_loss=False, focal_loss_params={"gamma":2},
             margin_loss=False, margin_loss_params={},
             use_step=False, step_params={},
             transfer_from="softmax_loss",
             training=True, extracted_embedding="far"):

        ## Params.
        default_dropout_params = {
            "type":"default", # default | random
            "start_p":0.,
            "dim":2,
            "method":"uniform", # uniform | normals
            "continuous":False,
            "inplace":True
        }

        default_tdnn_layer_params = {
            "nonlinearity":'relu', "nonlinearity_params":{"inplace":True},
            "bn-relu":False, "bn":True, "bn_params":{"momentum":0.5, "affine":False, "track_running_stats":True}
        }

        default_margin_loss_params = {
            "method":"am", "m":0.2, 
            "feature_normalize":True, "s":30, 
            "double":False,
            "mhe_loss":False, "mhe_w":0.01,
            "inter_loss":0.,
            "ring_loss":0.,
            "curricular":False
        }

        default_step_params = {
            "T":None,
            "m":False, "lambda_0":0, "lambda_b":1000, "alpha":5, "gamma":1e-4,
            "s":False, "s_tuple":(30, 12), "s_list":None,
            "t":False, "t_tuple":(0.5, 1.2), 
            "p":False, "p_tuple":(0.5, 0.1)
        }

        dropout_params = utils.assign_params_dict(default_dropout_params, dropout_params)
        tdnn_layer_params = utils.assign_params_dict(default_tdnn_layer_params, tdnn_layer_params)
        # If param is not be specified, default it a.w.t tdnn_layer_params.
        tdnn7_params = utils.assign_params_dict(tdnn_layer_params, tdnn7_params) 
        margin_loss_params = utils.assign_params_dict(default_margin_loss_params, margin_loss_params)
        step_params = utils.assign_params_dict(default_step_params, step_params)

        ## Var.
        self.skip_connection = skip_connection
        self.use_step = use_step
        self.step_params = step_params

        self.extracted_embedding = extracted_embedding # For extract.
        
        ## Nnet.
        # Head
        self.aug_dropout = get_dropout_from_wrapper(aug_dropout, dropout_params)
        self.context_dropout = ContextDropout(p=context_dropout) if context_dropout > 0 else None
        self.hidden_dropout = get_dropout_from_wrapper(hidden_dropout, dropout_params)

        # Frame level
        self.tdnn1 = ReluBatchNormTdnnLayer(inputs_dim,512,[-2,-1,0,1,2], **tdnn_layer_params)
        self.se1 = SEBlock(512, ratio=se_ratio) if SE else None
        self.ex_tdnn1 = ReluBatchNormTdnnLayer(512,512, **tdnn_layer_params) if extend else None
        self.tdnn2 = ReluBatchNormTdnnLayer(512,512,[-2,0,2], **tdnn_layer_params)
        self.se2 = SEBlock(512, ratio=se_ratio) if SE else None
        self.ex_tdnn2 = ReluBatchNormTdnnLayer(512,512, **tdnn_layer_params) if extend else None
        self.tdnn3 = ReluBatchNormTdnnLayer(512,512,[-3,0,3], **tdnn_layer_params)
        self.se3 = SEBlock(512, ratio=se_ratio) if SE else None
        self.ex_tdnn3 = ReluBatchNormTdnnLayer(512,512, **tdnn_layer_params) if extend else None
        self.ex_tdnn4 = ReluBatchNormTdnnLayer(512,512,[-4,0,4], **tdnn_layer_params) if extend else None
        self.se4 = SEBlock(512, ratio=se_ratio) if SE and extend else None
        self.ex_tdnn5 = ReluBatchNormTdnnLayer(512,512, **tdnn_layer_params) if extend else None
        self.tdnn4 = ReluBatchNormTdnnLayer(512,512, **tdnn_layer_params)

        nodes = LDE_pooling_params.pop("nodes") if LDE_pooling else 1500

        self.tdnn5 = ReluBatchNormTdnnLayer(512, nodes, **tdnn_layer_params)

        # Pooling
        if LDE_pooling:
            self.stats = LDEPooling(nodes, **LDE_pooling_params)
        elif attentive_pooling:
            self.stats = AttentiveStatisticsPooling(nodes, **attentive_pooling_params, stddev=True)
        else:
            self.stats = StatisticsPooling(nodes, stddev=True)

        stats_dim = self.stats.get_output_dim()

        # Segment level
        if tdnn6:
            self.tdnn6 = ReluBatchNormTdnnLayer(stats_dim, 512, **tdnn_layer_params)
            tdnn7_dim = 512
        else:
            self.tdnn6 = None
            tdnn7_dim = stats_dim

        if tdnn7_params["nonlinearity"] == "default":
            tdnn7_params["nonlinearity"] = tdnn_layer_params["nonlinearity"]

        self.tdnn7 = ReluBatchNormTdnnLayer(tdnn7_dim,512, **tdnn7_params)

        # Loss
        # Do not need when extracting embedding.
        if training :
            if margin_loss:
                self.loss = MarginSoftmaxLoss(512, num_targets, **margin_loss_params)
            elif focal_loss:
                self.loss = FocalLoss(512, num_targets, **focal_loss_params)
            else:
                self.loss = SoftmaxLoss(512, num_targets)

            # An example to using transform-learning without initializing loss.affine parameters
            self.transform_keys = ["tdnn1","tdnn2","tdnn3","tdnn4","tdnn5","stats","tdnn6","tdnn7",
                                   "ex_tdnn1","ex_tdnn2","ex_tdnn3","ex_tdnn4","ex_tdnn5",
                                   "se1","se2","se3","se4","loss"]

            if margin_loss and transfer_from == "softmax_loss":
                # For softmax_loss to am_softmax_loss
                self.rename_transform_keys = {"loss.affine.weight":"loss.weight"}