Ejemplo n.º 1
0
    def set_model(self):
        print('[Runner] - Initializing Transformer model...')
        
        # build the Transformer model with speech prediction head
        model_config = TransformerConfig(self.config)
        self.dr = model_config.downsample_rate

        self.model = TransformerForMaskedAcousticModel(model_config, self.input_dim, self.output_dim).to(self.device)
        self.model.train()

        if self.args.multi_gpu:
            self.model = torch.nn.DataParallel(self.model)
            print('[Runner] - Multi-GPU training Enabled: ' + str(torch.cuda.device_count()))
        print('[Runner] - Number of parameters: ' + str(sum(p.numel() for p in self.model.parameters() if p.requires_grad)))

        # Setup optimizer
        param_optimizer = list(self.model.named_parameters())

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]

        if self.apex:
            try:
                from apex.optimizers import FP16_Optimizer
                from apex.optimizers import FusedAdam
            except ImportError:
                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

            optimizer = FusedAdam(optimizer_grouped_parameters,
                                    lr=self.learning_rate,
                                    bias_correction=False,
                                    max_grad_norm=1.0)
            if self.config['optimizer']['loss_scale'] == 0:
                self.optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            else:
                self.optimizer = FP16_Optimizer(optimizer, static_loss_scale=self.config['optimizer']['loss_scale'])
            self.warmup_linear = WarmupLinearSchedule(warmup=self.warmup_proportion,
                                                      t_total=self.total_steps)
        else:
            self.optimizer = BertAdam(optimizer_grouped_parameters,
                                      lr=self.learning_rate,
                                      warmup=self.warmup_proportion,
                                      t_total=self.total_steps)
Ejemplo n.º 2
0
    def set_model(self,
                  inference=False,
                  with_head=False,
                  from_path=None,
                  output_attention=False):
        self.verbose('Initializing Transformer model.')

        # uild the Transformer model with speech prediction head
        self.model_config = TransformerConfig(self.config)
        self.dr = self.model_config.downsample_rate
        self.hidden_size = self.model_config.hidden_size
        self.with_head = with_head
        self.output_attention = output_attention

        if not inference or with_head:
            self.model = TransformerForMaskedAcousticModel(
                self.model_config, self.input_dim, self.output_dim,
                self.output_attention).to(self.device)
            self.transformer = self.model.Transformer
            if self.paras.multi_gpu:
                self.model = torch.nn.DataParallel(self.model)
                self.transformer = torch.nn.DataParallel(self.transformer)
                self.verbose('Multi-GPU training Enabled: ' +
                             str(torch.cuda.device_count()))
            self.verbose('Number of parameters: ' + str(
                sum(p.numel()
                    for p in self.model.parameters() if p.requires_grad)))

        if inference and not with_head:
            self.transformer = TransformerModel(
                self.model_config, self.input_dim,
                self.output_attention).to(self.device)
            if self.paras.multi_gpu:
                self.transformer = torch.nn.DataParallel(self.transformer)
                self.verbose('Multi-GPU training Enabled: ' +
                             str(torch.cuda.device_count()))
            self.verbose('Number of parameters: ' + str(
                sum(p.numel() for p in self.transformer.parameters()
                    if p.requires_grad)))
            self.transformer.eval()
        elif inference and with_head:
            self.model.eval()
        elif not inference:
            self.model.train()

            # Setup optimizer
            param_optimizer = list(self.model.named_parameters())

            no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [{
                'params': [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.01
            }, {
                'params': [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            }]

            if self.apex:
                try:
                    from apex.optimizers import FP16_Optimizer
                    from apex.optimizers import FusedAdam
                except ImportError:
                    raise ImportError(
                        "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                    )

                optimizer = FusedAdam(optimizer_grouped_parameters,
                                      lr=self.learning_rate,
                                      bias_correction=False,
                                      max_grad_norm=1.0)
                if self.config['optimizer']['loss_scale'] == 0:
                    self.optimizer = FP16_Optimizer(optimizer,
                                                    dynamic_loss_scale=True)
                else:
                    self.optimizer = FP16_Optimizer(
                        optimizer,
                        static_loss_scale=self.config['optimizer']
                        ['loss_scale'])
                self.warmup_linear = WarmupLinearSchedule(
                    warmup=self.warmup_proportion, t_total=self.total_steps)
            else:
                self.optimizer = BertAdam(optimizer_grouped_parameters,
                                          lr=self.learning_rate,
                                          warmup=self.warmup_proportion,
                                          t_total=self.total_steps)
        else:
            raise NotImplementedError('Invalid Arguments!')

        if self.load:  # This will be set to True by default when Tester is running set_model()
            self.load_model(inference=inference,
                            with_head=with_head,
                            from_path=from_path)