def __init__(self, options, inp_dim, config=None):
        super(TRANSFORMER, self).__init__()

        if config is not None:
            self.config = yaml.load(open(config, 'r'), Loader=yaml.FullLoader)
        else:
            all_states = torch.load(options["ckpt_file"], map_location='cpu')
            self.config = all_states['Settings']['Config']

        self.no_grad = bool(strtobool(options['no_grad']))
        self.spec_aug = bool(strtobool(options['spec_aug']))
        self.spec_aug_prev = bool(strtobool(options['spec_aug_prev']))
        self.weighted_sum = bool(strtobool(options['weighted_sum']))
        self.select_layer = int(options['select_layer'])
        if (not self.no_grad) and (not self.spec_aug_prev):
            raise RuntimeError('Only one of them can be set False!')

        # increase dropout
        if str(options['dropout']) != 'default':
            self.config['transformer']['hidden_dropout_prob'] = float(
                options['dropout'])
            self.config['transformer']['attention_probs_dropout_prob'] = float(
                options['dropout'])

        # Model Config
        self.model_config = TransformerConfig(self.config)
        self.dr = self.model_config.downsample_rate
        self.hidden_size = self.model_config.hidden_size
        self.num_layers = self.model_config.num_hidden_layers
        self.max_input_length = self.config['transformer'][
            'max_input_length'] if 'max_input_length' in self.config[
                'transformer'] else 0
        if self.max_input_length > 0:
            print('[Transformer] - Maximum input length: ',
                  self.max_input_length)
        if not (self.select_layer in list(range(-1, self.num_layers))):
            raise RuntimeError('Out of range int for \'select_layer\'!')

        # use weighted sum from all layers
        if self.weighted_sum:
            self.weight = nn.Parameter(
                torch.ones(self.num_layers) / self.num_layers)

        # Build model
        self.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')
        self.model = TransformerModel(self.model_config,
                                      inp_dim).to(self.device)
        self.model.eval() if self.no_grad else self.model.train()

        # Load from a PyTorch state_dict
        load = bool(strtobool(options["load_pretrain"]))
        if load:
            self.load_model(all_states['Transformer'])
            print('[Transformer] - Number of parameters: ' + str(
                sum(p.numel()
                    for p in self.model.parameters() if p.requires_grad)))

        self.out_dim = self.hidden_size  # 768, This attribute is for pytorch-kaldi and downstream runner
        self.permute_input = True  # This attribute is for the forward method. If Ture then input ouput is in the shape of (T, B, D), if False then in (B, T, D)
    def __init__(self, options, inp_dim, config=None, online_config=None):
        super(TransformerBaseWrapper, self).__init__()

        # read config
        if config is not None:
            self.config = yaml.load(open(config, 'r'), Loader=yaml.FullLoader)
        else:
            self.all_states = torch.load(options["ckpt_file"],
                                         map_location='cpu')
            self.config = self.all_states['Settings']['Config']

        # parse the options dict
        self.load = bool(strtobool(options["load_pretrain"]))
        self.no_grad = bool(strtobool(options['no_grad']))
        self.spec_aug = bool(strtobool(options['spec_aug']))
        self.spec_aug_prev = bool(strtobool(options['spec_aug_prev']))
        self.weighted_sum = bool(strtobool(options['weighted_sum']))
        self.select_layer = int(options['select_layer'])
        self.permute_input = bool(strtobool(options['permute_input']))
        if (not self.no_grad) and (not self.spec_aug_prev):
            raise RuntimeError('Only one of them can be set False!')
        if str(options['dropout']
               ) != 'default':  # increase dropout if specified
            self.config['transformer']['hidden_dropout_prob'] = float(
                options['dropout'])
            self.config['transformer']['attention_probs_dropout_prob'] = float(
                options['dropout'])

        # Set model config
        self.model_config = TransformerConfig(self.config)
        self.dr = self.model_config.downsample_rate
        self.hidden_size = self.model_config.hidden_size
        self.num_layers = self.model_config.num_hidden_layers
        self.max_input_length = self.config['transformer'][
            'max_input_length'] if 'max_input_length' in self.config[
                'transformer'] else 0
        if online_config is not None: self.config['online'] = online_config
        if 'online' in self.config:
            preprocessor, inp_dim = self.get_preprocessor(
                self.config['online'])
            self.preprocessor = preprocessor
        self.inp_dim = inp_dim if inp_dim > 0 else self.config['transformer'][
            'input_dim']
        self.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')

        if self.max_input_length > 0:
            print('[Transformer] - Maximum input length: ',
                  self.max_input_length)
        if not (self.select_layer in list(range(-1, self.num_layers))):
            raise RuntimeError('Out of range int for \'select_layer\'!')
        if self.weighted_sum:
            self.weight = nn.Parameter(
                torch.ones(self.num_layers) / self.num_layers)
Beispiel #3
0
    def set_model(self):
        print('[Runner] - Initializing Transformer model...')
        
        # build the Transformer model with speech prediction head
        model_config = TransformerConfig(self.config)
        self.dr = model_config.downsample_rate

        self.model = TransformerForMaskedAcousticModel(model_config, self.input_dim, self.output_dim).to(self.device)
        self.model.train()

        if self.args.multi_gpu:
            self.model = torch.nn.DataParallel(self.model)
            print('[Runner] - Multi-GPU training Enabled: ' + str(torch.cuda.device_count()))
        print('[Runner] - Number of parameters: ' + str(sum(p.numel() for p in self.model.parameters() if p.requires_grad)))

        # Setup optimizer
        param_optimizer = list(self.model.named_parameters())

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]

        if self.apex:
            try:
                from apex.optimizers import FP16_Optimizer
                from apex.optimizers import FusedAdam
            except ImportError:
                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

            optimizer = FusedAdam(optimizer_grouped_parameters,
                                    lr=self.learning_rate,
                                    bias_correction=False,
                                    max_grad_norm=1.0)
            if self.config['optimizer']['loss_scale'] == 0:
                self.optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            else:
                self.optimizer = FP16_Optimizer(optimizer, static_loss_scale=self.config['optimizer']['loss_scale'])
            self.warmup_linear = WarmupLinearSchedule(warmup=self.warmup_proportion,
                                                      t_total=self.total_steps)
        else:
            self.optimizer = BertAdam(optimizer_grouped_parameters,
                                      lr=self.learning_rate,
                                      warmup=self.warmup_proportion,
                                      t_total=self.total_steps)
Beispiel #4
0
    def set_model(self,
                  inference=False,
                  with_head=False,
                  from_path=None,
                  output_attention=False):
        self.verbose('Initializing Transformer model.')

        # uild the Transformer model with speech prediction head
        self.model_config = TransformerConfig(self.config)
        self.dr = self.model_config.downsample_rate
        self.hidden_size = self.model_config.hidden_size
        self.with_head = with_head
        self.output_attention = output_attention

        if not inference or with_head:
            self.model = TransformerForMaskedAcousticModel(
                self.model_config, self.input_dim, self.output_dim,
                self.output_attention).to(self.device)
            self.transformer = self.model.Transformer
            if self.paras.multi_gpu:
                self.model = torch.nn.DataParallel(self.model)
                self.transformer = torch.nn.DataParallel(self.transformer)
                self.verbose('Multi-GPU training Enabled: ' +
                             str(torch.cuda.device_count()))
            self.verbose('Number of parameters: ' + str(
                sum(p.numel()
                    for p in self.model.parameters() if p.requires_grad)))

        if inference and not with_head:
            self.transformer = TransformerModel(
                self.model_config, self.input_dim,
                self.output_attention).to(self.device)
            if self.paras.multi_gpu:
                self.transformer = torch.nn.DataParallel(self.transformer)
                self.verbose('Multi-GPU training Enabled: ' +
                             str(torch.cuda.device_count()))
            self.verbose('Number of parameters: ' + str(
                sum(p.numel() for p in self.transformer.parameters()
                    if p.requires_grad)))
            self.transformer.eval()
        elif inference and with_head:
            self.model.eval()
        elif not inference:
            self.model.train()

            # Setup optimizer
            param_optimizer = list(self.model.named_parameters())

            no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [{
                'params': [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.01
            }, {
                'params': [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            }]

            if self.apex:
                try:
                    from apex.optimizers import FP16_Optimizer
                    from apex.optimizers import FusedAdam
                except ImportError:
                    raise ImportError(
                        "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                    )

                optimizer = FusedAdam(optimizer_grouped_parameters,
                                      lr=self.learning_rate,
                                      bias_correction=False,
                                      max_grad_norm=1.0)
                if self.config['optimizer']['loss_scale'] == 0:
                    self.optimizer = FP16_Optimizer(optimizer,
                                                    dynamic_loss_scale=True)
                else:
                    self.optimizer = FP16_Optimizer(
                        optimizer,
                        static_loss_scale=self.config['optimizer']
                        ['loss_scale'])
                self.warmup_linear = WarmupLinearSchedule(
                    warmup=self.warmup_proportion, t_total=self.total_steps)
            else:
                self.optimizer = BertAdam(optimizer_grouped_parameters,
                                          lr=self.learning_rate,
                                          warmup=self.warmup_proportion,
                                          t_total=self.total_steps)
        else:
            raise NotImplementedError('Invalid Arguments!')

        if self.load:  # This will be set to True by default when Tester is running set_model()
            self.load_model(inference=inference,
                            with_head=with_head,
                            from_path=from_path)