Ejemplo n.º 1
0
    def __init__(self, trainroot, testroot, alpha=0.85):
        self.trainroot = trainroot
        self.testroot = testroot

        # ensure everytime the random is the same
        random.seed(cfg.manualSeed)
        np.random.seed(cfg.manualSeed)
        torch.manual_seed(cfg.manualSeed)

        self.logger = self.setLogger()
        self.alpha = alpha
        self.train_loss = []
        self.test_loss = []
        self.test_loss_t = []
        self.test_loss_c = []
        self.acc = {'acc': [], 'acc_t': [], 'acc_c': []}
        self.device = torch.device(
            "cuda" if cfg.use_cuda and torch.cuda.is_available() else "cpu")
        self.model = self.net_init().to(self.device)
        if cfg.label_type == "both":
            self.codec = dataset.Codec(cfg.alphabet[0])
            self.codec_color = dataset.Codec(cfg.alphabet[1])
        else:
            self.codec = dataset.Codec(cfg.alphabet)
        self.loss_fn = nn.CTCLoss() if not cfg.dealwith_lossnan else nn.CTCLoss(zero_infinity=True)
        self.loss_fn = self.loss_fn.to(self.device)
        if cfg.adam:
            self.optim = optim.Adam(self.model.parameters(), lr=cfg.lr,
                                    betas=(cfg.beta1, 0.999))
        elif cfg.adadelta:
            self.optim = optim.Adadelta(self.model.parameters())
        else:
            self.optim = optim.RMSprop(self.model.parameters(), lr=cfg.lr)
Ejemplo n.º 2
0
    def __init__(self,
                 eos,
                 blank,
                 enc_n_units,
                 vocab,
                 dropout=0.,
                 lsm_prob=0.,
                 fc_list=None,
                 param_init=0.1,
                 backward=False):

        super(CTC, self).__init__()

        self.eos = eos
        self.blank = blank
        self.vocab = vocab
        self.lsm_prob = lsm_prob
        self.bwd = backward

        self.space = -1  # TODO(hirofumi): fix later

        # for cache
        self.prev_spk = ''
        self.lmstate_final = None

        # for posterior plot
        self.prob_dict = {}
        self.data_dict = {}

        # Fully-connected layers before the softmax
        if fc_list is not None and len(fc_list) > 0:
            _fc_list = [int(fc) for fc in fc_list.split('_')]
            fc_layers = OrderedDict()
            for i in range(len(_fc_list)):
                input_dim = enc_n_units if i == 0 else _fc_list[i - 1]
                fc_layers['fc' + str(i)] = nn.Linear(input_dim, _fc_list[i])
                fc_layers['dropout' + str(i)] = nn.Dropout(p=dropout)
            fc_layers['fc' + str(len(_fc_list))] = nn.Linear(
                _fc_list[-1], vocab)
            self.output = nn.Sequential(fc_layers)
        else:
            self.output = nn.Linear(enc_n_units, vocab)

        self.use_warpctc = LooseVersion(
            torch.__version__) < LooseVersion("1.4.0")
        if self.use_warpctc:
            import warpctc_pytorch
            self.ctc_loss = warpctc_pytorch.CTCLoss(size_average=True)
        else:
            if LooseVersion(torch.__version__) < LooseVersion("1.7.0"):
                self.ctc_loss = nn.CTCLoss(reduction="sum")
            else:
                self.ctc_loss = nn.CTCLoss(reduction="sum", zero_infinity=True)

        self.forced_aligner = CTCForcedAligner()
Ejemplo n.º 3
0
def training_loop(model, kwargs, train_dataset, train_batch_loader, eval_dataset):
    device = 'cuda:0' if torch.cuda.is_available() and kwargs['cuda'] else 'cpu'
    model.to(device)
    greedy_decoder = GreedyDecoder(model.labels)
    criterion = nn.CTCLoss(blank=0,reduction='none')
    parameters = model.parameters()
    optimizer = torch.optim.SGD(parameters,lr=kwargs['lr'],momentum=kwargs['momentum'],nesterov=True,weight_decay=1e-5)
    scaling_factor = model.get_scaling_factor()
    epochs=kwargs['epochs']
    print('Train dataset size:%d' % len(train_dataset))
    batch_count = math.ceil(len(train_dataset) / kwargs['batch_size'])
    for epoch in range(epochs):
        with timing.EpochTimer(epoch,_log_to_tensorboard) as et:
            model.train()
            total_loss = 0
            for idx, data in et.across_epoch('Data Loading time', tqdm.tqdm(enumerate(train_batch_loader),total=batch_count)):
                inputs, input_lengths, targets, target_lengths, file_paths, texts = data
                with et.timed_action('Model execution time'):
                    out = model(torch.FloatTensor(inputs).to(device))
                out = out.transpose(1,0)
                output_lengths = [l // scaling_factor for l in input_lengths]
                with et.timed_action('Loss and BP time'):
                    loss = criterion(out, targets.to(device), torch.IntTensor(output_lengths), torch.IntTensor(target_lengths))
                    optimizer.zero_grad()
                    loss.mean().backward()
                    optimizer.step()
                total_loss += loss.mean().item()
            log_loss_to_tensorboard(epoch, total_loss / batch_count)
            evaluate(model,eval_dataset,greedy_decoder,epoch,kwargs)
            if epoch != 0 and epoch % kwargs['epochs_per_save'] == 0 :
                save_epoch_model(model,epoch, kwargs['model_dir'])
    if kwargs['model_dir']:
        save_model(model, kwargs['model_dir']+'/final.pth')
    print('Finished at %s' % time.asctime())
Ejemplo n.º 4
0
 def build_output(self, input: Tuple[int, int, int, int], block: str) -> Union[Tuple[None, None, None], Tuple[Tuple[int, int, int, int], str, Callable]]:
     """
     Builds an output layer.
     """
     pattern = re.compile(r'(O)(?P<name>{\w+})?(?P<dim>2|1|0)(?P<type>l|s|c)(?P<aug>a)?(?P<out>\d+)')
     m = pattern.match(block)
     if not m:
         return None, None, None
     dim = int(m.group('dim'))
     nl = m.group('type')
     outdim = int(m.group('out'))
     if dim == 0:
         raise ValueError('categorical output not supported, yet.')
     if nl == 'c' and dim == 2:
         raise ValueError('CTC not supported for heatmap output')
     if nl in ['l', 's'] and int(m.group('out')) >= 1:
         self.criterion = nn.BCELoss()
     elif nl == 'c':
         self.criterion = nn.CTCLoss(reduction='sum', zero_infinity=True)
     else:
         raise ValueError('unsupported output specification')
     # heatmap output
     if dim == 2:
         act = 's' if nl == 'l' else 'm'
         fn = layers.ActConv2D(input[1], outdim, (1, 1), (1, 1), act)
         logger.debug('{}\t\tconv\tkernel 1 x 1 filters {} stride 1 activation {}'.format(self.idx+1, outdim, nl))
         return fn.get_shape(input), self.get_layer_name(m.group('type'), m.group('name')), fn
     else:
         aug = True if m.group('aug') else False
         lin = layers.LinSoftmax(input[1], int(m.group('out')), aug)
         logger.debug('{}\t\tlinear\taugmented {} out {}'.format(self.idx+1, aug, m.group('out')))
         return lin.get_shape(input), self.get_layer_name(m.group(1), m.group('name')), lin
Ejemplo n.º 5
0
    def __init__(
        self,
        pad_id=0,
        smoothing_coef=0.0,
        sample_wise=False,
        aux_ctc=False,
        ctc_initial_coef=0.1,
        ctc_blank_id=None,
        eps=1e-5,
    ):
        assert (not aux_ctc) or (
            ctc_blank_id is not None), "Should be a blank id if using CTC loss"

        super().__init__()

        self.pad_id = pad_id
        self.smoothing_coef = smoothing_coef
        self.sample_wise = sample_wise
        self.aux_ctc = aux_ctc
        self.ctc_coef = ctc_initial_coef
        self.eps = eps

        if aux_ctc:
            self.ctc = nn.CTCLoss(blank=ctc_blank_id,
                                  reduction='none',
                                  zero_infinity=True)
            self.ctc = self.ctc.to(self._device)
Ejemplo n.º 6
0
def main(learning_rate=5e-4, batch_size=20, epochs=10,
        train_url="train-clean-100", test_url="test-clean",
        experiment=Experiment(api_key='dummy_key', disabled=True)):

    hparams = {
        "n_cnn_layers": 3,
        "n_rnn_layers": 5,
        "rnn_dim": 512,
        "n_class": 29,
        "n_feats": 128,
        "stride":2,
        "dropout": 0.1,
        "learning_rate": learning_rate,
        "batch_size": batch_size,
        "epochs": epochs
    }

    experiment.log_parameters(hparams)

    use_cuda = torch.cuda.is_available()
    torch.manual_seed(7)
    device = torch.device("cuda" if use_cuda else "cpu")

    if not os.path.isdir("./data"):
        os.makedirs("./data")

    train_dataset = torchaudio.datasets.LIBRISPEECH("./data", url=train_url, download=True)
    test_dataset = torchaudio.datasets.LIBRISPEECH("./data", url=test_url, download=True)

    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    train_loader = data.DataLoader(dataset=train_dataset,
                                batch_size=hparams['batch_size'],
                                shuffle=True,
                                collate_fn=lambda x: data_processing(x, 'train'),
                                **kwargs)
    test_loader = data.DataLoader(dataset=test_dataset,
                                batch_size=hparams['batch_size'],
                                shuffle=False,
                                collate_fn=lambda x: data_processing(x, 'valid'),
                                **kwargs)

    model = SpeechRecognitionModel(
        hparams['n_cnn_layers'], hparams['n_rnn_layers'], hparams['rnn_dim'],
        hparams['n_class'], hparams['n_feats'], hparams['stride'], hparams['dropout']
        ).to(device)

    print(model)
    print('Num Model Parameters', sum([param.nelement() for param in model.parameters()]))

    optimizer = optim.AdamW(model.parameters(), hparams['learning_rate'])
    criterion = nn.CTCLoss(blank=28).to(device)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=hparams['learning_rate'], 
                                            steps_per_epoch=int(len(train_loader)),
                                            epochs=hparams['epochs'],
                                            anneal_strategy='linear')
    
    iter_meter = IterMeter()
    for epoch in range(1, epochs + 1):
        train(model, device, train_loader, criterion, optimizer, scheduler, epoch, iter_meter, experiment)
        test(model, device, test_loader, criterion, epoch, iter_meter, experiment)
Ejemplo n.º 7
0
    def __init__(self, upstream_dim, upstream_rate, downstream_expert, expdir,
                 **kwargs):
        super(DownstreamExpert, self).__init__()
        self.expdir = expdir
        self.upstream_dim = upstream_dim
        self.corpus = downstream_expert["corpus"]

        # Text tokenizer
        self.tokenizer = load_text_encoder(**downstream_expert["text"])

        modelrc = downstream_expert["model"]
        self.projector = nn.Linear(upstream_dim, modelrc["project_dim"])

        model_select = downstream_expert["model"]["select"]
        self.model = eval(model_select)(
            modelrc["project_dim"],
            self.tokenizer.vocab_size,
            upstream_rate=upstream_rate,
            **modelrc.get(model_select, {}),
        )
        self.objective = nn.CTCLoss(
            blank=self.tokenizer.pad_idx,
            zero_infinity=modelrc["zero_infinity"],
        )
        self.save_best_on = downstream_expert.get("save_best_on", "dev")
        self.metrics = downstream_expert["metric"]
        self.metric_higher_better = downstream_expert["metric_higher_better"]
        self.register_buffer(
            "best_score",
            torch.ones(1) * (0 if self.metric_higher_better else 1 << 31))
Ejemplo n.º 8
0
def get_objective(objective):
    if isinstance(objective, str):
        objective = objective.lower()
        if objective in ['l1', 'l1loss']:
            return nn.L1Loss()
        elif objective in ['nll', 'nllloss']:
            return nn.NLLLoss()
        elif objective in ['nll2d', 'nllloss2d']:
            return nn.NLLLoss2d()
        elif objective in ['poissonnll', 'poissonnllloss']:
            return nn.PoissonNLLLoss()
        elif objective in ['kldiv', 'kldivloss']:
            return nn.KLDivLoss()
        elif objective in ['mse', 'mseloss']:
            return nn.MSELoss()
        elif objective in ['bce', 'bceloss']:
            return nn.BCELoss()
        elif objective in ['smoothl1', 'smoothl1loss']:
            return nn.SmoothL1Loss()
        elif objective in ['crossentropy', 'cross_entropy']:
            return nn.CrossEntropyLoss()
        elif objective in ['ctc', 'ctcloss']:
            return nn.CTCLoss()
        else:
            raise ValueError('unknown argument!')
    elif isinstance(objective, _Loss):
        return objective
    else:
        raise ValueError('unknown argument {}'.format(objective))
Ejemplo n.º 9
0
 def __init__(self, file_path, epochs, batch_size=4):
     self.temppath = file_path.split(".")
     self.logpath = self.temppath[0] + "log.txt"
     print("Cuda : " + str(torch.cuda.is_available()))
     self.device = torch.device(
         'cuda:0' if torch.cuda.is_available() else 'cpu')  #set cpu or gpu
     self.file_path = file_path
     self.net = model.SpeechRecognition()
     self.criterion = nn.CTCLoss(blank=28).to(self.device)
     if torch.cuda.is_available():
         self.net.cuda()
     else:
         self.net.cpu()
     if file_path is not None and path.exists(file_path):
         self.load()
         self.net.to(self.device)
     #set training waveform data transformer
     self.train_audio_transforms = nn.Sequential(
         torchaudio.transforms.MelSpectrogram(sample_rate=16000,
                                              n_mels=128),
         torchaudio.transforms.FrequencyMasking(freq_mask_param=30),
         torchaudio.transforms.TimeMasking(time_mask_param=100))
     #set testing waveform data transformer
     self.valid_audio_transforms = torchaudio.transforms.MelSpectrogram()
     #train
     self.train(epochs=epochs, batch_size=batch_size)
Ejemplo n.º 10
0
 def __init__(self, blank=0, reduction='mean', use_baidu=False):
     super(CTCLoss, self).__init__()
     self.use_baidu_implement = use_baidu
     if self.use_baidu_implement:
         self.internal_loss = internalCTC(reduction=reduction, blank=blank)
     else:
         self.internal_loss = nn.CTCLoss(blank=blank, reduction=reduction)
Ejemplo n.º 11
0
    def forward(self, images, targets=None):
        bs, _, _, _ = images.size()
        #print(images.shape)
        x = F.relu(self.conv1(images))
        #print(x.shape)
        x = self.maxpool1(x)
        #print(x.shape)
        x = F.relu(self.conv2(x))
        #print(x.shape)
        x = self.maxpool2(x)
        #print(x.shape)
        x = x.permute(0, 3, 1, 2)
        #print(x.shape)
        x = x.view(bs, x.size(1), -1)
        #print(x.shape)
        x = F.relu(self.linear1(x))
        #print(x.shape)
        x = self.dropout1(x) 

        x, _ = self.gru1(x) # 1, 75, 64
        x = self.output(x) # 1, 75, 20
        x = x.permute(1, 0, 2) # 75, 1, 20
        if targets is not None:
            log_softmax_values = F.log_softmax(x, 2)
            input_lengths = torch.full(
                size = (bs,), fill_value=log_softmax_values.size(0), dtype=torch.int32
            )
            target_lengths = torch.full(
                size=(bs,), fill_value=targets.size(1), dtype=torch.int32
            )
            loss = nn.CTCLoss(blank=0)(log_softmax_values, targets, input_lengths, target_lengths)
            return x, loss
        return x, None
Ejemplo n.º 12
0
    def __init__(self, num_features, num_classes, base_config, blank_index=0):
        super(Wav2LetterPlus, self).__init__()
        self.layers = nn.ModuleList ([
            Wav2LetterPlusSubBlock(num_features, 256, 11, 2, 1, 0.2, 1),
            Wav2LetterPlusSubBlock(256, 256, 11, 1, 1, 0.2, 3),
            Wav2LetterPlusSubBlock(256, 384, 13, 1, 1, 0.2, 3),
            Wav2LetterPlusSubBlock(384, 512, 17, 1, 1, 0.2, 3),
            Wav2LetterPlusSubBlock(512, 640, 21, 1, 1, 0.3, 3),
            Wav2LetterPlusSubBlock(640, 768, 25, 1, 1, 0.3, 3),
            Wav2LetterPlusSubBlock(768, 896, 29, 1, 2, 0.4, 1),
            Wav2LetterPlusSubBlock(896, 1024, 1, 1, 1, 0.4, 1),
        ])
        self.output_layers = nn.ModuleList([
            Wav2LetterPlusSubBlock(1024, num_classes, 1, 1, 1, activation=False, batch_norm=False)
            for _ in range(1+n_left_context_heads+n_right_context_heads)
        ])
        self.n_left_context_heads = n_left_context_heads
        self.n_right_context_heads = n_right_context_heads
    
        self.ctc_criterion = nn.CTCLoss(blank=blank_index, reduction='mean', zero_infinity=True)
        self.ct_criterion = ct_loss.CTLoss(blank_index=blank_index, version='numpy')


    ##### hyper parameters #####
        ctc_loss_weight = base_config.get('ctc_loss_weight', 1)
        ct_loss_left_weight = base_config.get('ct_loss_left_weight', 0)
        ct_loss_right_weight = base_config.get('ct_loss_right_weight', 0)
        n_left_context_heads = len(ct_loss_left_weight)
        n_right_context_heads = len(ct_loss_right_weight)
        eval_ct_steps = base_config.get('eval_steps') * base_config.get('eval_ct_steps', 0)
Ejemplo n.º 13
0
    def _get_ctc_loss(output, trg, device):
        ctc_batch_size = output.shape[1]
        ctc_input_length = output.shape[0]
        ctc_inputs = output.log_softmax(2)
        ctc_trgs = trg.permute((1, 0))
        ctc_input_lengths = torch.full(size=(ctc_batch_size, ),
                                       fill_value=ctc_input_length,
                                       dtype=torch.long)

        # TODO figure this part out better..
        #ctc_trg_lengths = torch.full(size=(ctc_batch_size,), fill_value=ctc_input_length, dtype=torch.long)
        ctc_trg_list = []
        for i in range(ctc_batch_size):
            nz = (ctc_trgs[i, :] == 0).nonzero()
            if len(nz) == 0 or nz[0] > ctc_input_length / 2:
                ctc_trg_list.append(
                    torch.tensor([int(ctc_input_length / 2)]).to(device))
                #ctc_trg_list.append(ctc_input_lengths[0:1].to(device))
            else:
                ctc_trg_list.append(nz[0].detach())

        try:
            ctc_trg_lengths = torch.cat(ctc_trg_list).to(device)
        except:
            import pdb
            pdb.set_trace()

        loss_ctc = nn.CTCLoss()(ctc_inputs, ctc_trgs, ctc_input_lengths,
                                ctc_trg_lengths)

        loss = loss_ctc
        if torch.isinf(loss):
            import pdb
            pdb.set_trace()
        return loss
Ejemplo n.º 14
0
 def __init__(
     self,
     num_classes: int,
     ignore_index: int,
     dim: int = -1,
     reduction='mean',
     ctc_weight: float = 0.3,
     cross_entropy_weight: float = 0.7,
     blank_id: int = None,
     smoothing: float = 0.1,
 ) -> None:
     super(JointCTCCrossEntropyLoss, self).__init__()
     self.num_classes = num_classes
     self.dim = dim
     self.ignore_index = ignore_index
     self.reduction = reduction.lower()
     self.ctc_weight = ctc_weight
     self.cross_entropy_weight = cross_entropy_weight
     self.ctc_loss = nn.CTCLoss(blank=blank_id,
                                reduction=self.reduction,
                                zero_infinity=True)
     if smoothing > 0.0:
         self.cross_entropy_loss = LabelSmoothedCrossEntropyLoss(
             num_classes=num_classes,
             ignore_index=ignore_index,
             smoothing=smoothing,
             reduction=reduction,
             dim=-1,
         )
     else:
         self.cross_entropy_loss = nn.CrossEntropyLoss(
             reduction=self.reduction, ignore_index=self.ignore_index)
Ejemplo n.º 15
0
    def __init__(self, net_class, net_cfg, train_cfg):
        super().__init__()
        self.net_cfg = net_cfg
        self.train_cfg = train_cfg
        self.loss_fn = nn.CTCLoss(blank=0, zero_infinity=False)

        self.model = net_class(**net_cfg)
Ejemplo n.º 16
0
def train(model, device):
    ctc_loss = nn.CTCLoss()
    model.train()
    optimizer = opt.SGD(model.parameters(), lr=learning_rate, weight_decay=1e-5)
    global batch_size

    for e in range(num_of_epochs):
        loss_sum = 0
        for item, label, real_size, indices in train_loader:
            optimizer.zero_grad()
            item = item.to(device) 
            probes = model(item, device)

            length = probes.size()[0]
            targets = indices.to(device)

            input_lengths = torch.full(size=(batch_size,), fill_value=length, dtype=torch.long).to(device)
            target_lengths = real_size.to(device)

            loss = ctc_loss(probes, targets, input_lengths, target_lengths)
            loss_sum += loss.item()

            loss.backward()

            optimizer.step()

        
        print("finish epoch #{} avg loss {} last loss {}".format(e, (loss_sum / len(train_loader)), loss))
Ejemplo n.º 17
0
 def build_output(
     self, input: Tuple[int, int, int, int], block: str
 ) -> Union[Tuple[None, None, None], Tuple[Tuple[int, int, int, int], str,
                                           Callable]]:
     """
     Builds an output layer.
     """
     pattern = re.compile(
         r'(O)(?P<name>{\w+})?(?P<dim>2|1|0)(?P<type>l|s|c)(?P<aug>a)?(?P<out>\d+)'
     )
     m = pattern.match(block)
     if not m:
         return None, None, None
     if int(m.group('dim')) != 1:
         raise ValueError('non-2d output not supported, yet')
     nl = m.group('type')
     if nl not in ['s', 'c']:
         raise ValueError('only softmax and ctc supported in output')
     if nl == 'c':
         self.criterion = nn.CTCLoss(reduction='none')
     aug = True if m.group('aug') else False
     lin = layers.LinSoftmax(input[1], int(m.group('out')), aug)
     logger.debug('{}\t\tlinear\taugmented {} out {}'.format(
         self.idx + 1, aug, m.group('out')))
     return lin.get_shape(input), self.get_layer_name(
         m.group(1), m.group('name')), lin
Ejemplo n.º 18
0
    def __init__(self, config):
        super().__init__()
        self.save_hyperparameters('config')
        self.hparams = config

        self.transform = ImageTransform(
            augmentation=config.get('augmentation', True),
            scale_height=config['dataset']['scale_height'],
            min_width=config['dataset']['min_width'])

        self.config = config
        self.beam_width = config['beam_width']

        # define model
        self.cnn = initialize(config['cnn'])
        self.vocab = initialize(config['vocab'], add_blank=True)
        self.loss_fn = nn.CTCLoss(blank=self.vocab.BLANK_IDX)

        output_H = config['dataset']['scale_height'] // (
            32 // 2**config['cnn']['args']['droplast'])
        self.blstm = nn.LSTM(input_size=output_H * self.cnn.n_features,
                             hidden_size=config['hidden_size'],
                             num_layers=config['num_layers'],
                             batch_first=True,
                             dropout=config['dropout'],
                             bidirectional=True)
        self.character_distribution = nn.Linear(2 * config['hidden_size'],
                                                self.vocab.size)
        self.ctc_string_tf = CTCStringTransform(self.vocab)
        self.string_tf = StringTransform(self.vocab)
Ejemplo n.º 19
0
    def forward(self, images, targets=None):
        bs, _, _, _ = images.size()
        x = F.relu(self.conv_1(images))
        x = self.pool_1(x)
        x = F.relu(self.conv_2(x))
        x = self.pool_2(x)
        x = x.permute(0, 3, 1, 2)
        x = x.view(bs, x.size(1), -1)
        x = F.relu(self.linear_1(x))
        x = self.drop_1(x)
        x, _ = self.lstm(x)
        x = self.output(x)
        x = x.permute(1, 0, 2)

        if targets is not None:
            log_probs = F.log_softmax(x, 2)
            input_lengths = torch.full(size=(bs, ),
                                       fill_value=log_probs.size(0),
                                       dtype=torch.int32)
            target_lengths = torch.full(size=(bs, ),
                                        fill_value=targets.size(1),
                                        dtype=torch.int32)
            loss = nn.CTCLoss(blank=0)(log_probs, targets, input_lengths,
                                       target_lengths)
            return x, loss

        return x, None
Ejemplo n.º 20
0
 def __init__(self, input_len, target_len, blank=0):
     super(CTC, self).__init__()
     self.ctc = nn.CTCLoss(blank=blank,
                           reduction='mean',
                           zero_infinity=True)
     self.input_len = input_len
     self.target_len = target_len
Ejemplo n.º 21
0
    def __init__(
            self,
            num_classes: int,  # the number of classfication
            ignore_index: int,  # indexes that are ignored when calcuating loss
            dim: int = -1,  # dimension of caculation loss
            reduction='mean',  # reduction method [sum, mean]
            ctc_weight: float = 0.3,  # weight of ctc loss
            cross_entropy_weight: float = 0.7,  # weight of cross entropy loss
            blank_id: int = None) -> None:
        super(JointCTCCrossEntropyLoss, self).__init__()
        self.num_classes = num_classes
        self.dim = dim
        self.ignore_index = ignore_index
        self.reduction = reduction.lower()
        self.ctc_weight = ctc_weight
        self.cross_entropy_weight = cross_entropy_weight

        if self.reduction == 'sum':
            self.reduction_method = torch.sum
        elif self.reduction == 'mean':
            self.reduction_method = torch.mean
        else:
            raise ValueError(
                "Unsupported reduction method {0}".format(reduction))

        self.ctc_loss = nn.CTCLoss(blank=blank_id,
                                   reduction=self.reduction,
                                   zero_infinity=True)
        self.cross_entropy_loss = nn.CrossEntropyLoss(
            reduction=self.reduction, ignore_index=self.ignore_index)
Ejemplo n.º 22
0
def train_epoch_packed(model, optimizer, train_loader):
    model.train()
    criterion = nn.CTCLoss() 
    criterion = criterion.to(DEVICE)
    batch_id=0
    avg_loss = 0
    before = time.time()
    print("Training", len(train_loader), "number of batches")
    print("Current running lr is: ",optimizer.param_groups[0]['lr'])
    for inputs, input_lens,targets, target_lens in train_loader: # lists, presorted, preloaded on GPU
        batch_id+=1
        inputs, input_lens, targets, target_lens = inputs.to(DEVICE), input_lens.to(DEVICE),targets.to(DEVICE), target_lens.to(DEVICE)
        out, out_lens = model(inputs, input_lens)
        # print(target_lens)
        loss = criterion(out,  targets, out_lens, target_lens) # criterion of the concatenated output
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        avg_loss += loss.item()
        if batch_id % 100 == 0:
            after = time.time()
            # nwords = np.sum(np.array([len(l) for l in inputs]))
            # lpw = loss.item() / nwords
            print("Time elapsed: ", after - before)
            print("At batch",batch_id,'Loss', avg_loss/(100))
            avg_loss = 0
            before = after
        del inputs
        del input_lens
        del targets
        del target_lens
        del out
        del out_lens
    return model
Ejemplo n.º 23
0
def main(args):
    alphabet = alphabet_factory()
    device = torch.device('cpu')
    checkpoint = torch.load('model_best.pth', map_location=device)
    in_features = args.n_mfcc * (2 * args.n_context + 1)
    model = build_deepspeech(in_features=in_features,
                             num_classes=len(alphabet))
    model.load_state_dict(checkpoint['state_dict'])
    print_size_of_model(model)
    decoder = GreedyDecoder()
    if args.quantize:
        model = torch.quantization.quantize_dynamic(model, {nn.RNN, nn.Linear},
                                                    dtype=torch.qint8)
        logging.info('quantized model')
        print_size_of_model(model)

    transform = prepare_transformations(args)
    dataset = ProcessedDataset(get_dataset(args.datadir, "dev-clean"),
                               transform, alphabet)
    collate_fn = collate_factory(model_length_function)
    criterion = nn.CTCLoss(blank=alphabet.mapping[alphabet.char_blank])
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=0,
                                             collate_fn=collate_fn,
                                             drop_last=False)
    test_loop_fn(dataloader, model, criterion, device, 1, decoder, alphabet)
Ejemplo n.º 24
0
def get_criterion(opt, vocab):
    if opt.architecture == 'deepspeech2':
        criterion = nn.CTCLoss(blank=vocab.blank_id, reduction=opt.reduction, zero_infinity=True)
    elif opt.architecture == 'las' and opt.joint_ctc_attention:
        criterion = JointCTCCrossEntropyLoss(
            num_classes=len(vocab),
            ignore_index=vocab.pad_id,
            reduction=opt.reduction,
            ctc_weight=opt.ctc_weight,
            cross_entropy_weight=opt.cross_entropy_weight,
            blank_id=vocab.blank_id,
            dim=-1,
        )
    elif opt.architecture == 'transformer':
        criterion = nn.CrossEntropyLoss(
            ignore_index=vocab.pad_id,
            reduction=opt.reduction
        )
    else:
        criterion = LabelSmoothedCrossEntropyLoss(
            num_classes=len(vocab),
            ignore_index=vocab.pad_id,
            smoothing=opt.label_smoothing,
            reduction=opt.reduction,
            architecture=opt.architecture,
            dim=-1
        )

    return criterion
Ejemplo n.º 25
0
    def forward(self, images, labels, lengths):
        r"""
        Overridden.
        """
        images = images.to(self.device)
        labels = labels.to(self.device)
        T_length = 18
        input_lengths, target_lengths = BIM.sparse_tuple_for_ctc(T_length, lengths)
        ctc_loss = nn.CTCLoss(blank=len(CHARS)-1, reduction='mean') # reduction: 'none' | 'mean' | 'sum'

        for i in range(self.iters) :    
            images.requires_grad = True
            outputs = self.model(images)
            log_probs = outputs.permute(2, 0, 1) # for ctc loss: T x N x C, llog_probs.shape = (18, 100, 68)
            log_probs = log_probs.log_softmax(2).requires_grad_()
            cost = ctc_loss(log_probs, labels, input_lengths=input_lengths, target_lengths=target_lengths)
            
            grad = torch.autograd.grad(cost, images,
                                       retain_graph=False, create_graph=False)[0]
            
            adv_images = images + self.alpha*grad.sign()
            
            a = torch.clamp(images - self.eps, min=-1, max=1)
            b = (adv_images>=a).float()*adv_images + (a>adv_images).float()*a
            c = (b > images+self.eps).float()*(images+self.eps) + (images+self.eps >= b).float()*b
            images = torch.clamp(c, max=1).detach()

        adv_images = images

        return adv_images
Ejemplo n.º 26
0
    def __init__(self, hparams: Namespace):
        super().__init__()
        self.hparams = hparams

        self.criterion = nn.CTCLoss()

        if Path(self.hparams.tokenizer_path).exists():
            self.tokenizer = WordLevelTokenizer(self.hparams.tokenizer_path)
        else:
            train_turns = preprocess(self.hparams.train_path,
                                     self.hparams.ontology_path)
            self.tokenizer = get_tokenizer(train_turns,
                                           self.hparams.tokenizer_path)

        # embedding
        self.embedding = nn.Embedding(self.tokenizer.get_vocab_size(),
                                      self.hparams.embedding_dim)
        self.pos_embedding = PositionalEncoding(
            d_model=self.hparams.embedding_dim, dropout=self.hparams.dropout)

        # value decoder
        self.value_decoder = nn.ModuleList([
            nn.MultiheadAttention(embed_dim=self.hparams.hidden_dim,
                                  num_heads=self.hparams.num_heads,
                                  dropout=self.hparams.dropout)
            for _ in range(3)
        ])
        self.vocab_proj = nn.Linear(self.hparams.hidden_dim,
                                    self.tokenizer.get_vocab_size())
Ejemplo n.º 27
0
 def __init__(
         self,
         num_classes: int,  # the number of classfication
         ignore_index: int,  # indexes that are ignored when calcuating loss
         dim: int = -1,  # dimension of caculation loss
         reduction='mean',  # reduction method [sum, mean]
         ctc_weight: float = 0.3,  # weight of ctc loss
         cross_entropy_weight: float = 0.7,  # weight of cross entropy loss
         blank_id: int = 0,  # identification of blank token
         smoothing:
     float = 0.1,  # ratio of smoothing (confidence = 1.0 - smoothing)
 ) -> None:
     super(JointCTCCrossEntropyLoss, self).__init__()
     self.num_classes = num_classes
     self.dim = dim
     self.ignore_index = ignore_index
     self.reduction = reduction.lower()
     self.ctc_weight = ctc_weight
     self.cross_entropy_weight = cross_entropy_weight
     self.ctc_loss = nn.CTCLoss(blank=blank_id,
                                reduction=self.reduction,
                                zero_infinity=True)
     if smoothing > 0.0:
         self.cross_entropy_loss = LabelSmoothedCrossEntropyLoss(
             num_classes=num_classes,
             ignore_index=ignore_index,
             smoothing=smoothing,
             reduction=reduction,
             dim=-1,
         )
     else:
         self.cross_entropy_loss = nn.CrossEntropyLoss(
             reduction=self.reduction, ignore_index=self.ignore_index)
Ejemplo n.º 28
0
def get_criterion(config: DictConfig, vocab: Vocabulary) -> nn.Module:
    if config.model.architecture in ('deepspeech2', 'jasper'):
        criterion = nn.CTCLoss(blank=vocab.blank_id,
                               reduction=config.train.reduction,
                               zero_infinity=True)
    elif config.model.architecture in (
            'las', 'transformer') and config.model.joint_ctc_attention:
        criterion = JointCTCCrossEntropyLoss(
            num_classes=len(vocab),
            ignore_index=vocab.pad_id,
            reduction=config.train.reduction,
            ctc_weight=config.model.ctc_weight,
            cross_entropy_weight=config.model.cross_entropy_weight,
            blank_id=vocab.blank_id,
            dim=-1,
            smoothing=config.train.label_smoothing,
        )
    elif config.model.architecture in ('rnnt', 'conformer'):
        criterion = TransducerLoss(blank_id=vocab.blank_id)
    elif config.model.architecture == 'transformer' and config.train.label_smoothing <= 0.0:
        criterion = nn.CrossEntropyLoss(
            ignore_index=vocab.pad_id,
            reduction=config.train.reduction,
        )
    else:
        criterion = LabelSmoothedCrossEntropyLoss(
            num_classes=len(vocab),
            ignore_index=vocab.pad_id,
            smoothing=config.train.label_smoothing,
            reduction=config.train.reduction,
            dim=-1,
        )

    return criterion
Ejemplo n.º 29
0
 def __init__(self, vocab_size, decoder_dim, hidden_size, dropout=0.5):
     super(MIEsitmator, self).__init__()
     self.proj = nn.Sequential(
         LinearNorm(decoder_dim, hidden_size, bias=True,
                    w_init_gain='relu'), nn.ReLU(), nn.Dropout(p=dropout))
     self.ctc_proj = LinearNorm(hidden_size, vocab_size + 1, bias=True)
     self.ctc = nn.CTCLoss(blank=vocab_size, reduction='none')
Ejemplo n.º 30
0
    def __init__(self, upstream_dim, upstream_rate, runner, downstream_expert,
                 expdir, **kwargs):
        super(DownstreamExpert, self).__init__()
        self.expdir = expdir
        self.upstream_dim = upstream_dim
        self.corpus = downstream_expert['corpus']

        # Text tokenizer
        self.tokenizer = load_text_encoder(**downstream_expert['text'])

        modelrc = downstream_expert['model']
        self.projector = nn.Linear(upstream_dim, modelrc['project_dim'])

        model_select = downstream_expert['model']['select']
        self.model = eval(model_select)(
            modelrc['project_dim'],
            self.tokenizer.vocab_size,
            upstream_rate=upstream_rate,
            **modelrc.get(model_select, {}),
        )
        self.objective = nn.CTCLoss(
            blank=self.tokenizer.pad_idx,
            zero_infinity=modelrc['zero_infinity'],
        )
        self.eval_dataloaders = runner['eval_dataloaders']
        self.metrics = downstream_expert['metric']
        self.metric_higher_better = downstream_expert['metric_higher_better']
        self.register_buffer(
            'best_score',
            torch.ones(1) * (0 if self.metric_higher_better else 1 << 31))