def main(opt): dataset = VideoDataset(opt, "test") opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], rnn_dropout_p=opt["rnn_dropout_p"]).cuda() elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder).cuda() elif opt["model"] == "CTCmodel": model = CTCmodel(opt['vocab_size'] + 1, opt['dim_hidden']) # model = nn.DataParallel(model) # Setup the model model.load_state_dict(torch.load(opt["saved_model"])) model.cuda() crit = CTCLoss() crit = crit.cuda() # crit = utils.LanguageModelCriterion() test(model, crit, dataset, dataset.get_vocab(), opt)
def __init__(self, model, loader, val_loader, test_loader, max_epochs=1, run_id='exp'): """ Use this class to train your model """ # feel free to add any other parameters here self.model = model.cuda() if torch.cuda.is_available() else model self.loader = loader self.val_loader = val_loader self.test_loader = test_loader self.train_losses = [] self.val_losses = [] self.predictions = [] self.predictions_test = [] self.generated_logits = [] self.generated = [] self.generated_logits_test = [] self.generated_test = [] self.epochs = 0 self.max_epochs = max_epochs self.run_id = run_id self.optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-6) # self.optimizer = torch.optim.SGD(model.parameters(), lr=0.0001, weight_decay=1e-6, momentum=0.9) self.criterion = CTCLoss()#size_average=True, length_average=False) self.criterion = self.criterion.cuda() if torch.cuda.is_available() else self.criterion self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, factor=0.1, patience=2) self.LD = Levenshtein(phoneme_list.PHONEME_MAP) self.best_rate = 1e10 self.decoder = CTCBeamDecoder(labels=[' '] + phoneme_list.PHONEME_MAP, blank_id=0, beam_width=150)
def main(args): if args.rnn: transform = transforms.Compose([ Normalize([0.3956, 0.5763, 0.5616], [0.1535, 0.1278, 0.1299]), Resize((204, 32)), ToTensorRGBFlatten() ]) else: transform = transforms.Compose([ Normalize([0.3956, 0.5763, 0.5616], [0.1535, 0.1278, 0.1299]), Resize((204, 32)), ToTensor() ]) train_set = digitsDataset(args.train_root_path, transform=transform) val_set = digitsDataset(args.val_root_path, transform=transform) train_loader = DataLoader(train_set, batch_size=args.batch_size, shuffle=True, num_workers=4, pin_memory=True) val_loader = DataLoader(val_set, batch_size=args.batch_size, shuffle=False, num_workers=4, pin_memory=True) # trainer parameters params = EasyDict() params.max_epoch = args.max_epoch params.print_freq = args.print_freq params.validate_interval = args.validate_interval params.save_interval = args.save_interval params.expr_path = args.expr_path params.rnn = args.rnn device = torch.device("cuda") # train engine ntoken = len(args.alphabet) + 1 if args.rnn: input_dim = 96 model = LSTMFeatures(input_dim, args.batch_size, ntoken) else: model = DenseNetFeature(num_classes=ntoken) model = model.to(device) criterion = CTCLoss() criterion = criterion.to(device) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) converter = LabelConverter(args.alphabet) solver = SolverWrapper(params) # train solver.train(train_loader, val_loader, model, criterion, optimizer, device, converter)
def __init__(self, train_path, test_path, model_file, model, img_h=32, img_w=110, batch_size=64, lr=1e-3, use_unicode=True, best_loss=0.2, use_gpu=True, workers=1): self.model = model self.model_file = model_file self.use_unicode = use_unicode self.img_h = img_h self.img_w = img_w self.batch_size = batch_size self.lr = lr self.best_loss = best_loss self.best_acc = 0.95 self.use_gpu = use_gpu self.workers = workers self.converter = utils.strLabelConverter(alphabet) self.criterion = CTCLoss() if self.use_gpu: print("[use gpu] ...") self.model = self.model.cuda() self.criterion = self.criterion.cuda() if torch.cuda.is_available() and not self.use_gpu: print("[WARNING] You have a CUDA device, so you should probably run with --cuda") # 加载模型 if os.path.exists(self.model_file): self.load(self.model_file) else: print('[Load model] error !!!') self.transform = T.Compose([ T.Resize((self.img_h, self.img_w)), T.ToTensor(), # T.Normalize(mean=[.5, .5, .5], std=[.5, .5, .5]) ]) train_label = os.path.join(train_path, 'labels_normal.txt') train_dataset = my_dataset.MyDataset(root=train_path, label_file=train_label, transform=self.transform, is_train=True, img_h=self.img_h, img_w=self.img_w) self.train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=int(self.workers)) test_label = os.path.join(test_path, 'labels_normal.txt') test_dataset = my_dataset.MyDataset(root=test_path, label_file=test_label, transform=self.transform, is_train=False, img_h=self.img_h, img_w=self.img_w) self.test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=int(self.workers)) # setup optimizer # if opt.adam: # self.optimizer = optim.Adam(crnn.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999)) # elif opt.adadelta: # self.optimizer = optim.Adadelta(crnn.parameters(), lr=opt.lr) # else: # self.optimizer = optim.RMSprop(crnn.parameters(), lr=opt.lr) self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr, weight_decay=1e-5)
def main(model_path, confs): model, __ = MultiTask.load_model(model_path) if confs['cuda']: model = model.cuda() if not model._meta['use_transcripts_out']: # only accent classification criterion = nn.CrossEntropyLoss() elif not model._meta['use_accents_out']: # only text recognition criterion = CTCLoss() else: # both tasks criterion = (CTCLoss(), nn.CrossEntropyLoss()) # Results results = {} for manifest, lm in confs['testing_manifests']: eprint(f'\n### Testing {manifest.split("/")[-1]} for model {Path(model_path).stem.split("_")[0]}') # Decoder if model._meta['use_transcripts_out']: decoder = BeamCTCDecoder(confs['labels'], lm_path=lm, alpha=confs['decoder_alpha'], beta=confs['decoder_beta'], cutoff_top_n=confs['decoder_cutoff_top_n'], cutoff_prob=confs['decoder_cutoff_top_n'], beam_width=confs['decoder_beam_width'], num_processes=confs['num_workers']) target_decoder = GreedyDecoder(confs['labels']) else: decoder, target_decoder = None, None # Test results[manifest.split('/')[-1]] = result_for_manifest(model, criterion, manifest, decoder, target_decoder, confs['batch_size'], confs['num_workers']) if not PRINT_LATEX_TABLE: print(f'Model: {model_path.split("/")[-1]}') for name, res in results.items(): print(f'\nResults for {name}:') print('; '.join([f'{k}: {v:.3f}' for k, v in res.items()])) else: print(' & '.join(['model']+list([k[:-4] for k in results.keys()]))) val_dict = {} for k in list(results.values())[0].keys(): val_dict[k] = [] for res in results.values(): [val_dict[k].append(f'{v:.1f}') for k, v in res.items()] for val in val_dict.values(): print(' & '.join([Path(model_path).stem.split('_')[0]]+val)+r' \\')
def __init__(self): os.environ['CUDA_VISIBLE_DEVICES'] = args.gpus if args.chars_file == '': self.alphabet = alphabetChinese else: self.alphabet = utils.load_chars(args.chars_file) nclass = len(self.alphabet) + 1 nc = 1 self.net = CRNN(args.imgH, nc, args.nh, nclass) self.train_dataloader, self.val_dataloader = self.dataloader( self.alphabet) self.criterion = CTCLoss() self.optimizer = self.get_optimizer() self.converter = utils.strLabelConverter(self.alphabet, ignore_case=False) self.best_acc = 0.00001 model_name = '%s' % (args.dataset_name) if not os.path.exists(args.save_prefix): os.mkdir(args.save_prefix) args.save_prefix += model_name if args.pretrained != '': print('loading pretrained model from %s' % args.pretrained) checkpoint = torch.load(args.pretrained) if 'model_state_dict' in checkpoint.keys(): # self.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) args.start_epoch = checkpoint['epoch'] self.best_acc = checkpoint['best_acc'] checkpoint = checkpoint['model_state_dict'] from collections import OrderedDict model_dict = OrderedDict() for k, v in checkpoint.items(): if 'module' in k: model_dict[k[7:]] = v else: model_dict[k] = v self.net.load_state_dict(model_dict) if not args.cuda and torch.cuda.is_available(): print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) elif args.cuda and torch.cuda.is_available(): print('available gpus is ', torch.cuda.device_count()) self.net = torch.nn.DataParallel(self.net, output_dim=1).cuda() self.criterion = self.criterion.cuda()
def initiate(hyp_params, train_loader, valid_loader, test_loader): model = getattr(models, hyp_params['model']+'Model')(hyp_params) if hyp_params['use_cuda']: model = model.cuda() optimizer = getattr(optim, hyp_params['optim'])(model.parameters(), lr=hyp_params['lr']) criterion = getattr(nn, hyp_params['criterion'])()# weight=hyp_params['weights'] if hyp_params['aligned'] or hyp_params['model']=='MULT': ctc_criterion = None ctc_a2l_module, ctc_v2l_module = None, None ctc_a2l_optimizer, ctc_v2l_optimizer = None, None else: from warpctc_pytorch import CTCLoss ctc_criterion = CTCLoss() ctc_a2l_module, ctc_v2l_module = get_CTC_module(hyp_params) if hyp_params['use_cuda']: ctc_a2l_module, ctc_v2l_module = ctc_a2l_module.cuda(), ctc_v2l_module.cuda() ctc_a2l_optimizer = getattr(optim, hyp_params['optim'])(ctc_a2l_module.parameters(), lr=hyp_params['lr']) ctc_v2l_optimizer = getattr(optim, hyp_params['optim'])(ctc_v2l_module.parameters(), lr=hyp_params['lr']) scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=hyp_params['when'], factor=0.1, verbose=True) settings = {'model': model, 'optimizer': optimizer, 'criterion': criterion, 'ctc_a2l_module': ctc_a2l_module, 'ctc_v2l_module': ctc_v2l_module, 'ctc_a2l_optimizer': ctc_a2l_optimizer, 'ctc_v2l_optimizer': ctc_v2l_optimizer, 'ctc_criterion': ctc_criterion, 'scheduler': scheduler} return train_model(settings, hyp_params, train_loader, valid_loader, test_loader)
def test_seg_ctc(use_mine=True, use_log=False): size = 43 voca_size = 37 n = 20 np.random.seed(1234) pred_len_np = np.ones([n])*size pred_np = np.random.random([size, n, voca_size+1]) pred_np = np.log(pred_np) token_len_np = np.random.randint(low=2, high=10, size=n) token_np = np.random.randint(voca_size, size=token_len_np.sum())+1 pred = Variable(floatX(pred_np), requires_grad=True) token = Variable(T.IntTensor(token_np)) sizes = Variable(T.IntTensor(pred_len_np)) target_sizes = Variable(T.IntTensor(token_len_np)) for i in range(50): if use_mine: H, cost = ctc_ent_cost(pred, token, sizes, target_sizes, use_log=use_log) # glog.info('%d, cost: %s, entropy: %s'% (i, cost.data.item(), H.data.item())) # cost = 0.9*cost - 0.1*H else: from warpctc_pytorch import CTCLoss criterion = CTCLoss().cuda() cost = criterion(pred, token, sizes, target_sizes) # glog.info('%d, cost: %s'% (i, cost.data.item())) optimizer = T.optim.Adam([pred], lr=3e-1)#, nesterov=True) optimizer.zero_grad() (cost).backward() optimizer.step()
def test_ctc_loss(): pytest.importorskip("torch") pytest.importorskip("warpctc_pytorch") import torch from warpctc_pytorch import CTCLoss from e2e_asr_attctc_th import pad_list n_out = 7 n_batch = 3 input_length = numpy.array([11, 17, 15], dtype=numpy.int32) label_length = numpy.array([4, 2, 3], dtype=numpy.int32) np_pred = [numpy.random.rand(il, n_out).astype( numpy.float32) for il in input_length] np_target = [numpy.random.randint( 0, n_out, size=ol, dtype=numpy.int32) for ol in label_length] # NOTE: np_pred[i] seems to be transposed and used axis=-1 in e2e_asr_attctc.py ch_pred = F.separate(F.pad_sequence(np_pred), axis=-2) ch_target = F.pad_sequence(np_target, padding=-1) ch_loss = F.connectionist_temporal_classification( ch_pred, ch_target, 0, input_length, label_length).data th_pred = pad_list([torch.autograd.Variable(torch.from_numpy(x)) for x in np_pred]).transpose(0, 1) th_target = torch.autograd.Variable( torch.from_numpy(numpy.concatenate(np_target))) th_ilen = torch.autograd.Variable(torch.from_numpy(input_length)) th_olen = torch.autograd.Variable(torch.from_numpy(label_length)) # NOTE: warpctc_pytorch.CTCLoss does not normalize itself by batch-size while chainer's default setting does th_loss = (CTCLoss()(th_pred, th_target, th_ilen, th_olen) / n_batch).data.numpy()[0] numpy.testing.assert_allclose(th_loss, ch_loss, 0.05)
def train( model, epochs=110, batch_size=128, train_index_path=TRAIN_PATH, dev_index_path=DEV_PATH, labels_path=LABEL_PATH, learning_rate=0.6, momentum=0.8, max_grad_norm=0.2, weight_decay=0, ): train_dataset = data.MASRDataset(train_index_path, labels_path) batchs = (len(train_dataset) + batch_size - 1) // batch_size dev_dataset = data.MASRDataset(dev_index_path, labels_path) train_dataloader = data.MASRDataLoader(train_dataset, batch_size=batch_size, num_workers=0) train_dataloader_shuffle = data.MASRDataLoader(train_dataset, batch_size=batch_size, num_workers=0, shuffle=True) dev_dataloader = data.MASRDataLoader(dev_dataset, batch_size=batch_size, num_workers=0) parameters = model.parameters() optimizer = torch.optim.SGD( parameters, lr=learning_rate, momentum=momentum, nesterov=True, weight_decay=weight_decay, ) ctcloss = CTCLoss(size_average=True) # lr_sched = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.985) gstep = 0 for epoch in range(epochs): epoch_loss = 0 if epoch > 0: train_dataloader = train_dataloader_shuffle # lr_sched.step() for i, (x, y, x_lens, y_lens) in enumerate(train_dataloader): x = x.cuda() out, out_lens = model(x, x_lens) out = out.transpose(0, 1).transpose(0, 2) loss = ctcloss(out, y, out_lens, y_lens) optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) optimizer.step() epoch_loss += loss.item() gstep += 1 print("[{}/{}][{}/{}]\tLoss = {}".format(epoch + 1, epochs, i, int(batchs), loss.item())) epoch_loss = epoch_loss / batchs cer = eval(model, dev_dataloader) print("Epoch {}: Loss= {}, CER = {}".format(epoch, epoch_loss, cer)) if (epoch + 1) % 5 == 0: torch.save(model, "pretrained/model_{}.pth".format(epoch))
def __init__(self, odim, eprojs, dropout_rate): super(CTC, self).__init__() self.dropout_rate = dropout_rate self.loss = None self.ctc_lo = torch.nn.Linear(eprojs, odim) from warpctc_pytorch import CTCLoss self.loss_fn = CTCLoss()
def __init__(self, classes, class_agnostic): super(_fasterRCNN_OCR, self).__init__() self.classes = classes self.n_classes = len(classes) self.class_agnostic = class_agnostic # loss self.RCNN_loss_cls = 0 self.RCNN_loss_bbox = 0 # define rpn self.RCNN_rpn = _RPN(self.dout_base_model) self.RCNN_proposal_target = _ProposalTargetLayer(self.n_classes) # self.RCNN_roi_pool = _RoIPooling(cfg.POOLING_SIZE, cfg.POOLING_SIZE, 1.0 / 16.0) self.RCNN_roi_align = RoIAlignAvg(cfg.POOLING_SIZE, cfg.POOLING_SIZE, 1.0 / 16.0) self.RCNN_ocr_roi_pooling = roi_pooling(2) # ocr roi_pooling self.grid_size = cfg.POOLING_SIZE * 2 if cfg.CROP_RESIZE_WITH_MAX_POOL else cfg.POOLING_SIZE self.RCNN_roi_crop = _RoICrop() # rnn初始化,隐藏节点256 nh = 256 nclass = len('0123456789.') + 1 self.rnn = nn.Sequential(BidirectionalLSTM(1024, nh, nh), BidirectionalLSTM(nh, nh, nclass)) self.ctc_critition = CTCLoss().cuda()
def __init__(self, oracle, alphabet, image_shape, target, file_weights): self.learning_rate = 0.001 # self.learning_rate = 10 self.num_iterations = 5000 # self.num_iterations = 100 self.batch_size = bs = 1 self.phrase_length = len(target) self.o_imW, self.o_imH = image_shape self.i_imW, self.i_imH = imgW, imgH self.oracle = oracle self.weights = file_weights # Variable for adversarial noise, which is added to the image to perturb it if torch.cuda.is_available(): self.delta = Variable(torch.rand( (1, self.o_imH, self.o_imW)).cuda(), requires_grad=True) else: self.delta = Variable(torch.rand((1, self.o_imH, self.o_imW)), requires_grad=True) # Optimize on delta and use ctc as criterion ctcloss = CTCLoss() self.optimizer = optim.Adam([self.delta], lr=self.learning_rate, betas=(0.9, 0.999)) self.loss = ctcloss self.ctcloss = ctcloss self.target = target self.converter = utils.strLabelConverter(alphabet, attention=False)
def eval(model, dataloader): model.eval() decoder = GreedyDecoder(dataloader.dataset.labels_str) ctcloss = CTCLoss(size_average=True) cer = 0 epoch_loss = 0 print("decoding") with torch.no_grad(): for i, (x, y, x_lens, y_lens) in tqdm(enumerate(dataloader)): x = x.to(device) outs, out_lens = model(x, x_lens) loss = ctcloss( outs.transpose(0, 1).transpose(0, 2), y, out_lens, y_lens) epoch_loss += loss.item() outs = F.softmax(outs, 1) outs = outs.transpose(1, 2) ys = [] offset = 0 for y_len in y_lens: ys.append(y[offset:offset + y_len]) offset += y_len out_strings, out_offsets = decoder.decode(outs, out_lens) y_strings = decoder.convert_to_strings(ys) for pred, truth in zip(out_strings, y_strings): trans, ref = pred[0], truth[0] #if len(ref) == 0 : print("ref:", ref, y_strings) cer += decoder.cer(trans, ref) / float(len(ref)) cer /= len(dataloader.dataset) epoch_loss /= i + 1 model.train() return cer, epoch_loss
def test_seg_ctc(use_mine=True): # (T, voca_size+1) pred_np = np.array([[0.5, 0.4, 0.1], [0.3, 0.1, 0.6], [0.7, 0.2, 0.1], [0.3, 0.5, 0.2]])[:, None] pred_np = np.log(np.tile(pred_np, (1, 2, 1))) # pred_np = np.random.random((4,2,3)) # (U) token_np = np.array([2, 2, 1, 2]) pred = Variable(floatX(pred_np), requires_grad=True) token = Variable(T.IntTensor(token_np)) sizes = Variable(T.IntTensor(np.array([4, 4]))) target_sizes = Variable(T.IntTensor(np.array([2, 2]))) for i in range(40): if use_mine: cost = ctc_cost(pred, token, sizes, target_sizes) print cost.data.item() else: from warpctc_pytorch import CTCLoss criterion = CTCLoss().cuda() cost = criterion(pred, token, sizes, target_sizes) print cost.data.item() optimizer = T.optim.SGD([pred], lr=3e-2, momentum=0.9, nesterov=True) optimizer.zero_grad() cost.backward() optimizer.step()
def forward(self, model, x, labels_flatten, img_seq_lens, label_lens, batch_size): with _disable_tracking_bn_stats(model): # calc adversarial direction # prepare random unit tensor d = torch.rand(x.shape).to( torch.device('cuda' if torch.cuda.is_available() else 'cpu')) d = _l2_normalize(d) for _ in range(self.ip): d.requires_grad_() loss_function = CTCLoss() preds = model.forward(x + self.xi * d, img_seq_lens) adv_loss_ctc = loss_function( preds, labels_flatten, Variable(torch.IntTensor(np.array(img_seq_lens))), label_lens) / batch_size adv_loss_ctc.backward() d = d.grad model.zero_grad() # calc LDS r_adv = torch.sign(d) * self.eps pred_hat = model.forward(x + r_adv, img_seq_lens) lds = loss_function( pred_hat, labels_flatten, Variable(torch.IntTensor(np.array(img_seq_lens))), label_lens) / batch_size return lds
def __init__(self): self.model = get_model().cuda() self.ctc_loss = CTCLoss(size_average=True) self.decoder = Decoder() # self.optimizer = optim.Adam(self.model.parameters(), lr=configs.lr, weight_decay=configs.l2_weight_decay) self.optimizer = optim.ASGD(self.model.parameters(), lr=configs.lr, weight_decay=configs.l2_weight_decay) self.lr_scheduler = lr_scheduler.ReduceLROnPlateau( self.optimizer, 'min', patience=configs.lr_scheduler_patience, factor=configs.lr_scheduler_factor, verbose=True) self.epoch_idx = 0 self.min_avg_dist = 1000.
def main(opts): # alphabet = '0123456789.' nclass = len(alphabet) + 1 model_name = 'E2E-CRNN' net = OwnModel(attention=True, nclass=nclass) print("Using {0}".format(model_name)) if opts.cuda: net.cuda() learning_rate = opts.base_lr optimizer = torch.optim.Adam(net.parameters(), lr=opts.base_lr, weight_decay=weight_decay) optimizer = optim.Adam(net.parameters(), lr=opts.base_lr, betas=(0.5, 0.999)) step_start = 0 ### 第一种:只修改conv11的维度 # model_dict = net.state_dict() # if os.path.exists(opts.model): # print('loading pretrained model from %s' % opts.model) # pretrained_model = OwnModel(attention=True, nclass=12) # pretrained_model.load_state_dict(torch.load(opts.model)['state_dict']) # pretrained_dict = pretrained_model.state_dict() # # pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict and 'rnn' not in k and 'conv11' not in k} # model_dict.update(pretrained_dict) # net.load_state_dict(model_dict) if os.path.exists(opts.model): print('loading model from %s' % args.model) step_start, learning_rate = net_utils.load_net(args.model, net, optimizer) ## ICDAR2015数据集 e2edata = E2Edataset(train_list=opts.train_list) e2edataloader = torch.utils.data.DataLoader(e2edata, batch_size=opts.batch_size, shuffle=True, collate_fn=E2Ecollate, num_workers=4) net.train() converter = strLabelConverter(alphabet) ctc_loss = CTCLoss() for step in range(step_start, opts.max_iters): for index, date in enumerate(e2edataloader): im_data, gtso, lbso = date im_data = im_data.cuda() try: loss= process_crnn(im_data, gtso, lbso, net, ctc_loss, converter, training=True) net.zero_grad() # optimizer.zero_grad() loss.backward() optimizer.step() except: import sys, traceback traceback.print_exc(file=sys.stdout) pass if index % disp_interval == 0:
def __init__(self): super(CTC, self).__init__() self.criterion = CTCLoss().cuda() self.softmax = None self.label = None self.len = 0 self.times = 0 self.count = 0
def __init__(self, alphabets=None, nh=256): super(OcrLoss, self).__init__() # rnn初始化,隐藏节点256 nclass = len(alphabets) + 1 self.rnn = nn.Sequential(BidirectionalLSTM(1024, nh, nh), BidirectionalLSTM(nh, nh, nclass)) self.ctc_critition = CTCLoss().cuda()
def loss_gradient(self, x: np.ndarray, y: np.ndarray, **kwargs) -> np.ndarray: """ Compute the gradient of the loss function w.r.t. `x`. :param x: Samples of shape (nb_samples, seq_length). Note that, it is allowable that sequences in the batch could have different lengths. A possible example of `x` could be: `x = np.array([np.array([0.1, 0.2, 0.1, 0.4]), np.array([0.3, 0.1])])`. :param y: Target values of shape (nb_samples). Each sample in `y` is a string and it may possess different lengths. A possible example of `y` could be: `y = np.array(['SIXTY ONE', 'HELLO'])`. :return: Loss gradients of the same shape as `x`. """ from warpctc_pytorch import CTCLoss x_ = x.copy() # Put the model in the training mode self._model.train() # Apply preprocessing x_preprocessed, y_preprocessed = self._apply_preprocessing(x_, y, fit=False) # Transform data into the model input space inputs, targets, input_rates, target_sizes, batch_idx = self.transform_model_input( x=x_preprocessed, y=y_preprocessed, compute_gradient=True ) # Compute real input sizes input_sizes = input_rates.mul_(inputs.size()[-1]).int() # Call to DeepSpeech model for prediction outputs, output_sizes = self._model(inputs.to(self._device), input_sizes.to(self._device)) outputs = outputs.transpose(0, 1) float_outputs = outputs.float() # Loss function criterion = CTCLoss() loss = criterion(float_outputs, targets, output_sizes, target_sizes).to(self._device) loss = loss / inputs.size(0) # Compute gradients if self._use_amp: from apex import amp with amp.scale_loss(loss, self._optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Get results results = [] for i in range(len(x_preprocessed)): results.append(x_preprocessed[i].grad.cpu().numpy().copy()) results = np.array(results) results = self._apply_preprocessing_gradient(x_, results) return results
def main(opts): alphabet = '0123456789.' nclass = len(alphabet) + 1 model_name = 'crnn' net = CRNN(nclass) print("Using {0}".format(model_name)) if opts.cuda: net.cuda() learning_rate = opts.base_lr optimizer = torch.optim.Adam(net.parameters(), lr=opts.base_lr, weight_decay=weight_decay) if os.path.exists(opts.model): print('loading model from %s' % args.model) step_start, learning_rate = net_utils.load_net(args.model, net, optimizer) ## 数据集 converter = strLabelConverter(alphabet) dataset = ImgDataset( root='/home/yangna/deepblue/OCR/mech_demo2/dataset/imgs/image', csv_root='/home/yangna/deepblue/OCR/mech_demo2/dataset/imgs/train_list.txt', transform=None, target_transform=converter.encode ) ocrdataloader = torch.utils.data.DataLoader( dataset, batch_size=1, shuffle=False, collate_fn=own_collate ) num_count = 0 net = net.eval() converter = strLabelConverter(alphabet) ctc_loss = CTCLoss() for step in range(len(dataset)): try: data = next(data_iter) except: data_iter = iter(ocrdataloader) data = next(data_iter) im_data, gt_boxes, text = data im_data = im_data.cuda() try: res = process_crnn(im_data, gt_boxes, text, net, ctc_loss, converter, training=False) pred, target = res if pred == target[0]: num_count += 1 except: import sys, traceback traceback.print_exc(file=sys.stdout) pass print('correct/total:%d/%d'%(num_count, len(dataset)))
def test_seg_ctc(use_mine=True, use_log=False): size = 43 voca_size = 37 n = 20 np.random.seed(1234) pred_len_np = np.ones([n]) * size pred_np = np.random.random([size, n, voca_size + 1]) pred_np = np.log(pred_np) token_len_np = np.random.randint(low=2, high=10, size=n) token_np = np.random.randint(voca_size, size=token_len_np.sum()) + 1 # pred_np = np.load('/home/jins/CTC/test/preds.npy') # pred_len_np = np.load('/home/jins/CTC/test/preds_size.npy') # token_len_np = np.load('/home/jins/CTC/test/y_length.npy') # token_np = np.load('/home/jins/CTC/test/text.npy') # pdb.set_trace() pred = Variable(floatX(pred_np), requires_grad=True) token = Variable(T.IntTensor(token_np)) sizes = Variable(T.IntTensor(pred_len_np)) target_sizes = Variable(T.IntTensor(token_len_np)) # # (T, voca_size+1) # pred_np = np.array([[0.5, 0.4, 0.1], [0.3, 0.1, 0.6], [0.7, 0.2, 0.1], [0.3, 0.5, 0.2]])[:, None] # pred_np = np.log(np.tile(pred_np, (1,2,1))) ## pred_np = np.random.random((4,2,3)) # # (U) # token_np = np.array([2, 2, 1, 2]) # # pred = Variable(floatX(pred_np), requires_grad=True) # token = Variable(T.IntTensor(token_np)) # sizes = Variable(T.IntTensor(np.array([4, 4]))) # target_sizes = Variable(T.IntTensor(np.array([2, 2]))) for i in range(50): if use_mine: H, cost = ctc_ent_cost(pred, token, sizes, target_sizes, use_log=use_log) glog.info('%d, cost: %s, entropy: %s' % (i, cost.data.item(), H.data.item())) # cost = 0.9*cost - 0.1*H else: from warpctc_pytorch import CTCLoss criterion = CTCLoss().cuda() cost = criterion(pred, token, sizes, target_sizes) glog.info('%d, cost: %s' % (i, cost.data.item())) optimizer = T.optim.Adam([pred], lr=3e-1) #, nesterov=True) optimizer.zero_grad() (cost).backward() optimizer.step()
def train(self, mode=True): """ Enter (or exit) training mode. Initializes loss function if necessary :param mode: if True, set model up for training :return: """ if mode and self.loss_func is None: self.loss_func = CTCLoss() super().train(mode=mode)
def train_ctc(rnn_num_layers = 2, learning_rate = 1e-3): dataset = SpectrogramDataset('data/CommonVoice/valid_train.h5', model_ctc = True) norm_transform = Normalize(dataset) decoder = CTCDecoder(dataset.char_to_ix) dataset.set_transform(norm_transform) data_loader = DataLoader(dataset, collate_fn = dataset.merge_batches, batch_size = batch_size, shuffle = True) print("dataset len") print(dataset.__len__()) print("\nDataset loading completed\n") model = CTCModel(input_dim, hidden_dim, output_dim, rnn_num_layers, batch_size) model.to(device) #optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum = 0.9) optimizer = optim.Adam(model.parameters(), lr = learning_rate) ctc_loss = CTCLoss(blank = output_dim - 1) count = 0 print("Begin training") for epoch in range(200): print("***************************") print("EPOCH NUM %d" % epoch) print("***************************") cost_epoch_sum = 0 cost_tstep_sum = 0 for i_batch, sample_batched in enumerate(data_loader): optimizer.zero_grad() padded_X, seq_labels, X_lengths, Y_lengths = sample_batched if (len(X_lengths) < batch_size): break # Get the distributions padded_X = padded_X.cuda() log_probs = model(padded_X, X_lengths) log_probs = log_probs.transpose(0, 1) log_probs.requires_grad_(True) cost = ctc_loss(log_probs.float(), seq_labels, (((X_lengths - 8) // 2) - 2) // 2, Y_lengths) cost.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 20) optimizer.step() print(cost) cost_epoch_sum += float(cost) print("***************************") print("PREDICTION") model = model.eval() xseq, yseq = dataset[0] xseq = torch.FloatTensor([xseq]) xseq = norm_transform(xseq.cuda()) log_probs = model(xseq.cuda()) logprobs_numpy = log_probs[0].data.cpu().numpy() decoded_seq, _ = decoder.beam_search_decoding(log_probs[0].data.cpu().numpy(), beam_size = 100) model = model.train() print("Ground truth: ", yseq) print("Prediction: ", decoded_seq) print("Avg cost per epoch: ", cost_epoch_sum / 4076) print("***************************")
def __init__(self, imgH, nc, nclass, nh, ngpu, n_rnn=2, leakyRelu=False): super(CRNN, self).__init__() self.ngpu = ngpu assert imgH % 16 == 0, 'imgH has to be a multiple of 16' ks = [3, 3, 3, 3, 3, 3, 2] ps = [1, 1, 1, 1, 1, 1, 0] ss = [1, 1, 1, 1, 1, 1, 1] nm = [64, 128, 256, 256, 512, 512, 512] cnn = nn.Sequential() self.criterion = CTCLoss() self.criterion = self.criterion.cuda() def convRelu(i, batchNormalization=False): nIn = nc if i == 0 else nm[i - 1] nOut = nm[i] cnn.add_module('conv{0}'.format(i), nn.Conv2d(nIn, nOut, ks[i], ss[i], ps[i])) if batchNormalization: cnn.add_module('batchnorm{0}'.format(i), nn.BatchNorm2d(nOut)) if leakyRelu: cnn.add_module('relu{0}'.format(i), nn.LeakyReLU(0.2, inplace=True)) else: cnn.add_module('relu{0}'.format(i), nn.ReLU(True)) convRelu(0) cnn.add_module('pooling{0}'.format(0), nn.MaxPool2d(2, 2)) # 64x16x64 convRelu(1) cnn.add_module('pooling{0}'.format(1), nn.MaxPool2d(2, 2)) # 128x8x32 convRelu(2, True) convRelu(3) cnn.add_module('pooling{0}'.format(2), nn.MaxPool2d((2, 2), (2, 1), (0, 1))) # 256x4x16 convRelu(4, True) convRelu(5) cnn.add_module('pooling{0}'.format(3), nn.MaxPool2d((2, 2), (2, 1), (0, 1))) # 512x2x16 convRelu(6, True) # 512x1x16 self.cnn = cnn self.rnn = nn.Sequential(BidirectionalLSTM(512, nh, nh, ngpu), BidirectionalLSTM(nh, nh, nclass, ngpu))
def init_model(cls, ninput, nhidden, noutput, codec, normalize=kraken.lib.lstm.normalize_nfkc, cuda=torch.cuda.is_available()): self = cls() self.codec = codec self.normalize = normalize self.rnn = TBIDILSTM(ninput, nhidden, noutput) self.setLearningRate() self.trial = 0 self.mode = 'clstm' self.criterion = CTCLoss() self.cuda_available = cuda if self.cuda_available: self.cuda() return self
def __init__(self, config): super(Pre_encoder, self).__init__() # define encoder self.config = config # self.encoder = BuildEncoder(config) self.encoder = build_encoder(config) self.project_layer = nn.Linear(800, 2664) self.crit = CTCLoss()
def main(args): print('Args:', args) # Set the random seed manually for reproducibility. np.random.seed(args.seed) torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) model = MyModel(args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti) # model = MyModel(args.nhid, args.nlayers) # model.load_state_dict(torch.load("/mnt/part2/e1/010_0.00.w")) if args.cuda: model.cuda() total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in model.parameters()) print('Model total parameters:', total_params) train_loader, dev_loader = get_data_loaders(args) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wdecay) criterion = CTCLoss() scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=1, threshold=0.01, verbose=True) for epoch in range(1, args.epochs + 1): print(datetime.now()) epoch_start_time = time.time() train(epoch, model, optimizer, criterion, train_loader, args) if True: val_loss_utter, val_loss_phoneme, val_cer = evaluate(model, criterion, dev_loader, args, calc_cer=True) scheduler.step(val_loss_phoneme) print('-' * 89) print( '| end of epoch {:3d} | time: {:5.2f}s | loss/utter {:5.2f} | loss/phoneme {:5.4f} | valid cer {:5.4f}' .format(epoch, (time.time() - epoch_start_time), val_loss_utter, val_loss_phoneme, val_cer)) print('-' * 89) if not os.path.exists(args.weights_dir): os.makedirs(args.weights_dir) weight_fname = "{}/{:03d}_{}.w".format( args.weights_dir, epoch, "{:.2f}".format(val_loss_phoneme)) print("saving as", weight_fname) torch.save(model.state_dict(), weight_fname)
def naren_loss(out, labels, input_lens, label_lens, blank_idx): """Calculates the loss function using sean naren's warpctc bindings. The `.permute(1,0,2).float().cpu()` section of the model output is meant to match the expected format for the loss function. the `.cpu()` call is necessary to calculate a non-zero loss value. """ from warpctc_pytorch import CTCLoss loss_fn = CTCLoss(blank=blank_idx, size_average=True, length_average=False) out = out.permute(1, 0, 2).float().cpu() #permuation for naren's warpctc loss = loss_fn(out, labels, input_lens, label_lens) return loss
train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=opt.batchSize, shuffle=True, sampler=sampler, num_workers=int(opt.workers), collate_fn=dataset.alignCollate(imgH=opt.imgH, keep_ratio=opt.keep_ratio)) test_dataset = dataset.lmdbDataset( root=opt.valroot, transform=dataset.resizeNormalize((100, 32))) ngpu = int(opt.ngpu) nh = int(opt.nh) alphabet = opt.alphabet nclass = len(alphabet) + 1 nc = 1 converter = utils.strLabelConverter(alphabet) criterion = CTCLoss() # custom weights initialization called on crnn def weights_init(m): classname = m.__class__.__name__ if classname.find('Conv') != -1: m.weight.data.normal_(0.0, 0.02) elif classname.find('BatchNorm') != -1: m.weight.data.normal_(1.0, 0.02) m.bias.data.fill_(0) crnn = crnn.CRNN(opt.imgH, nc, nclass, nh, ngpu) crnn.apply(weights_init) if opt.crnn != '': print('loading pretrained model from %s' % opt.crnn)