def build_dataloader(cfg, is_train=True): type_name = cfg.DATASETS.TYPE model_type = cfg.BASE.TYPE # 考虑不需区分dataset是否是检测或识别,只需填入需要的地址 if is_train == True: '''Use a letter to encode the type of the model''' if model_type == 'R': alphabet = Alphabet(cfg.ADDRESS.ALPHABET) else: alphabet = Alphabet() train_data_dir = cfg.ADDRESS.TRAIN_DATA_DIR train_anno_dir = cfg.ADDRESS.TRAIN_GT_DIR val_data_dir = cfg.ADDRESS.VAL_DATA_DIR val_anno_dir = cfg.ADDRESS.VAL_GT_DIR train_set = get_dataset(cfg, type_name, train_data_dir, train_anno_dir, split='train', alphabet=alphabet) val_set = get_dataset(cfg, type_name, val_data_dir, val_anno_dir, split='val', alphabet=alphabet) train_dataloader = get_dataloader(cfg, type_name, dataset=train_set, split='train') val_dataloader = get_dataloader(cfg, type_name, dataset=val_set, split='val') # remind the relation of batch_size and num of gpus images_per_batch = cfg.MODEL.BATCH_SIZE num_gpus = int(cfg.BASE.NUM_GPUS) assert ( images_per_batch % num_gpus == 0 ), "IMS_PER_BATCH ({}) must be divisible by the number " "of GPUs ({}) used.".format(images_per_batch, num_gpus) images_per_gpu = images_per_batch // num_gpus if images_per_gpu > 5: logger = logging.getLogger(__name__) # logger.warning( # "每GPU图片数量过高时可能遇到内存溢出," # "若发生该情况请调整BATCH_SIZE,并调整学习率等其他可能影响效果的因素" # ) return train_dataloader, val_dataloader else: if model_type == 'R': alphabet = Alphabet(cfg.ADDRESS.ALPHABET) else: alphabet = Alphabet() test_data_dir = cfg.ADDRESS.TEST_DATA_DIR test_anno_dir = cfg.ADDRESS.TEST_GT_DIR test_set = get_dataset(cfg, name, test_data_dir, test_anno_dir, split='test', alphabet=alphabet) test_dataloader = get_dataloader(cfg, name, dataset=test_set, split='test') return test_dataloader
def __init__(self, opt): nn.Module.__init__(self) from alphabet.alphabet import Alphabet self.n_class = len(Alphabet(opt.ADDRESS.ALPHABET)) self.opt = opt # self.stn = SpatialTransformer(self.opt) self.cnn = self.getCNN_cap() self.rnn = self.getEncoder() # n_class,hidden_size,num_embedding,input_size # self.attention = Attention(self.n_class,256, 128,256) self.attention = Attention(256, 256, self.n_class, 128) # Spatial transformer localization-network self.localization = nn.Sequential( nn.Conv2d(1, 8, kernel_size=7), nn.MaxPool2d(2, stride=2), nn.ReLU(True), nn.Conv2d(8, 10, kernel_size=5), nn.MaxPool2d(2, stride=2), nn.ReLU(True) ) # Regressor for the 3 * 2 affine matrix self.fc_loc = nn.Sequential( nn.Linear(10 * 4 * 21, 32), nn.ReLU(True), nn.Linear(32, 3 * 2) ) # Initialize the weights/bias with identity transformation self.fc_loc[2].weight.data.fill_(0) self.fc_loc[2].bias.data = torch.FloatTensor([1, 0, 0, 0, 1, 0])
def __init__(self, opt): nn.Module.__init__(self) from alphabet.alphabet import Alphabet self.n_class = len(Alphabet(opt.ADDRESS.ALPHABET)) self.blstm = BLSTM(512, 256) self.attention = Attention(input_size=512, hidden_size=256, num_classes=self.n_class, num_embeddings=128)
def __init__(self, opt): super(CRNN, self).__init__() self.opt = opt '''alphabet''' from alphabet.alphabet import Alphabet self.n_class = len(Alphabet(opt.ADDRESS.ALPHABET)) + 1 '''cnn''' self.cnn = self.getCNN() '''rnn''' self.rnn = self.getRNN()
def __init__(self, opt=None): nn.Module.__init__(self) self.opt = opt from alphabet.alphabet import Alphabet self.n_class = len(Alphabet(opt.ADDRESS.ALPHABET)) self.cnn = ResNet(num_in=opt.IMAGE.IMG_CHANNEL, block=BasicBlock, layers=[1, 2, 5, 3]) # (BS,6,40) self.encoder = Encoder() # (40,BS,512) self.decoder = Attention(opt=opt)
def __init__(self, opt): nn.Module.__init__(self) self.opt = opt from alphabet.alphabet import Alphabet self.n_class = len(Alphabet(opt.ADDRESS.ALPHABET)) self.cnn = CapsNet(E=10) # (BS,6,40) self.encoder = Encoder() # (40,BS,512) self.decoder = Attention(opt) self.fc = nn.Linear(272, 512, bias=True) self.relu = nn.ReLU()
def __init__(self, opt): super(Attention, self).__init__() self.attention_cell = AttentionCell() from alphabet.alphabet import Alphabet self.n_class = len(Alphabet(opt.ADDRESS.ALPHABET)) self.generator = nn.Linear(512, self.n_class) self.char_embeddings = Parameter(torch.randn(self.n_class + 1, 128)) '''给conv_feats用的''' self.conv = nn.Conv2d(512, 512, 3, 1, 1) self.bn = nn.BatchNorm2d(512) self.relu = nn.ReLU()
def __init__(self, opt): nn.Module.__init__(self) from alphabet.alphabet import Alphabet self.n_class = len(Alphabet(opt.ADDRESS.ALPHABET)) self.opt = opt self.stn = SpatialTransformer(self.opt) self.cnn = self.getCNN() self.rnn = self.getEncoder() # n_class,hidden_size,num_embedding,input_size # self.attention = Attention(self.n_class,256, 128,256) self.attention = Attention(256, 256, self.n_class, 128)
def loadTool(self): ''' 根据模型的类型,加载相应的组件 ''' if self.opt.BASE.TYPE == 'R': self.alphabet = Alphabet(self.opt.ADDRESS.ALPHABET) if self.opt.BASE.MODEL == 'GRCNN' or self.opt.BASE.MODEL == 'CRNN' or self.opt.BASE.MODEL == 'CAPSOCR2': from utils.strLabelConverterForCTC import strLabelConverterForCTC self.converter = strLabelConverterForCTC(self.alphabet.str) else: from utils.strLabelConverterForAttention import strLabelConverterForAttention self.converter = strLabelConverterForAttention(self.alphabet.str) self.highestAcc = 0 self.val_times = 0
def __init__(self, opt): nn.Module.__init__(self) from alphabet.alphabet import Alphabet self.n_class = len(Alphabet(opt.ADDRESS.ALPHABET)) self.opt = opt # self.stn = SpatialTransformer(self.opt) self.cnn = self.getCNN_sr() self.rnn = self.getEncoder() # n_class,hidden_size,num_embedding,input_size # self.attention = Attention(self.n_class,256, 128,256) self.attention = Attention(256, 256, self.n_class, 128) self.conv_layers = nn.ModuleList() self.norm_layers = nn.ModuleList() # ========= ConvCaps Layers for d in range(1, 2): '''自回归模型''' self.conv_layers.append( SelfRouting2d(num_caps, num_caps, caps_size, caps_size, kernel_size=3, stride=1, padding=1, pose_out=True)) '''bn''' self.norm_layers.append(nn.BatchNorm2d(caps_size * num_caps)) '''恒等输出''' self.conv_a = nn.Conv2d(8 * planes, num_caps, kernel_size=3, stride=1, padding=1, bias=False) '''姿态变量''' self.conv_pose = nn.Conv2d(8 * planes, num_caps * caps_size, kernel_size=3, stride=1, padding=1, bias=False) '''两个bn''' self.bn_a = nn.BatchNorm2d(num_caps) self.bn_pose = nn.BatchNorm2d(num_caps * caps_size)
def predict(self, texts, embedding_alphabet: AlphabetEmbeddings, label_alphabet: Alphabet, batch_size): lens = len(texts) batch_num = (lens + batch_size - 1) // batch_size ans = [] for i in range(batch_num): start = i * batch_size end = min(start + batch_size, lens) part = texts[start:end] part, lengths, mask = embedding_alphabet.add_padding_tensor( part, gpu=self.gpu) pred = self.forward(part, lengths, mask) pred = torch.argmax(pred, dim=-1, keepdim=False) pred = pred.tolist() pred = label_alphabet.get_instance(pred) ans.extend(pred) return ans
def __init__(self, opt): nn.Module.__init__(self) self.opt = opt from alphabet.alphabet import Alphabet self.n_class = len(Alphabet(opt.ADDRESS.ALPHABET)) self.fe = Feature_Extractor(strides=[(1, 1), (2, 2), (1, 1), (2, 2), (1, 1), (1, 1)], compress_layer=False, input_shape=[1, 32, 128]) scales = self.fe.Iwantshapes() self.cam = CAM(scales=scales, maxT=25, depth=8, num_channels=64) self.dtd = DTD( nclass=self.n_class, nchannel=512, dropout=0.3, )
def __init__(self, opt): # self.alphabet = Alphabet(self.opt.ADDRESS.RECOGNITION.ALPHABET) from alphabet.alphabet import Alphabet # self.nclass = len(alphabet) self.n_class = len(Alphabet(opt.ADDRESS.ALPHABET)) + 1 # self.n_class = len(alphabet) self.crann_config = json.loads(json.dumps(opt)) n_class = self.n_class crann_config = self.crann_config print('crann_config"s value is ', crann_config) print(type(crann_config)) super(newCRANN, self).__init__() self.ngpu = crann_config['BASE']['NUM_GPUS'] cnn_conf = crann_config['CNN'] print('Constructing {}'.format(cnn_conf['MODEL'])) self.cnn = ConvNets.__dict__[cnn_conf['MODEL']]() rnn_conf = crann_config['RNN'] print('Constructing {}'.format(rnn_conf['MODEL'])) self.rnn = SeqNets.__dict__[rnn_conf['MODEL']](rnn_conf, n_class)
def __init__(self, opt): from alphabet.alphabet import Alphabet # self.nclass = len(alphabet) self.nclass = len(Alphabet(opt.ADDRESS.ALPHABET)) self.nh = opt.nh self.targetH = opt.targetH self.targetW = opt.targetW self.BidirDecoder = opt.BidirDecoder self.inputDataType = opt.inputDataType self.maxBatch = opt.maxBatch self.CUDA = opt.CUDA self.nc = opt.IMAGE.IMG_CHANNEL # def __init__(self, nc, nclass, nh, targetH, targetW, BidirDecoder=False, # inputDataType='torch.cuda.FloatTensor', maxBatch=256, CUDA=True): ''' 初始化MORAN模型,由MORN和ASRN两部分构成 :param int nc 图片通道数 :param int nclass 字符表中的字符数量 :param int nh 图片的高 :param int targetH 经过MORN调整后图片的目标高度 :param int targetW 经过MORN调整后图片的目标宽度 :param bool bidirDeccoder 是否使用双向LSTM :param str inputDataType 数据类型 :param int maxBatch Batch的最大数量 :param bool CUDA 是否使用CUDA ''' super(newMORAN, self).__init__() self.MORN = MORN(self.nc, self.targetH, self.targetW, self.inputDataType, self.maxBatch, self.CUDA) self.ASRN = ASRN(self.targetH, self.nc, self.nclass, self.nh, self.BidirDecoder, self.CUDA)
# Fix the random seed of Pytorch when using GPU. if torch.cuda.is_available(): torch.cuda.manual_seed_all(args.random_state) torch.cuda.manual_seed(args.random_state) # Fix the random seed of Pytorch when using CPU. torch.manual_seed(args.random_state) torch.random.manual_seed(args.random_state) # get dataset and alphabets dataset = DataIOSST2(config['data']) if config['use_pre_embedding']: seq_alphabet = AlphabetEmbeddings(**config['embedding']) seq_alphabet.load_embeddings_from_file() else: seq_alphabet = AlphabetEmbeddings(**config['embedding']) seq_alphabet.add_instance(dataset.train_word) label_alphabet = Alphabet('label', False, False) label_alphabet.add_instance(dataset.train_label) # get model if args.load is not None: model = torch.load(args.load) else: model = ModelFactory.get_model(config, args, seq_alphabet, label_alphabet) process = Process(config, args, dataset, model, seq_alphabet, label_alphabet) process.train()