예제 #1
0
파일: cn_ocr.py 프로젝트: wibruce/cnocr
    def __init__(self, root=data_dir(), model_epoch=MODEL_EPOCE):
        self._model_dir = os.path.join(root, 'models')
        self._model_epoch = model_epoch
        self._assert_and_prepare_model_files(root)
        self._alphabet, _ = read_charset(os.path.join(self._model_dir, 'label_cn.txt'))

        self._hp = Hyperparams()
        self._mods = {}
예제 #2
0
def process_baidu_innovation():
    vocab, letter2id = read_charset(VOCAB_FP)
    baidu_index_fp = (
        '/Users/king/Documents/beiye-Ein/语料/text-detection/baidu-innovation/train.list'
    )
    out_index_fp = 'train_baidu_innovation.tsv'
    image_folder = 'baidu-innovation/train_images'
    data = read_index_file(baidu_index_fp, letter2id)
    logger.info(f'{len(data)} legal examples are found')

    with open(out_index_fp, 'w') as f:
        for example in data:
            fname, gt = example
            gt = ' '.join(gt)
            fp = image_folder + '/' + fname
            f.write('\t'.join((fp, gt)) + '\n')
예제 #3
0
    def __init__(self,
                 model_name='conv-lite-fc',
                 model_epoch=None,
                 cand_alphabet=None,
                 root=data_dir(),
                 gpus=0):
        """

        :param model_name: 模型名称
        :param model_epoch: 模型迭代次数
        :param cand_alphabet: 待识别字符所在的候选集合。默认为 `None`,表示不限定识别字符范围
        :param root: 模型文件所在的根目录。
            Linux/Mac下默认值为 `~/.cnocr`,表示模型文件所处文件夹类似 `~/.cnocr/1.1.0/conv-lite-fc-0027`。
            Windows下默认值为 ``。
        """
        check_model_name(model_name)
        self._model_name = model_name
        self._model_file_prefix = '{}-{}'.format(self.MODEL_FILE_PREFIX,
                                                 model_name)
        self._model_epoch = model_epoch or AVAILABLE_MODELS[model_name][0]

        root = os.path.join(root, MODEL_VERSION)
        self._model_dir = os.path.join(root, self._model_name)
        self._assert_and_prepare_model_files()
        self._alphabet, inv_alph_dict = read_charset(
            os.path.join(self._model_dir, 'label_cn.txt'))

        self._cand_alph_idx = None
        if cand_alphabet is not None:
            self._cand_alph_idx = [0] + [
                inv_alph_dict[word] for word in cand_alphabet
            ]
            self._cand_alph_idx.sort()

        self._hp = Hyperparams()
        self._hp._loss_type = None  # infer mode

        # DCMMC: gpu context for mxnet
        if gpus > 0:
            self.context = [mx.context.gpu(i) for i in range(gpus)]
        else:
            self.context = [mx.context.cpu()]
        self._mod = self._get_module()
예제 #4
0
    def __init__(
            self,
            model_name='densenet-lite-fc',
            model_epoch=None,
            cand_alphabet=None,
            root=data_dir(),
            context='cpu',
            name=None,
    ):
        """

        :param model_name: 模型名称
        :param model_epoch: 模型迭代次数
        :param cand_alphabet: 待识别字符所在的候选集合。默认为 `None`,表示不限定识别字符范围
        :param root: 模型文件所在的根目录。
            Linux/Mac下默认值为 `~/.cnocr`,表示模型文件所处文件夹类似 `~/.cnocr/1.1.0/conv-lite-fc-0027`。
            Windows下默认值为 ``。
        :param context: 'cpu', or 'gpu'。表明预测时是使用CPU还是GPU。默认为CPU。
        :param name: 正在初始化的这个实例名称。如果需要同时初始化多个实例,需要为不同的实例指定不同的名称。
        """
        check_model_name(model_name)
        self._model_name = model_name
        self._model_file_prefix = '{}-{}'.format(self.MODEL_FILE_PREFIX,
                                                 model_name)
        self._model_epoch = model_epoch or AVAILABLE_MODELS[model_name][0]

        root = os.path.join(root, MODEL_VERSION)
        self._model_dir = os.path.join(root, self._model_name)
        self._assert_and_prepare_model_files()
        self._alphabet, self._inv_alph_dict = read_charset(
            os.path.join(self._model_dir, 'label_cn.txt'))

        self._cand_alph_idx = None
        self.set_cand_alphabet(cand_alphabet)

        self._hp = Hyperparams()
        self._hp._loss_type = None  # infer mode
        # 传入''的话,也改成传入None
        self._net_prefix = None if name == '' else name

        self._mod = self._get_module(context)
예제 #5
0
def main():
    charset_fp = os.path.join(data_dir(), 'models', 'label_cn.txt')
    alphabet, inv_alph_dict = read_charset(charset_fp)
    for idx in BAD_CHARS:
        print('idx: {}, char: {}'.format(idx, alphabet[idx]))
예제 #6
0
def _gen_iters(hp,
               train_fp_prefix,
               val_fp_prefix,
               use_train_image_aug,
               dataset_fp,
               charset_fp,
               debug=False):
    height, width = hp.img_height, hp.img_width
    augs = None
    if use_train_image_aug:
        augs = mx.image.CreateAugmenter(
            data_shape=(3, height, width),
            resize=0,
            rand_crop=False,
            rand_resize=False,
            rand_mirror=False,
            mean=None,
            std=None,
            brightness=0.001,
            contrast=0.001,
            saturation=0.001,
            hue=0.05,
            pca_noise=0.1,
            inter_method=2,
        )
        augs.append(FgBgFlipAug(p=0.2))
    # train_iter = GrayImageIter(
    #     batch_size=hp.batch_size,
    #     data_shape=(3, height, width),
    #     label_width=hp.num_label,
    #     dtype='float32',
    #     shuffle=True,
    #     path_imgrec=str(train_fp_prefix) + ".rec",
    #     path_imgidx=str(train_fp_prefix) + ".idx",
    #     aug_list=augs,
    # )
    # val_iter = GrayImageIter(
    #     batch_size=hp.batch_size,
    #     data_shape=(3, height, width),
    #     label_width=hp.num_label,
    #     dtype='float32',
    #     path_imgrec=str(val_fp_prefix) + ".rec",
    #     path_imgidx=str(val_fp_prefix) + ".idx",
    # )
    _, token2id = read_charset(charset_fp)
    assert all([len(c) == 1 for c in token2id.keys() if c])
    train_iter = Hdf5ImgIter(batch_size=hp.batch_size,
                             data_shape=(3, height, width),
                             dataset_fp=dataset_fp,
                             label_width=hp.num_label,
                             classes_dict=token2id,
                             mode='train',
                             dtype='float32',
                             shuffle=True,
                             aug_list=augs,
                             debug=debug)
    val_iter = Hdf5ImgIter(batch_size=hp.batch_size,
                           data_shape=(3, height, width),
                           dataset_fp=dataset_fp,
                           classes_dict=token2id,
                           mode='val',
                           label_width=hp.num_label,
                           dtype='float32',
                           debug=debug)

    return train_iter, val_iter
예제 #7
0
파일: cn_ocr.py 프로젝트: showme890/cnocr
    def __init__(
            self,
            model_name: str = 'densenet_lite_136-fc',
            *,
            cand_alphabet: Optional[Union[Collection, str]] = None,
            context: str = 'cpu',  # ['cpu', 'gpu', 'cuda']
            model_fp: Optional[str] = None,
            root: Union[str, Path] = data_dir(),
            **kwargs,
    ):
        """
        识别模型初始化函数。

        Args:
            model_name (str): 模型名称。默认为 `densenet_lite_136-fc`
            cand_alphabet (Optional[Union[Collection, str]]): 待识别字符所在的候选集合。默认为 `None`,表示不限定识别字符范围
            context (str): 'cpu', or 'gpu'。表明预测时是使用CPU还是GPU。默认为 `cpu`
            model_fp (Optional[str]): 如果不使用系统自带的模型,可以通过此参数直接指定所使用的模型文件('.ckpt' 文件)
            root (Union[str, Path]): 模型文件所在的根目录。
                Linux/Mac下默认值为 `~/.cnocr`,表示模型文件所处文件夹类似 `~/.cnocr/2.1/densenet_lite_136-fc`。
                Windows下默认值为 `C:/Users/<username>/AppData/Roaming/cnocr`。
            **kwargs: 目前未被使用。

        Examples:
            使用默认参数:
            >>> ocr = CnOcr()

            使用指定模型:
            >>> ocr = CnOcr(model_name='densenet_lite_136-fc')

            识别时只考虑数字:
            >>> ocr = CnOcr(model_name='densenet_lite_136-fc', cand_alphabet='0123456789')

        """
        if 'name' in kwargs:
            logger.warning(
                'param `name` is useless and deprecated since version %s' %
                MODEL_VERSION)
        check_model_name(model_name)
        check_context(context)
        self._model_name = model_name
        if context == 'gpu':
            context = 'cuda'
        self.context = context

        self._model_file_prefix = '{}-{}'.format(self.MODEL_FILE_PREFIX,
                                                 model_name)
        model_epoch = AVAILABLE_MODELS.get(model_name, [None])[0]

        if model_epoch is not None:
            self._model_file_prefix = '%s-epoch=%03d' % (
                self._model_file_prefix,
                model_epoch,
            )

        self._assert_and_prepare_model_files(model_fp, root)
        self._vocab, self._letter2id = read_charset(VOCAB_FP)

        self._candidates = None
        self.set_cand_alphabet(cand_alphabet)

        self._model = self._get_model(context)