Ejemplo n.º 1
0
def storage():
    datapath = os.path.dirname(__file__) + '\data'
    imgpath = r'D:\\IMG'
    gmp = GMP(imgpath)
    regisration = Regisration(imgpath)
    license = License(imgpath)
    certificate = ProductionCertificate(datapath, imgpath)
    pga = Improtdrug(imgpath)
    for file in os.walk(datapath):
        id_code = randomidcode()
        for file_name in file[2]:
        # if 'GMP证书' in file_name:
            gmp.gmp(file[0], id_code)
        # elif "营业执照" in file_name:
            license.license(file[0], id_code)
        # elif "药品再注册批件" in file_name:
            regisration.regisration(file[0], id_code)
        # elif '药品生产许可证' in file_name:
            certificate.recognize(file[0], id_code)
        # elif '说明书' in file_name:
            introduction.run_introduction(file[0], id_code)
        # elif '进口药品注册证' in file_name:
            try:
                pga.start(file[0], id_code, 'shuai', '')
            except Exception as e:
                logmgr = LogMgr()
                logmgr.error(file[0]+ ":" + str(e))
                continue


            break
Ejemplo n.º 2
0
 def __init__(self, typeid, app_id = APP_ID, api_key = API_KEY, secret_key = SECRET_KEY):
     self.client = AipOcr(app_id, api_key, secret_key)
     #self.client = AipOcr(appid[1], apikey[1], secretkey[1])
     self.typeid = typeid
     self.codepath = os.path.dirname(__file__)
     self.datapath = self.codepath + '\data'
     os.makedirs(self.datapath, exist_ok=True)
     self.log = LogMgr()
Ejemplo n.º 3
0
def json2word(wordlist,savepath,savename):# savepath = './word' # savename = 'test1'
    emb_filename = os.path.join(savepath, savename+'.doc')
    if not os.path.isdir(os.path.split(emb_filename)[0]):
        os.makedirs(os.path.split(emb_filename)[0])
    try:
        with open(emb_filename, "w",encoding='utf-8') as f:
            for i in wordlist:
                f.write(i + "\n")
            f.close()
    except Exception as e:
        print(e)
        log_mgr = LogMgr()
        log_mgr.error('[mylog]This is error log')


# savepath = './word'
# savename = 'test1'
# wordlist = ["淋日期有合","【有效期】24个月","请仔细阅读说明书井在医师指导下使用"]
# json2word(wordlist,savepath,savename)
Ejemplo n.º 4
0
class JobTable(object):
    '''
    工作表
    '''
    db = cxOracle()
    logmgr = LogMgr()

    def __init__(self):
        self.jobdict = dict()
        #self.jobdict['SER_IP'] = '10.67.28.8'
        self.dbtable = 'OCRWORKFILE'
        self.dbflag = 2

    def job_add(self, jobtmp):
        self.jobdict = jobtmp

    def job_del(self):
        if self.jobdict:
            self.jobdict.clear()

    def update_item(self, find_key, find_value, update_key, update_value):
        '''
        ????????????
        @find_key       ----????????
        @find_value     ----?????????
        @update_key     ----????????
        @update_value   ----????????
        '''
        self.db.update('OCRWORKFILE', find_key, find_value, update_key,
                       update_value)

    def job_todb(self):
        try:
            jobsql, jobparam = self.db.getsavesql(self.dbtable, self.jobdict,
                                                  self.dbflag)
            self.db.insert(jobsql, jobparam)
        except Exception as e:
            self.logmgr.error(str(e))
Ejemplo n.º 5
0
class License(Tools):
    """
    识别营业执照
    """
    def __init__(self, imgpath):
        Tools.__init__(self)
        self.imgpath = imgpath
        self.logmgr = LogMgr()

    def _recognize(self, datas, nums):
        """
        程序的主逻辑
        """
        keylist = []
        datadict = dict()

        for (word, i) in zip(datas, range(0, nums)):
            '''
            循环读识别出的数据,然后根据judge_keywords函数是否提取到了关键信息;
            若提取到了,则保存到datadict中。
            若未提取到,list_result为空。有两种情况,
                1.这段信息不是我们所需要的。
                2.这段信息是上个关键字的值。
                然后执行else,进行更精确的判别。若是需归到上个字段,则循环递减,根据
                keylist[1],也就是list_reault[2]是否出现再上面的某个字段。若有则追加。
            '''
            list_result = self._judge_keywords(word['words'])
            if '名' == word['words'] and datas[i + 1]['words'][0] == '称':
                datadict['ENT_NAME'] = datas[i + 1]['words'][1:]
                continue
            elif '类' == word['words'] and '型' == datas[i + 1]['words'][0]:
                datadict['ENT_TYPE'] = datas[i + 1]['words'][1:]
                continue
            elif '住' == word['words'] and '所' == datas[i + 1]['words'][0]:
                datadict['住所'] = datas[i + 1]['words'][1:]
                continue
            if list_result != None:
                if list_result[
                        0] in datadict and keylist[-1][0] != list_result[0]:
                    datadict[list_result[0]] += list_result[1]
                    flag = 1
                else:
                    datadict[list_result[0]] = list_result[1]
                    flag = 1
                #保存关键字段的信息,以及这段信息原本关键字段的信息
                keylist.append([list_result[0], list_result[2]])
            else:
                j = i
                while j > 0:
                    if not keylist:
                        break
                    if keylist[-1][0] == '统一社会信用代码':
                        if re.search(r'[\u4e00-\u9fa5]+', word['words']):
                            break
                    if flag:
                        if keylist[-1][1] in datas[j]['words']:
                            datadict[keylist[-1][0]] += word['words']
                            break
                    j -= 1
        return datadict

    def _judge_keywords(self, strword):
        '''
        判断关键字,若识别到关键字,返回一个包含关键字的list。
        $resultlist[0] -----要入库的关键字
        $resultlist[1] -----提取到内容
        $resultlist[2] -----需判断的信息中本来的关键字
        如:'证书编号:H12345',resultlist = ['证书编号', 'H12345', '证书编号']
           '证书号:H123', resultlist = ['证书编号', 'H123', '证书号']
        '''
        re_name = re.compile(r"名称")
        re_social_code = re.compile(r"统*一社*会信用代码|统一*社会*信用*代码")
        re_type = re.compile(r"类型")
        re_residence = re.compile(r"住所")
        re_legal_representative = re.compile(r"法定*代表*人|法*定代表人*")
        re_capital = re.compile(r'注册*资本*|注*册资*本')
        re_establish = re.compile(r'成立*日期*|成*立日*期')
        re_period = re.compile(r'营业*期限*|营*业期*限')
        re_scope = re.compile(r"经营*范围*|经*营范*围")
        re_authority = re.compile(r"登记*机关*|登*记机*关")

        if len(strword) >= 10:
            index = 8
        elif len(strword) >= 8:
            index = 6
        else:
            index = len(strword)

        if re_social_code.search(strword[:index]):
            return [
                '统一社会信用代码', strword[re_social_code.search(strword).span()[1]:],
                re_social_code.search(strword).group()
            ]
        elif re_legal_representative.search(strword[:index]):
            return [
                '法定代表人',
                strword[re_legal_representative.search(strword).span()[1]:],
                re_legal_representative.search(strword).group()
            ]
        elif re_capital.search(strword[:index]):
            return [
                '注册资本', strword[re_capital.search(strword).span()[1]:],
                re_capital.search(strword).group()
            ]
        elif re_establish.search(strword[:index]):
            return [
                '成立日期', strword[re_establish.search(strword).span()[1]:],
                re_establish.search(strword).group()
            ]
        elif re_period.search(strword[:index]):
            return [
                '营业期限', strword[re_period.search(strword).span()[1]:],
                re_period.search(strword).group()
            ]
        elif re_scope.search(strword[:index]):
            return [
                '经营范围', strword[re_scope.search(strword).span()[1]:],
                re_scope.search(strword).group()
            ]
        elif re_authority.search(strword[:index]):
            return [
                '登记机关', strword[re_authority.search(strword).span()[1]:],
                re_authority.search(strword).group()
            ]
        else:
            return None

    def license_deploy(self, imgs, id_code):
        flag = 0
        tmp = ''
        for file in imgs:
            file_name = file['imgpath'].split('/')[-1]
            id = file['imgpath'].split('/')[-2]
            if re.search(r'[\u4e00-\u9fa5]+', id):
                dragname = re.search(r'[\u4e00-\u9fa5]+', id).group()
            else:
                dragname = re.search(r'[\u4e00-\u9fa5]+', file_name).group()

            if dragname.find('(') > 0:
                dragname = dragname[:dragname.find('(')]

            if 'error_code' in file['imgjson']:
                self.logmgr.error(file['imgpath'] + " : Img Size Error!")
                continue

            datas = file['imgjson']['words_result']
            nums = file['imgjson']['words_result_num']

        if len(datas) > 0 and nums > 0:
            datadict = self._recognize(datas, nums)
            ######################################增加部分###########################################
            datadict['ID_CODE'] = id_code
            datadict['REMARK'] = ''
            datadict['ADD_USER'] = '******'
            datadict['JOB_ID'] = self._generatemd5(file['imgpath'])
            ######################################增加部分###########################################
            if not datadict:
                nums = self._cleandata(datadict, datas, nums)
                return datadict
            if '登记机关' in datadict:
                del datadict['登记机关']
        return datadict
        #try:
        #    #self._data_to_db('BUSINESSLICENCE', datadict)
        #    nums = self._cleandata(datadict, datas, nums)
        #except Exception as e:
        #    print('Error: ', e)
        #    self.logmgr.error(file[0] + '\\' + file_name + "insert error!! : " + str(e))
        #    self._update_item('OCRWORKFILE','JOB_ID', jobid,'IS_TO_DB','F')
        #    nums = self._cleandata(datadict, datas, nums)

    def license(self, path, id_code):
        flag = 0
        temp = ''
        jobdict = {}
        for file in os.walk(path):
            page = 1
            for file_name in file[2]:
                if '营业执照' in file_name:
                    imgname = file_name.split('.')[0]
                    curpath = file[0].split('data')[1]
                    index = imgname.rfind('_')
                    id = curpath[curpath.rfind('\\') + 1:]
                    if re.search(r'[\u4e00-\u9fa5]+', id):
                        dragname = re.search(r'[\u4e00-\u9fa5]+', id).group()
                    else:
                        dragname = re.search(r'[\u4e00-\u9fa5]+',
                                             file_name).group()
                    if dragname.find('(') > 0:
                        dragname = dragname[:dragname.find('(')]
                    datajson = self._load_json(file[0] + '\\' + file_name)
                    original_path = self.imgpath + '\\' + curpath + '\\' + imgname[:index
                                                                                   -
                                                                                   2] + '.' + 'pdf'

                    #服务器
                    jobdict['SER_IP'] = '10.67.28.8'
                    #job id
                    jobdict['JOB_ID'] = self._generatemd5(file[0] + imgname)
                    jobid = jobdict['JOB_ID']
                    jobdict['SRC_FILE_NAME'] = imgname[:index -
                                                       2] + '.' + 'pdf'
                    jobdict['SRC_FILE_PATH'] = original_path
                    #原文件
                    jobdict['CUT_FILE_NAME'] = imgname[:index] + '.' + imgname[
                        index:].split('_')[1]
                    #原路径
                    jobdict['CUT_FILE_PATH'] = 'G:\\IMG' + '\\' + curpath
                    #时间
                    jobdict['HANDLE_TIME'] = time.strftime(
                        "%Y-%m-%d %X", time.localtime())
                    #药品名
                    jobdict['DRUG_NAME'] = dragname
                    #影像件类型
                    jobdict['FILE_TYPE'] = '营业执照'
                    #同一套影像件识别码
                    jobdict['ID_CODE'] = id_code
                    #分公司
                    jobdict['SRC_CO'] = curpath.split('\\')[1]
                    #源文件相对路径
                    jobdict[
                        'FILE_REL_PATH'] = '\\' + imgname[:
                                                          index] + '.' + imgname[
                                                              index:].split(
                                                                  '_')[1]
                    #文件服务器域名
                    jobdict['SYS_URL'] = '10.67.28.8'
                    #页数
                    jobdict['PAGE_NUM'] = page
                    #文件ocr解析识别状态 fk sysparams
                    jobdict['OCR_STATE'] = 'T'
                    #备注说明
                    jobdict['REMARK'] = ''
                    #创建用户
                    jobdict['ADD_USER'] = '******'
                    #图片过大或者一些原因,没有识别出来就会有error_code字段
                    if 'error_code' in datajson:
                        jobdict['IS_TO_DB'] = 'F'
                        self.job.job_add(jobdict)
                        self.job.job_todb()
                        self.job.job_del()
                        self.logmgr.error(file[0] + '\\' + file_name +
                                          ": img size error!")
                        continue
                    datas = datajson['words_result']
                    nums = datajson['words_result_num']
                    flag = 1

                    #中间文件
                    jobdict['MID_FILE_NAME'] = file_name
                    #中间文件路径
                    jobdict['MID_FILE_PATH'] = file[0]
                    #评分
                    jobdict['OCR_SCORE'] = int(self._getscore(datas, nums))

                    #影像件内容是否入库
                    if len(datas) > 0 and nums > 0:
                        jobdict['IS_TO_DB'] = 'T'
                    else:
                        jobdict['IS_TO_DB'] = 'F'

                    #文件文本内容
                    jobdict['FILE_TEXT'] = self._middict(
                        datas, self.codepath + '\\middata\\' + curpath,
                        imgname)
                    ###########################
                    temp = jobdict['FILE_TEXT']
                    ###########################
                    #jobdict['JOB_ID'] = self._generatemd5(jobdict['FILE_TEXT'])
                    ###############

                    page += 1
                    self.job.job_add(jobdict)
                    self.job.job_todb()
                    self.job.job_del()
            if flag:
                if len(datas) > 0 and nums > 0:
                    datadict = self._recognize(datas, nums)
                    ######################################增加部分###########################################
                    datadict['ID_CODE'] = id_code
                    datadict['REMARK'] = ''
                    datadict['ADD_USER'] = '******'
                    datadict['JOB_ID'] = self._generatemd5(temp)
                    ######################################增加部分###########################################
                    print(datadict)
                    if not datadict:
                        nums = self._cleandata(datadict, datas, nums)
                        continue
                    if '登记机关' in datadict:
                        del datadict['登记机关']
                    try:
                        self._data_to_db('BUSINESSLICENCE', datadict)
                        nums = self._cleandata(datadict, datas, nums)
                    except Exception as e:
                        print('Error: ', e)
                        self.logmgr.error(file[0] + '\\' + file_name +
                                          "insert error!! : " + str(e))
                        self._update_item('OCRWORKFILE', 'JOB_ID', jobid,
                                          'IS_TO_DB', 'F')
                        nums = self._cleandata(datadict, datas, nums)
                        continue
Ejemplo n.º 6
0
 def __init__(self, imgpath):
     Tools.__init__(self)
     self.imgpath = imgpath
     self.logmgr = LogMgr()
Ejemplo n.º 7
0
class Regisration(Tools):
    """
    识别药品再注册批件
    """
    def __init__(self, imgpath):
        Tools.__init__(self)
        self.imgpath = imgpath
        self.logmgr = LogMgr()

    def _recognize(self, datas, nums):
        """
        程序的主逻辑
        """
        keylist = []
        datadict = dict()

        for (word, i) in zip(datas, range(0, nums)):
            '''
            循环读识别出的数据,然后根据judge_keywords函数是否提取到了关键信息;
            若提取到了,则保存到datadict中。
            若未提取到,list_result为空。有两种情况,
                1.这段信息不是我们所需要的。
                2.这段信息是上个关键字的值。
                然后执行else,进行更精确的判别。若是需归到上个字段,则循环递减,根据
                keylist[1],也就是list_reault[2]是否出现再上面的某个字段。若有则追加。
            '''
            list_result = self._judge_keywords(word['words'])
            if list_result != None:
                if list_result[
                        0] in datadict and keylist[-1][0] != list_result[0]:
                    datadict[list_result[0]] += list_result[1]
                    flag = 1
                else:
                    datadict[list_result[0]] = list_result[1]
                    flag = 1
                #保存关键字段的信息,以及这段信息原本关键字段的信息
                keylist.append([list_result[0], list_result[2]])
            else:
                j = i
                while j > 0:
                    if not keylist:
                        break
                    if keylist[-1][0] == '批准文号':
                        if re.search(r'.?[a-zA-z][0-9]+', word['words']):
                            break
                    if keylist[-1][0] == '规格':
                        if not re.search(r'.*m*g|.*m*l', word['words']):
                            break
                    if flag:
                        if keylist[-1][1] in datas[j]['words']:
                            datadict[keylist[-1][0]] += word['words']
                            break
                    j -= 1
        return datadict

    def _judge_keywords(self, strword):
        '''
        判断关键字,若识别到关键字,返回一个包含关键字的list。
        $resultlist[0] -----要入库的关键字
        $resultlist[1] -----提取到内容
        $resultlist[2] -----需判断的信息中本来的关键字
        如:'证书编号:H12345',resultlist = ['证书编号', 'H12345', '证书编号']
           '证书号:H123', resultlist = ['证书编号', 'H123', '证书号']
        '''
        re_coname = re.compile(r"名称")
        re_num_orig = re.compile(r"原始*编号*|原*始编*号")
        re_drug_standord = re.compile(r"药品*标准*|药*品标*准")
        re_drug_valid = re.compile(r"药品*有效期*|药*品有*效期")
        re_drug_class = re.compile(r"药品*分类*|药*品分*类")
        re_common_name = re.compile(r'药品*通?用名称?|药*品通?用名?称')
        re_product_name = re.compile(r'商?品名称?|商?品名?称')
        re_english = re.compile(r'英文?名称?|英文名?称')
        re_pinyin = re.compile(r'汉语?拼音?|汉?语拼?音')
        re_coaddr = re.compile(r"生产*地址*|生*产地*址")
        re_conclution = re.compile(r"审批*结论*|审*批结*论")
        re_drug_approval = re.compile(r"药品*批准文*号|药*品批*准文号")
        re_drug_approval_valid = re.compile(r"药*品批准文号有*效期|药品*批准文号*有效*期")
        #TODO:有些注册批件的生产厂家
        re_annex = re.compile(r"附件")
        re_zhusong = re.compile(r"主送")
        re_chaobao = re.compile(r"抄报")
        re_regisnum = re.compile(r"注册*证号*|注*册证*号")
        re_regisnum_valid = re.compile(r"注册*证号有效期*|注*册证号有效*期")
        re_specification = re.compile(r'规格')
        re_jixing = re.compile(r'剂型')

        if len(strword) >= 8:
            index = 6
        else:
            index = len(strword)

        if (re.match(r'.+?(?:\:)', strword[:index])):
            if re_common_name.search(strword[:8]):
                return [
                    '药品名称', strword[re_common_name.search(strword).span()[1]:],
                    re_common_name.search(strword).group()
                ]
            elif re_pinyin.search(strword[:index]):
                return [
                    '汉语拼音', strword[re_pinyin.search(strword).span()[1] + 1:],
                    re_pinyin.search(strword).group()
                ]
            elif re_coname.search(strword[:4]):
                return [
                    '名称', strword[re_coname.search(strword).span()[1] + 1:],
                    re_coname.search(strword).group()
                ]
            elif re_coaddr.search(strword[:index]):
                return [
                    '生产地址', strword[re_coaddr.search(strword).span()[1] + 1:],
                    re_coaddr.search(strword).group()
                ]
            else:
                return None
        else:
            if re_common_name.search(strword[:8]):
                return [
                    '药品名称', strword[re_common_name.search(strword).span()[1]:],
                    re_common_name.search(strword).group()
                ]
            elif re_pinyin.search(strword[:index]):
                return [
                    '汉语拼音', strword[re_pinyin.search(strword).span()[1]:],
                    re_pinyin.search(strword).group()
                ]
            elif re_coname.search(strword[:4]):
                return [
                    '名称', strword[re_coname.search(strword).span()[1]:],
                    re_coname.search(strword).group()
                ]
            elif re_coaddr.search(strword[:index]):
                return [
                    '生产地址', strword[re_coaddr.search(strword).span()[1]:],
                    re_coaddr.search(strword).group()
                ]
            elif re_conclution.search(strword[:index]):
                return [
                    '审批结论', strword[re_conclution.search(strword).span()[1]:],
                    re_conclution.search(strword).group()
                ]
            elif re_drug_approval.search(strword[:index]):
                return [
                    '再注册证批准文号',
                    strword[re_drug_approval.search(strword).span()[1]:],
                    re_drug_approval.search(strword).group()
                ]
            elif re_drug_approval_valid.search(strword[:index]):
                return [
                    '药品批准文号有效期',
                    strword[re_drug_approval_valid.search(strword).span()[1]:],
                    re_drug_approval_valid.search(strword).group()
                ]
            elif re_regisnum.search(strword[:index]):
                return [
                    '注册证号', strword[re_regisnum.search(strword).span()[1]:],
                    re_regisnum.search(strword).group()
                ]
            elif re_regisnum_valid.search(strword[:8]):
                return [
                    '批准文号有效期',
                    strword[re_regisnum_valid.search(strword).span()[1]:],
                    re_regisnum_valid.search(strword).group()
                ]
            elif re_zhusong.search(strword[:index]):
                return [
                    '主送', strword[re_zhusong.search(strword).span()[1]:],
                    re_zhusong.search(strword).group()
                ]
            elif re_specification.search(strword[:self._short_index(strword)]):
                return [
                    '规格', strword[re_specification.search(strword).span()[1]:],
                    re_specification.search(strword).group()
                ]
            elif re_jixing.search(strword[:self._short_index(strword):]):
                return [
                    '剂型', strword[re_jixing.search(strword).span()[1]:],
                    re_jixing.search(strword).group()
                ]
            elif re_drug_class.search(strword[:index]):
                return [
                    '药品分类', strword[re_drug_class.search(strword).span()[1]:],
                    re_drug_class.search(strword).group()
                ]
            else:
                return None

    def regisration_deploy(self, imgs, id_code):
        flag = 0
        tmp = ''
        for file in imgs:
            file_name = file['imgpath'].split('/')[-1]
            id = file['imgpath'].split('/')[-2]
            if re.search(r'[\u4e00-\u9fa5]+', id):
                dragname = re.search(r'[\u4e00-\u9fa5]+', id).group()
            else:
                dragname = re.search(r'[\u4e00-\u9fa5]+', file_name).group()

            if dragname.find('(') > 0:
                dragname = dragname[:dragname.find('(')]

            if 'error_code' in file['imgjson']:
                self.logmgr.error(file['imgpath'] + " : Img Size Error!")
                continue

            datas = file['imgjson']['words_result']
            nums = file['imgjson']['words_result_num']

        if len(datas) > 0 and nums > 0:
            datadicttmp = self._recognize(datas, nums)
            datadict = dict()
            if '药品名称' in datadicttmp:
                if re.match('[::]', datadicttmp['药品名称']):
                    datadict['药品名称'] = datadicttmp['药品名称'][1:]
                else:
                    datadict['药品名称'] = datadicttmp['药品名称']

            if '剂型' in datadicttmp:
                if re.match('[::]', datadicttmp['剂型']):
                    datadict['剂型'] = datadicttmp['剂型'][1:]
                else:
                    datadict['剂型'] = datadicttmp['剂型']

            if '规格' in datadicttmp:
                if re.match('[::]', datadicttmp['规格']):
                    datadict['规格'] = datadicttmp['规格'][1:]
                else:
                    datadict['规格'] = datadicttmp['规格']

            if '生产厂家' in datadicttmp:
                if re.match('[::]', datadicttmp['生产厂家']):
                    datadict['生产厂家'] = datadicttmp['生产厂家'][1:]
                else:
                    datadict['生产厂家'] = datadicttmp['生产厂家']

            if '日期' in datadicttmp:
                if re.match('[::]', datadicttmp['日期']):
                    datadict['日期'] = datadicttmp['日期'][1:]
                else:
                    datadict['日期'] = datadicttmp['日期']
            if not datadict:
                return 'None'

            return datadict

    def regisration(self, path, id_code):
        flag = 0
        temp = ''
        for file in os.walk(path):
            page = 1
            jobdict = {}
            for file_name in file[2]:
                if '药品再注册批件' in file_name:
                    imgname = file_name.split('.')[0]
                    curpath = file[0].split('data')[1]
                    index = imgname.rfind('_')
                    id = curpath[curpath.rfind('\\') + 1:]
                    if re.search(r'[\u4e00-\u9fa5]+', id):
                        dragname = re.search(r'[\u4e00-\u9fa5]+', id).group()
                    else:
                        dragname = re.search(r'[\u4e00-\u9fa5]+',
                                             file_name).group()
                    if dragname.find('(') > 0:
                        dragname = dragname[:dragname.find('(')]
                    datajson = self._load_json(file[0] + '\\' + file_name)
                    original_path = self.imgpath + '\\' + curpath + '\\' + imgname[:index
                                                                                   -
                                                                                   2] + '.' + 'pdf'
                    #服务器
                    jobdict['SER_IP'] = '10.67.28.8'
                    #job id
                    jobdict['JOB_ID'] = self._generatemd5(file[0] + imgname)
                    jobid = jobdict['JOB_ID']
                    jobdict['SRC_FILE_NAME'] = imgname[:index -
                                                       2] + '.' + 'pdf'
                    jobdict['SRC_FILE_PATH'] = original_path
                    #原文件
                    jobdict['CUT_FILE_NAME'] = imgname[:index] + '.' + imgname[
                        index:].split('_')[1]
                    #原路径
                    jobdict['CUT_FILE_PATH'] = 'G:\\IMG' + '\\' + curpath
                    #时间
                    jobdict['HANDLE_TIME'] = time.strftime(
                        "%Y-%m-%d %X", time.localtime())
                    #药品名
                    jobdict['DRUG_NAME'] = dragname
                    #影像件类型
                    jobdict['FILE_TYPE'] = '药品再注册批件'
                    #同一套影像件识别码
                    jobdict['ID_CODE'] = id_code
                    #分公司
                    jobdict['SRC_CO'] = curpath.split('\\')[1]
                    #源文件相对路径
                    jobdict[
                        'FILE_REL_PATH'] = '\\' + imgname[:
                                                          index] + '.' + imgname[
                                                              index:].split(
                                                                  '_')[1]
                    #文件服务器域名
                    jobdict['SYS_URL'] = '10.67.28.8'
                    #页数
                    jobdict['PAGE_NUM'] = page
                    #文件ocr解析识别状态 fk sysparams
                    jobdict['OCR_STATE'] = 'T'
                    #备注说明
                    jobdict['REMARK'] = ''
                    #创建用户
                    jobdict['ADD_USER'] = '******'
                    #图片过大或者一些原因,没有识别出来就会有error_code字段
                    if 'error_code' in datajson:
                        jobdict['IS_TO_DB'] = 'F'
                        self.job.job_add(jobdict)
                        self.job.job_todb()
                        self.job.job_del()
                        self.logmgr.error(file[0] + '\\' + file_name +
                                          ": img size error!")
                        continue
                    datas = datajson['words_result']
                    nums = datajson['words_result_num']
                    flag = 1

                    #中间文件
                    jobdict['MID_FILE_NAME'] = file_name
                    #中间文件路径
                    jobdict['MID_FILE_PATH'] = file[0]
                    #评分
                    jobdict['OCR_SCORE'] = int(self._getscore(datas, nums))

                    #影像件内容是否入库
                    if len(datas) > 0 and nums > 0:
                        jobdict['IS_TO_DB'] = 'T'
                    else:
                        jobdict['IS_TO_DB'] = 'F'

                    #文件文本内容
                    jobdict['FILE_TEXT'] = self._middict(
                        datas, self.codepath + '\\middata\\' + curpath,
                        imgname)
                    ###############
                    temp = jobdict['FILE_TEXT']
                    #jobdict['JOB_ID'] = self._generatemd5(jobdict['FILE_TEXT'])
                    ###############

                    page += 1
                    self.job.job_add(jobdict)
                    self.job.job_todb()
                    self.job.job_del()
            if flag:
                if len(datas) > 0 and nums > 0:
                    datadicttmp = self._recognize(datas, nums)
                    datadict = dict()
                    if '药品名称' in datadicttmp:
                        if re.match('[::]', datadicttmp['药品名称']):
                            datadict['药品名称'] = datadicttmp['药品名称'][1:]
                        else:
                            datadict['药品名称'] = datadicttmp['药品名称']

                    if '剂型' in datadicttmp:
                        if re.match('[::]', datadicttmp['剂型']):
                            datadict['剂型'] = datadicttmp['剂型'][1:]
                        else:
                            datadict['剂型'] = datadicttmp['剂型']

                    if '规格' in datadicttmp:
                        if re.match('[::]', datadicttmp['规格']):
                            datadict['规格'] = datadicttmp['规格'][1:]
                        else:
                            datadict['规格'] = datadicttmp['规格']

                    if '生产厂家' in datadicttmp:
                        if re.match('[::]', datadicttmp['生产厂家']):
                            datadict['生产厂家'] = datadicttmp['生产厂家'][1:]
                        else:
                            datadict['生产厂家'] = datadicttmp['生产厂家']

                    if '日期' in datadicttmp:
                        if re.match('[::]', datadicttmp['日期']):
                            datadict['日期'] = datadicttmp['日期'][1:]
                        else:
                            datadict['日期'] = datadicttmp['日期']

                    ######################################增加部分###########################################
                    datadict['ID_CODE'] = id_code
                    datadict['REMARK'] = ''
                    datadict['ADD_USER'] = '******'
                    datadict['JOB_ID'] = self._generatemd5(temp)
                    ######################################增加部分###########################################
                    print(datadict)
                    ###########################

                    ###########################
                    if not datadict:
                        nums = self._cleandata(datadict, datas, nums)
                        continue
                    try:
                        self._data_to_db('DRUGREGAPPROVAL', datadict)
                        nums = self._cleandata(datadict, datas, nums)
                    except Exception as e:
                        print('Error: ', e)
                        self.logmgr.error(file[0] + '\\' + file_name +
                                          "insert error!! : " + str(e))
                        self._update_item('OCRWORKFILE', 'JOB_ID', jobid,
                                          'IS_TO_DB', 'F')
                        nums = self._cleandata(datadict, datas, nums)
                        continue
Ejemplo n.º 8
0
import json
from DatabaseToolsNew import cxOracle
import re
from FindKeyword import findImportWords
import HowManyColumn4 as hmc
#import openpyxl
import xlwings as xw
import time
import hashlib
from log import LogMgr
from job import JobTable
import random
from json2word import json2word
from tool import Tools

logmgr = LogMgr()

'''
使用openpyxl太慢了,改用xlwings
wb = openpyxl.load_workbook('C:\\Users\\DevinChang\\Desktop\\四家分公司影印件清单_去重匹配版.xlsx')
sheets = wb.sheetnames
sheet = wb.get_sheet_by_name(sheets[0])
shopid = sheet['B']
name = sheet['C']
strength = sheet['D']
mfrs = sheet['F']
'''



Ejemplo n.º 9
0
class ProductionCertificate(Tools):
    def __init__(self, jsonpath, imgpath):
        Tools.__init__(self)
        self.jsonpath = jsonpath
        self.imgpath = imgpath
        self.logmgr = LogMgr()
        

    def generatemd5(strid):
        md5 = hashlib.md5()
        md5.update(strid.encode('utf-8'))
        return md5.hexdigest()

    def subfiledata(self,direction, parameter, boundary, datas):
        leftdata = []
        rightdata = []
        for data in datas:
            if direction == 1 or direction == 2:
                if data['location'][parameter] >= boundary:
                    # 此处有bug
                    leftdata.append(data)
                else:
                    rightdata.append(data)
            else:
                if data['location'][parameter] <= boundary:
                    leftdata.append(data)
                else:
                    rightdata.append(data)
        return leftdata+(rightdata)

    def _productionCertificate(self, datas, nums):
        """
        识别生产许可证
        """
        keylist = []
        datadict = {}#这里做了一点小改动!!!!!!!!!!!!!!!!!!!1
        i = 0
        flag = 0
        for (word, i) in zip(datas, range(0, nums)):
            list_result = self._judge_keywords(word['words'])
            if list_result != None:
                if list_result[0] in datadict and keylist[-1][0] != list_result[0]:
                    datadict[list_result[0]] += list_result[1]
                    flag = 1
                else:
                    datadict[list_result[0]] = list_result[1]
                    flag = 1
                keylist.append([list_result[0], list_result[2]])
            else:
                flag = 1
                j = i
                while j >= 0:
                    if not keylist:
                        break
                    if ("分类码" in keylist[-1][0]):
                        if re.match(r'[a-zA-z]+', word['words']):
                            flag = 1
                        else:
                            break
                    elif "有效期至" in keylist[-1][0]:
                        if re.match(r'[0-9]+年?[0-9]+月?[0-9]+日?', word['words']):
                            flag = 1
                        else:
                            break

                    # # 字段追加问题
                    # if re.match(r'.?[::]', word['words'][:10]) and not re.match(r'质*量受*权人*',word['words']):
                    #     if  not re.match(r'质*量受*权人*', word['words']):
                    #         flag = 0
                    #         break
                    if flag:
                        if keylist[-1][1] in datas[j]['words']:
                            datadict[keylist[-1][0]] += word['words']
                            break
                    j -= 1
                # datadict[list_result[0]] = list_result[1]
                # flag = 1

        return datadict


    def _judge_keywords(self, strword):
        '''判断关键字'''
        # re_coname = re.compile(r"企业*名称*|企*业名*称")
        # re_cernum = re.compile(r"证书*编号*|证*书编*号")
        # re_addr = re.compile(r"地址")
        # re_cerscope = re.compile(r"认证*范围*|认*证范*围")
        # re_valid = re.compile(r"有效期至*|有效*期至")
        # re_liceauth = re.compile(r"发证*机关*|发*证机*关")
        # re_licedate = re.compile(r"发证*日期*|发*证日*期")

        re_entname = re.compile(r"企业*名称*|企*业名*称")
        re_regAddr = re.compile(r"注册*地址|注册地*址")
        re_uscc = re.compile(r"社会*信用社*代*码|社*会信用*社*代码*")
        re_legalReps = re.compile(r"法定*代表*人|法*定代表人*")
        re_entPrincipal = re.compile(r"企.负责人|.业负*责人|企.负.人")
        re_qcPrincipal = re.compile(r"质*量负责*人|质*量负责人*|.量负.人*")
        re_vld = re.compile(r"有*效期*至|有效*期至")
        re_supervisionDEP = re.compile(r"日常*监管*机构*|日*常监*管机*构")
        re_supervisor = re.compile(r"日常*监管*人员*|日*常监*管人*员")
        re_supervisorCT = re.compile(r"监督*举报*电话*|监*督举*报电*话")
        re_licNO = re.compile(r"编号|編号|号:|号:|号")
        re_licNO2 = re.compile(r"号")
        re_cateCode = re.compile(r"分*类码")
        re_prodAddrScope = re.compile(r"生*产地*址和生产*范*围|生*产*地址和*生*产范*围|.产.址和.产.围|生产地址.生产范.")
        re_issueOrg = re.compile(r"发证机.|发证.关")
        re_issuer = re.compile(r"签发*人")
        re_issueDate = re.compile(r"发证*日*期")
        re_kindsOfEnterprise = re.compile(r"企业*类型*")
        re_useLimit = re.compile(r"此*复印件*仅*限用*于*")
        re_qcLegal = re.compile(r"质*量受*权人*")
        re_NO = re.compile(r"NO|N0")
        re_authorizedDEPT = re.compile(r"国*家*食品*药品*监督*管*理局制*|.家*食.药.监督*.理局.")
        re_country = re.compile(r"中华人民共和国")
        re_kindsOfDocument = re.compile(r"药品生产许可证")






        #这里将提取关键字段的长度延长到了12个,尽可能的将由于印章等造成的干扰降低
        if len(strword) >= 4:
            index = 6
        else:
            index = len(strword)

        if (re.match(r'.+?(?:\:)', strword[:index])):
            if re_entname.search(strword[:index]):
                return ['企业名称_许可证', strword[re_entname.search(strword).span()[1]+1:], re_entname.search(strword).group()]
            elif re_regAddr.search(strword[:index]):
                return ['注册地址', strword[re_regAddr.search(strword).span()[1] + 1:], re_regAddr.search(strword).group()]
            elif re_uscc.search(strword[:9]):
                return ['社会信用社代码', strword[re_uscc.search(strword).span()[1] + 1:], re_uscc.search(strword).group()]
            elif re_legalReps.search(strword[:7]):
                return ['法定代表人', strword[re_legalReps.search(strword).span()[1] + 1:], re_legalReps.search(strword).group()]
            elif re_entPrincipal.search(strword[:7]):
                return ['企业负责人', strword[re_entPrincipal.search(strword).span()[1] + 1:], re_entPrincipal.search(strword).group()]
            elif re_qcPrincipal.search(strword[:7]):
                return ['质量负责人', strword[re_qcPrincipal.search(strword).span()[1] + 1:], re_qcPrincipal.search(strword).group()]
            elif re_vld.search(strword[:index]):
                return ['有效期至', strword[re_vld.search(strword).span()[1] + 1:], re_vld.search(strword).group()]
            elif re_supervisionDEP.search(strword[:8]):
                return ['日常监管机构', strword[re_supervisionDEP.search(strword).span()[1] + 1:], re_supervisionDEP.search(strword).group()]
            elif re_supervisionDEP.search(strword[:8]):
                return ['日常监管机构', strword[re_supervisionDEP.search(strword).span()[1] + 1:], re_supervisionDEP.search(strword).group()]
            elif re_supervisor.search(strword[:8]):
                return ['日常监管人员', strword[re_supervisor.search(strword).span()[1] + 1:], re_supervisor.search(strword).group()]
            elif re_supervisorCT.search(strword[:8]):
                return ['监督举报电话', strword[re_supervisorCT.search(strword).span()[1] + 1:], re_supervisorCT.search(strword).group()]
            elif re_licNO.search(strword[:3]):
                return ['许可证编号', strword[re_licNO.search(strword).span()[1] + 1:], re_licNO.search(strword).group()]
            elif re_licNO2.search(strword[:1]):
                return ['许可证编号', strword[re_licNO2.search(strword).span()[1] + 1:], re_licNO2.search(strword).group()]
            elif re_cateCode.search(strword[:5]):
                return ['分类码', strword[re_cateCode.search(strword).span()[1] + 1:], re_cateCode.search(strword).group()]
            elif re_prodAddrScope.search(strword[:11]):
                return ['生产地址和生产范围', strword[re_prodAddrScope.search(strword).span()[1] + 1:], re_prodAddrScope.search(strword).group()]
            elif re_issueOrg.search(strword[:index]):
                return ['发证机关', strword[re_issueOrg.search(strword).span()[1] + 1:], re_issueOrg.search(strword).group()]
            elif re_issuer.search(strword[:5]):
                return ['签发人', strword[re_issuer.search(strword).span()[1] + 1:], re_issuer.search(strword).group()]
            elif re_issueDate.search(strword[:index]):
                return ['发证日期', strword[re_issueDate.search(strword).span()[1] + 1:],re_issueDate.search(strword).group()]
            elif re_kindsOfEnterprise.search(strword[:index]):
                return ['企业类型', strword[re_kindsOfEnterprise.search(strword).span()[1] + 1:],re_kindsOfEnterprise.search(strword).group()]
            elif re_useLimit.search(strword[:10]):
                return ['此复印件仅限用于', strword[re_useLimit.search(strword).span()[1] + 1:],re_useLimit.search(strword).group()]
            # elif re_qcLegal.search(strword[:index]):
            #     return ['质量受权人', strword[re_qcLegal.search(strword).span()[1] + 1:],re_qcLegal.search(strword).group()]
            elif re_NO.search(strword[:3]):
                return ['NO', strword[re_NO.search(strword).span()[1] + 1:],re_NO.search(strword).group()]
            elif re_authorizedDEPT.search(strword[:13]):
                return ['国家食品药品监督管理局制', strword[re_authorizedDEPT.search(strword).span()[1]:],re_authorizedDEPT.search(strword).group()]
            elif re_country.search(strword[:8]):
                return ['中华人民共和国', strword[re_country.search(strword).span()[1]:],re_country.search(strword).group()]
            elif re_kindsOfDocument.search(strword[:8]):
                return ['药品生产许可证', strword[re_kindsOfDocument.search(strword).span()[1]:],re_kindsOfDocument.search(strword).group()]
            else:
                return None
        else:
            if re_entname.search(strword[:index]):
                return ['企业名称_许可证', strword[re_entname.search(strword).span()[1]+1:], re_entname.search(strword).group()]
            elif re_regAddr.search(strword[:index]):
                return ['注册地址', strword[re_regAddr.search(strword).span()[1] + 1:], re_regAddr.search(strword).group()]
            elif re_uscc.search(strword[:9]):
                return ['社会信用社代码', strword[re_uscc.search(strword).span()[1] + 1:], re_uscc.search(strword).group()]
            elif re_legalReps.search(strword[:7]):
                return ['法定代表人', strword[re_legalReps.search(strword).span()[1] + 1:], re_legalReps.search(strword).group()]
            elif re_entPrincipal.search(strword[:7]):
                return ['企业负责人', strword[re_entPrincipal.search(strword).span()[1] + 1:], re_entPrincipal.search(strword).group()]
            elif re_qcPrincipal.search(strword[:7]):
                return ['质量负责人', strword[re_qcPrincipal.search(strword).span()[1] + 1:], re_qcPrincipal.search(strword).group()]
            elif re_vld.search(strword[:index]):
                return ['有效期至', strword[re_vld.search(strword).span()[1] + 1:], re_vld.search(strword).group()]
            elif re_supervisionDEP.search(strword[:8]):
                return ['日常监管机构', strword[re_supervisionDEP.search(strword).span()[1] + 1:], re_supervisionDEP.search(strword).group()]
            elif re_supervisionDEP.search(strword[:8]):
                return ['日常监管机构', strword[re_supervisionDEP.search(strword).span()[1] + 1:], re_supervisionDEP.search(strword).group()]
            elif re_supervisor.search(strword[:8]):
                return ['日常监管人员', strword[re_supervisor.search(strword).span()[1] + 1:], re_supervisor.search(strword).group()]
            elif re_supervisorCT.search(strword[:8]):
                return ['监督举报电话', strword[re_supervisorCT.search(strword).span()[1] + 1:], re_supervisorCT.search(strword).group()]
            elif re_licNO.search(strword[:3]):
                return ['许可证编号', strword[re_licNO.search(strword).span()[1] + 1:], re_licNO.search(strword).group()]
            elif re_cateCode.search(strword[:5]):
                return ['分类码', strword[re_cateCode.search(strword).span()[1] + 1:], re_cateCode.search(strword).group()]
            elif re_prodAddrScope.search(strword[:11]):
                return ['生产地址和生产范围', strword[re_prodAddrScope.search(strword).span()[1] + 1:], re_prodAddrScope.search(strword).group()]
            elif re_issueOrg.search(strword[:index]):
                return ['发证机关', strword[re_issueOrg.search(strword).span()[1] + 1:], re_issueOrg.search(strword).group()]
            elif re_issuer.search(strword[:5]):
                return ['签发人', strword[re_issuer.search(strword).span()[1] + 1:], re_issuer.search(strword).group()]
            elif re_issueDate.search(strword[:index]):
                return ['发证日期', strword[re_issueDate.search(strword).span()[1] + 1:],re_issueDate.search(strword).group()]
            elif re_kindsOfEnterprise.search(strword[:index]):
                return ['企业类型', strword[re_kindsOfEnterprise.search(strword).span()[1] + 1:],re_kindsOfEnterprise.search(strword).group()]
            elif re_useLimit.search(strword[:10]):
                return ['此复印件仅限用于', strword[re_useLimit.search(strword).span()[1] + 1:],re_useLimit.search(strword).group()]
            # elif re_qcLegal.search(strword[:index]):
            #     return ['质量受权人', strword[re_qcLegal.search(strword).span()[1] + 1:], re_qcLegal.search(strword).group()]
            elif re_NO.search(strword[:3]):
                return ['NO', strword[re_NO.search(strword).span()[1]+1:], re_NO.search(strword).group()]
            elif re_authorizedDEPT.search(strword[:13]):
                return ['国家食品药品监督管理局制', strword[re_authorizedDEPT.search(strword).span()[1]:],re_authorizedDEPT.search(strword).group()]
            elif re_country.search(strword[:8]):
                return ['中华人民共和国', strword[re_country.search(strword).span()[1]:], re_country.search(strword).group()]
            elif re_kindsOfDocument.search(strword[:8]):
                return ['药品生产许可证', strword[re_kindsOfDocument.search(strword).span()[1]:],re_kindsOfDocument.search(strword).group()]
            else:
                return None

    def recognize_deploy(self, imgs, id_code):
        nums = 0
        flag = 0
        temp = ''
        datas = []
        for file in imgs:
            #提取药品名称
            id = file['imgpath'].split('/')[-2]
            file_name = file['imgpath'].split('/')[-1]
            if re.search(r'[\u4e00-\u9fa5]+', id):
                dragname = re.search(r'[\u4e00-\u9fa5]+', id).group()
            else:
                dragname = re.search(r'[\u4e00-\u9fa5]+', file_name).group() 

            if 'error_code' in file['imgjson']:
                self.logmgr.error(file['imgpath'] + ' : ' + 'Size Error!')
            #判别是否是多栏
            try:
                kindict = hmc.kinds(file['imgpath'], file['imgjson'])
            except Exception as e:
                self.logmgr.error(file['imgpath'] + ' : ' + 'Size Error!')
                continue
            print('Current processing: {}'.format(file['imgpath']))
            #提取关键信息
            datatmp = file['imgjson']['words_result']
            nums += file['imgjson']['words_result_num']
            if kindict['kinds'] == 2:
                datas += subfiledata(kindict['direction'], kindict['parameter'], kindict['boundary'][0], datatmp)
            elif kindict['kinds'] == 1:
                datas += datatmp
        if len(datas) > 0 and nums > 0:
            datadict = self._productionCertificate(datas, nums)
            if '企业类型' in datadict:
                del datadict['企业类型']
            if '此复印件仅限于' in datadict:
                del datadict['次复印件仅限于']
            if 'NO' in datadict:
                del datadict['NO']
            if '国家食品药品监督管理局制' in datadict:
                del datadict['国家食品药品监督管理局制']
            if '中华人民共和国' in datadict:
                del datadict['中华人民共和国']
            if '药品许可证' in datadict:
                del datadict['药品许可证']

            ######################################增加部分###########################################
            datadict['ID_CODE'] = id_code
            datadict['REMARK'] = ''
            datadict['ADD_USER'] = '******'
            datadict['JOB_ID'] = self._generatemd5(temp)
            ######################################增加部分###########################################
            if not datadict:
                nums = self._cleandata(datadict, datas, nums)
                return datadict
            return datadict
            #try:
            #    #self._data_to_db('DRUGMFRSCERT', datadict)
            #    nums = self._cleandata(datadict, datas, nums)
            #except Exception as e:
            #    #self.logmgr.error(file[0] + '\\' + file_name + "insert error!! : " + str(e))
            #    #self._update_item('OCRWORKFILE','JOB_ID', jobid,'IS_TO_DB','F')
            #    nums = self._cleandata(datadict, datas, nums)
            #    continue

    def recognize(self, path, id_code):
        flag = 0
        page = 0
        temp  =''
        jobdict = {}
        for file in os.walk(path):#这里将原来imgpath换成了 jsonpath
            for file_name in file[2]:
                if '生产许可证' in file_name:
                    jsonname = file_name.split('.')[0]
                    curpath = file[0].split('data')[1]
                    index = jsonname.rfind('_')
                    id = curpath[curpath.rfind('\\') + 1:]
                    if re.search(r'[\u4e00-\u9fa5]+', id):
                        dragname = re.search(r'[\u4e00-\u9fa5]+', id).group()
                    else:
                        dragname = re.search(r'[\u4e00-\u9fa5]+', file_name).group()
                    if dragname.find('(') > 0:
                        dragname = dragname[:dragname.find('(')]
                    jsonPath = file[0] + '\\' + file_name
                    datajson = self._load_json(file[0] + '\\' + file_name)
                    source_img_path = self.imgpath + curpath + '\\' + jsonname[:index] + '.' + jsonname[index:].split('_')[1]
                    original_path = self.imgpath + '\\' + curpath + '\\' + jsonname[:index - 2] + '.' + 'pdf'

                    #服务器
                    jobdict['SER_IP'] = '10.67.28.8'
                    #job id
                    jobdict['JOB_ID'] = self._generatemd5(file[0] + jsonname)
                    jobid = jobdict['JOB_ID']
                    jobdict['SRC_FILE_NAME'] = jsonname[:index - 2] + '.' + 'pdf'
                    jobdict['SRC_FILE_PATH'] = original_path
                    #原文件
                    jobdict['CUT_FILE_NAME'] = jsonname[:index] + '.' + jsonname[index:].split('_')[1]
                    #原路径
                    jobdict['CUT_FILE_PATH'] = 'G:\\IMG' + '\\' + curpath
                    #时间
                    jobdict['HANDLE_TIME'] = time.strftime("%Y-%m-%d %X", time.localtime())
                    #药品名
                    jobdict['DRUG_NAME'] = dragname
                    #影像件类型
                    jobdict['FILE_TYPE'] = '药品生产许可证'
                    #同一套影像件识别码
                    jobdict['ID_CODE'] = id_code
                    #分公司
                    jobdict['SRC_CO'] = curpath.split('\\')[1]
                    #源文件相对路径
                    jobdict['FILE_REL_PATH'] = '\\' + jsonname[:index] + '.' + jsonname[index:].split('_')[1]
                    #文件服务器域名
                    jobdict['SYS_URL'] = '10.67.28.8'
                    #页数
                    jobdict['PAGE_NUM'] = page
                    #文件ocr解析识别状态 fk sysparams
                    jobdict['OCR_STATE'] = 'T'
                    #备注说明
                    jobdict['REMARK'] = ''
                    #创建用户
                    jobdict['ADD_USER'] = '******'
                    # 图片过大或者一些原因,没有识别出来就会有error_code字段
                    if 'error_code' in datajson:
                        jobdict['IS_TO_DB'] = 'F'
                        self.job.job_add(jobdict)
                        self.job.job_todb()
                        self.job.job_del() 
                        self.logmgr.error(file[0] + '\\' + file_name + ": img size error!")
                        continue

                    #source_img_path = 'img\\'+jsonname+'.jpg' #由于需要增加分栏的程序所以,需要图片的路径,但是目前这里面的路径存在一定的问题
                    # source_img_path = file[0] + '\\' + file_name
                    # original_path = path_root + '\\' + curpath + '\\' + imgname[:index - 2] + '.' + 'pdf'
                    # FIXME:换工作环境这里也得改!
                    try:
                       kindict = hmc.kinds(source_img_path, jsonPath)
                    except Exception as e:
                        self.logmgr.error(file[0] + '\\' + file_name + ':' + str(e))
                        continue
                    #index = jsonname.rfind('.')
                    # print('Current processing: {}'.format(source_img_path + '\\' +
                    #                        '\\' + imgname[:index] +
                    #                        '.' + imgname[index:].split('.')[1],
                    #                        file[0] + '\\' + file_name))

                    datas = datajson['words_result']
                    nums = datajson['words_result_num']
                    if kindict['kinds'] == 2:
                        datas = self.subfiledata(kindict['direction'], kindict['parameter'], kindict['boundary'][0],datas)
                    elif kindict['kinds'] == 1 or kindict['kinds'] == 0:
                        datas = datas
                    flag = 1
                    page += 1
                    
                    #中间文件
                    jobdict['MID_FILE_NAME'] = file_name
                    #中间文件路径
                    jobdict['MID_FILE_PATH'] = file[0]
                    #评分
                    jobdict['OCR_SCORE'] = int(self._getscore(datas, nums))
                    
                    #影像件内容是否入库
                    if len(datas) > 0 and nums > 0:
                        jobdict['IS_TO_DB'] = 'T'
                    else:
                        jobdict['IS_TO_DB'] = 'F'
                    
                    #文件文本内容
                    jobdict['FILE_TEXT'] = self._middict(datas, self.codepath + '\\middata\\' + curpath, jsonname)
                    ###############
                    temp = jobdict['FILE_TEXT']
                    #jobdict['JOB_ID'] = self._generatemd5(jobdict['FILE_TEXT'])
                    ###############
                    
                    page += 1 
                    self.job.job_add(jobdict)
                    self.job.job_todb()
                    self.job.job_del() 
                if flag:
                    if len(datas) > 0 and nums > 0:
                        datadict = self._productionCertificate(datas, nums)
                        if '企业类型' in datadict:
                            del datadict['企业类型']
                        if '此复印件仅限于' in datadict:
                            del datadict['次复印件仅限于']
                        if 'NO' in datadict:
                            del datadict['NO']
                        if '国家食品药品监督管理局制' in datadict:
                            del datadict['国家食品药品监督管理局制']
                        if '中华人民共和国' in datadict:
                            del datadict['中华人民共和国']
                        if '药品许可证' in datadict:
                            del datadict['药品许可证']

                        print(source_img_path)
                        ######################################增加部分###########################################
                        datadict['ID_CODE'] = id_code
                        datadict['REMARK'] = ''
                        datadict['ADD_USER'] = '******'
                        datadict['JOB_ID'] = self._generatemd5(temp)
                        ######################################增加部分###########################################
                        print(datadict)
                        if not datadict:
                            nums = self._cleandata(datadict, datas, nums)
                            continue
                        try:
                            self._data_to_db('DRUGMFRSCERT', datadict)
                            nums = self._cleandata(datadict, datas, nums)
                        except Exception as e:
                            print('Error: ', e)
                            self.logmgr.error(file[0] + '\\' + file_name + "insert error!! : " + str(e))
                            self._update_item('OCRWORKFILE','JOB_ID', jobid,'IS_TO_DB','F')
                            nums = self._cleandata(datadict, datas, nums)
                            continue
Ejemplo n.º 10
0
class GMP(Tools):
    """
    GMP证书的识别
    """
    def __init__(self, imgpath):
        Tools.__init__(self)
        self.imgpath = imgpath
        self.logmgr = LogMgr()

    def _recognize(self,datas, nums):
        """
        识别GMP证书, 程序的主逻辑
        """
        keylist = []
        datadict = dict()
      
        for (word, i) in zip(datas, range(0, nums)):
            '''
            循环读识别出的数据,然后根据judge_keywords函数是否提取到了关键信息;
            若提取到了,则保存到datadict中。
            若未提取到,list_result为空。有两种情况,
                1.这段信息不是我们所需要的。
                2.这段信息是上个关键字的值。
                然后执行else,进行更精确的判别。若是需归到上个字段,则循环递减,根据
                keylist[1],也就是list_reault[2]是否出现再上面的某个字段。若有则追加。
            '''
            list_result = self._judge_keywords(word['words'])
            if list_result != None:
                if list_result[0] in datadict and keylist[-1][0] != list_result[0]:
                    datadict[list_result[0]] += list_result[1]
                    flag = 1
                else:
                    datadict[list_result[0]] = list_result[1]
                    flag = 1
                #保存关键字段的信息,以及这段信息原本关键字段的信息
                keylist.append([list_result[0],list_result[2]])
            else:
                j = i
                while j > 0:
                    if not keylist:
                        break
                    #FIXMEED:逻辑问题  4/10 DONE
                    if re.match(r'\s[a-zA-Z]+', word['words']):
                        break
                    #提取"有效期至"与"发证日期"字段
                    if re.match(r'\d{4}|\d{2}', word['words']):
                        if len(word['words']) <= 4:
                            break
                        elif '/' in word['words']:
                            if keylist[-1][0] == '发证机关':
                                datadict['发证日期'] = word['words']
                                keylist.append(['杂', '杂'])
                                break
                            if '有效期至' in datadict:
                                if re.search(r'\d{4}|\d{2}', datadict['有效期至']):
                                    break
                            else:
                                datadict['有效期至'] = word['words']
                                break
                    if flag:
                        if keylist[-1][0] == '地址':
                            if i + 1 >= nums:
                                break
                            is_scope = self._judge_keywords(datas[i + 1]['words'])
                            if is_scope != None and is_scope[0] == '认证范围':
                                datadict['认证范围'] = word['words']
                                break
                        if keylist[-1][0] == '有效期至':
                            break
                        if keylist[-1][1] in datas[j]['words']:
                            datadict[keylist[-1][0]] += word['words']
                            break
                    j -= 1  
        return datadict

    def _judge_keywords(self, strword):
        '''
        判断关键字,若识别到关键字,返回一个包含关键字的list。
        $resultlist[0] -----要入库的关键字
        $resultlist[1] -----提取到内容
        $resultlist[2] -----需判断的信息中本来的关键字
        如:'证书编号:H12345',resultlist = ['证书编号', 'H12345', '证书编号']
           '证书号:H123', resultlist = ['证书编号', 'H123', '证书号']
        '''
        re_coname = re.compile(r"企业*名称*|企*业名*称")
        re_cernum = re.compile(r"证书*编号*|证*书编*号")
        re_addr = re.compile(r"地址")
        re_cerscope = re.compile(r"认证*范围*|认*证范*围")
        re_valid = re.compile(r"有效期至*|有效*期至")
        re_liceauth = re.compile(r"发证*机关*|发*证机*关")
        re_licedate = re.compile(r"发证*日期*|发*证日*期")
        re_abandon = re.compile(r"经审*查")

        if len(strword) >= 8: 
            index = 6
        else:
            index = len(strword)

        if(re.match(r'.+?(?:\:)', strword[:index])):
            if re_coname.search(strword[:index]):
                return ['企业名称_GMP', strword[re_coname.search(strword).span()[1]:], re_coname.search(strword).group()]
            elif re_cernum.search(strword[:index]):
                return ['证书编号' , strword[re_cernum.search(strword).span()[1] + 1:], re_cernum.search(strword).group()]
            elif re_addr.search(strword[:self._sort_index(strword)]):
                return ['地址' , strword[re_addr.search(strword).span()[1]:],re_addr.search(strword).group()]
            elif re_cerscope.search(strword[:index]):
                return ['认证范围' , strword[re_cerscope.search(strword).span()[1]:],re_cerscope.search(strword).group()]
            elif re_valid.search(strword[:index]):
                return ['有效期至' , strword[re_valid.search(strword).span()[1]:],re_valid.search(strword).group()]
            elif re_liceauth.search(strword[:index]):
                return ['发证机关' , strword[re_liceauth.search(strword).span()[1]:],re_liceauth.search(strword).group()]
            elif re_licedate.search(strword[:index]):
                return ['发证时间' , strword[re_licedate.search(strword).span()[1]:],re_licedate.search(strword).group()]
            else:
                return None
        else: 
            if re_coname.search(strword[:index]):
                return ['企业名称_GMP', strword[re_coname.search(strword).span()[1]:], re_coname.search(strword).group()]
            elif re_cernum.search(strword[:index]):
                return ['证书编号' , strword[re_cernum.search(strword).span()[1] + 1:], re_cernum.search(strword).group()]
            elif re_addr.search(strword[:self._sort_index(strword)]):
                return ['地址' , strword[re_addr.search(strword).span()[1]:],re_addr.search(strword).group()]
            elif re_cerscope.search(strword[:index]):
                return ['认证范围' , strword[re_cerscope.search(strword).span()[1]:],re_cerscope.search(strword).group()]
            elif re_valid.search(strword[:index]):
                return ['有效期至' , strword[re_valid.search(strword).span()[1]:],re_valid.search(strword).group()]
            elif re_liceauth.search(strword[:index]):
                return ['发证机关' , strword[re_liceauth.search(strword).span()[1]:],re_liceauth.search(strword).group()]
            elif re_licedate.search(strword[:index]):
                return ['发证时间' , strword[re_licedate.search(strword).span()[1]:],re_licedate.search(strword).group()]
            elif re_abandon.search(strword[:index]):
                return ['经审查', strword[re_abandon.search(strword).span()[1]:], re_abandon.search(strword).group()]
            else:
                return None

    def gmp_delploy(self, imgs, idcode):
        flag = 0
        tmp = ''
        #datas = []
        for file in imgs:
            file_name = file['imgpath'].split('/')[-1]
            id = file['imgpath'].split('/')[-2]
            if re.search(r'[\u4e00-\u9fa5]+', id):
                dragname = re.search(r'[\u4e00-\u9fa5]+', id).group()
            else:
                dragname = re.search(r'[\u4e00-\u9fa5]+', file_name).group()

            if dragname.find('(') > 0:
                dragname = dragname[:dragname.find('(')]

            if 'error_code' in file['imgjson']:
                self.logmgr.error(file['imgpath'] + " : Img Size Error!")
                continue
            
            datas = file['imgjson']['words_result']
            nums = file['imgjson']['words_result_num']
            
        if len(datas) > 0 and nums > 0:
            datadicttmp = self._recognize(datas, nums)
            datadict = dict()
            if '企业名称_GMP' in datadicttmp:
                if re.match('[::]',datadicttmp['企业名称_GMP']):
                    datadict['企业名称_GMP'] = datadicttmp['企业名称_GMP'][1:]
                else:
                    datadict['企业名称_GMP'] = datadicttmp['企业名称_GMP']
            if '证书编号' in datadicttmp:
                if re.match('[::]',datadicttmp['证书编号']):
                    datadict['证书编号'] = datadicttmp['证书编号'][1:]
                else:
                    datadict['证书编号'] = datadicttmp['证书编号']
            if '地址' in datadicttmp:
                if re.match('[::]',datadicttmp['地址']):
                    datadict['地址'] = datadicttmp['地址'][1:]
                else:
                    datadict['地址'] = datadicttmp['地址']
            if '认证范围' in datadicttmp:
                if re.match('[::]',datadicttmp['认证范围']):
                    datadict['认证范围'] = datadicttmp['认证范围'][1:]
                else:
                    datadict['认证范围'] = datadicttmp['认证范围']

            if '有效期至' in datadicttmp:
                if re.match('[::]',datadicttmp['有效期至']):
                    datadict['有效期至'] = datadicttmp['有效期至'][1:]
                else:
                    datadict['有效期至'] = datadicttmp['有效期至']


            if '发证机关' in datadicttmp:
                if re.match('[::]',datadicttmp['发证机关']):
                    datadict['发证机关'] = datadicttmp['发证机关'][1:]
                else:
                    datadict['发证机关'] = datadicttmp['发证机关']

            if '发证日期' in datadicttmp:
                if re.match('[::]',datadicttmp['发证日期']):
                    datadict['发证日期'] = datadicttmp['发证日期'][1:]
                else:
                    datadict['发证日期'] = datadicttmp['发证日期']
            if '地址' not in datadict:
                datadict['地址'] = ''
            if '企业名称_GMP' not in datadict:
                datadict['企业名称_GMP'] = ''
            if re.search(r'.+公司.+',datadict['企业名称_GMP']):
                datadict['地址'] = datadict['地址']+datadict['企业名称_GMP'].split('公司')[1]
                datadict['企业名称_GMP'] = datadict['企业名称_GMP'].split('公司')[0]+'公司'

            if not datadict:
                nums = self._cleandata(datadict, datas, nums)
                return datadict
            return datadict
                #try:
                #    #self._data_to_db('GMPCERT', datadict)
                #    nums = self._cleandata(datadict, datas, nums)
                #except Exception as e:
                #    print('Error: ', e)
                #    #self._update_item('OCRWORKFILE','JOB_ID', jobid,'IS_TO_DB','F')
                #    self.logmgr.error(file[0] + '\\' + file_name + "insert error!! : " + str(e))
                #    nums = self._cleandata(datadict, datas, nums)
                #    return 'None'


    def gmp(self, datapath, id_code):
        flag = 0
        temp = ''
        for file in os.walk(datapath):
            jobdict = {}
            for file_name in file[2]:
                page = 1
                if 'GMP证书' in file_name:
                    imgname = file_name.split('.')[0]
                    curpath = file[0].split('data')[1]
                    index = imgname.rfind('_')
                    id = curpath[curpath.rfind('\\') + 1:]
                    if re.search(r'[\u4e00-\u9fa5]+', id):
                        dragname = re.search(r'[\u4e00-\u9fa5]+', id).group()
                    else:
                        dragname = re.search(r'[\u4e00-\u9fa5]+', file_name).group()

                    if dragname.find('(') > 0:
                        dragname = dragname[:dragname.find('(')]
                    #id_code = id[name_index_e - 1:]
                    datajson = self._load_json(file[0] + '\\' + file_name)
                    original_path = self.imgpath + '\\' + curpath + '\\' + imgname[:index - 2] + '.' + 'pdf'

                    #服务器
                    jobdict['SER_IP'] = '10.67.28.8'
                    #job id
                    jobdict['JOB_ID'] = self._generatemd5(file[0] + imgname)
                    jobid = jobdict['JOB_ID']
                    jobdict['SRC_FILE_NAME'] = imgname[:index - 2] + '.' + 'pdf'
                    jobdict['SRC_FILE_PATH'] = original_path
                    # jobdict['JOB_ID'] = self._generatemd5(jobdict[])
                    #原文件
                    jobdict['CUT_FILE_NAME'] = imgname[:index] + '.' + imgname[index:].split('_')[1]
                    #原路径
                    jobdict['CUT_FILE_PATH'] = 'G:\\IMG' + '\\' + curpath
                    #时间
                    jobdict['HANDLE_TIME'] = time.strftime("%Y-%m-%d %X", time.localtime())
                    #药品名
                    jobdict['DRUG_NAME'] = dragname
                    #影像件类型
                    jobdict['FILE_TYPE'] = 'GMP证书'
                    #同一套影像件识别码
                    jobdict['ID_CODE'] = id_code
                    #分公司
                    jobdict['SRC_CO'] = curpath.split('\\')[1]
                    #源文件相对路径
                    jobdict['FILE_REL_PATH'] = '\\' + imgname[:index] + '.' + imgname[index:].split('_')[1]
                    #文件服务器域名
                    jobdict['SYS_URL'] = '10.67.28.8'
                    #页数
                    jobdict['PAGE_NUM'] = page
                    #文件ocr解析识别状态 fk sysparams
                    jobdict['OCR_STATE'] = 'T'
                    #备注说明
                    jobdict['REMARK'] = ''
                    #创建用户
                    jobdict['ADD_USER'] = '******'
                    #图片过大或者一些原因,没有识别出来就会有error_code字段
                    if 'error_code' in datajson:
                        jobdict['IS_TO_DB'] = 'F'
                        self.job.job_add(jobdict)
                        self.job.job_todb()
                        self.job.job_del()
                        self.logmgr.error(file[0] + '\\' + file_name + ": img size error!")
                        continue
                    #source_img_path = imgpaht_root_desktop + '\\' + curpath + '\\' + imgname[:index] + '.' + imgname[index:].split('_')[1]
                    #try:
                    #    kindict = hmc.kinds(source_img_path, datajson)
                    #except Exception as e:
                    #    logmgr.error(file[0] + '\\' + file_name + ':' + str(e))
                    #    continue
                    #print('Current processing: {}'.format(imgpaht_root_desktop + '\\' + curpath + 
                    #                        '\\' + imgname[:index] + 
                    #                        '.' + imgname[index:].split('_')[1], 
                    #                        file[0] + '\\' + file_name))
                    datas = datajson['words_result']
                    nums = datajson['words_result_num']
                    flag = 1

                    #中间文件
                    jobdict['MID_FILE_NAME'] = file_name
                    #中间文件路径
                    jobdict['MID_FILE_PATH'] = file[0]
                    #评分
                    jobdict['OCR_SCORE'] = int(self._getscore(datas, nums))
                    #影像件内容是否入库
                    if len(datas) > 0 and nums > 0:
                        jobdict['IS_TO_DB'] = 'T'
                    else:
                        jobdict['IS_TO_DB'] = 'F'
                    
                    #文件文本内容
                    jobdict['FILE_TEXT'] = self._middict(datas, self.codepath + '\\middata\\' + curpath, imgname)
                    ###############
                    temp = jobdict['FILE_TEXT']
                    #jobdict['JOB_ID'] = self._generatemd5(jobdict['FILE_TEXT'])
                    ###############
                    
                    try:
                        self.job.job_add(jobdict)
                    except Exception:
                        self.job.update_item('JOB_ID', jobid, 'IS_TO_DB', 'F')
                    self.job.job_todb()
                    self.job.job_del()
            if flag:
                if len(datas) > 0 and nums > 0:
                    datadicttmp = self._recognize(datas, nums)
                    datadict = dict()
                    if '企业名称_GMP' in datadicttmp:
                        if re.match('[::]',datadicttmp['企业名称_GMP']):
                            datadict['企业名称_GMP'] = datadicttmp['企业名称_GMP'][1:]
                        else:
                            datadict['企业名称_GMP'] = datadicttmp['企业名称_GMP']
                    if '证书编号' in datadicttmp:
                        if re.match('[::]',datadicttmp['证书编号']):
                            datadict['证书编号'] = datadicttmp['证书编号'][1:]
                        else:
                            datadict['证书编号'] = datadicttmp['证书编号']
                    if '地址' in datadicttmp:
                        if re.match('[::]',datadicttmp['地址']):
                            datadict['地址'] = datadicttmp['地址'][1:]
                        else:
                            datadict['地址'] = datadicttmp['地址']
                    if '认证范围' in datadicttmp:
                        if re.match('[::]',datadicttmp['认证范围']):
                            datadict['认证范围'] = datadicttmp['认证范围'][1:]
                        else:
                            datadict['认证范围'] = datadicttmp['认证范围']

                    if '有效期至' in datadicttmp:
                        if re.match('[::]',datadicttmp['有效期至']):
                            datadict['有效期至'] = datadicttmp['有效期至'][1:]
                        else:
                            datadict['有效期至'] = datadicttmp['有效期至']


                    if '发证机关' in datadicttmp:
                        if re.match('[::]',datadicttmp['发证机关']):
                            datadict['发证机关'] = datadicttmp['发证机关'][1:]
                        else:
                            datadict['发证机关'] = datadicttmp['发证机关']

                    if '发证日期' in datadicttmp:
                        if re.match('[::]',datadicttmp['发证日期']):
                            datadict['发证日期'] = datadicttmp['发证日期'][1:]
                        else:
                            datadict['发证日期'] = datadicttmp['发证日期']

                    ######################################增加部分###########################################
                    datadict['ID_CODE']=id_code
                    datadict['REMARK']=''
                    datadict['ADD_USER']='******'
                    datadict['JOB_ID'] = self._generatemd5(temp)
                    if '地址' not in datadict:
                        datadict['地址'] = ''
                    if '企业名称_GMP' not in datadict:
                        datadict['企业名称_GMP'] = ''
                    if re.search(r'.+公司.+',datadict['企业名称_GMP']):
                        datadict['地址'] = datadict['地址']+datadict['企业名称_GMP'].split('公司')[1]
                        datadict['企业名称_GMP'] = datadict['企业名称_GMP'].split('公司')[0]+'公司'

                    ######################################增加部分###########################################
                    print(datadict)
                    if not datadict:
                        nums = self._cleandata(datadict, datas, nums)
                        continue
                    try:
                        self._data_to_db('GMPCERT', datadict)
                        nums = self._cleandata(datadict, datas, nums)
                    except Exception as e:
                        print('Error: ', e)
                        self._update_item('OCRWORKFILE','JOB_ID', jobid,'IS_TO_DB','F')
                        self.logmgr.error(file[0] + '\\' + file_name + "insert error!! : " + str(e))
                        nums = self._cleandata(datadict, datas, nums)
                        continue 
Ejemplo n.º 11
0
class MyOcr(object):
    """
    文字识别
    @app_id @api_key @secret_key 为百度ai平台上申请的值
    @typeid 精度的选择
            1--调用通用文字识别
            2--含位置信息的通用文字识别
            3--高精度的文字识别
            4--含位置信息的高精度文字识别
    """
    def __init__(self, typeid, app_id = APP_ID, api_key = API_KEY, secret_key = SECRET_KEY):
        self.client = AipOcr(app_id, api_key, secret_key)
        #self.client = AipOcr(appid[1], apikey[1], secretkey[1])
        self.typeid = typeid
        self.codepath = os.path.dirname(__file__)
        self.datapath = self.codepath + '\data'
        os.makedirs(self.datapath, exist_ok=True)
        self.log = LogMgr()
    

    def _get_file_content(self, filePath):
        """读取图片"""
        with open(filePath, 'rb') as fp:
            return fp.read()

    def _write_json_file(self, filepath, data):
        """写入json文件"""
        with open(filepath, 'w', encoding = 'utf-8') as fw:
            fw.write(json.dumps(data, ensure_ascii=False))
        
    def _list_custom(self, path):
        root = os.listdir(path)
        return os.listdir(path + '\\' + root[0]), path + '\\' + root[0]

    def ocr_deploy(self, rec_dict):
        files = rec_dict['files']
        #ocr所需的参数
        options = {}
        options["detect_direction"] = "true" 
        options["detect_language"] = "true"
        options["probability"] = "true"
        
        #dirlist = os.listdir(imgpath)
        #dirlist, root = self._list_custom(imgpath)
        for file in files:
            if re.search(r'进口注册证|GMP|说明书|药品再注册批件|营业执照|生产许可证|进口药品许可证|进口药品注册证', file['type']):
                for img in file['imgs']:
                    print('Current img: {}'.format(img['imgpath']))
                    try:
                        data = self.client.accurate(base64.b64decode(bytes(img['base64'], encoding='utf-8')), options)
                    except Exception as e:
                        print('Error: ', e)
                        self.log.error(img['imgpath'] + "Error! : " + str(e))
                        continue
                    img.update({"imgjson" : data})
        return rec_dict

    def _ocr(self, imgpath):
        """
        识别img文件下的图片
        @输出json数据,保存到data文件夹下
        """
        #imgpath = self.codepath + '\IMG'+'\国控天星'
        #FIXME:电脑环境不同,路径也不一样,切换环境的话要修改路径
        #imgpath = 'F:\IMG'
        #imgpath = r'D:\IMG'

        options = {}
        options["detect_direction"] = "true" 
        options["detect_language"] = "true"
        options["probability"] = "true"
        
        #FIXME:图片路径需改
        dirlist = os.listdir(imgpath)
        root = imgpath
        #dirlist, root = self._list_custom(imgpath)
        for file in os.walk(imgpath):
            for file_name in file[2]:
                if re.search(r'进口注册证|GMP|说明书|药品再注册批件|营业执照|生产许可证|进口药品许可证', file_name):
                    if '备案' in file_name:
                        continue
                    if os.path.isdir(file[0] + '\\' + file_name):
                        continue
                    if not re.match(r'[jJ][pP][gG]', file_name[-3:]):
                        continue
                    datafilepath = self.datapath + file[0].split('IMG')[1]
                    if not os.path.exists(datafilepath):
                        os.makedirs(datafilepath)
                    img = self._get_file_content(file[0] + '\\' + file_name)
                    if file_name[:-4].find('.'):
                        file_name = file_name[:-4].replace('.', '') + file_name[-4:]
                    try:
                        prefix,suffix = file_name.split('.')
                    except Exception as e:
                        print('split error: {}\ncurrent file: {}'.format(e, file[0] + '\\' + file_name))
                        self.log.error(file[0] + '\\' + file_name + " Error!! : " + str(e))
                        continue
                    #判断文件是否存在
                    if os.path.isfile((datafilepath +'\{}.json').format(prefix + '_' + suffix)):
                        continue
                    print('Current img: {}'.format(file[0] + '\\' + file_name))
                    #FIXME:
                    testdict = dict()
                    testdict['base64'] = str(base64.b64encode(img), 'utf-8')
                   #img_test = str.encode(testdict['base64'])
                    #self._write_json_file('F:\\IMG\\11A0015\\test.json', str(img))
                    try: 
                        if self.typeid == 1:
                            data = self.client.basicGeneral(img, options)
                        elif self.typeid == 2:
                            data = self.client.general(img, options)
                        elif self.typeid == 3:
                            data = self.client.basicAccurate(base64.b64decode(bytes(testdict['base64'], encoding='utf-8')), options)
                        elif self.typeid == 4:
                            data = self.client.accurate(img, options)
                    except Exception as e:
                        print('Error: ', e)
                        self.log.error(file[0] + '\\' + file_name + " Error!! : " + str(e))
                        continue
                    
                    self._write_json_file((datafilepath +'\{}.json').format(prefix + '_' + suffix), data)       


                
    def _write_dict(self):
        files = os.listdir(self.datapath)
        for file in files:
            format_data = introduction.introduction(self.datapath + '\\' + file)
            print(format_data)

    def pdf2img(self):
        """pdf转jpg"""
        file_dir = self.codepath + '/PDF/说明书/'
        save_dir = self.codepath + '/IMG/图片/'
        for files in os.walk(file_dir):
            for file_name in files[2]:
                file_path = file_dir
                [file_name_prefix, file_name_suffix] = file_name.split('.')
                file = file_dir + file_name
                with(Image(filename=file, resolution=300)) as img:
                    images = img.sequence
                    pages = len(images)
                    for i in range(pages):
                        images[i].type = 'truecolor'
                        save_name = save_dir + file_name_prefix + str(i) + '.jpg'
                        Image(images[i]).save(filename=save_name)
    
    def run(self, imgpath):
        """入口函数"""
        print('********Start Identify********')
        self._ocr(imgpath)
        print('********End********')
Ejemplo n.º 12
0
# -*- coding : utf-8 -*-

import os
import json
from DatabaseToolsNew import cxOracle
import re
from FindKeyword import findImportWords
import HowManyColumn4 as hmc
#import openpyxl
import xlwings as xw
from log import LogMgr

logmgr = LogMgr()
'''
使用openpyxl太慢了,改用xlwings
wb = openpyxl.load_workbook('C:\\Users\\DevinChang\\Desktop\\四家分公司影印件清单_去重匹配版.xlsx')
sheets = wb.sheetnames
sheet = wb.get_sheet_by_name(sheets[0])
shopid = sheet['B']
name = sheet['C']
strength = sheet['D']
mfrs = sheet['F']
'''


def load_excel(excel):
    wb = xw.Book(excel)
    sheet = wb.sheets[0]
    shopid = sheet['B:B'].value
    name = sheet['C:C'].value
    strength = sheet['D:D'].value