Esempio n. 1
0
def demo_flask(image_file):
    grocery = Grocery('NameIdAdd_NLP')
    model_name = grocery.name
    text_converter = None
    tgm = GroceryTextModel(text_converter, model_name)
    tgm.load(model_name)
    grocery.model = tgm

    t = time.time()
    result_dir = './result'
    image = np.array(Image.open(image_file).convert('RGB'))
    result, image_framed = ocr_whole.model(image)
    output_file = os.path.join(result_dir, image_file.split('/')[-1])
    Image.fromarray(image_framed).save(output_file)
    name_total = ''
    id_total = ''
    for key in result:
        string1 = result[key][1]
        if len(string1) <= 8:
            continue
        string2 = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*{}[]+", "", string1)
        no_digit = len(list(filter(str.isdigit, string2.encode('gbk'))))
        no_alpha = len(list(filter(is_alphabet, string2)))
        if len(set('法定代表人') & set(string2)) >= 2 or len(set('经营范围') & set(string2)) >= 2 or '资本' in string2 or '类型' in string2 or len(set('年月日') & set(string2)) >= 2 or len(set('登记机关') & set(string2)) >= 2 or '电话' in string2:
            predict_result = 'others'
        elif len(set('经营场所') & set(string2)) >= 3 or '住所' in string2 or len(set('营业场所') & set(string2)) >= 3:
            predict_result = 'company-address'
        elif len(set('统一社会信用代码') & set(string2)) >= 2 or ((no_digit+no_alpha) / len(string2) > 0.5 and no_digit > 8):
            predict_result = 'company-id'
        elif '名称' in string2:
            predict_result = 'company-name'
        else:
            predict_result = grocery.predict(string2)
        if str(predict_result) == 'company-name':
            name_total += string1
            break
        elif str(predict_result) == 'company-id':
            id_total += string1
        else:
            continue
    id_total = re.sub(r'\W', '', id_total)
    name_total = stupid_revise(name_total)
    print("Mission complete, it took {:.3f}s".format(time.time() - t))
    print('\nRecongition Result:\n')
    print(id_total)
    print(name_total)
    return output_file, id_total, name_total
"""
import re
import pandas as pd
from tgrocery import Grocery
from tgrocery.classifier import *

extra_addrs_dir = 'addrs_libs/full_address1.csv'
extra_lib = pd.read_csv(extra_addrs_dir, encoding='utf-8')
provinces = extra_lib[extra_lib['level'] == 1].loc[:, 'Name']
cities = extra_lib[extra_lib['level'] == 2].loc[:, 'Name']
grocery = Grocery('NameIdAdd_NLP')
model_name = grocery.name
text_converter = None
tgm = GroceryTextModel(text_converter, model_name)
tgm.load(model_name)
grocery.model = tgm


class Found(Exception):
    pass


def is_alphabet(uchar):
    """判断一个unicode是否是英文字母"""
    if (u'\u0041' <= uchar <= u'\u005a') or (u'\u0061' <= uchar <= u'\u007a'):
        return True
    else:
        return False


def preprocess_ocr(result):
Esempio n. 3
0
def demo_flask(image_file):
    grocery = Grocery('Addrss_NLP')
    model_name=grocery.name
    text_converter=None
    if (os.path.exists(model_name)):
        tgM=GroceryTextModel(text_converter,model_name)
        tgM.load(model_name)
        grocery.model=tgM
        print('load!!!!!')
    else:
        add_file = open('pkl_data/address1.pkl', 'rb')
        other_file = open('pkl_data/others1.pkl', 'rb')
        add_list = pickle.load(add_file)
        other_list = pickle.load(other_file)
        add_file .close()
        other_file .close()
        grocery = Grocery('Addrss_NLP')
        add_list.extend(other_list)
        grocery.train(add_list)
        print (grocery.get_load_status())
        grocery.save()
        # print('train!!!!!!!!')
    addrline = [] 
    t = time.time()
    result_dir = '/data/share/nginx/html/bbox'
    image = np.array(Image.open(image_file).convert('RGB'))
    result, image_framed = ocr_whole.model(image)
    output_file = os.path.join(result_dir, image_file.split('/')[-1])
    Image.fromarray(image_framed).save(output_file)
    ret_total = ''
    for key in result:
        string1 = result[key][1]
        # print("predict line text :",string1)
        string2 = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*{}[]+", "", string1)
        no_digit = len(list(filter(str.isdigit, string2)))
        no_alpha = len(list(filter(is_alphabet, string2)))
        if '注册' in string2 or '洼册' in string2 or '洼·册' in string2 or '洼.册' in string2 or '汪·册' in string2 or len(set('登记机关') & set(string2)) >= 3 or '电话' in string2 or ((no_digit / len(string2) > 0.7 and no_digit > 5)):
            predict_result='others'
        elif no_alpha>5 or len(set('经营范围化学品') & set(string2)) >= 3 or len(set('年月日') & set(string2)) >= 2:
            predict_result='others'
        else:
            predict_result = grocery.predict(string2)
        if (str(predict_result) == 'address'):
            string1 = string1.replace('《', '(')
            string1 = string1.replace('》', ')')
            string1 = string1.replace('(', '(')
            string1 = string1.replace(')', ')')
            string1 = string1.replace('((','(')
            if ((not ret_total) or len(string1) > len(ret_total)):
                ret_total = ''
                ret_total += string1
            else:
                ret_total += string1
    
    if ')' in ret_total:
        if '(' not in ret_total:
            ret_total = ret_total.replace('C', '(')
    ret_total = re.sub(r'((\w)住所(.*)', '', ret_total)
    ret_total = re.sub(r'((\w)住房(.*)', '', ret_total)
    ret_total = re.sub(r'(不作为(.*)', '', ret_total)
    ret_total = re.sub(r'(有效期(.*)', '', ret_total)
    ret_total = re.sub(r'(仅限(.*)', '', ret_total)
    ret_total = re.sub(r'(临时经营(.*)', '', ret_total)
    ret_total = re.sub(r'(仅限办公(.*)', '', ret_total)
    ret_total = re.sub(r'(经营场所(.*)', '', ret_total)
    ret_total = re.sub(r"^[经]*[营]*[场/住]*[所]*", "", ret_total)
    ret_total = stupid_revise(ret_total)
    print("Mission complete, it took {:.3f}s".format(time.time() - t))
    print('\nRecongition Result:\n')
    print(ret_total)
    return output_file,ret_total