def detect_text(self, file):
        ocr_time = timeutil.TimeElapsed()
        words_list = []

        tick = datetime.now()
        """Detects text in the file."""
        client = vision.ImageAnnotatorClient()
        sys.path.append('../util')
        # sys.path.append('..\\util')
        # print(sys.path)

        # [START vision_python_migration_text_detection]
        with io.open(file, 'rb') as image_file:
            content = image_file.read()

        image = vision.types.Image(content=content)

        response = client.text_detection(image=image)
        texts = response.text_annotations

        # for text in texts:
        #     print("{}".format(text.description))
        #     sysutil.copy2clip(text.description)
        #     pyperclip.copy(text.description)
        #     break;

        index = 0
        for text in texts:
            index += 1

            # 방법 1: vertext 정보를 list로 변환하여 return
            # print(text.description, text.bounding_poly.vertices)
            # print(type(text.description), type(text.bounding_poly.vertices))
            # print('"{} [{:>15s}]"'.format(index, text.description))

            vertices = [[int(vertex.x), int(vertex.y)]
                        for vertex in text.bounding_poly.vertices]

            temp = [text.description, vertices]

            # 방법 2: 직접 generator object를 return, 사용할 때 철가 필요
            # print(text.description, text.bounding_poly.vertices)
            # print(type(text.description), type(text.bounding_poly.vertices))
            # print('"{} [{:>15s}]"'.format(index, text.description))
            # vertices = ('[{},{}]'.format(int(vertex.x), int(vertex.y))
            #             for vertex in text.bounding_poly.vertices)

            words_list.append(temp)

        print("Time to spend for OCR 인식", ocr_time.getelapsed())
        return words_list
Example #2
0
    def load_data(self, filename):
        self.filename = filename
        loadingtime = timeutil.TimeElapsed()
        with open(self.filename, 'r', encoding='utf8') as words_f:
            # file에서  읽어서 loading하는 부분은 실 환경에 따라 변경됨.
            for line in words_f:
                # print('[', line, ']')
                data = line.replace('\n', '')
                if data.__len__() == 0:
                    continue

                data = data.replace('\t', ' ')
                wordslist = data.split(' ')
                print(wordslist)

                for word in wordslist:
                    DataLoader.wordsset.add(word)

        print("Time to spend for loading", loadingtime.getelapsed())
        print("Words count", len(DataLoader.wordsset))
Example #3
0
def detect_text(path):
    ocrtimer = timeutil.TimeElapsed()
    """Detects text in the file."""
    client = vision.ImageAnnotatorClient()

    with io.open(path, 'rb') as image_file:
        content = image_file.read()

    image = vision.types.Image(content=content)
    # image = vision.types.Image()

    response = client.text_detection(image=image)
    texts = response.text_annotations
    print('Type(texts):', type(texts))
    print('Texts:', texts)
    print('--------------------')

    print("\nOCR elapsed: ", ocrtimer.getelapsed())
    print('\nAll Data:')

    # ---- 전체 단어와 position을 나열한다
    count = 0
    for text in texts:
        newtext = text.description.replace('\n', ' ')
        print('{:>2} Desc:[{:>20s}],'.format(count, newtext), end='')

        vertices = ([
            '({},{})'.format(vertex.x, vertex.y)
            for vertex in text.bounding_poly.vertices
        ])
        # print('\n\tvertices:', vertices, ' Type:', type(vertices))
        print('bounds: {}'.format(','.join(vertices)))

        count = count + 1

    # ---- 해당하는 단어의 position을 가져온다
    print('--------------------')
    for text in texts:
        if text.description == '당도':
            for vertex in text.bounding_poly.vertices:
                print('Found....', vertex)
Example #4
0
import sys
sys.path.insert(1, '/Users/andrew/Documents/py_basic/util')
import timeutil


def loadDataToSet(data):
    dataset = set()
    dataset.add('123')
    dataset.add('abc')
    return dataset


if __name__ == '__main__':
    loadtimer = timeutil.TimeElapsed()
    dataset = loadDataToSet('copy2clip is working')
    print("loading....", loadtimer.getelapsed())

    count = 0
    testdate = ['abc', 'def']
    for word in testdate:
        if word in dataset:
            count = count + 1

    print("searching....", loadtimer.getelapsed(), ",", count)

'''
1안
DB에 있는 단어, 조사를 완성하여 set을 구성
OCR로 인식된 단어를 set에서 검색
2안
OCR로 인식된 단어로 set을 구성
Example #5
0
    def load_data(self, fname):
        self.filename = fname
        loadingtime = timeutil.TimeElapsed()

        num_not_necessary = 0
        name_of_sheet = '05.31'

        # df = pd.read_excel(filename, sheet_name='Sheet1')
        df = pd.read_excel(self.filename,
                           sheet_name=name_of_sheet,
                           header=num_not_necessary)

        count = 0
        # col_lists = ['공통', '공통\n(실증필요단어)', '의류', '속옷', '패션잡화', '가구', '침구/침장', '가전', '스포츠/레저', '식품', '주방용품', \
        #              '이미용\n(실증필요단어)', '생활/잡화', '유아동용품', '문화/서비스', '보석/장신구', '건강', '이미용']
        col_lists = df.columns
        # print('col_lists ==>', col_lists)

        # ---- column들의 길이를 가져온다 ----
        # maxColumnLenghts = []
        # for col in range(len(df.columns)):
        #     maxColumnLenghts.append(max(df.iloc[:,col].astype(str).apply(len)))
        # print('Max Column Lengths ', maxColumnLenghts)

        if num_not_necessary is not None:
            max_index = df.__len__() - num_not_necessary
        else:
            max_index = df.__len__()

        for col in col_lists:
            print('-------------', col, '------------------')
            for i in range(0, max_index):
                cell_value = df.at[i, col]
                if str(cell_value).replace(' ', '') == 'nan':
                    continue

                # 값을 보정 (특히 숫자인 경우 필요)
                if cell_value == 1:
                    cell_value = '100%'

                print('{:3>d} {}'.format(count, cell_value))

                DataLoader.wordsset.add(cell_value)
                count = count + 1

                count = self.add_word_with_postposition(cell_value, count)

        # with open(self.filename, 'r', encoding='utf8') as words_f:
        #     # file에서  읽어서 loading하는 부분은 실 환경에 따라 변경됨.
        #     for line in words_f:
        #         # print('[', line, ']')
        #         data = line.replace('\n', '')
        #         if data.__len__() == 0:
        #             continue
        #
        #         data = data.replace('\t', ' ')
        #         wordslist = data.split(' ')
        #         print(wordslist)
        #
        #         for word in wordslist:
        #             DataLoader.wordsset.add(word)

        print("Time to spend for 금지어  로딩", loadingtime.getelapsed())
        print("Words count", len(DataLoader.wordsset))
Example #6
0
    # ---- 전체 단어와 position을 나열한다
    count = 0
    for text in texts:
        newtext = text.description.replace('\n', ' ')
        print('{:>2} Desc:[{:>20s}],'.format(count, newtext), end='')

        vertices = ([
            '({},{})'.format(vertex.x, vertex.y)
            for vertex in text.bounding_poly.vertices
        ])
        # print('\n\tvertices:', vertices, ' Type:', type(vertices))
        print('bounds: {}'.format(','.join(vertices)))

        count = count + 1

    # ---- 해당하는 단어의 position을 가져온다
    print('--------------------')
    for text in texts:
        if text.description == '당도':
            for vertex in text.bounding_poly.vertices:
                print('Found....', vertex)


if __name__ == '__main__':
    wholetimer = timeutil.TimeElapsed()
    # detect_text('/Volumes/USB3-64/Image/10167389_1.jpg')
    detect_text(
        '/Volumes/USB3-64/Image/3. 10164315  쁘띠프루티 울트라 모이스처라이징 페이셜 마스크팩.jpg'
    )
    print("\n\nTotal elapsed: ", wholetimer.getelapsed())
Example #7
0
def main():
    global loader
    STX = 2

    # 엑셀 파일에 있는 금칙어를 로드,
    loader = dataloader.DataLoader()
    loader.load_data('/Users/andrew/Downloads/pWords.xlsx')

    # 형태소분석기를 로드
    tokenizer = analyser.Analyser()

    # TCP/IP 구성 정보를 로드
    global tcp_config
    global conn
    global interval

    try:
        with open('tcp_config.json', encoding='utf-8') as json_file:
            tcp_config = json.load(json_file)
    except FileNotFoundError:
        print("No File exists...")
        exit('socket configuration exception')

    host = tcp_config['hostname']
    port = tcp_config['port']
    interval = tcp_config['interval']

    error_cnt = 0
    while True:
        conn = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        try:
            conn.connect((host, port))
        except ConnectionRefusedError:
            conn.close()
            time.sleep(10)
            continue

        while True:
            try:
                content = conn.recv(200)
            except socket.error as e:
                print("error while receiving :: " + str(e), e.errno)
                if e.errno == errno.EPIPE:
                    conn.close()
                    break
                else:
                    raise
                    # exit("terminating")
            except:
                print("error 2 while receiving :: ")
                print(errno)
                break

            line_time = timeutil.TimeElapsed()

            if len(content) == 0:
                error_cnt += 1
                if error_cnt > 3:
                    error_cnt = 0
                    break

            if content[0] != STX:
                print('wrong data from the server..')
                continue

            actual_data = content[2:]
            print('[' + actual_data.decode(encoding='cp949') + ']')

            # 일반로직에 따라수신한 문장의 단어별 일따라 금지어 존재 검색
            pword_list = check_pword(actual_data.decode(encoding='cp949'))
            if len(pword_list) > 1:
                print(pword_list, len(pword_list))

            # 복합명사 처리를 위해 속기기록을 형태소 분석하여 명사만 가져온다
            nouns_list = tokenizer.get_noun_tokens(
                actual_data.decode(encoding='cp949'))
            for noun in nouns_list:
                print('Noun', noun)

            print("Time spent to analyse line: ", line_time.getelapsed())
            print('------------------------------------')
Example #8
0
import sys
import os

# sys.path.append('/Users/andrew/Documents/py_basic/util')
sys.path.insert(1, '/Users/andrew/Documents/py_basic/util')
# sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
# print(sys.path)
import timeutil

a = timeutil.TimeElapsed()
for i in range(10000):
    print(".", end='')
print()
print("elapsed....", a.getelapsed())

for i in range(2000):
    print(i, end='')
print()
print("elapsed2....", a.getelapsed())
Example #9
0
def main():
    # 엑셀 파일에 있는 금칙어를 로드,
    loader = dataloader.DataLoader()
    loader.load_data(
        '/Users/andrew/Documents/RPA_신세계TV쇼핑/SSG닷컴_RM_키워드(금칙어)_해제 완료.xlsx'
    )

    totaltime = timeutil.TimeElapsed()
    print('-----------------------------------------------')

    # GCV 오류 발생한 단어에 대한 dictionary를 로드
    gcv_error_data = {}
    try:
        with open('gcv_error_dic.json', encoding='utf-8') as json_file:
            gcv_error_data = json.load(json_file)
    except FileNotFoundError:
        print("No File exists...")
    print('-----------------------------------------------')

    # 상품 이미지 파일을 GCV를 이용하여 텍스트 인식
    hndlr = textDetectionUsingGCV.OCRHandler()
    text_lists = hndlr.detect_text(
        '/Volumes/USB3-64/Image/1. 10160920 다나한 인삼잎 보윤 수분크림.jpg'
    )
    # text_lists =  hndlr.detect_text('/Volumes/USB3-64/Image/2. 10160902 참존 마유 골든컴플렉스 2종 세트.png')
    # text_lists =  hndlr.detect_text('/Volumes/USB3-64/Image/3. 10164315  쁘띠프루티 울트라 모이스처라이징 페이셜 마스크팩.jpg')
    # text_lists =  hndlr.detect_text('/Volumes/USB3-64/Image/4. 10164318 쁘띠프루티 카밍 앤 브라이트 페이셜 마스크팩.png')
    # text_lists =  hndlr.detect_text('/Volumes/USB3-64/Image/5. 10164572 참존 인텐시브 골드 앰플.jpg')
    # text_lists =  hndlr.detect_text('/Volumes/USB3-64/Image/6. 10166107 일동제약 퍼스트랩 프로바이오틱 마스크팩.png')
    # text_lists =  hndlr.detect_text('/Volumes/USB3-64/Image/6. 10166107 일동제약-1.png')
    # text_lists =  hndlr.detect_text('/Volumes/USB3-64/Image/6. 10166107 일동제약-2.png')
    # text_lists =  hndlr.detect_text('/Volumes/USB3-64/Image/6. 10166107 일동제약-3.png')
    # text_lists =  hndlr.detect_text('/Volumes/USB3-64/Image/7. 10167823 이오 에브리원 솝 코코넛레몬.jpg')
    # text_lists =  hndlr.detect_text('/Volumes/USB3-64/Image/8. 10060298 티레이저 기기 + 크림.jpg')
    print('-----------------------------------------------')

    words_found_list = []

    # 인식된 각 텍스트를 금칙어에 있는지 확인
    count = 1
    for curr_word in text_lists[1:]:
        # 1. single 단어를 검색한다
        result = loader.find_word(curr_word[DESC])
        if result == FOUND:
            print("금지어 발견 -->", curr_word[DESC], curr_word[VERTEX])
            words_found_list.append(curr_word)
            count = count + 1
            continue

        # 미발견 시 복합단어 처리 위해, 그러나 다음 단어가 없으면 더 이상 복합 단어 처리는 하지 않음
        if count + 1 >= len(text_lists):
            count = count + 1
            continue

        next_word = text_lists[count + 1]
        # single 단어 검색 안되면, 인접 단어와 조합하여 검색. 그러나, 인접하지 않으면 처리하지 않음
        front_p2_x = curr_word[VERTEX][P2][X]
        front_p2_y = curr_word[VERTEX][P2][Y]
        back_p1_x = next_word[VERTEX][P1][X]
        back_p1_y = next_word[VERTEX][P1][Y]
        if abs(front_p2_x - back_p1_x) > 10 or abs(front_p2_y -
                                                   back_p1_y) > 10:
            count = count + 1
            continue

        # 2. single 단어 검색 안되면, 인접 단어와 space 넣어 조합하여 검색
        combined_word = curr_word[DESC] + ' ' + next_word[DESC]
        result = loader.find_word(combined_word)
        if result == FOUND:
            print("복합 금지어 발견 1-->", combined_word)
            count = count + 1
            continue

        # 3. single 단어 검색 안되면, 인접 단어와 space 없이 조합하여 검색
        combined_word = curr_word[DESC] + next_word[DESC]
        # print(combined_word, len(combined_word))
        result = loader.find_word(combined_word)
        if result == FOUND:
            print("복합 금지어 발견 2-->", combined_word)
            count = count + 1
            continue

        # 4. 오류 사전에서 combined word에 대한 대체명사를 검색하여 있으면 대체명사로 검색
        try:
            replaced_word = gcv_error_data[combined_word]
            # print('대체어: [', combined_word, '->', replaced_word, ']')
        except KeyError:
            replaced_word = ''

        if replaced_word != '':
            result = loader.search_in_dict(replaced_word)
            if result:
                print("대체어 금지어 발견 3-->", curr_word[DESC], curr_word[VERTEX])
                count = count + 1
                continue

        # 5. combined word 길이가 4음절 이상이면, ngram 분리하여 각각 검색
        grams = [2, 3, 4, 5]
        if len(combined_word) > 3:
            for gram in grams:
                start = 0
                end_pos = len(combined_word) + 1
                for end in range(gram, end_pos):
                    substr = combined_word[start:end]
                    start = start + 1
                    # print('NGRAM 단어:', substr)

                    result = loader.search_in_dict(substr)
                    if result == FOUND:
                        print("부분 금지어 발견 4-->", curr_word[DESC],
                              curr_word[VERTEX])
                        break

        count = count + 1

    print("Time spent  for 금지어 발견: ", totaltime.getelapsed())
    print('-----------------------------------------------')
Example #10
0
#     print('length : ', length)
#     print('width : ', width)
#     print('height : ', height)
#     print('fps : ', fps)


# cap = cv2.VideoCapture('BotInsight_M7_L3.mp4')
# cap = cv2.VideoCapture('/Volumes/NO NAME/20191021 영상기술검수 에러반려 예시/video black/N2018010300026_기술검수반려 [M]제왕수산 갈치  뷰티_갈치조림 (수정).MXF')
cap = cv2.VideoCapture('/Volumes/NO NAME/20191021 영상기술검수 에러반려 예시/video black/N2018010800080_기술검수반려 [M]AHC 아이크림 시즌6_인포.MXF')
if not cap.isOpened():
    print("already open..")
    print("so, exit..")
    exit(0)


timer = timeutil.TimeElapsed()
count = 0

frametimer = timeutil.TimeElapsed()
framepos = 0.0
stop_flag = False
while True:
    if stop_flag:
        break

    # 읽을 frame 위치를 지정
    cap.set(cv2.CAP_PROP_POS_FRAMES, framepos)
    ret, frame = cap.read()ㅜ
    if not ret:
        break