def detect_text(self, file): ocr_time = timeutil.TimeElapsed() words_list = [] tick = datetime.now() """Detects text in the file.""" client = vision.ImageAnnotatorClient() sys.path.append('../util') # sys.path.append('..\\util') # print(sys.path) # [START vision_python_migration_text_detection] with io.open(file, 'rb') as image_file: content = image_file.read() image = vision.types.Image(content=content) response = client.text_detection(image=image) texts = response.text_annotations # for text in texts: # print("{}".format(text.description)) # sysutil.copy2clip(text.description) # pyperclip.copy(text.description) # break; index = 0 for text in texts: index += 1 # 방법 1: vertext 정보를 list로 변환하여 return # print(text.description, text.bounding_poly.vertices) # print(type(text.description), type(text.bounding_poly.vertices)) # print('"{} [{:>15s}]"'.format(index, text.description)) vertices = [[int(vertex.x), int(vertex.y)] for vertex in text.bounding_poly.vertices] temp = [text.description, vertices] # 방법 2: 직접 generator object를 return, 사용할 때 철가 필요 # print(text.description, text.bounding_poly.vertices) # print(type(text.description), type(text.bounding_poly.vertices)) # print('"{} [{:>15s}]"'.format(index, text.description)) # vertices = ('[{},{}]'.format(int(vertex.x), int(vertex.y)) # for vertex in text.bounding_poly.vertices) words_list.append(temp) print("Time to spend for OCR 인식", ocr_time.getelapsed()) return words_list
def load_data(self, filename): self.filename = filename loadingtime = timeutil.TimeElapsed() with open(self.filename, 'r', encoding='utf8') as words_f: # file에서 읽어서 loading하는 부분은 실 환경에 따라 변경됨. for line in words_f: # print('[', line, ']') data = line.replace('\n', '') if data.__len__() == 0: continue data = data.replace('\t', ' ') wordslist = data.split(' ') print(wordslist) for word in wordslist: DataLoader.wordsset.add(word) print("Time to spend for loading", loadingtime.getelapsed()) print("Words count", len(DataLoader.wordsset))
def detect_text(path): ocrtimer = timeutil.TimeElapsed() """Detects text in the file.""" client = vision.ImageAnnotatorClient() with io.open(path, 'rb') as image_file: content = image_file.read() image = vision.types.Image(content=content) # image = vision.types.Image() response = client.text_detection(image=image) texts = response.text_annotations print('Type(texts):', type(texts)) print('Texts:', texts) print('--------------------') print("\nOCR elapsed: ", ocrtimer.getelapsed()) print('\nAll Data:') # ---- 전체 단어와 position을 나열한다 count = 0 for text in texts: newtext = text.description.replace('\n', ' ') print('{:>2} Desc:[{:>20s}],'.format(count, newtext), end='') vertices = ([ '({},{})'.format(vertex.x, vertex.y) for vertex in text.bounding_poly.vertices ]) # print('\n\tvertices:', vertices, ' Type:', type(vertices)) print('bounds: {}'.format(','.join(vertices))) count = count + 1 # ---- 해당하는 단어의 position을 가져온다 print('--------------------') for text in texts: if text.description == '당도': for vertex in text.bounding_poly.vertices: print('Found....', vertex)
import sys sys.path.insert(1, '/Users/andrew/Documents/py_basic/util') import timeutil def loadDataToSet(data): dataset = set() dataset.add('123') dataset.add('abc') return dataset if __name__ == '__main__': loadtimer = timeutil.TimeElapsed() dataset = loadDataToSet('copy2clip is working') print("loading....", loadtimer.getelapsed()) count = 0 testdate = ['abc', 'def'] for word in testdate: if word in dataset: count = count + 1 print("searching....", loadtimer.getelapsed(), ",", count) ''' 1안 DB에 있는 단어, 조사를 완성하여 set을 구성 OCR로 인식된 단어를 set에서 검색 2안 OCR로 인식된 단어로 set을 구성
def load_data(self, fname): self.filename = fname loadingtime = timeutil.TimeElapsed() num_not_necessary = 0 name_of_sheet = '05.31' # df = pd.read_excel(filename, sheet_name='Sheet1') df = pd.read_excel(self.filename, sheet_name=name_of_sheet, header=num_not_necessary) count = 0 # col_lists = ['공통', '공통\n(실증필요단어)', '의류', '속옷', '패션잡화', '가구', '침구/침장', '가전', '스포츠/레저', '식품', '주방용품', \ # '이미용\n(실증필요단어)', '생활/잡화', '유아동용품', '문화/서비스', '보석/장신구', '건강', '이미용'] col_lists = df.columns # print('col_lists ==>', col_lists) # ---- column들의 길이를 가져온다 ---- # maxColumnLenghts = [] # for col in range(len(df.columns)): # maxColumnLenghts.append(max(df.iloc[:,col].astype(str).apply(len))) # print('Max Column Lengths ', maxColumnLenghts) if num_not_necessary is not None: max_index = df.__len__() - num_not_necessary else: max_index = df.__len__() for col in col_lists: print('-------------', col, '------------------') for i in range(0, max_index): cell_value = df.at[i, col] if str(cell_value).replace(' ', '') == 'nan': continue # 값을 보정 (특히 숫자인 경우 필요) if cell_value == 1: cell_value = '100%' print('{:3>d} {}'.format(count, cell_value)) DataLoader.wordsset.add(cell_value) count = count + 1 count = self.add_word_with_postposition(cell_value, count) # with open(self.filename, 'r', encoding='utf8') as words_f: # # file에서 읽어서 loading하는 부분은 실 환경에 따라 변경됨. # for line in words_f: # # print('[', line, ']') # data = line.replace('\n', '') # if data.__len__() == 0: # continue # # data = data.replace('\t', ' ') # wordslist = data.split(' ') # print(wordslist) # # for word in wordslist: # DataLoader.wordsset.add(word) print("Time to spend for 금지어 로딩", loadingtime.getelapsed()) print("Words count", len(DataLoader.wordsset))
# ---- 전체 단어와 position을 나열한다 count = 0 for text in texts: newtext = text.description.replace('\n', ' ') print('{:>2} Desc:[{:>20s}],'.format(count, newtext), end='') vertices = ([ '({},{})'.format(vertex.x, vertex.y) for vertex in text.bounding_poly.vertices ]) # print('\n\tvertices:', vertices, ' Type:', type(vertices)) print('bounds: {}'.format(','.join(vertices))) count = count + 1 # ---- 해당하는 단어의 position을 가져온다 print('--------------------') for text in texts: if text.description == '당도': for vertex in text.bounding_poly.vertices: print('Found....', vertex) if __name__ == '__main__': wholetimer = timeutil.TimeElapsed() # detect_text('/Volumes/USB3-64/Image/10167389_1.jpg') detect_text( '/Volumes/USB3-64/Image/3. 10164315 쁘띠프루티 울트라 모이스처라이징 페이셜 마스크팩.jpg' ) print("\n\nTotal elapsed: ", wholetimer.getelapsed())
def main(): global loader STX = 2 # 엑셀 파일에 있는 금칙어를 로드, loader = dataloader.DataLoader() loader.load_data('/Users/andrew/Downloads/pWords.xlsx') # 형태소분석기를 로드 tokenizer = analyser.Analyser() # TCP/IP 구성 정보를 로드 global tcp_config global conn global interval try: with open('tcp_config.json', encoding='utf-8') as json_file: tcp_config = json.load(json_file) except FileNotFoundError: print("No File exists...") exit('socket configuration exception') host = tcp_config['hostname'] port = tcp_config['port'] interval = tcp_config['interval'] error_cnt = 0 while True: conn = socket.socket(socket.AF_INET, socket.SOCK_STREAM) try: conn.connect((host, port)) except ConnectionRefusedError: conn.close() time.sleep(10) continue while True: try: content = conn.recv(200) except socket.error as e: print("error while receiving :: " + str(e), e.errno) if e.errno == errno.EPIPE: conn.close() break else: raise # exit("terminating") except: print("error 2 while receiving :: ") print(errno) break line_time = timeutil.TimeElapsed() if len(content) == 0: error_cnt += 1 if error_cnt > 3: error_cnt = 0 break if content[0] != STX: print('wrong data from the server..') continue actual_data = content[2:] print('[' + actual_data.decode(encoding='cp949') + ']') # 일반로직에 따라수신한 문장의 단어별 일따라 금지어 존재 검색 pword_list = check_pword(actual_data.decode(encoding='cp949')) if len(pword_list) > 1: print(pword_list, len(pword_list)) # 복합명사 처리를 위해 속기기록을 형태소 분석하여 명사만 가져온다 nouns_list = tokenizer.get_noun_tokens( actual_data.decode(encoding='cp949')) for noun in nouns_list: print('Noun', noun) print("Time spent to analyse line: ", line_time.getelapsed()) print('------------------------------------')
import sys import os # sys.path.append('/Users/andrew/Documents/py_basic/util') sys.path.insert(1, '/Users/andrew/Documents/py_basic/util') # sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) # print(sys.path) import timeutil a = timeutil.TimeElapsed() for i in range(10000): print(".", end='') print() print("elapsed....", a.getelapsed()) for i in range(2000): print(i, end='') print() print("elapsed2....", a.getelapsed())
def main(): # 엑셀 파일에 있는 금칙어를 로드, loader = dataloader.DataLoader() loader.load_data( '/Users/andrew/Documents/RPA_신세계TV쇼핑/SSG닷컴_RM_키워드(금칙어)_해제 완료.xlsx' ) totaltime = timeutil.TimeElapsed() print('-----------------------------------------------') # GCV 오류 발생한 단어에 대한 dictionary를 로드 gcv_error_data = {} try: with open('gcv_error_dic.json', encoding='utf-8') as json_file: gcv_error_data = json.load(json_file) except FileNotFoundError: print("No File exists...") print('-----------------------------------------------') # 상품 이미지 파일을 GCV를 이용하여 텍스트 인식 hndlr = textDetectionUsingGCV.OCRHandler() text_lists = hndlr.detect_text( '/Volumes/USB3-64/Image/1. 10160920 다나한 인삼잎 보윤 수분크림.jpg' ) # text_lists = hndlr.detect_text('/Volumes/USB3-64/Image/2. 10160902 참존 마유 골든컴플렉스 2종 세트.png') # text_lists = hndlr.detect_text('/Volumes/USB3-64/Image/3. 10164315 쁘띠프루티 울트라 모이스처라이징 페이셜 마스크팩.jpg') # text_lists = hndlr.detect_text('/Volumes/USB3-64/Image/4. 10164318 쁘띠프루티 카밍 앤 브라이트 페이셜 마스크팩.png') # text_lists = hndlr.detect_text('/Volumes/USB3-64/Image/5. 10164572 참존 인텐시브 골드 앰플.jpg') # text_lists = hndlr.detect_text('/Volumes/USB3-64/Image/6. 10166107 일동제약 퍼스트랩 프로바이오틱 마스크팩.png') # text_lists = hndlr.detect_text('/Volumes/USB3-64/Image/6. 10166107 일동제약-1.png') # text_lists = hndlr.detect_text('/Volumes/USB3-64/Image/6. 10166107 일동제약-2.png') # text_lists = hndlr.detect_text('/Volumes/USB3-64/Image/6. 10166107 일동제약-3.png') # text_lists = hndlr.detect_text('/Volumes/USB3-64/Image/7. 10167823 이오 에브리원 솝 코코넛레몬.jpg') # text_lists = hndlr.detect_text('/Volumes/USB3-64/Image/8. 10060298 티레이저 기기 + 크림.jpg') print('-----------------------------------------------') words_found_list = [] # 인식된 각 텍스트를 금칙어에 있는지 확인 count = 1 for curr_word in text_lists[1:]: # 1. single 단어를 검색한다 result = loader.find_word(curr_word[DESC]) if result == FOUND: print("금지어 발견 -->", curr_word[DESC], curr_word[VERTEX]) words_found_list.append(curr_word) count = count + 1 continue # 미발견 시 복합단어 처리 위해, 그러나 다음 단어가 없으면 더 이상 복합 단어 처리는 하지 않음 if count + 1 >= len(text_lists): count = count + 1 continue next_word = text_lists[count + 1] # single 단어 검색 안되면, 인접 단어와 조합하여 검색. 그러나, 인접하지 않으면 처리하지 않음 front_p2_x = curr_word[VERTEX][P2][X] front_p2_y = curr_word[VERTEX][P2][Y] back_p1_x = next_word[VERTEX][P1][X] back_p1_y = next_word[VERTEX][P1][Y] if abs(front_p2_x - back_p1_x) > 10 or abs(front_p2_y - back_p1_y) > 10: count = count + 1 continue # 2. single 단어 검색 안되면, 인접 단어와 space 넣어 조합하여 검색 combined_word = curr_word[DESC] + ' ' + next_word[DESC] result = loader.find_word(combined_word) if result == FOUND: print("복합 금지어 발견 1-->", combined_word) count = count + 1 continue # 3. single 단어 검색 안되면, 인접 단어와 space 없이 조합하여 검색 combined_word = curr_word[DESC] + next_word[DESC] # print(combined_word, len(combined_word)) result = loader.find_word(combined_word) if result == FOUND: print("복합 금지어 발견 2-->", combined_word) count = count + 1 continue # 4. 오류 사전에서 combined word에 대한 대체명사를 검색하여 있으면 대체명사로 검색 try: replaced_word = gcv_error_data[combined_word] # print('대체어: [', combined_word, '->', replaced_word, ']') except KeyError: replaced_word = '' if replaced_word != '': result = loader.search_in_dict(replaced_word) if result: print("대체어 금지어 발견 3-->", curr_word[DESC], curr_word[VERTEX]) count = count + 1 continue # 5. combined word 길이가 4음절 이상이면, ngram 분리하여 각각 검색 grams = [2, 3, 4, 5] if len(combined_word) > 3: for gram in grams: start = 0 end_pos = len(combined_word) + 1 for end in range(gram, end_pos): substr = combined_word[start:end] start = start + 1 # print('NGRAM 단어:', substr) result = loader.search_in_dict(substr) if result == FOUND: print("부분 금지어 발견 4-->", curr_word[DESC], curr_word[VERTEX]) break count = count + 1 print("Time spent for 금지어 발견: ", totaltime.getelapsed()) print('-----------------------------------------------')
# print('length : ', length) # print('width : ', width) # print('height : ', height) # print('fps : ', fps) # cap = cv2.VideoCapture('BotInsight_M7_L3.mp4') # cap = cv2.VideoCapture('/Volumes/NO NAME/20191021 영상기술검수 에러반려 예시/video black/N2018010300026_기술검수반려 [M]제왕수산 갈치 뷰티_갈치조림 (수정).MXF') cap = cv2.VideoCapture('/Volumes/NO NAME/20191021 영상기술검수 에러반려 예시/video black/N2018010800080_기술검수반려 [M]AHC 아이크림 시즌6_인포.MXF') if not cap.isOpened(): print("already open..") print("so, exit..") exit(0) timer = timeutil.TimeElapsed() count = 0 frametimer = timeutil.TimeElapsed() framepos = 0.0 stop_flag = False while True: if stop_flag: break # 읽을 frame 위치를 지정 cap.set(cv2.CAP_PROP_POS_FRAMES, framepos) ret, frame = cap.read()ㅜ if not ret: break