def create_documents_list(self, params, index): size = 10000 # 페이징 사이즈 # 검색 시작 result = es.get_documents(params, size, index, "") #worksheet = self.workbook.add_worksheet("원문(%s)"%"~".join([params['start_date'][0:10],params['end_date'][0:10]])) # 엑셀 헤더 ''' for colidx, field in enumerate(output_fields_korean): worksheet.write(0, colidx, field, self.header) ''' if "hits" in result and result["hits"]["total"] > 0: scdfile = codecs.open(os.path.join(self.file_path, self.file_name), 'w', 'utf-8') for this_result in result["hits"]["hits"]: for field in FIELDS_DOCUMENTS: if field == 'doc_id': val = this_result["_id"] #worksheet.write(row+1, col, val, self.default) scdfile.write("<DOCID>%s"%val) scdfile.write("\r\n") continue val = this_result["_source"][field] if field in this_result["_source"] else "null" #worksheet.write(row+1, col, val, self.default) scdfile.write("<%s>%s" % (field, val)) scdfile.write("\r\n") # 결과건수가 한 페이지 사이즈보다 큰 경우, scroll을 이용해서 paging하며 결과를 가져옴. # 용량이 클 것으로 예상하여 엑셀 파일도 새로 생성. if "hits" in result and result["hits"]["total"] > size: for page in range(1, math.ceil(result["hits"]["total"]/size)): # 0, 1, 2, .... scrolled_result = es.get_documents(params, size, index, scroll_id=result["_scroll_id"]) for this_result in scrolled_result["hits"]["hits"]: for field in FIELDS_DOCUMENTS: if field == 'doc_id': val = this_result["_id"] #worksheet.write(row+1, col, val, self.default) scdfile.write("<DOCID>%s"%val) scdfile.write("\r\n") continue val = this_result["_source"][field] if field in this_result["_source"] else "null" #worksheet.write(row+1, col, val, self.default) scdfile.write("<%s>%s" % (field, val)) scdfile.write("\r\n") if page == math.ceil(result["hits"]["total"]/size)-1: # 마지막 페이지를 처리하고 나면 scroll을 clear if result["_scroll_id"]: es.clear_scroll(result["_scroll_id"]) scdfile.close()
def create_documents_list(self, params, index): size = 10000 # 페이징 사이즈 # 검색 시작 #result = es.get_documents(params, size, index, "") totalCount = es.get_count("/" + index + "/doc/_count", self.queryObj.get_documents_query(params)) self.logger.debug("[ReportStatistics][create_documents_list] %s" % self.queryObj.get_documents_query(params)) #if "hits" in result and result["hits"]["total"] > 0: if totalCount > 0: scroll_id = None # 결과건수가 한 페이지 사이즈보다 큰 경우, scroll을 이용해서 paging하며 결과를 가져옴. # 용량이 클 것으로 예상하여 엑셀 파일도 새로 생성. #if "hits" in result and result["hits"]["total"] > size: for page in range(math.ceil(totalCount / size)): # 0, 1, 2, .... worksheet = self.workbook.add_worksheet( "원문(%s)(%d)" % ("~".join([ params['start_date'][0:10], params['end_date'][0:10] ]), page + 1)) #>%s(%d)"%(this_dataset_name,page)) scrolled_result = es.get_list( "/" + index + "/doc/_search", self.queryObj.get_documents_query(params), size, scroll_id) scroll_id = scrolled_result['_scroll_id'] # 엑셀 헤더 for colidx, field in enumerate(self.DOCUMENTS_FIELDS_KOREAN): worksheet.write(0, colidx, field, self.header) for row, this_result in enumerate( scrolled_result["hits"]["hits"]): for col, field in enumerate(self.DOCUMENTS_FIELDS): if "." in field: field, subfield = field.split(".") val = this_result["_source"][field][ subfield] if field in this_result[ "_source"] and subfield in this_result[ "_source"][field] else "null" worksheet.write(row + 1, col, val, self.default) else: val = this_result["_source"][ field] if field in this_result[ "_source"] else "null" worksheet.write(row + 1, col, val, self.default) if page == math.ceil( totalCount / size) - 1: # 마지막 페이지를 처리하고 나면 scroll을 clear if '_scroll_id' in scrolled_result and scrolled_result[ "_scroll_id"]: es.clear_scroll(scroll_id)
def create_documents_list(self, params, index): size = 10000 # 페이징 사이즈 output_fields_korean = self.DOCUMENTS_FIELDS_KOREAN if index.startswith( 'documents') else self.EMOTIONS_FIELDS_KOREAN output_fields = self.DOCUMENTS_FIELDS if index.startswith( 'documents') else self.EMOTIONS_FIELDS # 검색 시작 #result = es.get_documents(params, size, index, "") totalCount = es.get_documents_count(params, index) #if "hits" in result and result["hits"]["total"] > 0: if totalCount > 0: scroll_id = None # 결과건수가 한 페이지 사이즈보다 큰 경우, scroll을 이용해서 paging하며 결과를 가져옴. # 용량이 클 것으로 예상하여 엑셀 파일도 새로 생성. #if "hits" in result and result["hits"]["total"] > size: for page in range(math.ceil(totalCount / size)): # 0, 1, 2, .... worksheet = self.workbook.add_worksheet( "원문(%s)(%d)" % ("~".join([ params['start_date'][0:10], params['end_date'][0:10] ]), page + 1)) #>%s(%d)"%(this_dataset_name,page)) scrolled_result = es.get_documents(params, size, index, scroll_id) scroll_id = scrolled_result['_scroll_id'] # 엑셀 헤더 for colidx, field in enumerate(output_fields_korean): worksheet.write(0, colidx, field, self.header) for row, this_result in enumerate( scrolled_result["hits"]["hits"]): for col, field in enumerate(output_fields): if "." in field: field, subfield = field.split(".") val = this_result["_source"][field][ subfield] if field in this_result[ "_source"] and subfield in this_result[ "_source"][field] else "null" worksheet.write(row + 1, col, val, self.default) else: val = this_result["_source"][ field] if field in this_result[ "_source"] else "null" worksheet.write(row + 1, col, val, self.default) if page == math.ceil( totalCount / size) - 1: # 마지막 페이지를 처리하고 나면 scroll을 clear if '_scroll_id' in scrolled_result and scrolled_result[ "_scroll_id"]: es.clear_scroll(scroll_id)
def create_documents_list(self, params, index): size = 10000 # 페이징 사이즈 # 검색 시작 #result = es.get_documents(params, size, index, "") totalCount = es.get_count("/"+index+"/doc/_count", self.queryObj.get_documents_query(params)) self.logger.debug("[ReportStatistics][create_documents_list] %s" % self.queryObj.get_documents_query(params)) #if "hits" in result and result["hits"]["total"] > 0: if totalCount > 0 : scroll_id = None # 결과건수가 한 페이지 사이즈보다 큰 경우, scroll을 이용해서 paging하며 결과를 가져옴. # 용량이 클 것으로 예상하여 엑셀 파일도 새로 생성. #if "hits" in result and result["hits"]["total"] > size: for page in range(math.ceil(totalCount/size)): # 0, 1, 2, .... worksheet = self.workbook.add_worksheet("원문(%s)(%d)"%("~".join([params['start_date'][0:10],params['end_date'][0:10]]), page+1))#>%s(%d)"%(this_dataset_name,page)) scrolled_result = es.get_list("/"+index+"/doc/_search", self.queryObj.get_documents_query(params), size, scroll_id) scroll_id = scrolled_result['_scroll_id'] # 엑셀 헤더 for colidx, field in enumerate(self.DOCUMENTS_FIELDS_KOREAN): worksheet.write(0, colidx, field, self.header) for row, this_result in enumerate(scrolled_result["hits"]["hits"]): for col, field in enumerate(self.DOCUMENTS_FIELDS): if "." in field: field, subfield = field.split(".") val = this_result["_source"][field][subfield] if field in this_result["_source"] and subfield in this_result["_source"][field] else "null" worksheet.write(row+1, col, val, self.default) else: val = this_result["_source"][field] if field in this_result["_source"] else "null" worksheet.write(row+1, col, val, self.default) if page == math.ceil(totalCount/size)-1: # 마지막 페이지를 처리하고 나면 scroll을 clear if '_scroll_id' in scrolled_result and scrolled_result["_scroll_id"]: es.clear_scroll(scroll_id)
def create_documents_list(self, params, index): # title, content에 포함되어 있을 시 제외시킬 패턴 가져오기 project_filter_keywords = db.get_project_filter_keywords(params['project_seq']) EXCLUDE_PATTERNS = None if project_filter_keywords and 'regex_filter_keywords' in project_filter_keywords: EXCLUDE_PATTERNS = re.compile("(?i)("+re.sub(",", "|", project_filter_keywords['regex_filter_keywords'].strip())+")") size = 10000 # 페이징 사이즈 # 검색 시작 result = es.get_documents(params, size, index, "") # 시트 생성 worksheet = self.workbook.add_worksheet("원문(%s)(0)"%"~".join([params['start_date'][0:10],params['end_date'][0:10]])) # 엑셀 헤더 worksheet.write(0, 0, 'ID', self.header) worksheet.write(0, 1, '게시일', self.header) worksheet.write(0, 2, '작성자', self.header) worksheet.write(0, 3, 'URL', self.header) worksheet.write(0, 4, '제목', self.header) worksheet.write(0, 5, '내용', self.header) worksheet.write(0, 6, '채널1', self.header) worksheet.write(0, 7, '채널2', self.header) worksheet.write(0, 8, '채널3', self.header) worksheet.write(0, 9, '정확도', self.header) # 정확도(Score) 추가 logger.info("<%s> Total Documents : %d" % (self.dataset_names, result["hits"]["total"])) # 엑셀 본문 if "hits" in result and result["hits"]["total"] > 0: row = 0 for this_result in result["hits"]["hits"]: doc_id = this_result["_id"] doc_datetime = this_result["_source"]["doc_datetime"] doc_writer = re.sub("[\+=\-/]", "", str(this_result["_source"]["doc_writer"])) doc_url = this_result["_source"]["doc_url"] doc_title = re.sub("[\+=\-/]", "", str(this_result["_source"]["doc_title"])) doc_content = re.sub("[\+=\-/]", "", str(this_result["_source"]["doc_content"])) depth1_nm = this_result["_source"]["depth1_nm"] depth2_nm = this_result["_source"]["depth2_nm"] depth3_nm = this_result["_source"]["depth3_nm"] score = this_result["_score"] # 2018.04.05 특정 패턴이 등장하는 title, content가 포함되어 있을 경우 row에서 제외. if EXCLUDE_PATTERNS is not None and (EXCLUDE_PATTERNS.search(doc_title) is not None or EXCLUDE_PATTERNS.search(doc_content) is not None): continue row += 1 worksheet.write(row, 0, doc_id, self.default) worksheet.write(row, 1, doc_datetime, self.default) worksheet.write(row, 2, doc_writer, self.default) worksheet.write(row, 3, doc_url, self.default) worksheet.write(row, 4, doc_title, self.default) worksheet.write(row, 5, doc_content, self.default) worksheet.write(row, 6, depth1_nm, self.default) worksheet.write(row, 7, depth2_nm, self.default) worksheet.write(row, 8, depth3_nm, self.default) worksheet.write(row, 9, score, self.default) # 결과건수가 한 페이지 사이즈보다 큰 경우, scroll을 이용해서 paging하며 결과를 가져옴. # 용량이 클 것으로 예상하여 엑셀 파일도 새로 생성. if "hits" in result and result["hits"]["total"] > size: row = 0 for page in range(1, math.ceil(result["hits"]["total"]/size)): # 0, 1, 2, .... worksheet = self.workbook.add_worksheet("원문(%s)(%d)"%("~".join([params['start_date'][0:10],params['end_date'][0:10]]),page)) # 엑셀 헤더 worksheet.write(0, 0, 'ID', self.header) worksheet.write(0, 1, '게시일', self.header) worksheet.write(0, 2, '작성자', self.header) worksheet.write(0, 3, 'URL', self.header) worksheet.write(0, 4, '제목', self.header) worksheet.write(0, 5, '내용', self.header) worksheet.write(0, 6, '채널1', self.header) worksheet.write(0, 7, '채널2', self.header) worksheet.write(0, 8, '채널3', self.header) worksheet.write(0, 9, '정확도', self.header) # 정확도(Score) 추가 scrolled_result = es.get_documents(params, size, index, scroll_id=result["_scroll_id"]) for this_result in scrolled_result["hits"]["hits"]: doc_id = this_result["_id"] doc_datetime = this_result["_source"]["doc_datetime"] doc_writer = re.sub("[\+=\-/]", "", str(this_result["_source"]["doc_writer"])) doc_url = this_result["_source"]["doc_url"] doc_title = re.sub("[\+=\-/]", "", str(this_result["_source"]["doc_title"])) doc_content = re.sub("[\+=\-/]", "", str(this_result["_source"]["doc_content"])) depth1_nm = this_result["_source"]["depth1_nm"] depth2_nm = this_result["_source"]["depth2_nm"] depth3_nm = this_result["_source"]["depth3_nm"] score = this_result["_score"] # 2018.04.05 특정 패턴이 등장하는 title, content가 포함되어 있을 경우 row에서 제외. if EXCLUDE_PATTERNS is not None and (EXCLUDE_PATTERNS.search(doc_title) is not None or EXCLUDE_PATTERNS.search(doc_content) is not None): continue row += 1 worksheet.write(row, 0, doc_id, self.default) worksheet.write(row, 1, doc_datetime, self.default) worksheet.write(row, 2, doc_writer, self.default) worksheet.write(row, 3, doc_url, self.default) worksheet.write(row, 4, doc_title, self.default) worksheet.write(row, 5, doc_content, self.default) worksheet.write(row, 6, depth1_nm, self.default) worksheet.write(row, 7, depth2_nm, self.default) worksheet.write(row, 8, depth3_nm, self.default) worksheet.write(row, 9, score, self.default) # 마지막 페이지를 처리하고 나면 scroll을 clear if page == math.ceil(result["hits"]["total"]/size)-1: if result["_scroll_id"]: es.clear_scroll(result["_scroll_id"])