def get_documents(params, size, index, scroll_id=None): request = None queryObj = Query(params) if not scroll_id: es_uri = "/" + index + "/doc/_search?scroll=1d" request = queryObj.get_documents_query() request['size'] = size else: es_uri = "/_search/scroll" request = {"scroll": "1d", "scroll_id": scroll_id} logger.debug("get_documents() ==> request : ") for k, v in request.items(): logger.debug("\t{} : {}".format(k, v)) es_conn = hc.HTTPConnection(es_ip, es_port, timeout=60) es_conn.request("POST", es_uri, json.dumps(request), {"Content-type": "application/json"}) result = es_conn.getresponse().read() if 'hits' in json.loads(result): logger.debug("[get_documents] result['hits']['total'] >>> %d" % int(json.loads(result)['hits']['total'])) else: logger.debug("[get_documents] result ::: " + str(result)) return json.loads(result)
def __init__(self, params): self.compare = True if params['compare_yn']=='Y' else False self.start_date = re.sub("[-:\s]", "", params['start_date'])[:8] self.end_date = re.sub("[-:\s]", "", params['end_date'])[:8] self.seq = params['seq'] self.reg_dt = re.sub("[-:\s]", "", params['reg_dt']) self.report_type = db.get_exceltype_name(params['type_cd']) # RSP -> 리포트_소셜모니터링_추이분석 self.project_name = db.get_project_name(params['project_seq']) self.channel = '전체' if not params['channels'] or params['channels']=='all' else "채널일부" self.dataset_names = ",".join([db.get_dataset_name(x) if db.get_dataset_name(x)!=None else 'unknown' for x in params['datasets'].split("^")]) if params['datasets'] else '' # 6^7^15 -> 신라면,안성탕면,짜파게티 if os.name == 'nt' and bool(re.match("[\/\\\"*?<>\|]", self.dataset_names)): self.dataset_names = re.sub("[\/\\\"*?<>\|]", "_", self.dataset_names) self.queryObj = Query() compare_yn = "동일기간비교" if params['compare_yn']=='Y' else "해당기간" if not params['datasets']: # 검색트렌드 self.file_name = "_".join([str(self.seq), self.report_type, self.start_date, self.end_date, compare_yn]) + ".xlsx" else: # 소셜모니터링 if len(params['datasets'].split("^"))>1: self.file_name = "_".join([str(self.seq), self.report_type, self.channel, self.start_date, self.end_date, compare_yn]) + ".xlsx" else: self.file_name = "_".join([str(self.seq), self.report_type+"("+self.dataset_names+")", self.channel, self.start_date, self.end_date, compare_yn]) + ".xlsx" self.logger.info("=======================================================================================") for k, v in params.items(): self.logger.info(k + " :\t\t" + str(v)) self.logger.info("=======================================================================================")
def get_documents_count(params, index): queryObj = Query(params) es_uri = "/" + index + "/doc/_count" request = queryObj.get_documents_query() es_conn = hc.HTTPConnection(es_ip, es_port, timeout=60) es_conn.request("POST", es_uri, json.dumps(request), {"Content-type": "application/json"}) result = es_conn.getresponse().read() if 'count' in json.loads(result): return json.loads(result)['count'] else: logger.error("[get_documents_count] %s" % str(result)) return -1
def get_request_query(params, scroll_id=None): queryObj = Query(params) if not scroll_id: request = {"query": {"bool": {}}} else: request = {"scroll": "1d", "scroll_id": scroll_id} if "query" in request: filter = [] # 프로젝트 시퀀스 포함 filter.append(queryObj.get_project_seq_query()) # 여러 프로젝트 seq 가 들어오더라도 모두 filter keyword가 동일하므로 첫번째 project_seq만 사용. filter.append( queryObj.get_project_filter_query( params['project_seqs'].split(",")[0])) # 대상 채널 if "channels" in params and params[ "channels"] and params["channels"] != 'all': filter.append(queryObj.get_channel_query()) # 대상 기간 if "start_date" in params and "end_date" in params: filter.append(queryObj.get_period_query(params['mode'])) request["query"]["bool"]["filter"] = filter request["query"]["bool"]["must"] = queryObj.get_total_dataset_query( params['project_seqs']) logger.debug("[get_request_query] Query >>> %s " % json.dumps(request)) return request
def __init__(self, params): self.mode = params['mode'] self.compare = True if params['compare_yn']=='Y' else False self.start_date = re.sub("[-:T\s]", "", params['start_date'])[:12] self.end_date = re.sub("[-:T\s]", "", params['end_date'])[:12] self.reg_dt = re.sub("[-:T\s]", "", params['reg_dt']) self.dataset_names = ",".join([db.get_dataset_name(x) if db.get_dataset_name(x)!=None else 'unknown' for x in str(params['datasets']).split("^")]) if params['datasets'] else '' # 6^7^15 -> 신라면,안성탕면,짜파게티 self.query = Query(params) if mode == MODE_DOCUMENTS: self.file_name = "_".join(["SNS", self.dataset_names, self.start_date, self.end_date]) + ".xlsx" elif mode == MODE_TOPICS: self.file_name = "_".join(["화제어", self.dataset_names, self.start_date, self.end_date]) + ".xlsx" elif mode == MODE_EMOTIONS: self.file_name = "_".join(["감성분석", self.dataset_names, self.start_date, self.end_date]) + ".xlsx" elif mode == MODE_TREND: self.file_name = "_".join(["연관검색어", str(params['project_seq']), self.start_date, self.end_date]) + ".xlsx"
def __init__(self, params): self.seq = params['seq'] self.compare = True if params['compare_yn']=='Y' else False self.start_date = re.sub("[-:T\s]", "", params['start_date'])[:12] self.end_date = re.sub("[-:T\s]", "", params['end_date'])[:12] self.reg_dt = re.sub("[-:T\s]", "", params['reg_dt']) self.dataset_names = ",".join([db.get_dataset_name(x) if db.get_dataset_name(x)!=None else 'unknown' for x in str(params['datasets']).split("^")]) if params['datasets'] else '' # 6^7^15 -> 신라면,안성탕면,짜파게티 self.query = Query(params) self.file_name = "B-%d-%s-I-C.SCD" % (self.seq, get_current_datetime())
def get_request_query(params, scroll_id=None): queryObj = Query(params) if not scroll_id: request = {"query": {"bool": {}}} else: request = {"scroll": "1d", "scroll_id": scroll_id} if "query" in request: filter = [] # 프로젝트 시퀀스 포함 filter.append(queryObj.get_project_seq_query()) # 여러 프로젝트 seq 가 들어오더라도 모두 filter keyword가 동일하므로 첫번째 project_seq만 사용. filter.append( queryObj.get_project_filter_query( params['project_seqs'].split(",")[0])) # 대상 채널 if "channels" in params and params[ "channels"] and params["channels"] != 'all': filter.append(queryObj.get_channel_query()) # 대상 기간 if "start_date" in params and "end_date" in params: filter.append(queryObj.get_period_query(params['mode'])) request["query"]["bool"]["filter"] = filter request["query"]["bool"]["must"] = queryObj.get_total_dataset_query( params['project_seqs']) ''' request["query"]["bool"]["must"] = { "bool" : { "should" : [ { "query_string": { "fields": ["doc_title^100", "doc_content"], "query" : "신한은행", "default_operator" : "AND", "tie_breaker" : 0.0 } } ] } } ''' logger.debug("[get_request_query] Query >>> %s " % json.dumps(request)) return request
def get_documents(params, size, index, scroll_id=None): queryObj = Query(params) if not scroll_id: es_uri = "/" + index + "/doc/_search?scroll=1d" request = {"size": size, "query": {"bool": {}}} else: es_uri = "/_search/scroll" request = {"scroll": "1d", "scroll_id": scroll_id} if "query" in request: filter = [] # 프로젝트 시퀀스 포함 filter.append(queryObj.get_project_seq_query()) filter.append(queryObj.get_project_filter_query(params['project_seq'])) # 대상 채널 if "channels" in params and params[ "channels"] and params["channels"] != 'all': filter.append(queryObj.get_channel_query()) # 대상 기간 if "start_date" in params and "end_date" in params: filter.append(queryObj.get_period_query()) request["query"]["bool"]["filter"] = filter # 데이터셋의 포함 키워드 if "datasets" in params and params["datasets"]: request["query"]["bool"]["must"] = queryObj.get_dataset_query( params['project_seq'], params["datasets"]) logger.debug("[get_documents] Query >>> %s " % json.dumps(request)) es_conn = hc.HTTPConnection(es_ip, es_port, timeout=60) es_conn.request("POST", es_uri, json.dumps(request), {"Content-type": "application/json"}) result = es_conn.getresponse().read() if 'hits' in json.loads(result): logger.debug("[get_documents] result['hits']['total'] >>> %d" % int(json.loads(result)['hits']['total'])) else: logger.debug("[get_documents] result ::: " + str(result)) return json.loads(result)
params['project_seq'], params["datasets"]) logger.debug("[get_documents] Query >>> %s " % json.dumps(request)) es_conn = hc.HTTPConnection(es_ip, es_port, timeout=60) es_conn.request("POST", es_uri, json.dumps(request), {"Content-type": "application/json"}) result = es_conn.getresponse().read() if 'hits' in json.loads(result): logger.debug("[get_documents] result['hits']['total'] >>> %d" % int(json.loads(result)['hits']['total'])) else: logger.debug("[get_documents] result ::: " + str(result)) return json.loads(result) if __name__ == '__main__': params = { "start_date": "2018-01-01T00:00:00", "end_date": "2018-12-31T23:59:59", "project_seq": 176, "compare_yn": "N", "channels": "all", "datasets": "2852" } queryObj = Query(params) #print(queryObj.ALL_TOPICS_LIST("신한금융지주")) print(get_documents(params, 10, "documents"))
class ReportKDICDocuments: mode = "" seq = -1 reg_dt = "" report_day = "" report_time = "" report_type = "" project_name = "" channel = "" start_date = "" end_date = "" dataset_names = "" query = None compare = '' save_path = "" file_name = "" file_path = "" #BASE_EXCEL_DIRECTORY='/data/dmap-data/dmap-excel' conf = Config() BASE_EXCEL_DIRECTORY=conf.get_report_home() HEADER_FORMAT = { 'bold' : True, 'font_size' : 9, 'bg_color' : '#F2F2F2', 'align' : 'center', 'border' : 1 } DEFAULT_FORMAT = { 'font_size' : 9, 'border' : 1 } def __init__(self, params): self.mode = params['mode'] self.compare = True if params['compare_yn']=='Y' else False self.start_date = re.sub("[-:T\s]", "", params['start_date'])[:12] self.end_date = re.sub("[-:T\s]", "", params['end_date'])[:12] self.reg_dt = re.sub("[-:T\s]", "", params['reg_dt']) self.dataset_names = ",".join([db.get_dataset_name(x) if db.get_dataset_name(x)!=None else 'unknown' for x in str(params['datasets']).split("^")]) if params['datasets'] else '' # 6^7^15 -> 신라면,안성탕면,짜파게티 self.query = Query(params) if mode == MODE_DOCUMENTS: self.file_name = "_".join(["SNS", self.dataset_names, self.start_date, self.end_date]) + ".xlsx" elif mode == MODE_TOPICS: self.file_name = "_".join(["화제어", self.dataset_names, self.start_date, self.end_date]) + ".xlsx" elif mode == MODE_EMOTIONS: self.file_name = "_".join(["감성분석", self.dataset_names, self.start_date, self.end_date]) + ".xlsx" elif mode == MODE_TREND: self.file_name = "_".join(["연관검색어", str(params['project_seq']), self.start_date, self.end_date]) + ".xlsx" def get_file_name(self): return self.file_name def create_file_path(self, path): self.file_path = path return file_util.search_create_directory( self.file_path ) # if mode == 'documents': # ''' # - documents는 report 폴더 아래 Social 디렉터리 아래 떨어지게 됨. # ''' # self.file_path = os.path.join(self.BASE_EXCEL_DIRECTORY, self.reg_dt, 'raw') # return file_util.search_create_directory( self.file_path ) # else: # ''' # - topics는 report 폴더 아래 Social_topics 디렉터리 아래 떨어지게 됨. # ''' # self.file_path = os.path.join(self.BASE_EXCEL_DIRECTORY, self.reg_dt, 'topic') # return file_util.search_create_directory( self.file_path ) def topics_list(self, params): worksheet = self.workbook.add_worksheet("화제어(%s)"%"~".join([params['start_date'][0:10],params['end_date'][0:10]])) # 헤더 # 날짜 형식은 YYYYMMDD 이어야 함 worksheet.write(0, 0, '날짜', self.header) worksheet.write(0, 1, '순위', self.header) worksheet.write(0, 2, '화제어', self.header) worksheet.write(0, 3, '문서수', self.header) worksheet.write(0, 4, '연관어', self.header) worksheet.write(0, 5, '문서수', self.header) # 데이터 result_topic = es.get_aggregations(self.query.ALL_TOPICS_LIST(params['dataset_name']), params, Query.INDEX_TOPICS) row=0 seq=0 # topic의 순위 #topics_date = params['start_date'][0:10].replace('-','') for bucket0 in result_topic['aggregations']['my_aggs0']['buckets']: for bucket1 in bucket0['my_aggs1']['buckets']: topic = re.sub("[\+=\-/]", "", str(bucket1['key'])) seq += 1 topics_date = bucket0['key_as_string'] if len(bucket1['my_aggs2']['buckets'])>0: for bucket2 in bucket1['my_aggs2']['buckets']: str(startdate.strftime('%Y-%m-%dT%H:00:00')) # worksheet.write(1+row, 0, params['start_date'][0:10].replace('-',''), self.default) worksheet.write(1+row, 0, re.sub("-","", topics_date[:topics_date.find("T")]), self.default) worksheet.write(1+row, 1, seq, self.default) worksheet.write(1+row, 2, re.sub("[\[\]]", "", topic), self.default) worksheet.write(1+row, 3, bucket1['doc_count'], self.default) worksheet.write(1+row, 4, bucket2['key'], self.default) worksheet.write(1+row, 5, bucket2['doc_count'], self.default) #worksheet.write(1+row, 6, verb_list, self.default) row += 1 else: worksheet.write(1+row, 0, re.sub("-","", topics_date[:topics_date.find("T")]), self.default) worksheet.write(1+row, 1, seq, self.default) worksheet.write(1+row, 2, re.sub("[\[\]]", "", topic), self.default) worksheet.write(1+row, 3, bucket1['doc_count'], self.default) worksheet.write(1+row, 4, '', self.default) worksheet.write(1+row, 5, '', self.default) #worksheet.write(1+row, 6, '', self.default) row += 1 logger.info("<%s> Total Topics : %d" % (self.dataset_names, row) ) def emotions_per_causes(self, params): worksheet = self.workbook.add_worksheet("강성분석(%s)"%"~".join([params['start_date'][0:10],params['end_date'][0:10]])) # 헤더 # 날짜 형식은 YYYYMMDD 이어야 함 worksheet.write(0, 0, '날짜', self.header) worksheet.write(0, 1, '채널1', self.header) worksheet.write(0, 2, '채널2', self.header) worksheet.write(0, 3, '채널3', self.header) worksheet.write(0, 4, '대분류', self.header) worksheet.write(0, 5, '중분류', self.header) worksheet.write(0, 6, '소분류', self.header) worksheet.write(0, 7, '긍부정', self.header) worksheet.write(0, 8, '문서수', self.header) # 데이터 qdsl = self.query.EMOTIONS_PER_CAUSES() result = es.get_aggregations(copy.copy(qdsl), params, INDEX_EMOTIONS) #total = result['hits']['total'] total = 0 row = 0 #emotions_date = params['start_date'][0:10].replace('-','') for bucket0 in result['aggregations']['my_aggs0']['buckets']: for bucket1 in bucket0['my_aggs1']['buckets']: for bucket2 in bucket1['my_aggs2']['buckets']: for bucket5 in bucket2['my_aggs3']['my_aggs4']['my_aggs5']['buckets']: # 2018.01.11 "(주)"가 포함된 경우에는 (주)를 뺀 나머지 이름이 포함됐는지 확인해야 하므로 변경. if params['dataset_name'].find(bucket2['key']) >= 0 : depth_level = bucket1['key'].split(">") #worksheet.write(1+row, 0, emotions_date, self.default) emotions_date = bucket0['key_as_string'] worksheet.write(1+row, 0, re.sub("-", "", emotions_date[:emotions_date.find("T")]), self.default) worksheet.write(1+row, 1, re.sub("[\[\]]", "", depth_level[0]) if len(bucket1['key'].split(">"))>=0 else '', self.default) worksheet.write(1+row, 2, re.sub("[\[\]]", "", depth_level[1]) if len(bucket1['key'].split(">"))>=1 else '', self.default) worksheet.write(1+row, 3, re.sub("[\[\]]", "", depth_level[2]) if len(bucket1['key'].split(">"))>=2 else '', self.default) worksheet.write(1+row, 4, bucket2['key'], self.default) worksheet.write(1+row, 5, '', self.default) worksheet.write(1+row, 6, '', self.default) worksheet.write(1+row, 7, bucket5['key'], self.default) worksheet.write(1+row, 8, bucket5['doc_count'], self.default) total += int(bucket5['doc_count']) row += 1 # 합꼐 if len(params['datasets'].split("^"))==1: worksheet.write(row+1, 0, '합계', self.header) worksheet.write(row+1, 1, '', self.header) worksheet.write(row+1, 2, '', self.header) worksheet.write(row+1, 3, '', self.header) worksheet.write(row+1, 4, '', self.header) worksheet.write(row+1, 5, '', self.header) worksheet.write(row+1, 6, '', self.header) worksheet.write(row+1, 7, '', self.header) worksheet.write(row+1, 8, total, self.header) logger.info("<%s> Total Emotions : %d" % (self.dataset_names, row) ) # 원문 def create_documents_list(self, params, index): # title, content에 포함되어 있을 시 제외시킬 패턴 가져오기 project_filter_keywords = db.get_project_filter_keywords(params['project_seq']) EXCLUDE_PATTERNS = None if project_filter_keywords and 'regex_filter_keywords' in project_filter_keywords: EXCLUDE_PATTERNS = re.compile("(?i)("+re.sub(",", "|", project_filter_keywords['regex_filter_keywords'].strip())+")") size = 10000 # 페이징 사이즈 # 검색 시작 result = es.get_documents(params, size, index, "") # 시트 생성 worksheet = self.workbook.add_worksheet("원문(%s)(0)"%"~".join([params['start_date'][0:10],params['end_date'][0:10]])) # 엑셀 헤더 worksheet.write(0, 0, 'ID', self.header) worksheet.write(0, 1, '게시일', self.header) worksheet.write(0, 2, '작성자', self.header) worksheet.write(0, 3, 'URL', self.header) worksheet.write(0, 4, '제목', self.header) worksheet.write(0, 5, '내용', self.header) worksheet.write(0, 6, '채널1', self.header) worksheet.write(0, 7, '채널2', self.header) worksheet.write(0, 8, '채널3', self.header) worksheet.write(0, 9, '정확도', self.header) # 정확도(Score) 추가 logger.info("<%s> Total Documents : %d" % (self.dataset_names, result["hits"]["total"])) # 엑셀 본문 if "hits" in result and result["hits"]["total"] > 0: row = 0 for this_result in result["hits"]["hits"]: doc_id = this_result["_id"] doc_datetime = this_result["_source"]["doc_datetime"] doc_writer = re.sub("[\+=\-/]", "", str(this_result["_source"]["doc_writer"])) doc_url = this_result["_source"]["doc_url"] doc_title = re.sub("[\+=\-/]", "", str(this_result["_source"]["doc_title"])) doc_content = re.sub("[\+=\-/]", "", str(this_result["_source"]["doc_content"])) depth1_nm = this_result["_source"]["depth1_nm"] depth2_nm = this_result["_source"]["depth2_nm"] depth3_nm = this_result["_source"]["depth3_nm"] score = this_result["_score"] # 2018.04.05 특정 패턴이 등장하는 title, content가 포함되어 있을 경우 row에서 제외. if EXCLUDE_PATTERNS is not None and (EXCLUDE_PATTERNS.search(doc_title) is not None or EXCLUDE_PATTERNS.search(doc_content) is not None): continue row += 1 worksheet.write(row, 0, doc_id, self.default) worksheet.write(row, 1, doc_datetime, self.default) worksheet.write(row, 2, doc_writer, self.default) worksheet.write(row, 3, doc_url, self.default) worksheet.write(row, 4, doc_title, self.default) worksheet.write(row, 5, doc_content, self.default) worksheet.write(row, 6, depth1_nm, self.default) worksheet.write(row, 7, depth2_nm, self.default) worksheet.write(row, 8, depth3_nm, self.default) worksheet.write(row, 9, score, self.default) # 결과건수가 한 페이지 사이즈보다 큰 경우, scroll을 이용해서 paging하며 결과를 가져옴. # 용량이 클 것으로 예상하여 엑셀 파일도 새로 생성. if "hits" in result and result["hits"]["total"] > size: row = 0 for page in range(1, math.ceil(result["hits"]["total"]/size)): # 0, 1, 2, .... worksheet = self.workbook.add_worksheet("원문(%s)(%d)"%("~".join([params['start_date'][0:10],params['end_date'][0:10]]),page)) # 엑셀 헤더 worksheet.write(0, 0, 'ID', self.header) worksheet.write(0, 1, '게시일', self.header) worksheet.write(0, 2, '작성자', self.header) worksheet.write(0, 3, 'URL', self.header) worksheet.write(0, 4, '제목', self.header) worksheet.write(0, 5, '내용', self.header) worksheet.write(0, 6, '채널1', self.header) worksheet.write(0, 7, '채널2', self.header) worksheet.write(0, 8, '채널3', self.header) worksheet.write(0, 9, '정확도', self.header) # 정확도(Score) 추가 scrolled_result = es.get_documents(params, size, index, scroll_id=result["_scroll_id"]) for this_result in scrolled_result["hits"]["hits"]: doc_id = this_result["_id"] doc_datetime = this_result["_source"]["doc_datetime"] doc_writer = re.sub("[\+=\-/]", "", str(this_result["_source"]["doc_writer"])) doc_url = this_result["_source"]["doc_url"] doc_title = re.sub("[\+=\-/]", "", str(this_result["_source"]["doc_title"])) doc_content = re.sub("[\+=\-/]", "", str(this_result["_source"]["doc_content"])) depth1_nm = this_result["_source"]["depth1_nm"] depth2_nm = this_result["_source"]["depth2_nm"] depth3_nm = this_result["_source"]["depth3_nm"] score = this_result["_score"] # 2018.04.05 특정 패턴이 등장하는 title, content가 포함되어 있을 경우 row에서 제외. if EXCLUDE_PATTERNS is not None and (EXCLUDE_PATTERNS.search(doc_title) is not None or EXCLUDE_PATTERNS.search(doc_content) is not None): continue row += 1 worksheet.write(row, 0, doc_id, self.default) worksheet.write(row, 1, doc_datetime, self.default) worksheet.write(row, 2, doc_writer, self.default) worksheet.write(row, 3, doc_url, self.default) worksheet.write(row, 4, doc_title, self.default) worksheet.write(row, 5, doc_content, self.default) worksheet.write(row, 6, depth1_nm, self.default) worksheet.write(row, 7, depth2_nm, self.default) worksheet.write(row, 8, depth3_nm, self.default) worksheet.write(row, 9, score, self.default) # 마지막 페이지를 처리하고 나면 scroll을 clear if page == math.ceil(result["hits"]["total"]/size)-1: if result["_scroll_id"]: es.clear_scroll(result["_scroll_id"]) def make_trend_report(self, params): logger.info("============================= \"make_trend_report\" starts.") today = re.sub("[-]", "", params['start_date'][0:10]) worksheet = self.workbook.add_worksheet("연관어(%s)"%"~".join([params['start_date'][0:10],params['end_date'][0:10]])) # 헤더 # 날짜 형식은 YYYYMMDD 이어야 함 worksheet.write(0, 0, '날짜', self.header) worksheet.write(0, 1, '시간', self.header) worksheet.write(0, 2, '검색그룹', self.header) worksheet.write(0, 3, '검색아이템', self.header) worksheet.write(0, 4, '검색키워드', self.header) worksheet.write(0, 5, '키워드', self.header) # 데이터 result = db.get_data_for_report_trend(params['project_seq'], today) for idx, row in enumerate(result, 1): worksheet.write(idx, 0, row[0], self.default) worksheet.write(idx, 1, row[1], self.default) worksheet.write(idx, 2, row[2], self.default) worksheet.write(idx, 3, row[3], self.default) worksheet.write(idx, 4, row[4], self.default) worksheet.write(idx, 5, row[5], self.default) def create_report(self, params): self.workbook = xlsxwriter.Workbook(os.path.join(self.file_path.replace("/", os.path.sep), self.file_name), options={'strings_to_urls': False, 'strings_to_numbers': True} ) self.header = self.workbook.add_format(self.HEADER_FORMAT) self.default = self.workbook.add_format(self.DEFAULT_FORMAT) if self.mode == MODE_TOPICS: self.topics_list(params) elif self.mode == MODE_DOCUMENTS: self.create_documents_list(params, INDEX_DOCUMENTS) elif self.mode == MODE_EMOTIONS: self.emotions_per_causes(params) elif self.mode == MODE_TREND: self.make_trend_report(params) self.close_workbook() def close_workbook(self): self.workbook.close()
def get_documents(params, size, index, scroll_id=None): queryObj = Query(params) if not scroll_id: es_uri = "/" + index + "/doc/_search?scroll=1d" request = {"size": size, "query": {"bool": {"must": []}}} else: es_uri = "/_search/scroll" request = {"scroll": "1d", "scroll_id": scroll_id} must = [] # 프로젝트 시퀀스 포함 must.append(get_project_seq_query(params)) # 대상 채널 if "channels" in params and params[ "channels"] and params["channels"] != 'all': must.append(get_channel_query(params)) # 대상 기간 if "start_date" in params and "end_date" in params: must.append(get_period_query(params)) # 데이터셋의 포함 키워드 if "datasets" in params and params["datasets"]: # 신라면,삼양라면,안성탕면 if len(params["datasets"].split("^")) > 1: should = [] for dataset in params["datasets"].split("^"): should.append( queryObj.get_dataset_query(params['project_seq'], dataset)) must.append({"bool": {"should": should}}) else: must.append( queryObj.get_dataset_query(params['project_seq'], params["datasets"])) # elif params["type_cd"] == "CCT002": # 소셜모니터링-문서통계 # elif params["type_cd"] == "CCT003": # 소셜모니터링-감성분석 # ..... # 코드별로 request 필요한 형태로 변경해서 추가 if "query" in request: request["query"]["bool"]["must"] = must logger.debug("get_documents() ==> request : ") for k, v in request.items(): logger.debug("\t{} : {}".format(k, v)) es_conn = hc.HTTPConnection(es_ip, es_port, timeout=60) es_conn.request("POST", es_uri, json.dumps(request), {"Content-type": "application/json"}) result = es_conn.getresponse().read() if 'hits' in json.loads(result): logger.debug("[get_documents] result['hits']['total'] >>> %d" % int(json.loads(result)['hits']['total'])) else: logger.debug("[get_documents] result ::: " + str(result)) return json.loads(result)