class SearchAppCore(object): max_query = 20 es = es_instance() index = IndexName # obj.set_seg_query(lst_query) # str_abst = obj.abstract('艺术创造美,创业的现象,创和业的结。') # print str_abst qt_type = { DEF_QUERY_TYPE_ORG: OrgInfo, DEF_QUERY_TYPE_STAFF: StaffInfo, DEF_QUERY_TYPE_CID: CourseTypeName, DEF_QUERY_TYPE_CREDICT: CreditCourse } @staticmethod def param_check(param): print json.dumps(param) @staticmethod def get_result(param): # 参数解析 hl_obj = Highlighting(w=56) SearchAppCore.param_check(param) highlight_tokens = set() # query 分析后的结果写在了param sum_token_score_w, sum_token_analyzer = analyse_query_all( param["query"], param, highlight_tokens, SearchAppCore.es) # print sum_token_score_w count = False # 创建DSL dsl = precess_func_dict[param["qt"]]["create_dsl"](sum_token_score_w, param, sum_token_analyzer, count) # 首次召回实体 res = SearchAppCore.es.search( index=IndexName, doc_type=SearchAppCore.qt_type[param["qt"]], body=dsl, preference="_primary_first") # 二次填充数据 res, object_num = precess_func_dict[param["qt"]]["process_res"]( res, param, hl_obj, highlight_tokens) result_json = { "data": res, "param": param, "total_num": object_num, "error_code": 0, "error_msg": "", } return result_json
def __init__(self, course_id): # base info self.course_id = course_id self.course_meta_id = -1 self.id_ = "" self.org = "" self.course = "" self.run = "" self.course_hash = "" self.owner = "" self.serialized = "" self.cid = [] self.groupcid = [] self.start = "" self.end = "" self.status = 0 self.last_chapter = "" self.mode = [] self.expire = "-" self.is_paid_only = 0 # searchable info self.course_name = "" self.about = "" self.prerequisites = "" self.title_type = 'title' self.about_type = 'about' self.staff_type = 'staff' self.category_type = 'category' self.subtitle_type = 'subtitle' self.prerequisites_type = 'prerequisites' # chunks self.title_chunks = [] self.about_chunks = [] self.structure_chunks = [] self.category_chunks = [] # self.problem_chunks = [] # self.html_chunks = [] self.staff_chunks = [] self.prerequisites_chunks = [] self.subtitle_chunks = [] self.fragment_chunks = [] # structure info self.children = [] # children列表,用来构建一种children的子type self.chapters = [] self.staff = [] # 这个列表包含了多个结构体,每个结构体都是一个老师的人员信息 self.fragment = [] # 这个课程ID对应的知识点信息(course_meta_fragmentknowledge) # 数据库连接 self.db = mongo_db() self.es = es_instance() self.mysql_conn = mysql_connection() self.cursor = self.mysql_conn.cursor() # 继续初始化相关课程信息 self._init_course_data() self.location = [self.course_id] self.course_chunks = [] self.ut = datetime.datetime.now()
def get_scroll_search_res(query_dsl, scroll='5m', index=es_assist_word_index, doc_type=doc_type, timeout="1m"): es_result = helpers.scan( client=es_instance(), query=query_dsl, scroll=scroll, index=index, doc_type=doc_type, timeout=timeout ) return es_result
def spider_live(row_id_list, live_type, mode): conn = mysql_connection() cursor = conn.cursor() assert live_type in ("livecast_livecastitem", "livecast_event") assert isinstance(row_id_list, list), "请使用list结构" if mode == "ALL": row_id_list = [] sql_text = "SELECT id FROM {table_name}".format(table_name=live_type) cursor.execute(sql_text) res = cursor.fetchall() for item in res: row_id_list.append(item[0]) sum = len(row_id_list) counter = 0 live_bulk_list = [] for item_id in row_id_list: # 抽取组装这个course_id对应的内容 live_dict = deepcopy(live_info_base) counter += 1 process_msg = "{}%; sum={}; now={}".format((counter * 100 / sum), sum, item_id) print process_msg get_live_base_info(item_id, live_type, cursor, live_dict) if live_dict["start_time"]: live_dict["start_time"] = live_dict["start_time"].strftime( "%Y-%m-%dT%H:%M:%S+08:00") if live_dict["end_time"]: live_dict["end_time"] = live_dict["end_time"].strftime( "%Y-%m-%dT%H:%M:%S+08:00") live_bulk_list.append(live_dict) conn.close() for chunk in live_bulk_list: chunk["_index"] = IndexName chunk["_type"] = LiveTypeName chunk["_id"] = str(chunk['live_key']) # print json.dumps(chunk) sucess_num, error_list = helpers.bulk(es_instance(), live_bulk_list) print "一共有效更新%d" % (sucess_num) if error_list: print "error list:", error_list return sucess_num
def upsert_course(course_id, specila_type, special_field): sucess_flag = True assert isinstance(course_id, list) es = es_instance() # assert es.indices.exists_type(index="course", doc_type=specila_type) assert specila_type == "course" if special_field == "accumulate_num": # 为了替换成新的接口 special_field = "accumulate_num_v2" res = es.indices.get_mapping(index="course", doc_type=specila_type) assert special_field in res["course"]["mappings"]["course"]["properties"].keys() result = {} chunks = [] for item_course in course_id: if item_course != "": enrollment_num_v2 = get_course_enrollment_v2(es, item_course) doc_id = md5(item_course) # 并不是每种type的id都是这样计算的 if es.exists(index="course", doc_type=specila_type, id=doc_id): result[item_course] = 1 # 正确 doc = { "_type": specila_type, "doc": { "ut": datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S"), special_field: enrollment_num_v2 }, "_index": "course", "doc_as_upsert": "true", "_id": doc_id, "_op_type": "update" } chunks.append(doc) else: result[item_course] = 0 # 不存在 sucess_flag = False else: result[item_course] = -1 sucess_flag = False sucess, error = helpers.bulk(es, chunks) for item_error in error: result[item_error] = 0 return sucess_flag, result
def get_course_info_specify_fields(fields, course_ids, index="course", type="course"): assert isinstance(course_ids, list), u"必须使用list结构的course_ids" assert isinstance(fields, list), u"必须使用list结构的fields" dsl_body = { "query": { "terms": { "course_id": course_ids } } } res = es_instance().search(index=index, doc_type=type, body=dsl_body, preference="_primary_first") res_dict_t = dict() data_num = res["hits"]["total"] for item_doc in res["hits"]["hits"]: key_ = item_doc["_source"]["course_id"] res_dict_t[key_] = item_doc["_source"] res_list = list() for item_course_id in course_ids: item_dict_ = dict({"course_id": item_course_id}) for item_field in fields: item_dict_[item_field] = res_dict_t.get(item_course_id, {}).get(item_field, None) res_list.append(item_dict_) return res_list, data_num
def spider_microdegree(row_id_list, mode): conn = mysql_connection() cursor = conn.cursor() assert isinstance(row_id_list, list), "请使用list结构" if mode == "ALL": row_id_list = [] sql_text = "SELECT id FROM livecast_microdegree" cursor.execute(sql_text) res = cursor.fetchall() for item in res: row_id_list.append(item[0]) sum = len(row_id_list) counter = 0 microdegree_bulk_list = [] for item_id in row_id_list: # 抽取组装这个microdegree_id对应的内容 microdegree_dict = deepcopy(microdegree_info_base) counter += 1 process_msg = "{}%; sum={}; now={}".format((counter * 100 / sum), sum, item_id) print process_msg get_microdegree_base_info(item_id, cursor, microdegree_dict) microdegree_bulk_list.append(microdegree_dict) conn.close() for chunk in microdegree_bulk_list: del chunk["course_ids"] chunk["_index"] = IndexName chunk["_type"] = MicrodegreeTypeName chunk["_id"] = str(chunk['microdegree_key']) # print json.dumps(chunk) sucess_num, error_list = helpers.bulk(es_instance(), microdegree_bulk_list) print "一共有效更新%d" % (sucess_num) if error_list: print "error list:", error_list return sucess_num
COURSE_TOPIC_MYSQL = 'edxapp' TAP_PARENT_COURSE = 'course-edit' # es tap_index_name = "course_ancestor" tap_type_name = "course_ancestor" GROUP_ID1 = 'search_index_update_gujia' GROUP_ID2 = 'search_index_update_gujia' GROUP_ID3 = 'search_index_update_tap' course_queue = Queue.Queue() # size = infinite Index = "monit_table" Type = "search_add_index_kafka" doc_id = "1" es = es_instance() es_tap = es_instance_tap() last_time = 0 def write_es(kafka_source): time_str = time.strftime('%Y-%m-%d %X', time.localtime()) assert kafka_source in ("gujia_kafka", "mysql_kafka", "tap_parent_kafka"), "请输入正确的键" doc = {"doc": {kafka_source: time_str}} es.update(index=Index, doc_type=Type, id=doc_id, body=doc, retry_on_conflict=5)
def spider_xuetang_score(row_id_list, mode): def proc_stage(chunks_): chunk = dict() if len(chunks_) >= 1: for item_key in chunks_[0].keys(): if item_key not in chunk: if item_key == "creditcourse_id": pass else: chunk[item_key] = set() for item_doc in chunks_: if item_key == "creditcourse_id": chunk[item_key] = item_doc[item_key] else: chunk[item_key].add(item_doc[item_key]) for item_key in chunk: if isinstance(chunk[item_key], set): chunk[item_key] = list(chunk[item_key]) if len(chunk[item_key]) == 1: chunk[item_key] = chunk[item_key][0] return chunk sql_dict = { "school": [ """ select * from (SELECT a1.id as creditcourse_row_id, a1.id as l2_id, a1.name as l2_name, a2.name as l1_name, a2.id as l1_id FROM newcloud_credit_organization as a1 left join newcloud_credit_college as a2 on a1.college_id = a2.id) a3 where a3.creditcourse_row_id ={param_key_}""", proc_stage, "creditcourse_row_id" ], "stage": [ """ SELECT * from (select a1.creditcourse_id as creditcourse_id, a2.id as stage_id, a2.name as stage_name, a2.weight as stage_weight from newcloud_credit_creditcourse_stages as a1 left join newcloud_credit_coursestage as a2 on a1.coursestage_id = a2.id) a3 where creditcourse_id = {param_key_} """, proc_stage, "creditcourse_row_id" ], "categorys": [ """select * from (select a1.creditcourse_id, a2.id as category_id, a2.name as category_name, a2.weight, a2.parent_id from newcloud_credit_creditcourse_categorys as a1 left join newcloud_credit_coursecategory as a2 on a1.coursecategory_id = a2.id) a3 where a3.creditcourse_id = {param_key_}""", proc_stage, "creditcourse_row_id" ], "student_num": [ """ select count(user_id) as student_num from student_courseenrollment where course_id="{param_key_}" """, proc_stage, "course_id" ], "platform_num": [ """ select count(DISTINCT(plat_id)) as platform_num from newcloud_termcourse where coursekey="{param_key_}" """, proc_stage, "course_id" ] } conn = mysql_connection() cursor = conn.cursor() assert isinstance(row_id_list, list), "请使用list结构" if mode == "ALL": row_id_list = [] sql_text = "SELECT id FROM newcloud_credit_creditcourse" cursor.execute(sql_text) res = cursor.fetchall() for item in res: row_id_list.append(item[0]) sum = len(row_id_list) # 总的数量 all_sum = 0 # 成功数量 creditcourse_bulk_list = [] assert isinstance(row_id_list, list), "请使用list结构" counter = 0 # 游标 for creditcourse_row__id in row_id_list: counter += 1 process_msg = "{}%; sum={}; now={}".format((counter * 100 / sum), sum, creditcourse_row__id) print process_msg sql_text = """ select a.id, a.course_id, a.org, a.name, a.start, a.end, a.thumbnail, a.owner, a.status, a.serialized, a.subtitle, b.id as creditcourse_row_id, b.created, b.modified, b.visible, b.is_enroll, b.is_apply, b.credit from (select * from newcloud_credit_creditcourse where id={id} ) b left join course_meta_course as a on a.id = b.course_id """.format(id=creditcourse_row__id) cursor.execute(sql_text) res = cursor.fetchall() desc = cursor.description chunks = [dict(zip([col[0] for col in desc], row)) for row in res] assert len(chunks) == 1, "抓取学堂学分课出现异常值,id={id}".format( id=creditcourse_row__id) chunk = chunks[0] for item_fields in sql_dict: param_key_ = sql_dict[item_fields][2] if chunk[param_key_]: sql_t = sql_dict[item_fields][0].format( param_key_=chunk[param_key_]) # print sql_t cursor.execute(sql_t) res = cursor.fetchall() desc = cursor.description res = list(res) chunks_ = [ dict(zip([col[0] for col in desc], row)) for row in res ] res = sql_dict[item_fields][1](chunks_) chunk.update(res) sql_text = """select name from newcloud_credit_coursecategory where id = {id}""" if "parent_id" in chunk: if not isinstance(chunk["parent_id"], list): chunk["parent_id"] = [chunk["parent_id"]] chunk["parent_name"] = [] for item_parent in chunk["parent_id"]: try: cursor.execute(sql_text.format(id=item_parent)) res = cursor.fetchall() chunk["parent_name"].append(res[0][0]) except Exception, e: chunk["parent_name"].append(None) # print_time_json(chunk) add_key_prefix_4_dict(CreditCourse + "tp", chunk) chunk["ut"] = time.strftime("%Y-%m-%dT%H:%M:%S+08:00", time.localtime()) chunk["_index"] = IndexName chunk["_type"] = CreditCourse chunk["_id"] = md5(str(chunk[CreditCourse + "tp_" + "course_id"])) creditcourse_bulk_list.append(chunk) if len(creditcourse_bulk_list) == 100: sucess_num, error_list = helpers.bulk(es_instance(), creditcourse_bulk_list) all_sum += sucess_num del creditcourse_bulk_list[:] if error_list: process_msg = "build error {ids}".format(ids=str(error_list)) print process_msg
def spider_staffinfo(row_name_list, mode): conn = mysql_connection() cursor = conn.cursor() all_sum = 0 if mode == "ALL": row_name_list = [] sql_text = "SELECT name FROM course_meta_staff" cursor.execute(sql_text) res = cursor.fetchall() for item in res: if not item[0] or u"#" in item[0] or len(item[0]) <= 1 or u"," in item[0] \ or u"1" in item[0] or u"2" in item[0] or u"3" in item[0]: print item[0], "****" continue if item[0] not in row_name_list: # 按照人名去重 row_name_list.append(item[0]) assert isinstance(row_name_list, list), "请使用list结构" sum = len(row_name_list) counter = 0 staffinfo_list = [] for item_name in row_name_list: counter += 1 process_msg = "{}%; sum={}; now={}".format((counter * 100 / sum), sum, item_name) print process_msg sql_text = """ SELECT id as staffinfo_id, name as staffinfo_name, org_id_id as staffinfo_org, company as staffinfo_company, department as staffinfo_department, position as staffinfo_position, avartar as staffinfo_avartar, about as staffinfo_about FROM course_meta_staff WHERE name="{name}" """.format( name=MySQLdb.escape_string(item_name)) cursor.execute(sql_text) res = cursor.fetchall() desc = cursor.description chunks = [dict(zip([col[0] for col in desc], row)) for row in res] # 这里面只有一个元素 if len(chunks) >= 1: for chunk in chunks: score_ = 0 for key_ in chunk: if not chunk[key_]: score_ += 1 chunk["score_"] = score_ goal_chunk = sorted(chunks, key=lambda x: x["score_"], reverse=True)[-1] # url_ = "http://192.168.9.30:9999/search?query={name}&process=0&group=&qt=2&cid=&serialized=&expiration=&course_type=0&st=1&hasTA=&course_id=&num=15&platform=0&version=2&org=&owner=xuetangX%3BedX&pn=0&home=0&mode=&fields_v=1&persent=10" url_ = "http://newsearch.xuetangx.info:9998/search?process=0&group=&qt=2&cid=&serialized=&expiration=&course_type=0&st=1&hasTA=&course_id=&num=15&platform=0&version=2&org=&owner=xuetangX%3BedX&query={name}&pn=0&home=0&mode=&fields_v=1&persent=10" url = url_.format(name=query2urlcode(item_name)) course_num = 0 count_num = 3 while count_num >= 1: try: url_res = requests.get(url) course_num = url_res.json()["total"]["course"] count_num = 0 except Exception, e: print e time.sleep(1) count_num -= 1 goal_chunk["staffinfo_course_num"] = course_num goal_chunk["ut"] = time.strftime("%Y-%m-%dT%H:%M:%S+08:00", time.localtime()) del goal_chunk["score_"] goal_chunk["staffinfo_company_md5"] = md5( str(goal_chunk["staffinfo_company"])) goal_chunk["_index"] = IndexName goal_chunk["_type"] = StaffInfo goal_chunk["_id"] = md5(str(goal_chunk["staffinfo_name"])) # print course_num, "----" staffinfo_list.append(goal_chunk) if len(staffinfo_list) == 100: sucess_num, error_list = helpers.bulk(es_instance(), staffinfo_list) all_sum += sucess_num del staffinfo_list[:] if error_list: process_msg = "build error {ids}".format( ids=str(error_list)) print process_msg
def spider_orginfo(org_name_list, mode): conn = mysql_connection() cursor = conn.cursor() all_sum = 0 if mode == "ALL": org_name_list = [] sql_text = "SELECT name FROM course_meta_organization" cursor.execute(sql_text) res = cursor.fetchall() for item in res: if item[0]: if item[0] not in org_name_list: org_name_list.append(item[0]) else: pass # print item[0].replace("\n", "").replace(" ", ""), "--" assert isinstance(org_name_list, list), "请使用list结构" sum = len(org_name_list) counter = 0 orginfo_list = [] for item_name in org_name_list: counter += 1 process_msg = "{}%; sum={}; now={}".format((counter * 100 / sum), sum, item_name) print process_msg sql_text = """ SELECT id as orginfo_id, org as orginfo_org, name as orginfo_name, about as orginfo_about, cover_image, school_motto FROM course_meta_organization WHERE name="{org_name}" """.format(org_name=item_name) cursor.execute(sql_text) res = cursor.fetchall() desc = cursor.description if len(res) >= 1: chunks = [dict(zip([col[0] for col in desc], row)) for row in res] # 这里面只有一个元素 orgs = set() for chunk in chunks: orgs.add(chunk["orginfo_org"]) chunk = chunks[0] chunk["orginfo_org"] = u";".join(orgs) # url_ = "http://192.168.9.30:9999/search?process=0&group=&qt=2&cid=&serialized=&expiration=&course_type=0&st=1&hasTA=&course_id=&num=10&platform=0&version=2&org={org}&owner=xuetangX%3bedX&query=&pn=0&home=0&mode=" url_ = "http://newsearch.xuetangx.info:9998/search?process=0&group=&qt=2&cid=&serialized=&expiration=&course_type=0&st=1&hasTA=&course_id=&num=10&platform=0&version=2&org={org}&owner=xuetangX%3bedX&query=&pn=0&home=0&mode=" url = url_.format(org=query2urlcode(chunk["orginfo_org"])) # print url course_num = 0 count_num = 3 while count_num >= 1: try: url_res = requests.get(url) course_num = url_res.json()["total"]["course"] count_num = 0 except Exception, e: print e time.sleep(1) count_num -= 1 chunk["course_num"] = course_num print course_num, chunk["orginfo_org"], "------" chunk["ut"] = time.strftime("%Y-%m-%dT%H:%M:%S+08:00", time.localtime()) chunk["_index"] = IndexName chunk["_type"] = OrgInfo chunk["_id"] = str(chunk["orginfo_id"]) orginfo_list.append(chunk) if len(orginfo_list) == 100: sucess_num, error_list = helpers.bulk(es_instance(), orginfo_list) all_sum += sucess_num del orginfo_list[:] if error_list: process_msg = "build error {ids}".format(ids=str(error_list)) print process_msg
chunk["ut"] = time.strftime("%Y-%m-%dT%H:%M:%S+08:00", time.localtime()) chunk["_index"] = IndexName chunk["_type"] = OrgInfo chunk["_id"] = str(chunk["orginfo_id"]) orginfo_list.append(chunk) if len(orginfo_list) == 100: sucess_num, error_list = helpers.bulk(es_instance(), orginfo_list) all_sum += sucess_num del orginfo_list[:] if error_list: process_msg = "build error {ids}".format(ids=str(error_list)) print process_msg sucess_num, error_list = helpers.bulk(es_instance(), orginfo_list) all_sum += sucess_num if error_list: process_msg = "build error {ids}".format(ids=str(error_list)) print process_msg conn.close() print "一共有效更新%d" % (all_sum) return all_sum def spider_staffinfo(row_name_list, mode): conn = mysql_connection() cursor = conn.cursor() all_sum = 0 if mode == "ALL":
class SearchCore(object): max_query = 20 es = es_instance() index = IndexName @staticmethod def high_light(item_class_dict, query_tokens, param): obj = Highlighting(w=56) obj.set_seg_query(query_tokens) for item_resource in item_class_dict.itervalues(): if item_resource["item_type"] == "course": if "sub_about" in item_resource: item_resource["highlight"]["about"] = obj.abstract( item_resource["sub_about"]["about"]) del item_resource["sub_about"] if "sub_title" in item_resource: item_resource["highlight"]["title"] = obj.abstract( item_resource["sub_title"]["title"]) del item_resource["sub_title"] if "sub_staff" in item_resource: for item in ["department", "position", "name", "company"]: item_resource["highlight"][ "staff-" + item] = obj.abstract( item_resource["sub_staff"][item]) del item_resource["sub_staff"] if "sub_structure" in item_resource: display = item_resource["sub_structure"]["name"] item_resource["sub_structure"]["structure"].append(display) temp_str = " ".join( item_resource["sub_structure"]["structure"]) item_resource["highlight"]["chapter"] = obj.abstract( temp_str) temp_dict = { "center": { "process": obj.abstract(display), "display": display, "location": item_resource["sub_structure"]["location"] } } item_resource["highlight"]["structure"] = temp_dict del item_resource["sub_structure"] if item_resource["item_type"] == "fragment": item_resource["highlight"]["frag_title"] = obj.abstract( item_resource["frag_title"]) item_resource["highlight"]["frag_desc"] = obj.abstract( item_resource["frag_desc"]) if item_resource["item_type"] == "live": item_resource["highlight"][ "live_album_sub_title"] = obj.abstract( item_resource["live_album_sub_title"]) item_resource["highlight"]["live_album_title"] = obj.abstract( item_resource["live_album_title"]) item_resource["highlight"]["live_item_title"] = obj.abstract( item_resource["live_item_title"]) if item_resource["item_type"] == "microdegree": item_resource["highlight"][ "microdegree_sub_title"] = obj.abstract( item_resource["microdegree_sub_title"]) item_resource["highlight"]["match_course"] = obj.abstract( item_resource["match_course"]) item_resource["highlight"]["microdegree_title"] = obj.abstract( item_resource["microdegree_title"]) item_resource["highlight"]["tips"] = obj.abstract( "包含{q}相关的课程:".format(q=param["query"])) @staticmethod def param_check(param): qt = param["qt"] st = param["st"] version = param["version"] home = param["home"] log_str = u"param_check:参数检查失败" if version == 2: # 新版接口(直播和微学位) pass if home == 1: # 检查 主页只能对这几种资源只能用相关性排序 allow_qt = [ DEF_QUERY_TYPE_MIX_CK, DEF_QUERY_TYPE_COURSE, DEF_QUERY_TYPE_KNOWLE, # DEF_QUERY_TYPE_ENGINEER, 此时不能使用工程硕士 DEF_QUERY_TYPE_LIVE, DEF_QUERY_TYPE_MICRODEGREE ] assert qt in allow_qt, u"version=2,home=1,qt失败" elif home == 0: assert qt != DEF_QUERY_TYPE_MIX_CK, u"qt=1请搭配home=1" # 检查 live 和 microdegree只能用相关性排序 if qt in [DEF_QUERY_TYPE_LIVE, DEF_QUERY_TYPE_MICRODEGREE]: assert st == DEF_SORT_TYPE_SCORE, log_str elif version == 1: # 旧版接口 pass # 参数矫正,以使v1接口能使用v2接口: # app 的主筛选框 if qt == DEF_QUERY_TYPE_MIX_CK and st == DEF_SORT_TYPE_STATUS: param["qt"] = DEF_QUERY_TYPE_COURSE else: raise Exception, "version参数有误" @staticmethod def get_result(param): # query = u"花卉学" # query = u"大学化学" SearchCore.param_check(param) query = param["query"] t1 = time.time() highlight_tokens.clear() # 配置分词器行为 # query 分析后的结果写在了param sum_token_score_w, sum_token_analyzer = analyse_query_all( query, param, highlight_tokens, SearchCore.es) # search_logger.info(json.dumps(param)) dsl_list, index_end = create_dsl(sum_token_score_w, param, sum_token_analyzer) # print "++++" * 15 # print json.dumps(dsl_list) # print "++++" * 15 # 搜索ES,获取指定的返回父文档结果 dsl_all_in_body = "" query_dsl_head = {"index": IndexName, "preference": "_primary_first"} for item_dsl in dsl_list: dsl_all_in_body += json.dumps(query_dsl_head) + '\n' + json.dumps( item_dsl) + '\n' log_str = u"父得分首次 召回结束" result = SearchCore.es.msearch(body=dsl_all_in_body) # search_logger.info("{b}==={a}".format(b=log_str, a=str(time.time() - t1))) # print "------"*10 # print json.dumps(result) # print "------" * 10 if param['st'] == DEF_SORT_TYPE_SCORE and param["persent"] < 10: second_search = False # 测试是否需要二次检索: 只有all标签为0才进行二次检索 if param["home"] == 1: # 主搜索框的检索 if param["qt"] == DEF_QUERY_TYPE_MIX_CK: if result["responses"][0]["hits"]["total"] == 0: # 全部标签为0 second_search = True search_logger.info("all资源为0") else: # 对单资源的检索 assert param["qt"] in [ DEF_QUERY_TYPE_MIX_CK, DEF_QUERY_TYPE_COURSE, DEF_QUERY_TYPE_KNOWLE, DEF_QUERY_TYPE_LIVE, DEF_QUERY_TYPE_MICRODEGREE ], "参数检查失败2" if result["responses"][0]["hits"]["total"] == 0: second_search = True search_logger.info("单项召回时,资源为0") if second_search: search_logger.info(u"触发二次检索") param["must_tokens"] = [] param["should_tokens"] = sum_token_score_w.keys() param["persent_scale"] = len( param["should_tokens"]) / 5 + 1 if len( param["should_tokens"]) > 4 else 0 if DEBUG_FLAG: search_logger.info('----二次should----') for j in param["should_tokens"]: search_logger.info(j) dsl_list, index_end = create_dsl(sum_token_score_w, param, sum_token_analyzer) # print json.dumps(dsl_list) # print "---**--"*10 dsl_all_in_body = "" query_dsl_head = { "index": IndexName, "preference": "_primary_first" } for item_dsl in dsl_list: dsl_all_in_body += json.dumps( query_dsl_head) + '\n' + json.dumps(item_dsl) + '\n' log_str = u"父得分二次 召回结束" result = SearchCore.es.msearch(body=dsl_all_in_body) search_logger.info("{b}==={a}".format(b=log_str, a=str(time.time() - t1))) # 二次检索结束 # print "--0_0--" * 10 # print json.dumps(result), item_class_dict = dict() all_sum = None course_sum = None knowledge_sum = None live_sum = None microdegree_sum = None if param["home"]: item_sum = [ # "all_sum" # "course_sum" # "knowledge_sum" # "live_sum" # "microdegree_sum" ] # 多类资源召回 for index_t, item_resource in enumerate(result["responses"]): # print item_resource assert item_resource["status"] == 200, u"ES结果报错" this_sum = item_resource["hits"]["total"] item_sum.append(this_sum) if (index_t + 1) == index_end[0][0]: # 这是要显示的资源栏 # print "—+++——"*10 # print len(item_resource["hits"]["hits"]) for item in item_resource["hits"]["hits"]: # 创建 字典结构中的key # print item["_type"], "+++++++" # 工程硕士也是使用这个 if item["_type"] == "course": couse_id = item["_source"]["course_id"] item_id = couse_id elif item["_type"] == "fragment": knowledge_id = item["_source"]["course_id"] + item[ "_source"]["knowledge_id"] item_id = knowledge_id elif item["_type"] == "live": live_id = item["_source"]["live_key"] item_id = live_id elif item["_type"] == "microdegree": microdegree_id = "{t}_{k}".format( t="microdegree", k=item["_source"]["microdegree_key"]) item_id = microdegree_id del item["_source"]["course_info_list"] else: raise Exception, "出现不明的异常类型" # search_logger.info("这是一个{t}:{s}".format(t=item["_type"], s=item_id)) item_class_dict[item_id] = item["_source"] item_class_dict[item_id]["item_type"] = item["_type"] item_class_dict[item_id]["score"] = item["_score"] item_class_dict[item_id]["highlight"] = dict() all_sum = item_sum[0] course_sum = item_sum[1] knowledge_sum = item_sum[2] live_sum = item_sum[3] microdegree_sum = item_sum[4] else: # 单类资源召回,就会循环一次 # print json.dumps(result) # print "-++-"*10 for index_t, item_resource in enumerate(result["responses"]): # print item_resource assert item_resource["status"] == 200, u"ES结果报错" all_sum = item_resource["hits"]["total"] # print index_end[0][0], "___" for item in item_resource["hits"]["hits"]: # 这个会遍历进行 # 工程硕士也是使用这个 resource_type = item["_type"] if item["_type"] == "course": couse_id = item["_source"]["course_id"] item_id = couse_id course_sum = all_sum elif item["_type"] == "fragment": knowledge_id = item["_source"]["course_id"] + item[ "_source"]["knowledge_id"] item_id = knowledge_id knowledge_sum = all_sum elif item["_type"] == "live": live_id = item["_source"]["live_key"] item_id = live_id live_sum = all_sum elif item["_type"] == "microdegree": microdegree_id = "{t}_{k}".format( t="microdegree", k=item["_source"]["microdegree_key"]) item_id = microdegree_id del item["_source"]["course_info_list"] microdegree_sum = all_sum else: print "---------" * 10 print json.dumps(item) raise Exception, "出现不明的异常类型" # search_logger.info("这是一个{t}:{s}".format(t=item["_type"], s=item_id)) item_class_dict[item_id] = item["_source"] item_class_dict[item_id]["item_type"] = item["_type"] item_class_dict[item_id]["score"] = item["_score"] item_class_dict[item_id]["highlight"] = dict() # 为兼容以前的接口返回参数 # search_logger.info("all_sum:{a} course_sum:{b} knowledge_sum:{c} live_sum:{l} microdegree_sum:{m}".format( # a=all_sum, b=course_sum, c=knowledge_sum, l=live_sum, m=microdegree_sum # )) # print "_____"*10 # print json.dumps(item_class_dict) # print "_____" * 10 get_main_info(item_class_dict, param["query"], SearchCore.es, param) # print json.dumps(item_class_dict) # print "_____" * 10 if param["st"] == DEF_SORT_TYPE_SCORE: SearchCore.high_light(item_class_dict, highlight_tokens, param) res_out_list = sorted(item_class_dict.values(), key=lambda x: x.get("score", 0), reverse=True) else: res_out_list = sorted(item_class_dict.values(), cmp=sort_func_by_status, reverse=True) # print json.dumps(res_out_list) # print "_*_*_*_*_" * 10 process_result(res_out_list) # print json.dumps(course_list) result_json = { "data": res_out_list, "param": param, "total": { "microdegree": microdegree_sum if microdegree_sum else 0, "live": live_sum if live_sum else 0, "course": course_sum if course_sum else 0, "knowledge": knowledge_sum if knowledge_sum else 0, "all": all_sum if all_sum else 0 }, "error_code": 0, "error_msg": "", } correct_server = conf['correct']['host'] # correct_server = "10.0.0.160" port = conf['correct']['port'] if query != '': try: url = 'http://%s:%s/query_correct' % (correct_server, port) pyload = {'query': query} correct_result = requests.get(url, timeout=1, params=pyload) result_json['correct'] = correct_result.json().get( 'correct_result') except Exception, e: search_logger.warn( "ERROR_CORRECT_SERVICE url={url} query = {q}".format( url=url, q=query)) result_json["time"] = str(time.time() - t1) return result_json