def get_user_qa_content(uid, begin, end): # 不要了,改用从hbase取数据 # 获取用户在一段时间内所有qa的全文 begin_ds = timestamp2datetime(ensure_second_timestamp(begin)) end_ds = timestamp2datetime(ensure_second_timestamp(end)) all_qa_text = [] sql1 = 'select id from ask_problem where user_id=%s and created_time>"%s" and created_time<"%s";' \ % ( uid, begin_ds, end_ds ) o1 = get_medicaldb_handler().do_one(sql1) if o1 is None or len(o1) == 0: return all_qa_text for item in o1: problem_id = item[0] sql = 'select content from ask_problemcontent where problem_id=%s;' % problem_id o = get_medicaldb_handler().do_one(sql) if o is None or len(o) == 0: continue content = o[0][0] content_dict = json.loads(content)[0] if content_dict['type'] != 'text': continue text = content_dict['text'] all_qa_text.append(text) return all_qa_text
def test5(): from rpc_services.medical_service_utils import get_entities from rpc_services.search_api import more_topic from general_utils.db_utils import get_medicaldb_handler sql = "select ask from ask_problem order by id desc limit 1000;" o = get_medicaldb_handler().do_one(sql) yes = 0 all = 0 for item in o: print "---=-=-==-==-==--=--===-==" text = item[0] tags = " ".join(get_entities(text)) print "text", text if not text: continue o = more_topic(text) o = json.loads(o)["result"] for item in o: print item['title'] print len(o) print "==================" print "tags", tags if not tags: continue o = more_topic(tags) o = json.loads(o)["result"] for item in o: print item['title'] cnt = 0 print len(o)
def test9(): from general_utils.db_utils import get_medicaldb_handler from add_data_to_solr.manager.add_utils import topic_info, doctor_info fo = open("topic_score.csv", "w") csvwriter = csv.writer(fo, dialect='excel') first_line = [u'topic id', u'score', u'topic title', u'content len', u'image num', u'is original', u'doctor id', u'职称', u'医院级别', u'1科室', u"2科室", u'城市', u'hospital_name'] first_line = convert2gbk(first_line) csvwriter.writerow(first_line) d = "data_dir/topic_data/" b2 = pickle_from_file(d + 'all_doc_big_2') for item in b2: id = int(item['id'].split('_')[-1]) score = item['tid'] / 10.0 title = item['title'] content_len = item['content_len'] sql = 'select doctor_id from api_doctortopic where id=%s;' % id o = get_medicaldb_handler().do_one(sql) doctor_id = o[0][0] ti = topic_info(id) di = doctor_info(doctor_id) image_num = ti['image_num'] is_original = ti['is_original'] d_title = di['title'] h_level = di['hospital_level'] h_name = di['hospital_name'] clinic_no = di['first_class_clinic_no'] s_clinic_no = di['second_class_clinic_no'] city = di['city'] rows = [str(id), str(score), title, str(content_len), str(image_num), str(is_original), doctor_id, d_title, h_level, clinic_no, s_clinic_no, city, h_name] rows = convert2gbk(rows) csvwriter.writerow(rows) fo.close()
def get_problem_contents(): from general_utils.db_utils import get_medicaldb_handler pid = sys.argv[2] sql = 'select content from ask_problemcontent where problem_id=%s;' % pid o = get_medicaldb_handler().dbhandler.do_one(sql) for item in o: print item
def get_topic_data(): # score old_score = pickle_from_file(TOPIC_SCORE_FILE) biggest_id = max(old_score.keys()) # 最大的topic_id sql1 = "select id,doctor_id from api_doctortopic where is_deleted=0 and title <> '' and id>%s;" % biggest_id o = get_medicaldb_handler().do_one(sql1) cnt = 0 for item in o: id = item[0] doctor_id = item[1] info_of_topic = topic_info(id) info_of_doc = doctor_info(doctor_id) title_tags = get_entities_cyseg(info_of_topic["title"]) content_tags = get_entities_cyseg(info_of_topic["text"]) # print "content",info_of_topic["text"] if len(content_tags) == 0 or len(info_of_topic['title']) == 0: print "no content tag", id continue score = grade_topic(info_of_topic, info_of_doc, title_tags, content_tags) old_score[int(id)] = score cnt += 1 print "new topic id num", cnt pickle_to_file(old_score, TOPIC_SCORE_FILE)
def get_unique_clinic_no_1(topic_id): # 获取医生话题的一级科室信息 clinic_no, second_class_clinic_no = get_medicaldb_handler( ).get_topic_clinic_no(topic_id) if clinic_no and clinic_no in LONELY_FIRST_CLINIC_NO: return str(clinic_no) if second_class_clinic_no: return map_second_clinic_2_first(second_class_clinic_no) return None
def one_user_last_qa_info(pid): # 从数据库ask_problem表里把ask取出来,整理成hbase_utils里cy_time_event_one_user_kernel输出的格式 info = {"last_event": None, "last_event_time": 0} text = get_medicaldb_handler().get_ask_by_pid(pid) sex = '' age = '' # info_logger.info("qa text %s", text) # text = u"感冒发烧了吃什么药好,二甲双胍可以吃吗 肺气肿 怀孕 糖尿病(男,1岁)"############# info["last_event"] = ["free_problem_create", [text, sex, age]] return info
def get_qa_uids(begin, end): # 获取begin-end之间所有qa对应的user_id begin_dt = timestamp2datetime(ensure_second_timestamp(begin)) end_dt = timestamp2datetime(ensure_second_timestamp(end)) sql = 'select distinct user_id from ask_problem where created_time>"%s" and created_time<"%s";' % ( begin_dt, end_dt) o = get_medicaldb_handler().dbhandler.do_one(sql) uids = set() for item in o: uid = item[0] uids.add(int(uid)) return uids
def doctor_info(doctor_id): sql = "select title,level_title,second_class_clinic_no,first_class_clinic_no,hospital_name from symptomchecker_doctor where id='%s';" % doctor_id o = get_medicaldb_handler().do_one(sql) if o is None or len(o) == 0: return None title = o[0][0] # unicode level_title = o[0][1] # unicode second_class_clinic_no = o[0][2] first_class_clinic_no = o[0][3] hospital_name = o[0][4] sql = "select province from clinic_clinicdoctorinfo where doctor_id='%s';" % doctor_id o = get_medicaldb_handler().do_one(sql) if o is None or len(o) == 0: return None city = o[0][0] return { "title": title, "hospital_level": level_title, "second_class_clinic_no": second_class_clinic_no, "first_class_clinic_no": first_class_clinic_no, "city": city, "hospital_name": hospital_name, }
def test19(): import time from general_utils.db_utils import get_medicaldb_handler from general_utils.time_utils import timestamp2datetime uid = sys.argv[2] print 'uid', uid t1 = time.time() sql = 'select id from ask_problem where user_id=%s and created_time>"%s";' % ( uid, timestamp2datetime(time.time() - 180 * 86400)) o = get_medicaldb_handler().do_one(sql) if o is None or len(o) == 0: print 'nothing' return all_content = [] for item in o: id = item[0] print id sql1 = 'select content from ask_problemcontent where problem_id=%s;' % id o1 = get_medicaldb_handler().do_one(sql1) all_content.append(o1) t2 = time.time() print 'time', t2 - t1
def test14(): from add_data_to_solr.manager.add_utils import topic_info from general_utils.db_utils import get_medicaldb_handler sql = 'select id from api_doctortopic where is_deleted=0 and title <> "";' o = get_medicaldb_handler().do_one(sql) fo = open('topic_content_len.csv', 'w') csvwriter = csv.writer(fo) first_line = ['topic id', 'doctor id', 'content length'] csvwriter.writerow(first_line) for item in o: id = int(item[0]) info_of_topic = topic_info(id) doctor_id = info_of_topic['doctor_id'] content_len = info_of_topic['content_len'] csvwriter.writerow([str(id), str(doctor_id), str(content_len)]) fo.close()
def get_user_qa_content_smart(uid, num=5): all_qa_text = [] sql1 = 'select id from ask_problem where user_id=%s order by created_time limit %s;' % ( uid, num) t1 = time.time() o1 = get_medicaldb_handler().do_one(sql1) t2 = time.time() print "get_user_qa_content_smart mysql time", t2 - t1 if o1 is None or len(o1) == 0: return all_qa_text for item in o1: problem_id = item[0] print '-' * 10 t1 = time.time() qa_texts = get_qa_texts_by_pid(problem_id) t2 = time.time() print "get_qa_texts_by_pid time", problem_id, t2 - t1 all_qa_text.extend(qa_texts) return all_qa_text
def get_user_qa_content2(uid, begin, end): # 从habse problem2表中 获取用户在一段时间内所有qa的全文 begin_ds = timestamp2datetime(ensure_second_timestamp(begin)) end_ds = timestamp2datetime(ensure_second_timestamp(end)) all_qa_text = [] sql1 = 'select id from ask_problem where user_id=%s and created_time>"%s" and created_time<"%s";' \ % ( uid, begin_ds, end_ds ) o1 = get_medicaldb_handler().do_one(sql1) if o1 is None or len(o1) == 0: return all_qa_text for item in o1: problem_id = item[0] qa_texts = get_qa_texts_by_pid(problem_id) all_qa_text.extend(qa_texts) return all_qa_text
def topic_info(topic_id): sql = "select doctor_id,title,content,html,image,is_original from api_doctortopic where id=%s;" % topic_id o = get_medicaldb_handler().do_one(sql) if o is None or len(o) == 0: return None doctor_id = o[0][0] # unicode if not doctor_id: return None title = o[0][1] # unicode content = o[0][2] # unicode text = "" if content and len(content) > 0: content = content.replace(u"\r", u"\\r") content = content.replace(u"\n", u"\\n") content = re.sub(ur"\\+[nrt]", u" ", content) content = json.loads(content) text = get_text(content) elif len(text) == 0: html_text = o[0][3] # unicode if html_text: # default None text = filterHTML(html_text) content_len = len(text) image = o[0][4] if image: image_num = len(json.loads(image)) else: image_num = 0 is_original = o[0][5] return { "doctor_id": doctor_id, "title": title, "text": text, "content_len": content_len, "image_num": image_num, "is_original": is_original, }
def get_qa_text(uid, begin, end, num): # 需要快,同时保留事件的时间 bad_return = [], [] begin = ensure_second_timestamp(begin) end = ensure_second_timestamp(end) sql = 'select id,created_time,ask from ask_problem where user_id=%s order by id desc limit %s;' % ( uid, num) # print 'sql', sql o = get_medicaldb_handler().do_one(sql) if o is None or len(o) == 0: return bad_return text_list = [] ts_list = [] for item in o: dt = str(item[1]) ts = datetime_str2timestamp(dt) if ts < begin or ts > end: continue first_ask = unicode(item[2]) text_list.append(first_ask) ts_list.append(ts) return text_list, ts_list
def main8(test_uid=None): # test cf from recommend.manager.recommend_resource import Recommend_by_user_info if test_uid == "n": test_uid = None now = time.time() # now = 1513780888 data_dict = cy_time_event_kernel_test(now - 6000, now, test_uid) if not test_uid: fo = open("20171229_1_cfr.csv", "w") else: fo = open('test.csv', 'w') csvwriter = csv.writer(fo, dialect="excel") first_line = [ u"uid", u"u_tags", u"special_population", u"trigger", u"trigger_info", u"trigger_time", u"material_id", u"material_type", u"score", u"title", u"m_tags", u"only_topic" ] csvwriter.writerow(first_line) fail_cases = { 'big_search': defaultdict(int), 'free_problem_create': defaultdict(int) } all_uid_cnt = 0 all_valid_res_cnt = 0 qa_score = [[1.0 - i / 10.0, 0] for i in range(11)] bs_score = [[1.0 - i / 10.0, 0] for i in range(11)] trigger_cnt = {'qa': 0, 'bs': 0} cal_time = {} for uid in data_dict.keys(): time.sleep(0.5) print '=' * 10, uid, '=' * 10 user_info0 = data_dict[uid] t1 = time.time() res = Recommend_by_user_info(user_info0=user_info0, uid=uid, log_mark="test8", num=6, test=True) t2 = time.time() cal_time[uid] = t2 - t1 user_info = res['user_info'] res1 = res['res'] topn_ids_scores = res['topn_ids_scores'] only_topic = res['only_topic'] status = res['status'] v_score_dict = res['v_score_dict'] if not user_info: continue all_uid_cnt += 1 trigger = user_info["trigger"] if trigger == "big_search": trigger_cnt['bs'] += 1 else: trigger_cnt['qa'] += 1 if status != 'succeed': fail_cases[trigger][status] += 1 continue texts = user_info["texts"] tags = user_info["tags"] special_population = user_info["special_population"] timestamp = user_info['timestamp'] best_id, best_title, mtype = res1[0] best_score = v_score_dict[mtype + '_' + str(best_id)] if trigger == 'big_search': for i, item in enumerate(bs_score): if best_score >= item[0]: bs_score[i][1] += 1 break else: for i, item in enumerate(qa_score): if best_score >= item[0]: qa_score[i][1] += 1 break if trigger == 'big_search': trigger_info = "-".join(texts) elif trigger == "free_problem_create": problem_id, ask = get_medicaldb_handler().get_ask_by_timestamp( uid, timestamp) if not ask: ask = texts[0] trigger_info = '-'.join([str(problem_id), str(ask)]) # [u"uid", u"u_tags", u"special_population", u"trigger", u"trigger_info", u"trigger_time", # u"material_id", u"material_type", u"score", u"title", u"m_tags", u"only_topic", for id, title, mtype in res1: prefix = 'news_' if mtype == 'news' else 'r_topic_' mtags = get_news_tags_from_solr(prefix + str(uid)) rows = [ str(uid), '-'.join(tags), special_population, trigger, trigger_info, str(timestamp), str(id), mtype, v_score_dict[mtype + '_' + str(id)], title, '-'.join(mtags), str(only_topic) ] rows = convert2gbk(rows) csvwriter.writerow(rows) if res1: all_valid_res_cnt += 1 # fail_cases for trigger in fail_cases: for reason in fail_cases[trigger]: rows = [trigger, reason, str(fail_cases[trigger][reason])] rows = convert2gbk(rows) csvwriter.writerow(rows) # ana rows = ['all', str(all_uid_cnt), 'res_cnt', str(all_valid_res_cnt)] rows = convert2gbk(rows) csvwriter.writerow(rows) # score cut rows = ['bs score cut'] csvwriter.writerow(rows) cum_cnt = 0 for score, cnt in bs_score: cum_cnt += cnt true_recall = cum_cnt / float(trigger_cnt['bs']) rows = [str(score), str(cnt), str(true_recall)] csvwriter.writerow(rows) rows = ['qa score cut'] csvwriter.writerow(rows) cum_cnt = 0 for score, cnt in qa_score: cum_cnt += cnt true_recall = cum_cnt / float(trigger_cnt['bs']) rows = [str(score), str(cnt), str(true_recall)] csvwriter.writerow(rows) # cal time s_cal_time = sorted(cal_time.iteritems(), key=lambda x: x[1], reverse=True) for u, t in s_cal_time[:20]: csvwriter.writerow([str(u), str(t)]) fo.close()
def main5(test_uid=None, now=None): if test_uid == "n": test_uid = None now = time.time() if not now: now = 1512379920.1 else: now = float(ensure_second_timestamp(now)) t10 = time.time() data_dict = cy_time_event_kernel_test(now - 12000.0, now, test_uid) t20 = time.time() print "len(data_dict)", len(data_dict) if not test_uid: fo = open("20171220_1_res.csv", "w") else: fo = open('test.csv', 'w') csvwriter = csv.writer(fo, dialect="excel") first_line = [ u"uid", u"u_tags", u"special_population", u"trigger", u"trigger_info", u"trigger_time", u"material_id", u"material_type", u"score", u"title", u"m_tags", u"only_topic", u"best_id", u"best_score", u"time" ] csvwriter.writerow(first_line) all_call_cnt = 0 all_valid_res_cnt = 0 exception_cnt = 0 status_dict = defaultdict(int) total_time = [] slow_case = [] for uid in data_dict: all_call_cnt += 1 user_info0 = data_dict[uid] try: # if True: t1 = time.time() res = Recommend_by_user_info(user_info0, uid, log_mark='testmain5', test=True) # return = {"user_info": None, "res": None, "topn_ids_scores": None, "only_topic": None,"status":"succeed"} t2 = time.time() print t2 - t1 if t2 - t1 >= 3: break user_info = res['user_info'] res1 = res['res'] topn_ids_scores = res['topn_ids_scores'] only_topic = res['only_topic'] status = res['status'] v_score_dict = res['v_score_dict'] best_id, best_title, mtype = res1[0] this_time = t2 - t1 if this_time >= 1.0: slow_case.append([uid, this_time]) total_time.append(t2 - t1) except Exception, e: print e exception_cnt += 0 continue status_dict[status] += 1 #################### # if not only_topic: # continue #################### if best_id == -1 or user_info is None: continue print '=================' print uid texts = user_info["texts"] tags = user_info["tags"] special_population = user_info["special_population"] trigger = user_info["trigger"] timestamp = user_info['timestamp'] best_score = v_score_dict[mtype + '_' + str(best_id)] # if trigger == "big_search": # continue if trigger == 'big_search': trigger_info = "-".join(texts) elif trigger == "free_problem_create": problem_id, ask = get_medicaldb_handler().get_ask_by_timestamp( uid, timestamp) if not ask: ask = texts[0] trigger_info = '-'.join([str(problem_id), str(ask)]) print "u tags", "-".join(tags), special_population print trigger_info, best_id, best_score, best_title for unique_id, score in topn_ids_scores: material_type, id = unique_id.split('_') if material_type == "news": title, _ = get_newsdb_handler().get_title_digest_by_nid(id) m_tags = get_news_tags_from_solr("news_" + str(id)) elif material_type == "topic": title = get_medicaldb_handler().get_topic_title(id) m_tags = get_news_tags_from_solr("r_topic_" + str(id)) rows = [ str(uid), "-".join(tags), str(special_population), trigger, trigger_info, str(timestamp), str(id), material_type, str(score), title, "-".join(m_tags), str(only_topic), str(best_id), str(best_score), str(this_time) ] rows = convert2gbk(rows) csvwriter.writerow(rows) all_valid_res_cnt += 1
def get_systag_data(): # 获取热卖tag相关数据,keywords,target_param,name等 sql = "select sysTag_id, keywords ,clinic_no,second_clinic_no from ner_systagsolrgenerateconf;" data = dict() data["systag"] = {} # 9:{'tag_name':'gastroscope_colonoscope','plan':[{'url':url1,'name':name1},{'url':url2,'name':name2}]} data['keyword'] = defaultdict(list) # '感冒':[systag_id1,systag_id2...] data['keyword_extend'] = {} data['clinic_no'] = defaultdict(list) # u'1':[systag_id1] all_plan_name = [] o = get_diagnose_handler().dbhandler.do_one(sql) for item in o: systag_id = item[0] keywords = item[1].strip() clinic_no = item[2].strip() second_clinic_no = item[3].strip() # 科室信息与systag_id的对应关系,不标记区分一二级科室 if clinic_no: clinic_nos = clinic_no.split() for x in clinic_nos: x = ensure_unicode(x) data['clinic_no'][x].append(systag_id) if second_clinic_no: second_clinic_nos = second_clinic_no.split() for x in second_clinic_nos: x = ensure_unicode(x) data['clinic_no'][x].append(systag_id) # data['systag'] tag_name = get_diagnose_handler().get_systag_en_name(systag_id) sql1 = 'select id,name,target_param from api_userhomehotsalegallery where tag="%s" and is_online=1;' % tag_name o1 = get_medicaldb_handler().do_one(sql1) data['systag'][systag_id] = {'tag_name': tag_name, 'plan': []} if not o1: continue for item1 in o1: plan_id = item1[0] name = item1[1] url = item1[2].replace('\r\n', '') print systag_id, tag_name, name, url data['systag'][systag_id]['plan'].append({ 'url': url, 'name': name, 'plan_id': plan_id }) all_plan_name.append([systag_id, name]) if keywords == u"*": continue # data['keyword'] keywords = keywords.lower().split() for k in keywords: if systag_id not in data['keyword'][k]: data['keyword'][k].append(systag_id) # 用相似词将keyword扩充 num = 20 master_slave = {} high_freq_words = get_high_freq_words() for k in data['keyword']: systag_id_list = data['keyword'][k] # data['keyword_extend'][k] = [systag_id_list, 1.0] master_slave[k] = [systag_id_list, []] for w, s in get_similar_redis(k, num): w = ensure_unicode(w) if len(w) < 2: # 去掉长度为1的相似词 continue if s < 0.41: # 分数过低的不要 break if w in high_freq_words: # 去掉公认的高频词 continue data['keyword_extend'][w] = [systag_id_list, s] master_slave[k][1].append([w, s]) for k in data['keyword']: systag_id_list = data['keyword'][k] data['keyword_extend'][k] = [systag_id_list, 1.0] # 把keyword_extend信息存文件里,方便查看 with open(SYSTAG_DATA_CHECK_FILE, 'w') as fc: for k in master_slave: systag_id_list, ws_list = master_slave[k] fc.write('###' + k + '|||' + json.dumps(systag_id_list) + '=' * 10 + '\n') for w, s in ws_list: fc.write(w + '|||' + str(s) + '\n') for systag_id, plan_name in all_plan_name: fc.write(str(systag_id) + '---' + plan_name + '\n') pickle_to_file(data, SYSTAG_DATA_FILE)
def add_topic(): batch_size = 1000 all_doc_small = [] all_doc_big = [] docs_small = [] docs_big = [] sql = 'select id from api_doctortopic where is_deleted=0 and title <> "" and id > 154517 limit 20000;' o = get_medicaldb_handler().do_one(sql) id_prefix_small = "r_topic_" id_prefix_big = "r_topicbig_" content_type_small = "r_topic" content_type_big = "r_topicbig" # fo = open("topic_score.csv", "w") # csvwriter = csv.writer(fo, dialect='excel') # first_line = [u'topic id', u'score', u'topic title', u'content len', u'image num', u'is original', # u'doctor id', u'职称', u'医院级别', u'科室', u'城市'] # first_line = convert2gbk(first_line) # csvwriter.writerow(first_line) # index = range(len(o)) # shuffle(index) ff = open("failed_id","a") solr = SolrHelper("online").get_solr("topic_tpl") is_end = False for item in o: if item == o[-1]: is_end = True #print "is_end",is_end topic_id = item[0] print "topic_id",topic_id info_of_topic = topic_info(topic_id) topic_title = info_of_topic['title'] if len(topic_title) == 0: #print "empty title",topic_id continue doctor_id = info_of_topic["doctor_id"] info_of_doctor = doctor_info(doctor_id) title_tags = get_entities_cyseg(info_of_topic["title"]) content_tags = get_entities_cyseg(info_of_topic["text"]) if len(content_tags) == 0: print "no content tag",topic_id continue title_vecs = get_vecs2(title_tags) content_vecs = get_vecs2(content_tags) print "content_vecs len",len(content_vecs) score = int(grade_topic(info_of_topic, info_of_doctor, title_tags, content_tags) * 10) if title_vecs and len(title_vecs) > 0: #若title有vec,存之 try: add_topic_kernel(topic_id=topic_id, docs=docs_small, tags=title_tags, score=score, info_of_topic=info_of_topic, info_of_doctor=info_of_doctor, vecs=title_vecs, id_prefix=id_prefix_small, content_type=content_type_small ) except: ff.write("small|||" + str(topic_id) + "\n") if content_vecs and len(content_vecs) > 0: #若content有vec,存之 try: add_topic_kernel(topic_id=topic_id, docs=docs_big, tags=content_tags, score=score, info_of_topic=info_of_topic, info_of_doctor=info_of_doctor, vecs=content_vecs, id_prefix=id_prefix_big, content_type=content_type_big) except: ff.write("big|||" + str(topic_id) + "\n") ########### ############ print "eln docs_small",len(docs_small) print "len docs_big",len(docs_big) if len(docs_small) == batch_size or is_end: print "topic_id",topic_id print "is end",is_end print "add small", len(docs_small) #print json.dumps(docs_small) #add(docs_small,solr) all_doc_small.extend(docs_small) docs_small = [] if len(docs_big) == batch_size or is_end: print "topic_id", topic_id print "is end", is_end print "add big", len(docs_big) #print json.dumps(docs_big) #add(docs_big, solr) all_doc_big.extend(docs_big) docs_big = [] ff.close() pickle_to_file(all_doc_small,"all_doc_small_3") pickle_to_file(all_doc_big,"all_doc_big_3")
def recall_together(text, tags, weights, cates, special_population, trigger_type=None, only_topic=False, yxjt=False): # news and topic 一起召回 news_cons = population_cons2(special_population) # 不能在这些分类 news_limits = population_limits(special_population) # 必须在这些分类 if trigger_type == "big_search": drug_words = [ x for x in tags if (x in cates and cates[x] == 'DRUG_DESC') ] else: drug_words = None rows = 25 # 这里太大可能导致文章多->访问word2vec次数多->超时的可能性变大 if yxjt: # 医学讲堂 rows = 25 # 医学讲堂只处理标题所以这里可以取大一些 res_ids, title_dict, score_dict = more_news_and_topic_from_solr( # text=text, text='', tags=tags, weights=weights, rows=rows, drug_words=drug_words, news_cons=news_cons, news_limits=news_limits, topic_only=only_topic) # 召回后,进行过滤 res_ids1 = [] all_titles = set() all_doctor_ids = set() for id in res_ids: print '--------======-----' print id # news and topic 标题去重 title = title_dict.get(id, '') if title in all_titles: # 对所有标题去重 continue all_titles.add(title) # 获取物料类型和真实id type, true_id = id.split('_') # 医学讲堂标题长度限制 if type == 'topic' and yxjt: if len(title) < 8: continue # 对topic的医生id去重 if type == 'topic': doctor_id = get_medicaldb_handler().get_topic_doctor_id(true_id) if doctor_id and doctor_id in all_doctor_ids: continue all_doctor_ids.add(doctor_id) # 规则过滤 if not child_match(special_population, text, title_dict[id]): continue res_ids1.append(id) print "recall ids" for id in res_ids1: print id, score_dict[id], title_dict[id] return res_ids1, title_dict, score_dict