def get_topic_data(): # score old_score = pickle_from_file(TOPIC_SCORE_FILE) biggest_id = max(old_score.keys()) # 最大的topic_id sql1 = "select id,doctor_id from api_doctortopic where is_deleted=0 and title <> '' and id>%s;" % biggest_id o = get_medicaldb_handler().do_one(sql1) cnt = 0 for item in o: id = item[0] doctor_id = item[1] info_of_topic = topic_info(id) info_of_doc = doctor_info(doctor_id) title_tags = get_entities_cyseg(info_of_topic["title"]) content_tags = get_entities_cyseg(info_of_topic["text"]) # print "content",info_of_topic["text"] if len(content_tags) == 0 or len(info_of_topic['title']) == 0: print "no content tag", id continue score = grade_topic(info_of_topic, info_of_doc, title_tags, content_tags) old_score[int(id)] = score cnt += 1 print "new topic id num", cnt pickle_to_file(old_score, TOPIC_SCORE_FILE)
def test9(): from general_utils.db_utils import get_medicaldb_handler from add_data_to_solr.manager.add_utils import topic_info, doctor_info fo = open("topic_score.csv", "w") csvwriter = csv.writer(fo, dialect='excel') first_line = [u'topic id', u'score', u'topic title', u'content len', u'image num', u'is original', u'doctor id', u'职称', u'医院级别', u'1科室', u"2科室", u'城市', u'hospital_name'] first_line = convert2gbk(first_line) csvwriter.writerow(first_line) d = "data_dir/topic_data/" b2 = pickle_from_file(d + 'all_doc_big_2') for item in b2: id = int(item['id'].split('_')[-1]) score = item['tid'] / 10.0 title = item['title'] content_len = item['content_len'] sql = 'select doctor_id from api_doctortopic where id=%s;' % id o = get_medicaldb_handler().do_one(sql) doctor_id = o[0][0] ti = topic_info(id) di = doctor_info(doctor_id) image_num = ti['image_num'] is_original = ti['is_original'] d_title = di['title'] h_level = di['hospital_level'] h_name = di['hospital_name'] clinic_no = di['first_class_clinic_no'] s_clinic_no = di['second_class_clinic_no'] city = di['city'] rows = [str(id), str(score), title, str(content_len), str(image_num), str(is_original), doctor_id, d_title, h_level, clinic_no, s_clinic_no, city, h_name] rows = convert2gbk(rows) csvwriter.writerow(rows) fo.close()
def test14(): from add_data_to_solr.manager.add_utils import topic_info from general_utils.db_utils import get_medicaldb_handler sql = 'select id from api_doctortopic where is_deleted=0 and title <> "";' o = get_medicaldb_handler().do_one(sql) fo = open('topic_content_len.csv', 'w') csvwriter = csv.writer(fo) first_line = ['topic id', 'doctor id', 'content length'] csvwriter.writerow(first_line) for item in o: id = int(item[0]) info_of_topic = topic_info(id) doctor_id = info_of_topic['doctor_id'] content_len = info_of_topic['content_len'] csvwriter.writerow([str(id), str(doctor_id), str(content_len)]) fo.close()
def add_topic(): batch_size = 1000 all_doc_small = [] all_doc_big = [] docs_small = [] docs_big = [] sql = 'select id from api_doctortopic where is_deleted=0 and title <> "" and id > 154517 limit 20000;' o = get_medicaldb_handler().do_one(sql) id_prefix_small = "r_topic_" id_prefix_big = "r_topicbig_" content_type_small = "r_topic" content_type_big = "r_topicbig" # fo = open("topic_score.csv", "w") # csvwriter = csv.writer(fo, dialect='excel') # first_line = [u'topic id', u'score', u'topic title', u'content len', u'image num', u'is original', # u'doctor id', u'职称', u'医院级别', u'科室', u'城市'] # first_line = convert2gbk(first_line) # csvwriter.writerow(first_line) # index = range(len(o)) # shuffle(index) ff = open("failed_id","a") solr = SolrHelper("online").get_solr("topic_tpl") is_end = False for item in o: if item == o[-1]: is_end = True #print "is_end",is_end topic_id = item[0] print "topic_id",topic_id info_of_topic = topic_info(topic_id) topic_title = info_of_topic['title'] if len(topic_title) == 0: #print "empty title",topic_id continue doctor_id = info_of_topic["doctor_id"] info_of_doctor = doctor_info(doctor_id) title_tags = get_entities_cyseg(info_of_topic["title"]) content_tags = get_entities_cyseg(info_of_topic["text"]) if len(content_tags) == 0: print "no content tag",topic_id continue title_vecs = get_vecs2(title_tags) content_vecs = get_vecs2(content_tags) print "content_vecs len",len(content_vecs) score = int(grade_topic(info_of_topic, info_of_doctor, title_tags, content_tags) * 10) if title_vecs and len(title_vecs) > 0: #若title有vec,存之 try: add_topic_kernel(topic_id=topic_id, docs=docs_small, tags=title_tags, score=score, info_of_topic=info_of_topic, info_of_doctor=info_of_doctor, vecs=title_vecs, id_prefix=id_prefix_small, content_type=content_type_small ) except: ff.write("small|||" + str(topic_id) + "\n") if content_vecs and len(content_vecs) > 0: #若content有vec,存之 try: add_topic_kernel(topic_id=topic_id, docs=docs_big, tags=content_tags, score=score, info_of_topic=info_of_topic, info_of_doctor=info_of_doctor, vecs=content_vecs, id_prefix=id_prefix_big, content_type=content_type_big) except: ff.write("big|||" + str(topic_id) + "\n") ########### ############ print "eln docs_small",len(docs_small) print "len docs_big",len(docs_big) if len(docs_small) == batch_size or is_end: print "topic_id",topic_id print "is end",is_end print "add small", len(docs_small) #print json.dumps(docs_small) #add(docs_small,solr) all_doc_small.extend(docs_small) docs_small = [] if len(docs_big) == batch_size or is_end: print "topic_id", topic_id print "is end", is_end print "add big", len(docs_big) #print json.dumps(docs_big) #add(docs_big, solr) all_doc_big.extend(docs_big) docs_big = [] ff.close() pickle_to_file(all_doc_small,"all_doc_small_3") pickle_to_file(all_doc_big,"all_doc_big_3")