def test9(): from general_utils.db_utils import get_medicaldb_handler from add_data_to_solr.manager.add_utils import topic_info, doctor_info fo = open("topic_score.csv", "w") csvwriter = csv.writer(fo, dialect='excel') first_line = [u'topic id', u'score', u'topic title', u'content len', u'image num', u'is original', u'doctor id', u'职称', u'医院级别', u'1科室', u"2科室", u'城市', u'hospital_name'] first_line = convert2gbk(first_line) csvwriter.writerow(first_line) d = "data_dir/topic_data/" b2 = pickle_from_file(d + 'all_doc_big_2') for item in b2: id = int(item['id'].split('_')[-1]) score = item['tid'] / 10.0 title = item['title'] content_len = item['content_len'] sql = 'select doctor_id from api_doctortopic where id=%s;' % id o = get_medicaldb_handler().do_one(sql) doctor_id = o[0][0] ti = topic_info(id) di = doctor_info(doctor_id) image_num = ti['image_num'] is_original = ti['is_original'] d_title = di['title'] h_level = di['hospital_level'] h_name = di['hospital_name'] clinic_no = di['first_class_clinic_no'] s_clinic_no = di['second_class_clinic_no'] city = di['city'] rows = [str(id), str(score), title, str(content_len), str(image_num), str(is_original), doctor_id, d_title, h_level, clinic_no, s_clinic_no, city, h_name] rows = convert2gbk(rows) csvwriter.writerow(rows) fo.close()
def get_topic_data(): # score old_score = pickle_from_file(TOPIC_SCORE_FILE) biggest_id = max(old_score.keys()) # 最大的topic_id sql1 = "select id,doctor_id from api_doctortopic where is_deleted=0 and title <> '' and id>%s;" % biggest_id o = get_medicaldb_handler().do_one(sql1) cnt = 0 for item in o: id = item[0] doctor_id = item[1] info_of_topic = topic_info(id) info_of_doc = doctor_info(doctor_id) title_tags = get_entities_cyseg(info_of_topic["title"]) content_tags = get_entities_cyseg(info_of_topic["text"]) # print "content",info_of_topic["text"] if len(content_tags) == 0 or len(info_of_topic['title']) == 0: print "no content tag", id continue score = grade_topic(info_of_topic, info_of_doc, title_tags, content_tags) old_score[int(id)] = score cnt += 1 print "new topic id num", cnt pickle_to_file(old_score, TOPIC_SCORE_FILE)
def test(): p = pickle_from_file(showlist_parsed_filename) for key in p: before_key = p[key].get('before_key', None) after_key = p[key].get('after_key', None) print '*' * 30 print 'before key', before_key print 'this key', key print 'after key', after_key
def test7(): from general_utils.file_utils import pickle_from_file filename = sys.argv[2] solr_file = "data_dir/topic_data/" + filename solr_docs = pickle_from_file(solr_file) from add_data_to_solr.manager.add import add as add_all from add_data_to_solr.cy_solr_local.solr_base import SolrHelper solr = SolrHelper("online").get_solr("topic_tpl") add_all(solr_docs, solr)
def test11(): filename = "/home/classify/workspace/medical_data/data_dir/medical_word_detail.pickle" endict = pickle_from_file(filename) fo = open("all_medical_words.csv", "w") csvwriter = csv.writer(fo, dialect='excel') for w in endict: id = endict[w]['id'] cate = endict[w]['cate'] row = [str(id), w, cate] row = convert2gbk(row) csvwriter.writerow(row) fo.close()
def insert_one_day(): # 必须加id,否则默认为'',会覆盖之前的数据 log_file_name1, ana_file_name1, bdp_file_name1, log_file_name2, ana_file_name2, bdp_file_name2, bdp_file_name = get_yesterday_log_filename( ) bdp_data = pickle_from_file(bdp_file_name) now = time.time() yesterday = now - 86400.0 ds = timestamp2datetime(now) date_int = int(timestamp2date(yesterday)) fields = ["created_date", "id"] data0 = [ds, date_int] for field in bdp_data: cnt = bdp_data[field] print field, type(field) print cnt, type(cnt) fields.append(field) data0.append(cnt) print "fields", fields print "data0", data0 data = [data0] insert_kernel(fields, data)
def test8(): from general_utils.file_utils import pickle_from_file, pickle_to_file from general_utils.db_utils import get_medicaldb_handler d = "data_dir/topic_data/" s1 = pickle_from_file(d + 'all_doc_small') s2 = pickle_from_file(d + 'all_doc_small_2') s3 = pickle_from_file(d + 'all_doc_small_3') b1 = pickle_from_file(d + 'all_doc_big') b2 = pickle_from_file(d + 'all_doc_big_2') b3 = pickle_from_file(d + 'all_doc_big_3') s = s1 + s2 + s3 # 用不到 b = b1 + b2 + b3 fo_name = "data_dir/topic_score.pickle" res = {} open(fo_name, "w").close() for item in b: id = int(item['id'].split('_')[-1]) score = item['tid'] / 10.0 res[id] = score pickle_to_file(res, fo_name)
class DbDataLocalHandler(object): # data = pickle_from_file(RESOURCE_DATA_FILE) topic_score = pickle_from_file(TOPIC_SCORE_FILE) # stop_word = load_simple_lines(STOP_WORD_PATH) bodypart_data = pickle_from_file(BODYPART_FILE) medical_entity_cate = pickle_from_file(MEDICAL_ENTITY_FILE) medical_relation_drug = pickle_from_file(MEDICAL_RELATION_DRUG_FILE) systag_data = pickle_from_file(SYSTAG_DATA_FILE) @classmethod def get_news_title(cls, news_id): news_id = int(news_id) if news_id in cls.data["news"]: return cls.data["news"][news_id][0] else: return u'' @classmethod def get_news_digest(cls, news_id): news_id = int(news_id) if news_id in cls.data["news"]: return cls.data["news"][news_id][1] else: return u'' @classmethod def get_news_content(cls, news_id): news_id = int(news_id) if news_id in cls.data["news"]: return cls.data["news"][news_id][2] return u'' @classmethod def get_news_type(cls, news_id): news_id = int(news_id) if news_id in cls.data["news"]: return cls.data["news"][news_id][3] return u'' @classmethod def get_topic_score(cls, topic_id): topic_id = int(topic_id) if topic_id in cls.topic_score: return cls.topic_score[topic_id] return 0.0 @classmethod def is_in_bodypart(cls, word): word = ensure_unicode(word) if word == u"血": return False return word in cls.bodypart_data @classmethod def get_entity_cate(cls, word): word = ensure_unicode(word.lower()) return cls.medical_entity_cate.get(word, '') @classmethod def is_entity(cls, word): word = ensure_unicode(word) return word in cls.medical_entity_cate @classmethod def get_relation_drug(cls, word, num=100): word = ensure_unicode(word) return cls.medical_relation_drug.get(word, [])[:num] @classmethod def get_keyword_relation_systag_id(cls, word): word = ensure_unicode(word) return cls.systag_data['keyword'].get(word, []) # return choice([[1, 2], [4]]) @classmethod def get_systag_relation_plan(cls, systag_id): # systag_id = unicode(systag_id) if systag_id not in cls.systag_data['systag']: return [] return cls.systag_data['systag'][systag_id]['plan'] @classmethod def get_systagid_name(cls, systag_id): return cls.systag_data['systag'][systag_id]['tag_name'] @classmethod def get_systagid_relation_planid(cls, systag_id): systag_id = int(systag_id) try: return [ item['plan_id'] for item in cls.systag_data['systag'][systag_id]['plan'] ] except: return [] @classmethod def get_extend_keyword_relation_systag_id(cls, word): # 对热卖tag进行相似词扩展后 word = ensure_unicode(word) # return [systag_id_list,similarity] return cls.systag_data['keyword_extend'].get(word, [[], 0.0]) @classmethod def clinic_no_relation_systag_id(cls, clinic_no): clinic_no = unicode(clinic_no) return cls.systag_data['clinic_no'].get(clinic_no, [])
def test10(): from rpc_services.word2vec_api import get_similar from rpc_services.medical_service_api import tokenizer_default # 寻找相似词 # id query 分词结果 实体词分类 疾病词1 疾病词2 疾病词3 症状词1 症状词2 症状词3 药品词1 药品词2 药品词3 # input_file = "/Users/satoshi/Documents/work file/query_result_o1.csv" input_file = sys.argv[2] endict = pickle_from_file( "/home/classify/workspace/medical_data/data_dir/medical_word_detail.pickle" ) first_line = [ u"id", u"query", u"words", u"cates", u"disease", u"symptom", u"drug", ] fo = open("query_similar_words.csv", "w") csvwriter = csv.writer(fo, dialect='excel') csvwriter.writerow(first_line) with open(input_file, 'r') as f: for l in f: ll = l.strip('\n').split(',') print l print ll id, text = ll[0], ll[1] text = text.decode('gbk', 'ignore') similar_word_score_dict = {} seged = [] cates = [] tokens = tokenizer_default([text])["tokens"][0] for item in tokens: if u"neg_ne" in item: continue if "cate" not in item: continue word = item['token'] if word in seged: continue seged.append(word) cates.append(item['cate']) for x in seged: x_s = get_similar(x, 100) if not x_s: continue for w, s in x_s: if w not in similar_word_score_dict: similar_word_score_dict[w] = s elif s > similar_word_score_dict[w]: similar_word_score_dict[w] = s dis = [] sym = [] drug = [] s_similar_word_score = sorted(similar_word_score_dict.iteritems(), key=lambda x: x[1], reverse=True) for w, s in s_similar_word_score: if w not in endict: continue cate = endict[w]['cate'] if cate == "SYMPTOM_DESC" and len(sym) < 3: sym.append(w) if cate == "DISEASE_DESC" and len(dis) < 3: dis.append(w) if cate == "DRUG_DESC" and len(drug) < 3: drug.append(w) row = [ id, text, u"|||".join(seged), u"|||".join(cates), u"|||".join(dis), u"|||".join(sym), u"|||".join(drug) ] row = convert2gbk(row) csvwriter.writerow(row) fo.close()