Ejemplo n.º 1
0
def test9():
    from general_utils.db_utils import get_medicaldb_handler
    from add_data_to_solr.manager.add_utils import topic_info, doctor_info
    fo = open("topic_score.csv", "w")
    csvwriter = csv.writer(fo, dialect='excel')
    first_line = [u'topic id', u'score', u'topic title', u'content len', u'image num', u'is original',
                  u'doctor id', u'职称', u'医院级别', u'1科室', u"2科室", u'城市', u'hospital_name']
    first_line = convert2gbk(first_line)
    csvwriter.writerow(first_line)
    d = "data_dir/topic_data/"
    b2 = pickle_from_file(d + 'all_doc_big_2')
    for item in b2:
        id = int(item['id'].split('_')[-1])
        score = item['tid'] / 10.0
        title = item['title']
        content_len = item['content_len']
        sql = 'select doctor_id from api_doctortopic where id=%s;' % id
        o = get_medicaldb_handler().do_one(sql)
        doctor_id = o[0][0]
        ti = topic_info(id)
        di = doctor_info(doctor_id)
        image_num = ti['image_num']
        is_original = ti['is_original']
        d_title = di['title']
        h_level = di['hospital_level']
        h_name = di['hospital_name']
        clinic_no = di['first_class_clinic_no']
        s_clinic_no = di['second_class_clinic_no']
        city = di['city']
        rows = [str(id), str(score), title, str(content_len), str(image_num),
                str(is_original), doctor_id, d_title, h_level, clinic_no, s_clinic_no, city, h_name]
        rows = convert2gbk(rows)
        csvwriter.writerow(rows)
    fo.close()
Ejemplo n.º 2
0
def get_topic_data():
    # score
    old_score = pickle_from_file(TOPIC_SCORE_FILE)
    biggest_id = max(old_score.keys())  # 最大的topic_id
    sql1 = "select id,doctor_id from api_doctortopic where is_deleted=0 and title <> '' and id>%s;" % biggest_id
    o = get_medicaldb_handler().do_one(sql1)
    cnt = 0
    for item in o:
        id = item[0]
        doctor_id = item[1]
        info_of_topic = topic_info(id)
        info_of_doc = doctor_info(doctor_id)
        title_tags = get_entities_cyseg(info_of_topic["title"])

        content_tags = get_entities_cyseg(info_of_topic["text"])
        # print "content",info_of_topic["text"]

        if len(content_tags) == 0 or len(info_of_topic['title']) == 0:
            print "no content tag", id
            continue

        score = grade_topic(info_of_topic, info_of_doc, title_tags,
                            content_tags)
        old_score[int(id)] = score
        cnt += 1
    print "new topic id num", cnt
    pickle_to_file(old_score, TOPIC_SCORE_FILE)
Ejemplo n.º 3
0
def test():
    p = pickle_from_file(showlist_parsed_filename)
    for key in p:
        before_key = p[key].get('before_key', None)
        after_key = p[key].get('after_key', None)
        print '*' * 30
        print 'before key', before_key
        print 'this key', key
        print 'after key', after_key
Ejemplo n.º 4
0
def test7():
    from general_utils.file_utils import pickle_from_file
    filename = sys.argv[2]
    solr_file = "data_dir/topic_data/" + filename
    solr_docs = pickle_from_file(solr_file)
    from add_data_to_solr.manager.add import add as add_all
    from add_data_to_solr.cy_solr_local.solr_base import SolrHelper
    solr = SolrHelper("online").get_solr("topic_tpl")
    add_all(solr_docs, solr)
Ejemplo n.º 5
0
def test11():
    filename = "/home/classify/workspace/medical_data/data_dir/medical_word_detail.pickle"
    endict = pickle_from_file(filename)
    fo = open("all_medical_words.csv", "w")
    csvwriter = csv.writer(fo, dialect='excel')

    for w in endict:
        id = endict[w]['id']
        cate = endict[w]['cate']

        row = [str(id), w, cate]
        row = convert2gbk(row)
        csvwriter.writerow(row)
    fo.close()
def insert_one_day():
    # 必须加id,否则默认为'',会覆盖之前的数据
    log_file_name1, ana_file_name1, bdp_file_name1, log_file_name2, ana_file_name2, bdp_file_name2, bdp_file_name = get_yesterday_log_filename(
    )
    bdp_data = pickle_from_file(bdp_file_name)
    now = time.time()
    yesterday = now - 86400.0
    ds = timestamp2datetime(now)
    date_int = int(timestamp2date(yesterday))
    fields = ["created_date", "id"]
    data0 = [ds, date_int]

    for field in bdp_data:
        cnt = bdp_data[field]
        print field, type(field)
        print cnt, type(cnt)
        fields.append(field)
        data0.append(cnt)
    print "fields", fields
    print "data0", data0

    data = [data0]
    insert_kernel(fields, data)
Ejemplo n.º 7
0
def test8():
    from general_utils.file_utils import pickle_from_file, pickle_to_file
    from general_utils.db_utils import get_medicaldb_handler
    d = "data_dir/topic_data/"

    s1 = pickle_from_file(d + 'all_doc_small')
    s2 = pickle_from_file(d + 'all_doc_small_2')
    s3 = pickle_from_file(d + 'all_doc_small_3')
    b1 = pickle_from_file(d + 'all_doc_big')
    b2 = pickle_from_file(d + 'all_doc_big_2')
    b3 = pickle_from_file(d + 'all_doc_big_3')
    s = s1 + s2 + s3  # 用不到
    b = b1 + b2 + b3

    fo_name = "data_dir/topic_score.pickle"

    res = {}
    open(fo_name, "w").close()
    for item in b:
        id = int(item['id'].split('_')[-1])
        score = item['tid'] / 10.0
        res[id] = score
    pickle_to_file(res, fo_name)
Ejemplo n.º 8
0
class DbDataLocalHandler(object):
    # data = pickle_from_file(RESOURCE_DATA_FILE)
    topic_score = pickle_from_file(TOPIC_SCORE_FILE)
    # stop_word = load_simple_lines(STOP_WORD_PATH)
    bodypart_data = pickle_from_file(BODYPART_FILE)
    medical_entity_cate = pickle_from_file(MEDICAL_ENTITY_FILE)
    medical_relation_drug = pickle_from_file(MEDICAL_RELATION_DRUG_FILE)
    systag_data = pickle_from_file(SYSTAG_DATA_FILE)

    @classmethod
    def get_news_title(cls, news_id):
        news_id = int(news_id)
        if news_id in cls.data["news"]:
            return cls.data["news"][news_id][0]
        else:
            return u''

    @classmethod
    def get_news_digest(cls, news_id):
        news_id = int(news_id)
        if news_id in cls.data["news"]:
            return cls.data["news"][news_id][1]
        else:
            return u''

    @classmethod
    def get_news_content(cls, news_id):
        news_id = int(news_id)
        if news_id in cls.data["news"]:
            return cls.data["news"][news_id][2]
        return u''

    @classmethod
    def get_news_type(cls, news_id):
        news_id = int(news_id)
        if news_id in cls.data["news"]:
            return cls.data["news"][news_id][3]
        return u''

    @classmethod
    def get_topic_score(cls, topic_id):
        topic_id = int(topic_id)
        if topic_id in cls.topic_score:
            return cls.topic_score[topic_id]
        return 0.0

    @classmethod
    def is_in_bodypart(cls, word):
        word = ensure_unicode(word)
        if word == u"血":
            return False
        return word in cls.bodypart_data

    @classmethod
    def get_entity_cate(cls, word):
        word = ensure_unicode(word.lower())
        return cls.medical_entity_cate.get(word, '')

    @classmethod
    def is_entity(cls, word):
        word = ensure_unicode(word)
        return word in cls.medical_entity_cate

    @classmethod
    def get_relation_drug(cls, word, num=100):
        word = ensure_unicode(word)
        return cls.medical_relation_drug.get(word, [])[:num]

    @classmethod
    def get_keyword_relation_systag_id(cls, word):
        word = ensure_unicode(word)
        return cls.systag_data['keyword'].get(word, [])
        # return choice([[1, 2], [4]])

    @classmethod
    def get_systag_relation_plan(cls, systag_id):
        # systag_id = unicode(systag_id)
        if systag_id not in cls.systag_data['systag']:
            return []
        return cls.systag_data['systag'][systag_id]['plan']

    @classmethod
    def get_systagid_name(cls, systag_id):
        return cls.systag_data['systag'][systag_id]['tag_name']

    @classmethod
    def get_systagid_relation_planid(cls, systag_id):
        systag_id = int(systag_id)
        try:
            return [
                item['plan_id']
                for item in cls.systag_data['systag'][systag_id]['plan']
            ]
        except:
            return []

    @classmethod
    def get_extend_keyword_relation_systag_id(cls, word):
        # 对热卖tag进行相似词扩展后
        word = ensure_unicode(word)
        # return [systag_id_list,similarity]
        return cls.systag_data['keyword_extend'].get(word, [[], 0.0])

    @classmethod
    def clinic_no_relation_systag_id(cls, clinic_no):
        clinic_no = unicode(clinic_no)
        return cls.systag_data['clinic_no'].get(clinic_no, [])
Ejemplo n.º 9
0
def test10():
    from rpc_services.word2vec_api import get_similar
    from rpc_services.medical_service_api import tokenizer_default

    # 寻找相似词
    # id  query  分词结果  实体词分类  疾病词1 疾病词2 疾病词3  症状词1 症状词2 症状词3 药品词1 药品词2  药品词3
    # input_file = "/Users/satoshi/Documents/work file/query_result_o1.csv"
    input_file = sys.argv[2]
    endict = pickle_from_file(
        "/home/classify/workspace/medical_data/data_dir/medical_word_detail.pickle"
    )
    first_line = [
        u"id",
        u"query",
        u"words",
        u"cates",
        u"disease",
        u"symptom",
        u"drug",
    ]
    fo = open("query_similar_words.csv", "w")
    csvwriter = csv.writer(fo, dialect='excel')
    csvwriter.writerow(first_line)

    with open(input_file, 'r') as f:
        for l in f:
            ll = l.strip('\n').split(',')
            print l
            print ll
            id, text = ll[0], ll[1]
            text = text.decode('gbk', 'ignore')
            similar_word_score_dict = {}
            seged = []
            cates = []
            tokens = tokenizer_default([text])["tokens"][0]

            for item in tokens:
                if u"neg_ne" in item:
                    continue
                if "cate" not in item:
                    continue
                word = item['token']
                if word in seged:
                    continue
                seged.append(word)
                cates.append(item['cate'])

            for x in seged:
                x_s = get_similar(x, 100)
                if not x_s:
                    continue
                for w, s in x_s:
                    if w not in similar_word_score_dict:
                        similar_word_score_dict[w] = s
                    elif s > similar_word_score_dict[w]:
                        similar_word_score_dict[w] = s

            dis = []
            sym = []
            drug = []
            s_similar_word_score = sorted(similar_word_score_dict.iteritems(),
                                          key=lambda x: x[1],
                                          reverse=True)
            for w, s in s_similar_word_score:
                if w not in endict:
                    continue
                cate = endict[w]['cate']
                if cate == "SYMPTOM_DESC" and len(sym) < 3:
                    sym.append(w)
                if cate == "DISEASE_DESC" and len(dis) < 3:
                    dis.append(w)
                if cate == "DRUG_DESC" and len(drug) < 3:
                    drug.append(w)
            row = [
                id, text, u"|||".join(seged), u"|||".join(cates),
                u"|||".join(dis), u"|||".join(sym), u"|||".join(drug)
            ]
            row = convert2gbk(row)
            csvwriter.writerow(row)
    fo.close()