Exemple #1
0
def get_topic_data():
    # score
    old_score = pickle_from_file(TOPIC_SCORE_FILE)
    biggest_id = max(old_score.keys())  # 最大的topic_id
    sql1 = "select id,doctor_id from api_doctortopic where is_deleted=0 and title <> '' and id>%s;" % biggest_id
    o = get_medicaldb_handler().do_one(sql1)
    cnt = 0
    for item in o:
        id = item[0]
        doctor_id = item[1]
        info_of_topic = topic_info(id)
        info_of_doc = doctor_info(doctor_id)
        title_tags = get_entities_cyseg(info_of_topic["title"])

        content_tags = get_entities_cyseg(info_of_topic["text"])
        # print "content",info_of_topic["text"]

        if len(content_tags) == 0 or len(info_of_topic['title']) == 0:
            print "no content tag", id
            continue

        score = grade_topic(info_of_topic, info_of_doc, title_tags,
                            content_tags)
        old_score[int(id)] = score
        cnt += 1
    print "new topic id num", cnt
    pickle_to_file(old_score, TOPIC_SCORE_FILE)
def test9():
    from general_utils.db_utils import get_medicaldb_handler
    from add_data_to_solr.manager.add_utils import topic_info, doctor_info
    fo = open("topic_score.csv", "w")
    csvwriter = csv.writer(fo, dialect='excel')
    first_line = [u'topic id', u'score', u'topic title', u'content len', u'image num', u'is original',
                  u'doctor id', u'职称', u'医院级别', u'1科室', u"2科室", u'城市', u'hospital_name']
    first_line = convert2gbk(first_line)
    csvwriter.writerow(first_line)
    d = "data_dir/topic_data/"
    b2 = pickle_from_file(d + 'all_doc_big_2')
    for item in b2:
        id = int(item['id'].split('_')[-1])
        score = item['tid'] / 10.0
        title = item['title']
        content_len = item['content_len']
        sql = 'select doctor_id from api_doctortopic where id=%s;' % id
        o = get_medicaldb_handler().do_one(sql)
        doctor_id = o[0][0]
        ti = topic_info(id)
        di = doctor_info(doctor_id)
        image_num = ti['image_num']
        is_original = ti['is_original']
        d_title = di['title']
        h_level = di['hospital_level']
        h_name = di['hospital_name']
        clinic_no = di['first_class_clinic_no']
        s_clinic_no = di['second_class_clinic_no']
        city = di['city']
        rows = [str(id), str(score), title, str(content_len), str(image_num),
                str(is_original), doctor_id, d_title, h_level, clinic_no, s_clinic_no, city, h_name]
        rows = convert2gbk(rows)
        csvwriter.writerow(rows)
    fo.close()
def test14():
    from add_data_to_solr.manager.add_utils import topic_info
    from general_utils.db_utils import get_medicaldb_handler
    sql = 'select id from api_doctortopic where is_deleted=0 and title <> "";'
    o = get_medicaldb_handler().do_one(sql)
    fo = open('topic_content_len.csv', 'w')
    csvwriter = csv.writer(fo)
    first_line = ['topic id', 'doctor id', 'content length']
    csvwriter.writerow(first_line)

    for item in o:
        id = int(item[0])
        info_of_topic = topic_info(id)
        doctor_id = info_of_topic['doctor_id']
        content_len = info_of_topic['content_len']
        csvwriter.writerow([str(id), str(doctor_id), str(content_len)])
    fo.close()
def add_topic():
    batch_size = 1000
    all_doc_small = []
    all_doc_big = []
    docs_small = []
    docs_big = []
    sql = 'select id from api_doctortopic where is_deleted=0 and title <> "" and id > 154517 limit 20000;'
    o = get_medicaldb_handler().do_one(sql)
    id_prefix_small = "r_topic_"
    id_prefix_big = "r_topicbig_"
    content_type_small = "r_topic"
    content_type_big = "r_topicbig"
    # fo = open("topic_score.csv", "w")
    # csvwriter = csv.writer(fo, dialect='excel')
    # first_line = [u'topic id', u'score', u'topic title', u'content len', u'image num', u'is original',
    #               u'doctor id', u'职称', u'医院级别', u'科室', u'城市']

    # first_line = convert2gbk(first_line)
    # csvwriter.writerow(first_line)
    # index = range(len(o))
    # shuffle(index)
    ff = open("failed_id","a")
    solr = SolrHelper("online").get_solr("topic_tpl")
    is_end = False
    for item in o:
        if item == o[-1]:
            is_end = True
        #print "is_end",is_end
        topic_id = item[0]
        print "topic_id",topic_id
        info_of_topic = topic_info(topic_id)
        topic_title = info_of_topic['title']
        if len(topic_title) == 0:
            #print "empty title",topic_id
            continue
        doctor_id = info_of_topic["doctor_id"]
        info_of_doctor = doctor_info(doctor_id)
        title_tags = get_entities_cyseg(info_of_topic["title"])
        content_tags = get_entities_cyseg(info_of_topic["text"])

        if len(content_tags) == 0:
            print "no content tag",topic_id
            continue

        title_vecs = get_vecs2(title_tags)
        content_vecs = get_vecs2(content_tags)
        print "content_vecs len",len(content_vecs)


        score = int(grade_topic(info_of_topic, info_of_doctor, title_tags, content_tags) * 10)
        if title_vecs and len(title_vecs) > 0:
            #若title有vec,存之
            try:
                add_topic_kernel(topic_id=topic_id,
                                docs=docs_small,
                                 tags=title_tags,
                                 score=score,
                                 info_of_topic=info_of_topic,
                                 info_of_doctor=info_of_doctor,
                                 vecs=title_vecs,
                                 id_prefix=id_prefix_small,
                                 content_type=content_type_small
                                 )
            except:
                ff.write("small|||" + str(topic_id) + "\n")
        if content_vecs and len(content_vecs) > 0:
            #若content有vec,存之
            try:
                add_topic_kernel(topic_id=topic_id,
                                 docs=docs_big,
                                 tags=content_tags,
                                 score=score,
                                 info_of_topic=info_of_topic,
                                 info_of_doctor=info_of_doctor,
                                 vecs=content_vecs,
                                 id_prefix=id_prefix_big,
                                 content_type=content_type_big)
            except:
                ff.write("big|||" + str(topic_id) + "\n")

        ###########




        ############
        print "eln docs_small",len(docs_small)
        print "len docs_big",len(docs_big)
        if len(docs_small) == batch_size or is_end:
            print "topic_id",topic_id
            print "is end",is_end
            print "add small", len(docs_small)

            #print json.dumps(docs_small)
            #add(docs_small,solr)
            all_doc_small.extend(docs_small)
            docs_small = []
        if len(docs_big) == batch_size or is_end:
            print "topic_id", topic_id
            print "is end", is_end
            print "add big", len(docs_big)
            #print json.dumps(docs_big)
            #add(docs_big, solr)
            all_doc_big.extend(docs_big)
            docs_big = []



    ff.close()
    pickle_to_file(all_doc_small,"all_doc_small_3")
    pickle_to_file(all_doc_big,"all_doc_big_3")