Ejemplo n.º 1
0
def Recommend_list(uid, num, end=None, pid=None, lookback=5 * 61.0):
    # return : [{'id':111,'type':'topic','title':'xxxx'},{'id':222,'type':'news','title':'yyy'}...]
    # ******************************************
    # if not IS_ONLINE_WEB_SERVER:
    #     return TEST_RETURN_Recommend_list
    # ******************************************

    bad_return = []
    log_mark = "recommend_topn"
    info_logger.info(
        "%s===============start=========uid=%s==============pid=%s===============",
        log_mark, uid, str(pid))

    # assert uid
    try:
        uid = int(uid)
    except:
        info_logger.info("%s=====failed in recommend==bad uid=%s=========",
                         log_mark, uid)
        return bad_return
    if uid == -1:
        info_logger.info("%s=====failed in recommend==bad uid=%s=========",
                         log_mark, uid)
        return bad_return

    # time window
    if not end:
        end = time.time()
    else:
        end = ensure_second_timestamp(end)

    begin = end - lookback

    end += 5.0  # 结束点顺延5s,防止hbase表里还没有实时数据
    if pid:  # qa触发由传入的problem_id查询信息
        user_info0 = one_user_last_qa_info(pid)
    else:
        user_info0 = cy_time_event_one_user_kernel(uid, begin, end)

    res_dict = Recommend_by_user_info(user_info0,
                                      uid,
                                      log_mark=log_mark,
                                      num=num)
    res = res_dict['res']
    status = res_dict['status']
    if not res:
        info_logger.info("%s==failed in recommend==%s===uid=%s===========",
                         log_mark, status, uid)
        return bad_return
    for item in res:
        best_id, title, mtype = item
    info_logger.info(
        "%s==succeed in recommend===id=%s==title=%s====type=%s===uid=%s===========",
        log_mark, best_id, title, mtype, uid)
    return [{'id': item[0], 'title': item[1], 'type': item[2]} for item in res]
Ejemplo n.º 2
0
def Recommend_news(uid, num, solr_first=True, redis_second=True):
    bad_return = BAD_TEST_RETURN_Recommend_plan
    if not IS_ONLINE_WEB_SERVER:
        shuffle(TEST_RETURN_Recommend_news)
        return TEST_RETURN_Recommend_news[:num]
    log_mark = "recommend_news"
    info_logger.info("%s===uid=%s====start====num=%s=======", log_mark, uid,
                     num)

    if uid == -1:
        return bad_return
    try:
        uid = str(uid)
    except:
        info_logger.info("%s===uid=%s============uid unvalid=======", log_mark,
                         uid)
        return bad_return

    # 首先尝试去solr取结果
    if solr_first:
        solr_res = get_ramdom_topn_news_from_solr(uid, num)
        if solr_res:
            log_string = '|'.join([str(x) for x in solr_res])
            info_logger.info("%s===uid=%s=====solr_newsids=%s============",
                             log_mark, uid, log_string)
            return solr_res[:num]

    # 再尝试去redis里取结果
    if redis_second:

        redis_ids = get_random_topn_news_from_redis(uid, num)
        if redis_ids:
            log_string = '|'.join([str(x) for x in redis_ids])
            info_logger.info("%s===uid=%s=====redis_newsids=%s============",
                             log_mark, uid, log_string)
            return redis_ids[:num]

    res = recommend_news_kernel(uid=uid, test=False, num=num + 10)

    ids = res['ids']
    # 如果使用redis,则将计算结果存入redis,以便下次刷新时使用
    if redis_second:
        write_user_all_news_to_redis(uid, ids)

    # 随机选取num个文章
    ids = select_newsid(ids, num)

    if redis_second:
        write_user_showed_news_to_redis(uid, ids)

    log_string = '|'.join([str(x) for x in ids])
    info_logger.info("%s===newsids=%s=======uid=%s=============", log_mark,
                     log_string, uid)

    return ids[:num]
 def do_one(self, sql, commit=False):
     cur = self.conn.cursor()
     # print "sql"
     # print sql
     try:
         cur.execute(sql)
         return cur.fetchall()
     except Exception, e:
         print "Exception:", e
         info_logger.info("sql exception %s", e)
         return None
Ejemplo n.º 4
0
def Recommend_tags(uid):
    # ******************************************
    # if uid == -1:
    #     return BAD_TEST_RETURN_Recommend_tags
    # return TEST_RETURN_Recommend_tags
    # ******************************************
    # 取用户最后一个query
    # 返回 最相似疾病词*2,最相似症状词*1,最相似药品词*2(按顺序给出词),和 解决方案
    # 别忘了打log
    log_mark = "recommend_tag_tag_tag"
    info_logger.info("%s===uid=%s====start==========", log_mark, uid)
    bad_return = BAD_TEST_RETURN_Recommend_tags
    # return TEST_RETURN_Recommend_tags
    if uid == -1:
        return bad_return
    last_query = get_user_last_query(uid)
    info_logger.info("%s===uid=%s====last_query=%s=======", log_mark, uid,
                     last_query)
    tags = get_similar_entities(query=last_query)
    plans = get_relation_plan(query=last_query)
    info_logger.info("%s===tags=%s=======uid=%s=============", log_mark,
                     '|||'.join(tags), uid)

    plans_string = '|||'.join([item['name'] for item in plans])
    info_logger.info("%s===plans=%s=======uid=%s=============", log_mark,
                     plans_string, uid)

    return {'words': tags, 'plan': plans}
Ejemplo n.º 5
0
def Recommend(uid, lookback, end=None, pid=None):
    # if not IS_ONLINE_WEB_SERVER:
    #     return choice(TEST_RETURN)

    # recommed top 1
    bad_return = [-1, "", "nothing"]  # material_id, title, material_type
    log_mark = "recommend_one"
    info_logger.info(
        "%s===============start=========uid=%s==============pid=%s===============",
        log_mark, uid, str(pid))

    try:
        uid = int(uid)
    except:
        info_logger.info("%s=====failed in recommend==bad uid=%s=========",
                         log_mark, uid)
        return bad_return

    if uid == -1:
        info_logger.info("%s=====failed in recommend==bad uid=%s=========",
                         log_mark, uid)
        return bad_return

    if not end:
        end = time.time()
    else:
        end = ensure_second_timestamp(end)

    begin = end - lookback

    end += 5.0  # 结束点顺延5s,防止hbase表里还没有实时数据

    if pid:  # qa触发由传入的problem_id查询信息
        user_info0 = one_user_last_qa_info(pid)
    else:
        user_info0 = cy_time_event_one_user_kernel(uid, begin, end)

    res_dict = Recommend_by_user_info(user_info0, uid, log_mark=log_mark)
    res = res_dict['res']
    status = res_dict['status']
    if not res:
        info_logger.info("%s==failed in recommend==%s===uid=%s===========",
                         log_mark, status, uid)
        return bad_return
    best_id, title, mtype = res[0]
    info_logger.info(
        "%s==succeed in recommend===id=%s==title=%s====type=%s===uid=%s===========",
        log_mark, best_id, title, mtype, uid)
    return [int(best_id), title, mtype]
Ejemplo n.º 6
0
def Recommend_topics(uid, num, timestamp=None, test=False):
    # if uid == -1:
    #     return BAD_TEST_RETURN_Recomend_topics
    # return TEST_RETURN_Recomend_topics
    log_mark = "recommend_topic_topic"
    info_logger.info("%s===uid=%s====start====num=%s=======", log_mark, uid,
                     num)
    topic_ids = Recommend_topics_kernel(uid=uid,
                                        num=num,
                                        timestamp=timestamp,
                                        test=test)

    info_logger.info("%s===uid=%s====return====topic_ids=%s=======", log_mark,
                     uid, topic_ids)
    return topic_ids
def cy_time_event_one_user_viewnews(uid, begin, end):
    connection = happybase.Connection('hbase_server',
                                      compat='0.90',
                                      port=19090,
                                      timeout=SMALL_TIMEOUT)
    table = connection.table("cy_real_time_event")
    res = {}
    for key, data in table.scan(row_prefix=str(uid) + '|'):
        uid, timestamp, event_type = key.split('|')

        if event_type not in ["view_news"]:
            continue
        timestamp = ensure_second_timestamp(timestamp)
        info_logger.info("real timestamp=%s", timestamp)
        if timestamp >= begin and timestamp <= end:
            news_id = int(data["info:news_id"])
            res[news_id] = timestamp
    return res
Ejemplo n.º 8
0
def Recommend_plan(uid, num):
    bad_return = BAD_TEST_RETURN_Recommend_plan
    # if not IS_ONLINE_WEB_SERVER:
    #     return TEST_RETURN_Recommend_plan
    log_mark = "recommend_plan"
    info_logger.info("%s===uid=%s====start====num=%s=======", log_mark, uid,
                     num)

    if uid == -1:
        return bad_return
    try:
        uid = str(uid)
    except:
        info_logger.info("%s===uid=%s============uid unvalid=======", log_mark,
                         uid)
        return bad_return

    res = get_relation_plan3(uid, num)
    systagids = res.get('ids', [])
    status = res['status']
    plan_ids = systagid_2_planid(systagids)
    plans_string = '|||'.join([str(item) for item in plan_ids])
    info_logger.info("%s===plans=%s=======uid=%s======status=%s=======",
                     log_mark, plans_string, uid, status)
    return plan_ids
Ejemplo n.º 9
0
def is_valid_user_info(user_info):
    if len(user_info['tags']) == 0 or len(user_info['vecs']) == 0:
        info_logger.info("=failed no tags=======")
        return False
    return True
Ejemplo n.º 10
0
def parse_user_info(user_info):
    last_event = user_info['last_event']
    info_logger.info("last_event %s", str(last_event))
    if last_event is None:
        return None
    timestamp = user_info['last_event_time']
    # output = {}  # trigger,timestamp,tags,vecs,weights,center,texts,sex,age,special_population
    if last_event[0] == 'big_search':
        texts = [item[0] for item in user_info['big_search']]
        text = ' '.join(texts).strip()

        # tags, weights, cates, _ = get_medical_entities_info([text], [1.0], 1.0)
        # 改为不仅仅使用医学实体词
        tags, weights, cates, counts, entities, entity_counts = get_medical_entities_info2_cyseg(
            text_list=[text],
            weight_list=[1.0],
            weights_is_dict=False  # weights以list形式返回
        )

        vecs, keep_indices = get_vecs_weighted3(tags)
        weights = [weights[i] for i in keep_indices]
        tags = [tags[i] for i in keep_indices]  # 去掉没有向量的tag
        entities = [x for x in entities if x in tags]  # 去掉没有向量的实体词

        center = get_center(vecs)
        special_population, sex = special_population_big(text)
        age = None
        output = {
            "trigger": 'big_search',
            "timestamp": timestamp,  # 这个没有用
            "tags": tags,
            "vecs": vecs,
            "entities": entities,
            "weights": weights,
            "cates": cates,
            "center": center,
            "texts": texts,
            "sex": sex,
            "age": age,
            "special_population": special_population,
        }

    elif last_event[0] == 'free_problem_create':
        text, sex, age = last_event[1]
        if sex:
            s_text = text + ' (%s,%s)' % (sex, age)
        else:
            s_text = text
            text, sex, age = qa_ask_info(text)
        special_population, sex = special_population_big(s_text)
        print special_population

        tags, weights, cates, counts, entities, entity_counts = get_medical_entities_info2_cyseg(
            text_list=[text],
            weight_list=[1.0],
            weights_is_dict=False  # weights以list形式返回
        )

        vecs, keep_indices = get_vecs_weighted3(tags)
        weights = [weights[i] for i in keep_indices]
        # cates is a dict
        tags = [tags[i] for i in keep_indices]  # 去掉没有向量的tag
        entities = [x for x in entities if x in tags]  # 去掉没有向量的实体词

        info_logger.info("len vecs %s", len(vecs))
        center = get_center(vecs)
        output = {
            "trigger": 'free_problem_create',
            "timestamp": timestamp,
            "tags": tags,
            "entities": entities,
            "vecs": vecs,
            "weights": weights,
            "cates": cates,
            "center": center,
            "texts": [text],
            "sex": sex,
            "age": age,
            "special_population": special_population,
        }
    else:
        print "output is None"
        output = None

    return output
Ejemplo n.º 11
0
def Recommend_topics_kernel(uid, num, timestamp=None, test=False):
    if not test:
        bad_return = []
    else:
        bad_return = [], None, None
    log_mark = "recommend_topic_topic"
    # step 1 先从solr里找,看有没有被离线计算过,8ms左右
    if not test:
        solr_res = get_caled_user_topn_topics_yesterday0(uid)
        if solr_res is not None:
            info_logger.info(
                "%s===uid=%s====return==offline_data==topic_ids=%s=======",
                log_mark, uid, solr_res)
            print 'hehehehe', uid, solr_res
            return solr_res

    user_info0 = get_yesterday_user_info(uid, timestamp)
    if user_info0['last_event'] is None:
        info_logger.info("%s==failed in no last event=uid=%s========",
                         log_mark, uid)
        return bad_return
    # print json.dumps(user_info0)
    t1 = time.time()
    user_info = parse_user_info2(user_info0)
    t2 = time.time()
    print "Recommend_topics_kernel parse_user_info2 time", t2 - t1
    texts = user_info['texts']
    tags = user_info['tags']
    u_weights = user_info['weights']
    u_cates = user_info['cates']
    special_population = user_info['special_population']
    center = None
    trigger = user_info['trigger']
    u_vecs = user_info['vecs']

    if len(tags) == 0:
        info_logger.info("%s==failed in no tags=uid=%s========", log_mark, uid)
        return bad_return
    t3 = time.time()
    recall_ids, title_dict, score_dict = recall(
        text=' '.join(texts),
        tags=tags,
        weights=u_weights,
        cates=u_cates,
        special_population=special_population,
        center=center,
        trigger_type=trigger,
        only_topic=True,
        yxjt=True)

    t4 = time.time()
    print "Recommend_topics_kernel recall time", t4 - t3

    info_logger.info((log_mark + "===recall_ids", recall_ids))
    t5 = time.time()
    topn_ids_scores, title_dict, v_score_dict = rank(
        uid=uid,
        recall_ids=recall_ids,
        title_dict=title_dict,
        solr_score_dict=score_dict,
        u_vecs=u_vecs,
        u_weights=u_weights,
        u_tags=tags,
        u_cates=u_cates,
        keep=num)
    t6 = time.time()
    print "Recommend_topics_kernel parse_user_info2 rank", t6 - t5

    threshhold = trigger_threshhold(trigger)

    if test:
        return [
            int(item[0].split('_')[-1]) for item in topn_ids_scores
            if v_score_dict[item[0]] >= threshhold
        ], user_info, v_score_dict
    return [
        int(item[0].split('_')[-1]) for item in topn_ids_scores
        if v_score_dict[item[0]] >= threshhold
    ]
Ejemplo n.º 12
0
def Recommend_by_user_info(user_info0, uid, log_mark, num=1, test=False):
    if test:
        bad_return = {
            "user_info": None,
            "res": None,
            "topn_ids_scores": None,
            "only_topic": None,
            "v_score_dict": None
        }
    else:
        bad_return = {"res": None}
    t1 = time.time()
    user_info = parse_user_info(user_info0)
    t2 = time.time()
    print "Recommend_by_user_info parse_user_info time", t2 - t1

    if user_info is None:
        bad_return["status"] = "user info is None"
        return bad_return
    if not is_valid_user_info(user_info):
        bad_return["status"] = "not valid user info"
        return bad_return

    only_topic = False

    if not filter_user_info(user_info):
        only_topic = True

    texts = user_info["texts"]
    tags = user_info["tags"]
    special_population = user_info["special_population"]
    center = user_info["center"]
    u_vecs = user_info["vecs"]
    u_weights = user_info["weights"]
    u_cates = user_info["cates"]
    trigger = user_info["trigger"]

    info_logger.info("%s=texts=%s======uid=%s===============", log_mark,
                     '|||'.join(texts), uid)
    info_logger.info("%s=tags=%s======uid=%s===============", log_mark,
                     '|||'.join(tags), uid)
    info_logger.info("%s=special_population=%s======uid=%s===============",
                     log_mark, special_population, uid)
    info_logger.info("%s=trigger=%s======uid=%s===============", log_mark,
                     trigger, uid)
    info_logger.info("%s=only_topic=%s======uid=%s===============", log_mark,
                     only_topic, uid)
    ###############
    if trigger not in ("big_search", "free_problem_create"):
        bad_return["status"] = "trigger_type not bs or qa"
        return bad_return
    ################

    # 召回
    t3 = time.time()
    recall_ids, title_dict, score_dict = recall(
        text=' '.join(texts),
        tags=tags,
        weights=u_weights,
        cates=u_cates,
        special_population=special_population,
        center=center,
        trigger_type=trigger,
        only_topic=only_topic)

    t4 = time.time()
    print "Recommend_by_user_info recall time", t4 - t3

    info_logger.info("%s=recall_ids=%s===============uid=%s========", log_mark,
                     '-'.join(recall_ids), uid)

    # 排序
    t5 = time.time()
    topn_ids_scores, title_dict, v_score_dict = rank(
        uid=uid,
        recall_ids=recall_ids,
        title_dict=title_dict,
        solr_score_dict=score_dict,
        u_vecs=u_vecs,
        u_weights=u_weights,
        u_tags=tags,
        u_cates=u_cates,
        keep=num + 9)

    t6 = time.time()
    print "Recommend_by_user_info rank time", t6 - t5

    info_logger.info("%s=topn_ids_scores_len=%s================uid=%s========",
                     log_mark, len(topn_ids_scores), uid)

    if len(topn_ids_scores) == 0:
        bad_return["status"] = "topn_ids_scores empty"  # 往往因为物料不足或者召回量过低导致
        return bad_return
    t7 = time.time()
    best = aftertreatment(topn_ids_scores, {}, special_population, num)
    t8 = time.time()
    print "Recommend_by_user_info aftertreatment time", t8 - t7

    if not best:
        bad_return["status"] = "no best"
        return bad_return

    # get threshhold
    if not test:
        threshhold = trigger_threshhold(trigger)
    else:
        threshhold = -9999999.9

    best_id, _ = best[0]
    best_score = v_score_dict[best_id]
    if best_score < threshhold:  # 最高分低于阈值
        bad_return["status"] = "best_score too low"
        return bad_return

    res = [[
        int(item[0].split('_')[1]),
        title_dict.get(item[0], u''), item[0].split('_')[0]
    ] for item in best]  # [[111,title1,"news"],[222,title2],...]

    if not test:
        return {"res": res, "status": "succeed"}
    else:
        return {
            "res": res,
            "user_info": user_info,
            "topn_ids_scores": topn_ids_scores,
            "only_topic": only_topic,
            "status": "succeed",
            "v_score_dict": v_score_dict
        }