Esempio n. 1
0
def test17():
    from general_utils.solr_utils import nat_get_title
    fin = sys.argv[2]
    mtype = 'topic' if 'topic' in fin else 'news'
    fon = mtype + '_nearest_top10.csv'
    fo = open(fon, 'w')
    csvwriter = csv.writer(fo)
    first_line = ['id', 'title', 'top_id', 'top_title', 'score']
    csvwriter.writerow(first_line)
    cnt = 0
    with open(fin, 'r') as f:
        for l in f:
            if cnt > 1000:
                break
            cnt += 1
            l = l.strip('\n')
            this_dict = json.loads(l)
            main_id = this_dict['id']
            main_title = nat_get_title(mtype + '_' + str(main_id))
            if not main_title:
                continue
            top = this_dict['top'][:10]
            for subordinate_id, score in top:
                subordinate_title = nat_get_title(mtype + '_' +
                                                  str(subordinate_id))
                row = [
                    str(main_id), main_title,
                    str(subordinate_id), subordinate_title,
                    str(score)
                ]
                row = convert2gbk(row)
                csvwriter.writerow(row)
    fo.close()
Esempio n. 2
0
def main6(test_uid=None):
    # test recommend_topics
    from recommend.manager.recommend_resource import Recommend_topics
    from recommend.manager.recommend_topic_data_helper import parse_user_info as parse_user_info2
    now = time.time()
    if test_uid == "n":
        test_uid = None
    data_dict = cy_time_event_kernel_test(now - 12000.0, now, test_uid)
    fo = open("20180102_rt.csv", "w")
    csvwriter = csv.writer(fo)
    first_line = ['uid', 'tags', 'sp', 'topicid', 'score', 't_title', 't_tags']
    csvwriter.writerow(first_line)
    times = {}
    for uid in data_dict.keys():
        t1 = time.time()
        topic_ids, user_info, score_dict = Recommend_topics(uid, 5, now, True)
        t2 = time.time()
        times[uid] = t2 - t1
        if not user_info:
            continue
        tags = user_info['tags']
        sp = user_info['special_population']
        for x in topic_ids:
            title = nat_get_title('topic_' + str(x))
            score = score_dict['topic_' + str(x)]
            t_tags = get_news_tags_from_solr("r_topic_" + str(x))
            row = [
                str(uid), '-'.join(tags), sp,
                str(x),
                str(score), title, '-'.join(t_tags)
            ]
            row = convert2gbk(row)
            csvwriter.writerow(row)
    fo.close()

    s_times = sorted(times.iteritems(), key=lambda x: x[1], reverse=True)[:10]
    for x, y in s_times:
        print x, y
Esempio n. 3
0
def rank_news(news_ids, solr_score_dict, score_dict, v_score_dict, uid, u_vecs,
              u_weights, u_bp_words):
    t1 = time.time()
    bad_ids = user_half_year_newsids(uid)  # int
    t2 = time.time()
    print "user_half_year_newsids time", t2 - t1
    titles = set()
    cnt = 0
    all_n_tangs_cnt = 0
    all_tags_vecs = {}  # tag:vec
    for id in news_ids:
        print '=' * 20, id
        if cnt >= 10:
            break
        _, true_id = id.split('_')
        if int(true_id) in bad_ids:
            continue
        # 标题去重
        title = nat_get_title(id)

        if title in titles:
            if len(title) > 0:
                # 防止访问数据库出错导致推送失败
                continue
        titles.add(title)

        digest = nat_get_digest(id)
        # 提取文章tags
        t3 = time.time()
        is_good_article, n_tags, n_weights, n_cates, empty_title, title_tags = weighted_news_tags3(
            title, digest)
        # n_weights is a dict
        t4 = time.time()
        print "weighted_news_tags3 time", id, t4 - t3
        try:
            all_n_tangs_cnt += len(n_tags)
        except:
            pass
        # 去掉没tag的文章
        if not is_good_article or len(n_tags) <= 1:
            continue

        t5 = time.time()
        print "user_news_tag_hard_match time", t5 - t4

        new_tags = set(n_tags) - set(all_tags_vecs.keys())
        new_tags_vecs_dict = get_vec_dict_norm_ndarray_redis(new_tags)

        all_tags_vecs.update(new_tags_vecs_dict)  # 更新到all_tags_vecs

        t6 = time.time()
        print "get_vecs_weighted3 time", t6 - t5

        # 计算相似度
        # score = vecs_similarity2(u_vecs, u_weights, n_vecs, n_weights)
        score = vecs_similarity3(u_vecs=u_vecs,
                                 u_weights=u_weights,
                                 n_vecs_dict=all_tags_vecs,
                                 n_weights_dict=n_weights,
                                 n_tags=n_tags)
        # 身体部位匹配度分数
        bp_score = user_news_tag_bodypart_match2(u_bp_words, title_tags)
        print 'bp_score', bp_score
        print 'score0', score
        print 'ntags', '-'.join(n_tags)
        t7 = time.time()
        print "vecs_similarity2 time", t7 - t6
        if empty_title:
            score *= 0.5  # 标题没有实体词的文章降权
        solr_score = solr_score_dict[id]

        v_score_dict[id] = score * bp_score
        score_dict[id] = (0.7 * score + 0.3 * solr_score) * bp_score
        cnt += 1
    print "all_n_tangs_cnt news", all_n_tangs_cnt
Esempio n. 4
0
def user_view_action_texts(uid, num=10):
    ts_dict = user_last_view_actions(uid, num=num)
    text_dict = dict([[key,
                       nat_get_title(key) + ' ' + nat_get_digest(key)]
                      for key in ts_dict])
    return text_dict, ts_dict
Esempio n. 5
0
def g2():
    # test recommend_news
    '''

    :return:
    '''
    from recommend.manager.feed_data_helper import recommend_news_kernel
    from general_utils.solr_utils import nat_get_title

    test_uids = get_one_day_uid_from_file('log_event_20180222')
    print "test_uids num", len(test_uids)

    # 打乱顺序,取1000个样本
    random.shuffle(test_uids)
    selected_uids = test_uids[:1000]

    fo = open('20180321_rn_1.csv', 'w')
    csvwriter = csv.writer(fo)
    first_line = [
        'uid', 'username', 'utags', 'user_bs', 'user_qa', 'user_look_title',
        'news_id', 'title', 'score'
    ]
    csvwriter.writerow(first_line)
    total_time = {}

    cnt_all = 0
    cnt_good = 0

    for uid in selected_uids:
        print '==============uid=%s=======================' % uid
        username = get_username(uid)
        is_app = is_app_user(uid)
        if not is_app:
            continue
        cnt_all += 1
        t1 = time.time()
        recommend_res = recommend_news_kernel(uid, True)
        t2 = time.time()
        total_time[uid] = t2 - t1
        parsed_user_info = recommend_res['parsed_user_info']
        utags = parsed_user_info['weight_dict'].keys()
        user_info_list = recommend_res['user_info_list']

        bs_text_list = []
        qa_text_list = []
        view_news_title_list = []
        view_topic_title_list = []

        for ts, obj, action_type in user_info_list:
            if action_type in ('bs', 'sd'):
                bs_text_list.append(obj)
            elif action_type == 'qa':
                qa_text_list.append(obj)
            elif action_type == 'vt':
                title = nat_get_title('topic_' + str(obj))
                view_topic_title_list.append(title)
            elif action_type == 'vn':
                title = nat_get_title('news_' + str(obj))
                view_news_title_list.append(title)

        user_bs = '~'.join([str(item) for item in bs_text_list])
        user_qa = '~'.join([str(item) for item in qa_text_list])
        user_look_title = '~'.join([
            str(item) for item in view_news_title_list + view_topic_title_list
        ])

        title_dict = recommend_res['title_dict']
        ids_list = recommend_res['ids']
        score_dict = recommend_res['v_score_dict']
        ids = [['%s-news_' % i + str(x) for x in ids]
               for [i, ids] in enumerate(ids_list)]
        ids1 = []
        for x in ids:
            ids1.extend(x)
        ids = ids1
        tcnt = 0

        if ids:
            cnt_good += 1
        for id in ids:
            id0 = id.split('-')[1]
            title = title_dict[id0]
            score = score_dict[id0]
            if tcnt == 0:
                line = convert2gbk([
                    str(uid), username, '~'.join(utags), user_bs, user_qa,
                    user_look_title,
                    str(id), title, score
                ])
            else:
                line = convert2gbk([
                    ' ', ' ', '~'.join(utags), user_bs, user_qa,
                    user_look_title,
                    str(id), title, score
                ])
            csvwriter.writerow(line)
            tcnt += 1

    min_t = min(total_time.values())
    max_t = max(total_time.values())
    mean_t = sum(total_time.values()) / len(total_time)

    line = ['min', 'max', 'mean']
    csvwriter.writerow(line)
    line = [str(min_t), str(max_t), str(mean_t)]
    csvwriter.writerow(line)

    sorted_total_time = sorted(total_time.iteritems(),
                               key=lambda x: x[1],
                               reverse=True)
    for uid, t in sorted_total_time[:10]:
        line = [str(uid), str(t)]
        csvwriter.writerow(line)

    line = ['all_app_user_num', 'good_add_user_num']
    csvwriter.writerow(line)
    line = [str(cnt_all), str(cnt_good)]
    csvwriter.writerow(line)

    fo.close()
def test_many():
    file_name = sys.argv[2]
    num = int(sys.argv[3])
    qs = []
    uids = set()
    with open(file_name, 'r') as f:
        for l in f:
            ll = l.strip('\n').split(',')
            # print ll
            if len(ll) != 12:
                continue
            if ll[0] == 'uid':
                continue
            uid = int(ll[0])
            if uid in uids:
                continue
            uids.add(uid)
            ts = float(ll[5])
            info = ll[4].decode('gbk')
            qs.append([uid, ts, info])
    endpoint = RPC_LOCAL_PROXY

    get_fast_transport(endpoint)

    protocol = get_service_protocol(service, fast=True)
    client = Client(protocol)

    times_ar = {}
    times_topic = {}
    times_tags = {}
    times_list = {}
    times_news = {}
    ar_ecps = []
    rl_ecps = []
    # begin = True

    print len(qs)
    shuffle(qs)

    for uid, ts, info in qs[:num]:
        time.sleep(1)  # 防止接口累着
        # if uid == 3024070:
        #     begin = True
        # if not begin:
        #     continue
        #  test article_recommend
        input = json.dumps([{'user_id': uid, 'timestamp': ts}])
        t1 = time.time()
        if True:
            # try:
            output = client.article_recommend(input)
        # except Exception, e:
        #     ar_ecps.append([uid, e])e

        t2 = time.time()
        times_ar[uid] = t2 - t1
        print "========ar============"
        print uid, ts, info
        try:
            print json.loads(output)["output"][0]["title"]
        except:
            pass

        # test recommend_topn_topcs
        input = json.dumps({'user_id': uid})
        t1 = time.time()
        output = json.loads(client.recommend_topn_topic(input))['output']
        t2 = time.time()
        times_topic[uid] = t2 - t1
        if output:
            print "==========recommend topics========"
            for id in output:
                title = nat_get_title('topic_' + str(id))
                print uid, id, title

        # test recommend_list
        input = json.dumps({'user_id': uid, 'timestamp': ts})

        t1 = time.time()
        if True:
            # try:
            output = client.recommend_list(input)
        # except Exception,e:
        #     output = json.dumps({'output':[]})
        #     rl_ecps.append([uid,e])
        t2 = time.time()
        times_list[uid] = t2 - t1

        output = json.loads(output)['output']
        if output:
            print "========recommend_list=========="
            for item in output:
                print uid, item['id'], item['type'], item['title']

        # test recommend tags
        input = json.dumps({
            'user_id': uid,
        })
        t1 = time.time()
        output = client.recommend_tags(input)
        t2 = time.time()
        times_tags[uid] = t2 - t1
        output = json.loads(output)['output']
        words = output['words']
        plan = output['plan']
        print "=======recommend tags=========="
        print uid
        last_query = user_last_query(uid)
        print "last_query", last_query
        print "words", '-'.join(words)
        for item in plan:
            print item['name'], item['url']

        # test recommend_news
        input = json.dumps({'user_id': uid, 'top_n': 2})

        t1 = time.time()
        output = client.recommend_news(input)
        t2 = time.time()
        times_news[uid] = t2 - t1
        output = json.loads(output)
        ids = output['ids']
        titles = [nat_get_title('news_' + str(id)) for id in ids]
        print "=======recommend news=========="
        print uid
        for i, id in enumerate(ids):
            print id, titles[i]

    print "mean time ar", sum(times_ar.values()) / len(times_ar)
    s_times = sorted(times_ar.iteritems(), key=lambda x: x[1], reverse=True)
    for uid, t in s_times[:10]:
        print uid, t

    print '---------'

    print "mean time recommend topic", sum(
        times_topic.values()) / len(times_topic)
    s_times = sorted(times_topic.iteritems(), key=lambda x: x[1], reverse=True)
    for uid, t in s_times[:10]:
        print uid, t

    print '---------'

    print "mean time recommend list", sum(
        times_list.values()) / len(times_list)
    s_times = sorted(times_list.iteritems(), key=lambda x: x[1], reverse=True)
    for uid, t in s_times[:10]:
        print uid, t

    print '---------'

    print "mean time recommend tags", sum(
        times_tags.values()) / len(times_tags)
    s_times = sorted(times_tags.iteritems(), key=lambda x: x[1], reverse=True)
    for uid, t in s_times[:10]:
        print uid, t

    print '---------'

    print "mean time recommend news", sum(
        times_news.values()) / len(times_news)
    s_times = sorted(times_news.iteritems(), key=lambda x: x[1], reverse=True)
    for uid, t in s_times[:10]:
        print uid, t

    print '---------'

    for u, e in ar_ecps:
        print " ar exceptions", u, e

    for u, e in rl_ecps:
        print "rl exceptions", u, e
Esempio n. 7
0
def main1():
    uids = []

    # 获取所有uid
    for i in (0, 1, 2, 3):
        uid_filename = get_parti_uid_filename(part=i, mode='news')
        with open(uid_filename, 'r') as f:
            ls = f.readlines()
            t_uids = [int(item.strip('\n')) for item in ls]
            uids.extend(t_uids)

    #
    output_filename = '20180312_user_event_and_recommend_news.csv'
    yesterday_begin, yesterday_end = get_yesterday_timestamp()
    yesterday_begin = int(yesterday_begin * 1000)
    yesterday_end = int(yesterday_end * 1000)

    #

    fo = open(output_filename, 'w')
    csvwriter = csv.writer(fo)
    first_line = [
        'uid', 'is_app_user', 'event_datetime', 'event_type', 'event_obj',
        'recommended_news'
    ]
    csvwriter.writerow(first_line)

    all_cnt = 0
    good_cnt = 0

    shuffle(uids)
    for uid in uids[:1000]:
        all_cnt += 1
        is_app = is_app_user(uid)
        print '+' * 10, uid, '+' * 10
        user_action_list = cy_time_event_one_user_kernel2(
            uid, yesterday_begin, yesterday_end)
        recommended_news_ids = get_caled_user_topn_news(uid)
        recommended_news_ids = new_newsids_check(recommended_news_ids, 2)
        if recommended_news_ids:
            good_cnt += 1

        cnt = 0
        for i in range(max([len(user_action_list),
                            len(recommended_news_ids)])):
            if cnt == 0:
                user_id = str(uid)

            else:
                user_id = ''
                is_app = ''
            try:
                event_datetime = timestamp2datetime(user_action_list[i][0] /
                                                    1000.0)
                event_type = user_action_list[i][2]
                event_obj = user_action_list[i][1]
                if event_type == 'vn':
                    title = nat_get_title('news_' + str(event_obj))
                    event_obj_str = str(event_obj) + '|' + title
                elif event_type == 'vt':
                    title = nat_get_title('topic_' + str(event_obj))
                    event_obj_str = str(event_obj) + '|' + title
                else:
                    event_obj_str = event_obj

            except:
                event_datetime = ''
                event_obj_str = ''
                event_type = ''

            try:
                recommended_news_id = recommended_news_ids[i]
                title = nat_get_title('news_' + str(recommended_news_id))
                recommend_str = str(recommended_news_id) + '|' + title

            except:
                recommend_str = ''

            line = convert2gbk([
                user_id,
                str(is_app), event_datetime, event_type, event_obj_str,
                recommend_str
            ])
            csvwriter.writerow(line)

            cnt += 1

    line = ['all', 'good']
    csvwriter.writerow(line)
    csvwriter.writerow([str(all_cnt), str(good_cnt)])
    fo.close()