def test17(): from general_utils.solr_utils import nat_get_title fin = sys.argv[2] mtype = 'topic' if 'topic' in fin else 'news' fon = mtype + '_nearest_top10.csv' fo = open(fon, 'w') csvwriter = csv.writer(fo) first_line = ['id', 'title', 'top_id', 'top_title', 'score'] csvwriter.writerow(first_line) cnt = 0 with open(fin, 'r') as f: for l in f: if cnt > 1000: break cnt += 1 l = l.strip('\n') this_dict = json.loads(l) main_id = this_dict['id'] main_title = nat_get_title(mtype + '_' + str(main_id)) if not main_title: continue top = this_dict['top'][:10] for subordinate_id, score in top: subordinate_title = nat_get_title(mtype + '_' + str(subordinate_id)) row = [ str(main_id), main_title, str(subordinate_id), subordinate_title, str(score) ] row = convert2gbk(row) csvwriter.writerow(row) fo.close()
def main6(test_uid=None): # test recommend_topics from recommend.manager.recommend_resource import Recommend_topics from recommend.manager.recommend_topic_data_helper import parse_user_info as parse_user_info2 now = time.time() if test_uid == "n": test_uid = None data_dict = cy_time_event_kernel_test(now - 12000.0, now, test_uid) fo = open("20180102_rt.csv", "w") csvwriter = csv.writer(fo) first_line = ['uid', 'tags', 'sp', 'topicid', 'score', 't_title', 't_tags'] csvwriter.writerow(first_line) times = {} for uid in data_dict.keys(): t1 = time.time() topic_ids, user_info, score_dict = Recommend_topics(uid, 5, now, True) t2 = time.time() times[uid] = t2 - t1 if not user_info: continue tags = user_info['tags'] sp = user_info['special_population'] for x in topic_ids: title = nat_get_title('topic_' + str(x)) score = score_dict['topic_' + str(x)] t_tags = get_news_tags_from_solr("r_topic_" + str(x)) row = [ str(uid), '-'.join(tags), sp, str(x), str(score), title, '-'.join(t_tags) ] row = convert2gbk(row) csvwriter.writerow(row) fo.close() s_times = sorted(times.iteritems(), key=lambda x: x[1], reverse=True)[:10] for x, y in s_times: print x, y
def rank_news(news_ids, solr_score_dict, score_dict, v_score_dict, uid, u_vecs, u_weights, u_bp_words): t1 = time.time() bad_ids = user_half_year_newsids(uid) # int t2 = time.time() print "user_half_year_newsids time", t2 - t1 titles = set() cnt = 0 all_n_tangs_cnt = 0 all_tags_vecs = {} # tag:vec for id in news_ids: print '=' * 20, id if cnt >= 10: break _, true_id = id.split('_') if int(true_id) in bad_ids: continue # 标题去重 title = nat_get_title(id) if title in titles: if len(title) > 0: # 防止访问数据库出错导致推送失败 continue titles.add(title) digest = nat_get_digest(id) # 提取文章tags t3 = time.time() is_good_article, n_tags, n_weights, n_cates, empty_title, title_tags = weighted_news_tags3( title, digest) # n_weights is a dict t4 = time.time() print "weighted_news_tags3 time", id, t4 - t3 try: all_n_tangs_cnt += len(n_tags) except: pass # 去掉没tag的文章 if not is_good_article or len(n_tags) <= 1: continue t5 = time.time() print "user_news_tag_hard_match time", t5 - t4 new_tags = set(n_tags) - set(all_tags_vecs.keys()) new_tags_vecs_dict = get_vec_dict_norm_ndarray_redis(new_tags) all_tags_vecs.update(new_tags_vecs_dict) # 更新到all_tags_vecs t6 = time.time() print "get_vecs_weighted3 time", t6 - t5 # 计算相似度 # score = vecs_similarity2(u_vecs, u_weights, n_vecs, n_weights) score = vecs_similarity3(u_vecs=u_vecs, u_weights=u_weights, n_vecs_dict=all_tags_vecs, n_weights_dict=n_weights, n_tags=n_tags) # 身体部位匹配度分数 bp_score = user_news_tag_bodypart_match2(u_bp_words, title_tags) print 'bp_score', bp_score print 'score0', score print 'ntags', '-'.join(n_tags) t7 = time.time() print "vecs_similarity2 time", t7 - t6 if empty_title: score *= 0.5 # 标题没有实体词的文章降权 solr_score = solr_score_dict[id] v_score_dict[id] = score * bp_score score_dict[id] = (0.7 * score + 0.3 * solr_score) * bp_score cnt += 1 print "all_n_tangs_cnt news", all_n_tangs_cnt
def user_view_action_texts(uid, num=10): ts_dict = user_last_view_actions(uid, num=num) text_dict = dict([[key, nat_get_title(key) + ' ' + nat_get_digest(key)] for key in ts_dict]) return text_dict, ts_dict
def g2(): # test recommend_news ''' :return: ''' from recommend.manager.feed_data_helper import recommend_news_kernel from general_utils.solr_utils import nat_get_title test_uids = get_one_day_uid_from_file('log_event_20180222') print "test_uids num", len(test_uids) # 打乱顺序,取1000个样本 random.shuffle(test_uids) selected_uids = test_uids[:1000] fo = open('20180321_rn_1.csv', 'w') csvwriter = csv.writer(fo) first_line = [ 'uid', 'username', 'utags', 'user_bs', 'user_qa', 'user_look_title', 'news_id', 'title', 'score' ] csvwriter.writerow(first_line) total_time = {} cnt_all = 0 cnt_good = 0 for uid in selected_uids: print '==============uid=%s=======================' % uid username = get_username(uid) is_app = is_app_user(uid) if not is_app: continue cnt_all += 1 t1 = time.time() recommend_res = recommend_news_kernel(uid, True) t2 = time.time() total_time[uid] = t2 - t1 parsed_user_info = recommend_res['parsed_user_info'] utags = parsed_user_info['weight_dict'].keys() user_info_list = recommend_res['user_info_list'] bs_text_list = [] qa_text_list = [] view_news_title_list = [] view_topic_title_list = [] for ts, obj, action_type in user_info_list: if action_type in ('bs', 'sd'): bs_text_list.append(obj) elif action_type == 'qa': qa_text_list.append(obj) elif action_type == 'vt': title = nat_get_title('topic_' + str(obj)) view_topic_title_list.append(title) elif action_type == 'vn': title = nat_get_title('news_' + str(obj)) view_news_title_list.append(title) user_bs = '~'.join([str(item) for item in bs_text_list]) user_qa = '~'.join([str(item) for item in qa_text_list]) user_look_title = '~'.join([ str(item) for item in view_news_title_list + view_topic_title_list ]) title_dict = recommend_res['title_dict'] ids_list = recommend_res['ids'] score_dict = recommend_res['v_score_dict'] ids = [['%s-news_' % i + str(x) for x in ids] for [i, ids] in enumerate(ids_list)] ids1 = [] for x in ids: ids1.extend(x) ids = ids1 tcnt = 0 if ids: cnt_good += 1 for id in ids: id0 = id.split('-')[1] title = title_dict[id0] score = score_dict[id0] if tcnt == 0: line = convert2gbk([ str(uid), username, '~'.join(utags), user_bs, user_qa, user_look_title, str(id), title, score ]) else: line = convert2gbk([ ' ', ' ', '~'.join(utags), user_bs, user_qa, user_look_title, str(id), title, score ]) csvwriter.writerow(line) tcnt += 1 min_t = min(total_time.values()) max_t = max(total_time.values()) mean_t = sum(total_time.values()) / len(total_time) line = ['min', 'max', 'mean'] csvwriter.writerow(line) line = [str(min_t), str(max_t), str(mean_t)] csvwriter.writerow(line) sorted_total_time = sorted(total_time.iteritems(), key=lambda x: x[1], reverse=True) for uid, t in sorted_total_time[:10]: line = [str(uid), str(t)] csvwriter.writerow(line) line = ['all_app_user_num', 'good_add_user_num'] csvwriter.writerow(line) line = [str(cnt_all), str(cnt_good)] csvwriter.writerow(line) fo.close()
def test_many(): file_name = sys.argv[2] num = int(sys.argv[3]) qs = [] uids = set() with open(file_name, 'r') as f: for l in f: ll = l.strip('\n').split(',') # print ll if len(ll) != 12: continue if ll[0] == 'uid': continue uid = int(ll[0]) if uid in uids: continue uids.add(uid) ts = float(ll[5]) info = ll[4].decode('gbk') qs.append([uid, ts, info]) endpoint = RPC_LOCAL_PROXY get_fast_transport(endpoint) protocol = get_service_protocol(service, fast=True) client = Client(protocol) times_ar = {} times_topic = {} times_tags = {} times_list = {} times_news = {} ar_ecps = [] rl_ecps = [] # begin = True print len(qs) shuffle(qs) for uid, ts, info in qs[:num]: time.sleep(1) # 防止接口累着 # if uid == 3024070: # begin = True # if not begin: # continue # test article_recommend input = json.dumps([{'user_id': uid, 'timestamp': ts}]) t1 = time.time() if True: # try: output = client.article_recommend(input) # except Exception, e: # ar_ecps.append([uid, e])e t2 = time.time() times_ar[uid] = t2 - t1 print "========ar============" print uid, ts, info try: print json.loads(output)["output"][0]["title"] except: pass # test recommend_topn_topcs input = json.dumps({'user_id': uid}) t1 = time.time() output = json.loads(client.recommend_topn_topic(input))['output'] t2 = time.time() times_topic[uid] = t2 - t1 if output: print "==========recommend topics========" for id in output: title = nat_get_title('topic_' + str(id)) print uid, id, title # test recommend_list input = json.dumps({'user_id': uid, 'timestamp': ts}) t1 = time.time() if True: # try: output = client.recommend_list(input) # except Exception,e: # output = json.dumps({'output':[]}) # rl_ecps.append([uid,e]) t2 = time.time() times_list[uid] = t2 - t1 output = json.loads(output)['output'] if output: print "========recommend_list==========" for item in output: print uid, item['id'], item['type'], item['title'] # test recommend tags input = json.dumps({ 'user_id': uid, }) t1 = time.time() output = client.recommend_tags(input) t2 = time.time() times_tags[uid] = t2 - t1 output = json.loads(output)['output'] words = output['words'] plan = output['plan'] print "=======recommend tags==========" print uid last_query = user_last_query(uid) print "last_query", last_query print "words", '-'.join(words) for item in plan: print item['name'], item['url'] # test recommend_news input = json.dumps({'user_id': uid, 'top_n': 2}) t1 = time.time() output = client.recommend_news(input) t2 = time.time() times_news[uid] = t2 - t1 output = json.loads(output) ids = output['ids'] titles = [nat_get_title('news_' + str(id)) for id in ids] print "=======recommend news==========" print uid for i, id in enumerate(ids): print id, titles[i] print "mean time ar", sum(times_ar.values()) / len(times_ar) s_times = sorted(times_ar.iteritems(), key=lambda x: x[1], reverse=True) for uid, t in s_times[:10]: print uid, t print '---------' print "mean time recommend topic", sum( times_topic.values()) / len(times_topic) s_times = sorted(times_topic.iteritems(), key=lambda x: x[1], reverse=True) for uid, t in s_times[:10]: print uid, t print '---------' print "mean time recommend list", sum( times_list.values()) / len(times_list) s_times = sorted(times_list.iteritems(), key=lambda x: x[1], reverse=True) for uid, t in s_times[:10]: print uid, t print '---------' print "mean time recommend tags", sum( times_tags.values()) / len(times_tags) s_times = sorted(times_tags.iteritems(), key=lambda x: x[1], reverse=True) for uid, t in s_times[:10]: print uid, t print '---------' print "mean time recommend news", sum( times_news.values()) / len(times_news) s_times = sorted(times_news.iteritems(), key=lambda x: x[1], reverse=True) for uid, t in s_times[:10]: print uid, t print '---------' for u, e in ar_ecps: print " ar exceptions", u, e for u, e in rl_ecps: print "rl exceptions", u, e
def main1(): uids = [] # 获取所有uid for i in (0, 1, 2, 3): uid_filename = get_parti_uid_filename(part=i, mode='news') with open(uid_filename, 'r') as f: ls = f.readlines() t_uids = [int(item.strip('\n')) for item in ls] uids.extend(t_uids) # output_filename = '20180312_user_event_and_recommend_news.csv' yesterday_begin, yesterday_end = get_yesterday_timestamp() yesterday_begin = int(yesterday_begin * 1000) yesterday_end = int(yesterday_end * 1000) # fo = open(output_filename, 'w') csvwriter = csv.writer(fo) first_line = [ 'uid', 'is_app_user', 'event_datetime', 'event_type', 'event_obj', 'recommended_news' ] csvwriter.writerow(first_line) all_cnt = 0 good_cnt = 0 shuffle(uids) for uid in uids[:1000]: all_cnt += 1 is_app = is_app_user(uid) print '+' * 10, uid, '+' * 10 user_action_list = cy_time_event_one_user_kernel2( uid, yesterday_begin, yesterday_end) recommended_news_ids = get_caled_user_topn_news(uid) recommended_news_ids = new_newsids_check(recommended_news_ids, 2) if recommended_news_ids: good_cnt += 1 cnt = 0 for i in range(max([len(user_action_list), len(recommended_news_ids)])): if cnt == 0: user_id = str(uid) else: user_id = '' is_app = '' try: event_datetime = timestamp2datetime(user_action_list[i][0] / 1000.0) event_type = user_action_list[i][2] event_obj = user_action_list[i][1] if event_type == 'vn': title = nat_get_title('news_' + str(event_obj)) event_obj_str = str(event_obj) + '|' + title elif event_type == 'vt': title = nat_get_title('topic_' + str(event_obj)) event_obj_str = str(event_obj) + '|' + title else: event_obj_str = event_obj except: event_datetime = '' event_obj_str = '' event_type = '' try: recommended_news_id = recommended_news_ids[i] title = nat_get_title('news_' + str(recommended_news_id)) recommend_str = str(recommended_news_id) + '|' + title except: recommend_str = '' line = convert2gbk([ user_id, str(is_app), event_datetime, event_type, event_obj_str, recommend_str ]) csvwriter.writerow(line) cnt += 1 line = ['all', 'good'] csvwriter.writerow(line) csvwriter.writerow([str(all_cnt), str(good_cnt)]) fo.close()