def get_sp_duration_active_userid(begin, end): ''' 获取cy_real_time_event一定时间段内左右出现的用户id :param begin: :param end: :return: ''' # 调整输入时间戳格式 begin = ensure_second_timestamp(begin) end = ensure_second_timestamp(end) # 获取table connection = happybase.Connection('hbase_server', compat='0.90', port=19090, timeout=BIG_TIMEOUT) table = connection.table("cy_real_time_event") uids = set() for key, data in table.scan(): uid, timestamp, event_type = key.split('|') try: uid = int(uid) except: continue ts = ensure_second_timestamp(timestamp) if ts < begin or ts > end: continue uids.add(uid) connection.close() return list(uids)
def get_user_recent_views(uid, now=None, lookback=3 * 24 * 86400.0, num=None): # 获取用户近期点击news和topic的数据 if now: end = ensure_second_timestamp(now) begin = end - lookback else: end = time.time() begin = end - lookback connection = happybase.Connection('hbase_server', compat='0.90', port=19090, timeout=SMALL_TIMEOUT) table = connection.table("cy_real_time_event") focused_type = ('view_news', 'view_topics', 'view_topic') actions = [] o = [[key, data] for key, data in table.scan(row_prefix=str(uid) + '|')][::-1] for key, data in o: if num and len(actions) >= num: # 限定只取一定数目的数据 break _, ts, action_type = key.split('|') # 用时间筛选 ts = ensure_second_timestamp(ts) if end < ts < begin: continue if action_type in focused_type: actions.append([ ts, action_type, int(data[CY_REAL_TIME_EVENT_ATTR_MAP[action_type]]) ]) return actions
def get_user_qa_content(uid, begin, end): # 不要了,改用从hbase取数据 # 获取用户在一段时间内所有qa的全文 begin_ds = timestamp2datetime(ensure_second_timestamp(begin)) end_ds = timestamp2datetime(ensure_second_timestamp(end)) all_qa_text = [] sql1 = 'select id from ask_problem where user_id=%s and created_time>"%s" and created_time<"%s";' \ % ( uid, begin_ds, end_ds ) o1 = get_medicaldb_handler().do_one(sql1) if o1 is None or len(o1) == 0: return all_qa_text for item in o1: problem_id = item[0] sql = 'select content from ask_problemcontent where problem_id=%s;' % problem_id o = get_medicaldb_handler().do_one(sql) if o is None or len(o) == 0: continue content = o[0][0] content_dict = json.loads(content)[0] if content_dict['type'] != 'text': continue text = content_dict['text'] all_qa_text.append(text) return all_qa_text
def get_sp_duration_valid_user_id(begin, end): ''' 获取begin,end之间所有活跃(big_search and free_problem_create)用户的id :param begin: :param end: :return: ''' # 调整输入时间戳格式 begin = ensure_second_timestamp(begin) end = ensure_second_timestamp(end) # 获取table connection = happybase.Connection('hbase_server', compat='0.90', port=19090, timeout=BIG_TIMEOUT) table = connection.table("cy_real_time_event") valid_uids = set() event_type_list = ["big_search", "free_problem_create"] for key, data in table.scan(): uid, timestamp, event_type = key.split('|') if event_type not in event_type_list: continue ts = ensure_second_timestamp(timestamp) if ts < begin or ts > end: continue valid_uids.add(int(uid)) connection.close() return valid_uids
def get_feed_showlist_dict(begin, end): ''' 获取begin和end之间每天的展示列表,并存为date:news_list的字典 :param begin: 精确到秒的时间戳 :param end: 精确到秒的时间戳 :return: ''' view_time_th = 5000 # 5000以下认为没展示过,5000以上认为被展示过 # 调整时间戳格式 begin = ensure_second_timestamp(begin) end = ensure_second_timestamp(end) # begin , end 转换成日期 begin_d = timestamp2date(begin) end_d = timestamp2date(end) # 从数据库select sql = 'select id,date,view_times from news_healthnews where is_online=1 and date<="%s" and date>="%s";' % ( end_d, begin_d) o = get_newsdb_handler().dbhandler.do_one(sql) date_newsid_dict = defaultdict(list) for item in o: news_id = int(item[0]) date = item[1] view_times = int(item[2]) if view_times < view_time_th: continue date_newsid_dict[date].append(news_id) return date_newsid_dict
def get_qa_uids(begin, end): # 获取begin-end之间所有qa对应的user_id begin_dt = timestamp2datetime(ensure_second_timestamp(begin)) end_dt = timestamp2datetime(ensure_second_timestamp(end)) sql = 'select distinct user_id from ask_problem where created_time>"%s" and created_time<"%s";' % ( begin_dt, end_dt) o = get_medicaldb_handler().dbhandler.do_one(sql) uids = set() for item in o: uid = item[0] uids.add(int(uid)) return uids
def test3(): from collections import defaultdict # 2017-11-12 17:05:33,289 INFO recommend_resource.Recommend Line:52 failed in recommend==user_info is None ===uid=128243057=========== filename = sys.argv[2] lookback_list = [5, 10, 15, 20, 30, 60, 120] res = defaultdict(list) trigger_count = {"big_search": 0, "free_problem_create": 0} with open(filename, 'r') as f: for l in f: dt = l.split(',')[0] uid = l.split("==uid=")[1].split('=')[0] end = ensure_second_timestamp(dt) begin_list = [end - x * 61.0 for x in lookback_list] end += 5.0 index, trigger = user_time_event2(uid, end, begin_list) if trigger: trigger_count[trigger] += 1 # print uid, index, trigger res[index].append([uid, dt, trigger]) lookback_list.append(0) print res[0] for index in res.keys(): print lookback_list[index], "分钟内可以召回", len(res[index]) for trigger in trigger_count: print trigger, trigger_count[trigger]
def user_time_event(uid, begin, end): begin = ensure_second_timestamp(begin) end = ensure_second_timestamp(end) connection = happybase.Connection('hbase_server', compat='0.90', port=19090, timeout=30000) table = connection.table("cy_real_time_event") for key, value in table.scan(row_prefix=str(uid) + '|'): print "all key", key _, ts, event_type = key.split('|') ts = ensure_second_timestamp(ts) print "all time", timestamp2datetime(ts) if ts >= begin and ts <= end: print "shoot key", key, value print "shoot time", timestamp2datetime(ts)
def user_time_event2(uid, end, begin_list): #lookback_list = [5,10,15,20,30,60]必须是递增的整数 #end = ensure_second_timestamp(end) #begin_list = [end - x * 61.0 for x in lookback_list] #end += 5.0 connection = happybase.Connection('hbase_server', compat='0.90', port=19090, timeout=30000) table = connection.table("cy_real_time_event") rows = [item for item in table.scan(row_prefix=str(uid) + '|')] for key, value in rows[::-1]: _, ts, event_type = key.split('|') if event_type not in ("big_search", "free_problem_create"): continue ts = ensure_second_timestamp(ts) if ts > end: #以后发生的事 continue for i in range(len(begin_list)): begin = begin_list[i] #lookback = lookback_list[i] if ts >= begin: return i, event_type return len(begin_list), ''
def cy_time_event_one_user_kernel(uid, begin, end, event_type_list=None): # 获取某个用户begin到end时间戳内的所有活动信息 # 上线用的,输入uid,时间段(一般是15min,获取触发类型和触发信息) info = {"last_event": None, "last_event_time": 0} if not event_type_list: event_type_list = ["big_search", "free_problem_create"] connection = happybase.Connection('hbase_server', compat='0.90', port=19090, timeout=SMALL_TIMEOUT) table = connection.table("cy_real_time_event") for key, data in table.scan(row_prefix=str(uid) + '|'): uid, timestamp, event_type = key.split('|') if event_type not in event_type_list: continue timestamp = ensure_second_timestamp(timestamp) if end >= timestamp >= begin: event_type, t_info = event_info2(data, event_type) if event_type in info: info[event_type].append(t_info + [timestamp]) else: info[event_type] = [t_info + [timestamp]] if timestamp > info['last_event_time']: info['last_event_time'] = timestamp info['last_event'] = [event_type, t_info] connection.close() return info
def get_last_login_uids(begin, end): # 不需要快 # 调整两个时间戳的格式 begin = int(1000 * ensure_second_timestamp(begin)) end = int(1000 * ensure_second_timestamp(end)) # 建立query solr_query = SolrQuery() q = '*:*' solr_query.set('q', q) solr_query.set('fl', ['id']) solr_query.add('fq', 'last_login:[%s TO %s]' % (begin, end)) solr_query.set('rows', 1000000) # 搜 res = [item['id'] for item in solr_up.search(**solr_query.get_query_dict())] return res
def get_row_key_from_solr2(uid, begin, end, col_name): if col_name == 'search_event': res = get_cy_event_row_key_search(uid) if col_name == 'news_profile': res = get_cy_event_row_key_news(uid) if col_name == 'topic_profile': res = get_cy_event_row_key_topic(uid) selected_rowkey_list = [] begin = ensure_second_timestamp(begin) end = ensure_second_timestamp(end) for item in res: ts = ensure_second_timestamp(item['event_time']) if ts > end or ts < begin: continue selected_rowkey_list.append(item['id']) return selected_rowkey_list
def get_48h_data(now=None): # 时间戳格式是hbase除以1000之后的 if now: # 从指定的now时间戳算前两天(不包括now的当天) now = ensure_second_timestamp(now) begin, end = get_48h_timestamps(now) else: begin, end = get_48h_timestamps() return cy_time_event_kernel(begin, end)
def get_sp_duration_valid_user_data(begin, end, test_uid=None): ''' 获取begin到end之间所有活跃用户(qa or bs action)的用户的数据 时间不可以太久,因为cy_real_time_event只存10天的实时数据 :param begin: 开始的时间戳 :param end: 终止的时间戳 :return: user_info0 ''' # 调整输入时间戳格式 begin = ensure_second_timestamp(begin) end = ensure_second_timestamp(end) # 获取table connection = happybase.Connection('hbase_server', compat='0.90', port=19090, timeout=BIG_TIMEOUT) table = connection.table("cy_real_time_event") user_info0 = {} event_type_list = ["big_search", "free_problem_create"] row_prefix = str(test_uid) if test_uid else None for key, data in table.scan(row_prefix=row_prefix): uid, timestamp, event_type = key.split('|') uid = int(uid) if test_uid and uid != int(test_uid): continue if event_type not in event_type_list: continue ts = ensure_second_timestamp(timestamp) if ts < begin or ts > end: continue # 搂uid的数据,不记录last_event if uid not in user_info0: user_info0[uid] = {'big_search': [], 'free_problem_create': []} event_type, t_info = event_info2(data, event_type) user_info0[uid][event_type].append(t_info + [timestamp]) connection.close() return user_info0
def test1(): uid = sys.argv[2] end = sys.argv[3] interval = sys.argv[4] end = ensure_second_timestamp(end) begin = end - int(interval) * 61.0 end += int(interval) * 61.0 print "begin", timestamp2datetime(begin) print "end", timestamp2datetime(end) user_time_event(uid, begin, end)
def Recommend_list(uid, num, end=None, pid=None, lookback=5 * 61.0): # return : [{'id':111,'type':'topic','title':'xxxx'},{'id':222,'type':'news','title':'yyy'}...] # ****************************************** # if not IS_ONLINE_WEB_SERVER: # return TEST_RETURN_Recommend_list # ****************************************** bad_return = [] log_mark = "recommend_topn" info_logger.info( "%s===============start=========uid=%s==============pid=%s===============", log_mark, uid, str(pid)) # assert uid try: uid = int(uid) except: info_logger.info("%s=====failed in recommend==bad uid=%s=========", log_mark, uid) return bad_return if uid == -1: info_logger.info("%s=====failed in recommend==bad uid=%s=========", log_mark, uid) return bad_return # time window if not end: end = time.time() else: end = ensure_second_timestamp(end) begin = end - lookback end += 5.0 # 结束点顺延5s,防止hbase表里还没有实时数据 if pid: # qa触发由传入的problem_id查询信息 user_info0 = one_user_last_qa_info(pid) else: user_info0 = cy_time_event_one_user_kernel(uid, begin, end) res_dict = Recommend_by_user_info(user_info0, uid, log_mark=log_mark, num=num) res = res_dict['res'] status = res_dict['status'] if not res: info_logger.info("%s==failed in recommend==%s===uid=%s===========", log_mark, status, uid) return bad_return for item in res: best_id, title, mtype = item info_logger.info( "%s==succeed in recommend===id=%s==title=%s====type=%s===uid=%s===========", log_mark, best_id, title, mtype, uid) return [{'id': item[0], 'title': item[1], 'type': item[2]} for item in res]
def cy_time_event_kernel_test(begin, end, test_uid=None): ############ # query算15min内的所有,qa取最后一个,所以一个用户就取一次触发的例子,其他时间不要了 connection = happybase.Connection('hbase_server', compat='0.90', port=19090, timeout=BIG_TIMEOUT) table = connection.table("cy_real_time_event") data_dict = {} interval = 5 * 60.0 # 15min caled_uid = set() max = 0.0 begin = ensure_second_timestamp(begin) end = ensure_second_timestamp(end) row_prefix = str(test_uid) + '|' if test_uid else None for key, data in table.scan(row_prefix=row_prefix): uid, timestamp, event_type = key.split('|') if test_uid and int(uid) != int(test_uid): continue if event_type not in ("big_search", "free_problem_create"): continue timestamp = ensure_second_timestamp(timestamp) if timestamp > max: max = timestamp if timestamp > end or timestamp < begin: continue if uid in caled_uid: continue caled_uid.add(uid) end_t = timestamp + 1.0 begin_t = end_t - interval user_info = cy_time_event_one_user_kernel(uid, begin_t, end_t) data_dict[uid] = user_info print "num of caled_uid", len(caled_uid) print "max timestamp", max return data_dict
def get_today_data(now=None): # 时间戳格式是hbase除以1000之后的 # 获取now当天的活跃用户数据 if now: # 从指定的now时间戳算前两天(不包括now的当天) now = ensure_second_timestamp(now) begin, end = get_today_timestamp(now) else: begin, end = get_today_timestamp() return cy_time_event_kernel(begin, end)
def get_user_qa_content2(uid, begin, end): # 从habse problem2表中 获取用户在一段时间内所有qa的全文 begin_ds = timestamp2datetime(ensure_second_timestamp(begin)) end_ds = timestamp2datetime(ensure_second_timestamp(end)) all_qa_text = [] sql1 = 'select id from ask_problem where user_id=%s and created_time>"%s" and created_time<"%s";' \ % ( uid, begin_ds, end_ds ) o1 = get_medicaldb_handler().do_one(sql1) if o1 is None or len(o1) == 0: return all_qa_text for item in o1: problem_id = item[0] qa_texts = get_qa_texts_by_pid(problem_id) all_qa_text.extend(qa_texts) return all_qa_text
def get_all_yesterday_user_id(now=None, test=False): if not now: now = time.time() else: now = ensure_second_timestamp(now) begin, end = get_yesterday_timestamp(now) if test: end = begin + 30 * 60 # 测试模式只取三十分钟数据 all_valid_uids = get_sp_duration_valid_user_id(begin, end) return all_valid_uids
def get_user_search_keys(uid, begin, end): # 从md4的search_event中选取一定时间段内用户搜索行为的key # (这个key在hbase的cy_event表中可以查到该次行为的详细信息) # 调整两个时间戳的格式 begin = int(1000 * ensure_second_timestamp(begin)) end = int(1000 * ensure_second_timestamp(end)) # 建立query solr_query = SolrQuery() q = '*:*' solr_query.set('q', q) solr_query.set('fl', ['id', 'event_time']) # solr_query.add('fq', 'event_time:[%s TO %s]' % (begin, end)) solr_query.add('fq', 'uid:%s' % uid) solr_query.set('rows', 100000) # 搜 res = [[item['id'], item['event_time']] for item in solr_se.search(**solr_query.get_query_dict())] res = [item[0] for item in res if (begin < item[1] < end)] return res
def get_qa_text(uid, begin, end, num): # 需要快,同时保留事件的时间 bad_return = [], [] begin = ensure_second_timestamp(begin) end = ensure_second_timestamp(end) sql = 'select id,created_time,ask from ask_problem where user_id=%s order by id desc limit %s;' % ( uid, num) # print 'sql', sql o = get_medicaldb_handler().do_one(sql) if o is None or len(o) == 0: return bad_return text_list = [] ts_list = [] for item in o: dt = str(item[1]) ts = datetime_str2timestamp(dt) if ts < begin or ts > end: continue first_ask = unicode(item[2]) text_list.append(first_ask) ts_list.append(ts) return text_list, ts_list
def get_all_yesterday_user_id(now=None, test=False): from general_utils.hbase_utils import get_sp_duration_active_userid from general_utils.time_utils import get_yesterday_timestamp if not now: now = time.time() else: now = ensure_second_timestamp(now) begin, end = get_yesterday_timestamp(now) if test: end = begin + 30 * 60 # 测试模式只取三十分钟数据 all_valid_uids = get_sp_duration_active_userid(begin, end) return all_valid_uids
def get_view_news_data(row_prefix): connection = happybase.Connection('hbase_server', compat='0.90', port=19090, timeout=SMALL_TIMEOUT) table = connection.table("cy_event") news_viwers = defaultdict(set) cnt = 0 print time.time() last_ts = None now = time.time() start = now - 86400 * 180 focused_type = ('view_news', 'view_topic') if row_prefix not in focused_type: return all_types = defaultdict(int) for key, data in table.scan(row_prefix=row_prefix): try: action_type, ts, uid = key.split('|') except: continue all_types[action_type] += 1 if action_type not in focused_type: continue last_ts = ensure_second_timestamp(ts) if last_ts < start: continue news_id = data[CY_REAL_TIME_EVENT_ATTR_MAP[action_type]] news_viwers[news_id].add(uid) cnt += 1 if cnt % 1000 == 0: print timestamp2datetime(time.time()), cnt, len(news_viwers) print time.time() print 'last_ts', last_ts print len(news_viwers) for x in all_types: print x, all_types[x] with open('cy_event_%s.json' % row_prefix, 'w') as f: for news_id in news_viwers: str = json.dumps({ 'id': news_id, 'uids': list(news_viwers[news_id]), 'len': len(news_viwers[news_id]) }) + '\n' f.write(str)
def Recommend(uid, lookback, end=None, pid=None): # if not IS_ONLINE_WEB_SERVER: # return choice(TEST_RETURN) # recommed top 1 bad_return = [-1, "", "nothing"] # material_id, title, material_type log_mark = "recommend_one" info_logger.info( "%s===============start=========uid=%s==============pid=%s===============", log_mark, uid, str(pid)) try: uid = int(uid) except: info_logger.info("%s=====failed in recommend==bad uid=%s=========", log_mark, uid) return bad_return if uid == -1: info_logger.info("%s=====failed in recommend==bad uid=%s=========", log_mark, uid) return bad_return if not end: end = time.time() else: end = ensure_second_timestamp(end) begin = end - lookback end += 5.0 # 结束点顺延5s,防止hbase表里还没有实时数据 if pid: # qa触发由传入的problem_id查询信息 user_info0 = one_user_last_qa_info(pid) else: user_info0 = cy_time_event_one_user_kernel(uid, begin, end) res_dict = Recommend_by_user_info(user_info0, uid, log_mark=log_mark) res = res_dict['res'] status = res_dict['status'] if not res: info_logger.info("%s==failed in recommend==%s===uid=%s===========", log_mark, status, uid) return bad_return best_id, title, mtype = res[0] info_logger.info( "%s==succeed in recommend===id=%s==title=%s====type=%s===uid=%s===========", log_mark, best_id, title, mtype, uid) return [int(best_id), title, mtype]
def cy_time_event_one_user_viewnews(uid, begin, end): connection = happybase.Connection('hbase_server', compat='0.90', port=19090, timeout=SMALL_TIMEOUT) table = connection.table("cy_real_time_event") res = {} for key, data in table.scan(row_prefix=str(uid) + '|'): uid, timestamp, event_type = key.split('|') if event_type not in ["view_news"]: continue timestamp = ensure_second_timestamp(timestamp) info_logger.info("real timestamp=%s", timestamp) if timestamp >= begin and timestamp <= end: news_id = int(data["info:news_id"]) res[news_id] = timestamp return res
def g1(): ''' 查看不使用热卖tag扩充的覆盖率,和使用热卖tag扩充的覆盖率 分子,能匹配上热卖tag的,分母,一天内有活动用户(cy_event ''' from general_utils.hbase_utils import get_user_query, get_user_query2 from general_utils.solr_utils import get_last_login_uids from recommend.manager.recommend_tags_data_helper import get_relation_plan3 from general_utils.db_utils import get_db_data_local_handler from general_utils.hbase_utils import get_sp_duration_active_userid from general_utils.time_utils import timestamp2datetime, ensure_second_timestamp # 用户采样时间窗 # 用户采样命中率 end_ds0 = '2018-01-21 23:59:40' end0 = datetime_str2timestamp(end_ds0) begin0 = end0 - 86400 * 1 # 每个选中用户的数据采集时间窗 end_ds = '2018-01-22 23:59:40' end = datetime_str2timestamp(end_ds) begin = end - 86400 * 180.0 # 半年 # 最后登录时间在2018-01-21 23:59:40前一周的用户 # test_uids = get_last_login_uids(begin0, end0) # test_uids = get_sp_duration_active_userid(begin0,end0) test_uids = get_one_day_uid_from_file('log_event_20180122') print "test_uids num", len(test_uids) # 打乱顺序,取1000个样本 random.shuffle(test_uids) selected_uids = test_uids[:3000] all_good_cnt = 0 all_cnt = 0 app_cnt = 0 good_app_cnt = 0 text_empty_cnt = 0 fo = open('180129_rp_1.csv', 'w') csvwriter = csv.writer(fo) first_line = [ 'uid', 'username', 'is_app', 'last_info_time', 'use_tags', 'systag_ids', 'tag_names', 't', 'is_tangsai' ] csvwriter.writerow(first_line) # status_dict = { # 1: "qa and query", # 2: "view actions", # 3: "search_doctor clinic_no", # 0: "" # } total_time = {} for uid in selected_uids: print '==============uid=%s=======================' % uid username = get_username(uid) is_app = is_app_user(uid) all_cnt += 1 if is_app: app_cnt += 1 t1 = time.time() res = get_relation_plan3(uid, test=True) t2 = time.time() t = t2 - t1 total_time[uid] = t status = res['status'] is_tangsai = False if status: all_good_cnt += 1 if is_app: good_app_cnt += 1 systag_ids = res['ids'] if 96 in systag_ids: is_tangsai = True tagnames = [ get_db_data_local_handler().get_systagid_name(id) for id in systag_ids ] if status in (1, 2, 4): info0 = res['systag_id_dict'] record_info = '~'.join(info0.keys()) elif status == 3: info0 = res['clinic_no'] record_info = '~'.join(info0) last_ts = res['last_ts'] last_info_time = timestamp2datetime( ensure_second_timestamp(last_ts)) else: systag_ids = [] tagnames = [] record_info = '' last_info_time = '' systag_ids_str = '~'.join([str(x) for x in systag_ids]) tagnames_str = '~'.join(tagnames) line = convert2gbk([ str(uid), username, str(is_app), last_info_time, record_info, systag_ids_str, tagnames_str, str(t), str(is_tangsai) ]) csvwriter.writerow(line) line = [str(all_cnt), str(all_good_cnt), str(app_cnt), str(good_app_cnt)] csvwriter.writerow(line) s_total_time = sorted(total_time.iteritems(), key=lambda x: x[1], reverse=True) times = total_time.values() line = [str(min(times)), str(max(times)), str(sum(times) / len(times))] csvwriter.writerow(line) for uid, t in s_total_time[:10]: line = [str(uid), str(t)] csvwriter.writerow(line) fo.close() print str(max(times)) print all_good_cnt
def main5(test_uid=None, now=None): if test_uid == "n": test_uid = None now = time.time() if not now: now = 1512379920.1 else: now = float(ensure_second_timestamp(now)) t10 = time.time() data_dict = cy_time_event_kernel_test(now - 12000.0, now, test_uid) t20 = time.time() print "len(data_dict)", len(data_dict) if not test_uid: fo = open("20171220_1_res.csv", "w") else: fo = open('test.csv', 'w') csvwriter = csv.writer(fo, dialect="excel") first_line = [ u"uid", u"u_tags", u"special_population", u"trigger", u"trigger_info", u"trigger_time", u"material_id", u"material_type", u"score", u"title", u"m_tags", u"only_topic", u"best_id", u"best_score", u"time" ] csvwriter.writerow(first_line) all_call_cnt = 0 all_valid_res_cnt = 0 exception_cnt = 0 status_dict = defaultdict(int) total_time = [] slow_case = [] for uid in data_dict: all_call_cnt += 1 user_info0 = data_dict[uid] try: # if True: t1 = time.time() res = Recommend_by_user_info(user_info0, uid, log_mark='testmain5', test=True) # return = {"user_info": None, "res": None, "topn_ids_scores": None, "only_topic": None,"status":"succeed"} t2 = time.time() print t2 - t1 if t2 - t1 >= 3: break user_info = res['user_info'] res1 = res['res'] topn_ids_scores = res['topn_ids_scores'] only_topic = res['only_topic'] status = res['status'] v_score_dict = res['v_score_dict'] best_id, best_title, mtype = res1[0] this_time = t2 - t1 if this_time >= 1.0: slow_case.append([uid, this_time]) total_time.append(t2 - t1) except Exception, e: print e exception_cnt += 0 continue status_dict[status] += 1 #################### # if not only_topic: # continue #################### if best_id == -1 or user_info is None: continue print '=================' print uid texts = user_info["texts"] tags = user_info["tags"] special_population = user_info["special_population"] trigger = user_info["trigger"] timestamp = user_info['timestamp'] best_score = v_score_dict[mtype + '_' + str(best_id)] # if trigger == "big_search": # continue if trigger == 'big_search': trigger_info = "-".join(texts) elif trigger == "free_problem_create": problem_id, ask = get_medicaldb_handler().get_ask_by_timestamp( uid, timestamp) if not ask: ask = texts[0] trigger_info = '-'.join([str(problem_id), str(ask)]) print "u tags", "-".join(tags), special_population print trigger_info, best_id, best_score, best_title for unique_id, score in topn_ids_scores: material_type, id = unique_id.split('_') if material_type == "news": title, _ = get_newsdb_handler().get_title_digest_by_nid(id) m_tags = get_news_tags_from_solr("news_" + str(id)) elif material_type == "topic": title = get_medicaldb_handler().get_topic_title(id) m_tags = get_news_tags_from_solr("r_topic_" + str(id)) rows = [ str(uid), "-".join(tags), str(special_population), trigger, trigger_info, str(timestamp), str(id), material_type, str(score), title, "-".join(m_tags), str(only_topic), str(best_id), str(best_score), str(this_time) ] rows = convert2gbk(rows) csvwriter.writerow(rows) all_valid_res_cnt += 1
def time_factor(ts, t0): # 牛顿冷却定律的时间衰减因子 return math.exp( -k * abs(ensure_second_timestamp(t0) - ensure_second_timestamp(ts)))