def get_query_uids(begin, end, mode='search'): from general_utils.solr_utils import zk_md4, SolrQuery, pysolr from general_utils.time_utils import ensure_m_timestamp # 更改时间戳格式 begin = ensure_m_timestamp(begin) end = ensure_m_timestamp(end) # 构建索引 q = '*:*' solr_query = SolrQuery() solr_query.set('q', q) solr_query.add('fq', 'event_time:[%s TO %s]' % (begin, end)) solr_query.set('fl', ['uid']) solr_query.set('rows', 1000000) if mode == 'search': solr = pysolr.SolrCloud(zk_md4, 'search_event', timeout=25) elif mode == 'news': solr = pysolr.SolrCloud(zk_md4, 'news_profile', timeout=25) elif mode == 'topic': solr = pysolr.SolrCloud(zk_md4, 'topic_profile', timeout=25) else: return set() uid_list = [ int(item['uid']) for item in solr.search(**solr_query.get_query_dict()) ] uid_set = set(uid_list) print 'len uids', len(uid_set) return uid_set
def get_view_profile_rowkeys(begin, end, mode='news'): # topic一个月大约30w,news一个大约80w,不需要快 print 'begin', begin, timestamp2datetime(begin) print 'end', end, timestamp2datetime(end) # 确定solr表 solr = solr_np if mode == 'news' else solr_tp # 调整时间戳为毫秒格式 begin = ensure_m_timestamp(begin) end = ensure_m_timestamp(end) print begin print end # 估算rows days = (end - begin) / 86400 / 1000 rows = days * every_day_topic_cnt if mode == 'topic' else days * every_day_news_cnt print 'rows', rows # 构建索引 solr_query = SolrQuery() q = '*:*' solr_query.set('q', q) solr_query.add('fq', 'event_time:[%s TO %s]' % (begin, end)) solr_query.set('rows', rows) solr_query.set('fl', ['id']) rowkey_list = [ item['id'] for item in solr.search(**solr_query.get_query_dict()) ] print len(rowkey_list) return rowkey_list
def get_cy_event_active_user(begin, end): # 从cy_event中一段时间内有记录的用户id(不包括qa,这个表里没有) connection = happybase.Connection('hbase_server', compat='0.90', port=19090, timeout=BIG_TIMEOUT) table = connection.table("cy_event") begin = ensure_m_timestamp(begin) end = ensure_m_timestamp(end) uids = set() for key, value in table.scan(batch_size=10000): try: action_type, ts, uid = key.split('|') except: continue ts = int(ts) if ts > end or ts < begin: continue if random.random() > 0.01: continue uids.add(uid) connection.close() return uids
def from_file(file_name): ''' 从文件中读取数据;若没有last_event_time字段,则将其设置为当前 :param file_name: :return: ''' data = [] with open(file_name, 'r') as f: for l in f: data.append(json.loads(l.strip('\n'))) docs = [] for item in data: now = time.time() uid = str(item['uid']) news_ids = json.dumps(item['ids']) last_event_time = ensure_m_timestamp(item.get('last_event_time', now)) timestamp = ensure_m_timestamp(now) docs.append({ 'id': uid, 'news_ids': news_ids, 'last_event_time': last_event_time, 'timestamp': timestamp, }) add_all(docs, solr)
def get_views_news_rowskeys_from_solr(begin, end): begin = ensure_m_timestamp(begin) end = ensure_m_timestamp(end)