def get_query_uids(begin, end, mode='search'):
    from general_utils.solr_utils import zk_md4, SolrQuery, pysolr
    from general_utils.time_utils import ensure_m_timestamp
    # 更改时间戳格式
    begin = ensure_m_timestamp(begin)
    end = ensure_m_timestamp(end)
    # 构建索引
    q = '*:*'
    solr_query = SolrQuery()
    solr_query.set('q', q)
    solr_query.add('fq', 'event_time:[%s TO %s]' % (begin, end))
    solr_query.set('fl', ['uid'])
    solr_query.set('rows', 1000000)

    if mode == 'search':
        solr = pysolr.SolrCloud(zk_md4, 'search_event', timeout=25)
    elif mode == 'news':
        solr = pysolr.SolrCloud(zk_md4, 'news_profile', timeout=25)
    elif mode == 'topic':
        solr = pysolr.SolrCloud(zk_md4, 'topic_profile', timeout=25)
    else:
        return set()

    uid_list = [
        int(item['uid']) for item in solr.search(**solr_query.get_query_dict())
    ]
    uid_set = set(uid_list)
    print 'len uids', len(uid_set)
    return uid_set
Exemple #2
0
def get_view_profile_rowkeys(begin, end, mode='news'):
    # topic一个月大约30w,news一个大约80w,不需要快
    print 'begin', begin, timestamp2datetime(begin)
    print 'end', end, timestamp2datetime(end)

    # 确定solr表
    solr = solr_np if mode == 'news' else solr_tp

    # 调整时间戳为毫秒格式
    begin = ensure_m_timestamp(begin)
    end = ensure_m_timestamp(end)
    print begin
    print end

    # 估算rows
    days = (end - begin) / 86400 / 1000
    rows = days * every_day_topic_cnt if mode == 'topic' else days * every_day_news_cnt
    print 'rows', rows
    # 构建索引
    solr_query = SolrQuery()
    q = '*:*'
    solr_query.set('q', q)
    solr_query.add('fq', 'event_time:[%s TO %s]' % (begin, end))
    solr_query.set('rows', rows)
    solr_query.set('fl', ['id'])

    rowkey_list = [
        item['id'] for item in solr.search(**solr_query.get_query_dict())
    ]
    print len(rowkey_list)
    return rowkey_list
def get_cy_event_active_user(begin, end):
    # 从cy_event中一段时间内有记录的用户id(不包括qa,这个表里没有)
    connection = happybase.Connection('hbase_server',
                                      compat='0.90',
                                      port=19090,
                                      timeout=BIG_TIMEOUT)
    table = connection.table("cy_event")
    begin = ensure_m_timestamp(begin)
    end = ensure_m_timestamp(end)

    uids = set()
    for key, value in table.scan(batch_size=10000):

        try:
            action_type, ts, uid = key.split('|')
        except:
            continue
        ts = int(ts)
        if ts > end or ts < begin:
            continue
        if random.random() > 0.01:
            continue
        uids.add(uid)

    connection.close()
    return uids
def from_file(file_name):
    '''
    从文件中读取数据;若没有last_event_time字段,则将其设置为当前
    :param file_name:
    :return:
    '''
    data = []
    with open(file_name, 'r') as f:
        for l in f:
            data.append(json.loads(l.strip('\n')))

    docs = []
    for item in data:
        now = time.time()
        uid = str(item['uid'])
        news_ids = json.dumps(item['ids'])
        last_event_time = ensure_m_timestamp(item.get('last_event_time', now))
        timestamp = ensure_m_timestamp(now)

        docs.append({
            'id': uid,
            'news_ids': news_ids,
            'last_event_time': last_event_time,
            'timestamp': timestamp,
        })

    add_all(docs, solr)
Exemple #5
0
def get_views_news_rowskeys_from_solr(begin, end):
    begin = ensure_m_timestamp(begin)
    end = ensure_m_timestamp(end)