Exemple #1
0
def get_show_trace_followers(xnr_user_no):

    es_get_result = es.get(index=tw_xnr_fans_followers_index_name,doc_type=tw_xnr_fans_followers_index_type,\
                    id=xnr_user_no)['_source']

    trace_follow_list = es_get_result['trace_follow_list']

    weibo_user_info = []

    if trace_follow_list:
        mget_results = es.mget(index=twitter_user_index_name,doc_type=twitter_user_index_type,\
                            body={'ids':trace_follow_list})['docs']
        # print 'mget_results::',mget_results
        for result in mget_results:
            if result['found']:
                weibo_user_info.append(result['_source'])
            else:
                uid = result['_id']

                weibo_user_info.append({
                    'uid': uid,
                    'statusnum': 0,
                    'fansnum': 0,
                    'friendsnum': 0,
                    'photo_url': '',
                    'sex': '',
                    'nick_name': uid,
                    'user_location': ''
                })
    else:
        weibo_user_info = []

    return weibo_user_info
def my_topic_classfiy(uid_list, datetime_list):
    topic_dict_results = {}
    topic_string_results = {}
    #将处理后的结果保存到数据库中,并在处理前查询数据库中是否已经有了相应内容之前存储的结果,以提高效率
    uids = uid_list
    unresolved_uids = []
    res = es.mget(index=tw_portrait_index_name, doc_type=tw_portrait_index_type, body={'ids': uids})['docs']
    for r in res:
        uid = r['_id']
        if r.has_key('found'): 
            found = r['found']
            if found and r['_source'].has_key('topic'):
                topic = r['_source']['topic']
                topic_string = r['_source']['topic_string']
                topic_dict_results[uid] = json.loads(topic)
                topic_string_results[uid] = [topic_ch2en_dict[ch_topic] for ch_topic in topic_string.split('&')]
            else:
                unresolved_uids.append(uid)
        else:   #es表中目前无任何记录 
            unresolved_uids.append(uid)

    #未在数据库中的进行计算并存储
    user_topic_dict = {}
    user_topic_list = {}
    if unresolved_uids:
        tw_flow_text_index_list = []
        for datetime in datetime_list:
            tw_flow_text_index_list.append(flow_text_index_name_pre + datetime)
        user_topic_data = get_filter_keywords(tw_flow_text_index_list, unresolved_uids)
        user_topic_dict, user_topic_list = topic_classfiy(unresolved_uids, user_topic_data)

        user_topic_string = {}
        for uid, topic_list in user_topic_list.items():
            li = []
            for t in topic_list:
                li.append(zh_data[name_list.index(t)].decode('utf8'))
            user_topic_string[uid] = '&'.join(li)
        user_topic = {}
        for uid in unresolved_uids:
            if uid in user_topic_dict:
                user_topic[uid] = {
                    'filter_keywords': json.dumps(user_topic_data[uid]),
                    'topic': json.dumps(user_topic_dict[uid]),
                    'topic_string': user_topic_string[uid]
                }
            else:
                user_topic[uid] = {
                    'filter_keywords': json.dumps({}),
                    'topic': json.dumps({}),
                    'topic_string': ''
                }
        save_data2es(user_topic)

    #整合
    user_topic_dict.update(topic_dict_results)
    user_topic_list.update(topic_string_results)
    return user_topic_dict, user_topic_list
Exemple #3
0
def get_recommend_at_user(xnr_user_no):
    #_id  = user_no2_id(user_no)
    es_result = es.get(index=tw_xnr_index_name,
                       doc_type=tw_xnr_index_type,
                       id=xnr_user_no)['_source']
    #print 'es_result:::',es_result
    if es_result:
        uid = es_result['uid']
        daily_interests = es_result['daily_interests']
    if S_TYPE == 'test':
        now_ts = datetime2ts(S_DATE_TW)
    else:
        now_ts = int(time.time())
    datetime = ts2datetime(now_ts - 24 * 3600)

    index_name = twitter_flow_text_index_name_pre + datetime
    nest_query_list = []
    daily_interests_list = daily_interests.split('&')

    es_results_daily = es.search(index=index_name,doc_type=twitter_flow_text_index_type,\
                        body={'query':{'match_all':{}},'size':200,\
                        'sort':{'timestamp':{'order':'desc'}}})['hits']['hits']

    uid_list = []
    if es_results_daily:
        for result in es_results_daily:
            result = result['_source']
            uid_list.append(result['uid'])

    ## 根据uid,从weibo_user中得到 nick_name
    uid_nick_name_dict = dict()  # uid不会变,而nick_name可能会变
    es_results_user = es.mget(index=twitter_user_index_name,
                              doc_type=twitter_user_index_type,
                              body={'ids': uid_list})['docs']
    i = 0
    for result in es_results_user:

        if result['found'] == True:
            result = result['_source']
            uid = result['uid']
            nick_name = result['name']
            if nick_name:
                i += 1
                uid_nick_name_dict[uid] = nick_name
        if i >= DAILY_AT_RECOMMEND_USER_TOP:
            break

    return uid_nick_name_dict
Exemple #4
0
def create_event_warning(xnr_user_no, today_datetime, write_mark):
    #获取事件名称
    hashtag_list = get_hashtag(today_datetime)
    #print 'hashtag_list/:',hashtag_list

    facebook_flow_text_index_name = get_timets_set_indexset_list(
        facebook_flow_text_index_name_pre, today_datetime, today_datetime)

    #虚拟人的好友列表
    friends_list = lookup_xnr_friends(xnr_user_no)

    event_warming_list = []
    for event_item in hashtag_list:
        event_warming_content = dict()  #事件名称、主要参与用户、典型微博、事件影响力、事件平均时间
        event_warming_content['event_name'] = event_item['event_name']
        event_influence_sum = 0
        event_time_sum = 0
        query_body = {
            'query': {
                'filtered': {
                    'filter': {
                        'bool': {
                            'must': [{
                                'term': {
                                    'hashtag': event_item['event_name']
                                }
                            }, {
                                'range': {
                                    'sensitive': {
                                        'gte': 1
                                    }
                                }
                            }]
                        }
                    }
                }
            },
            'size': MAX_WARMING_SIZE,
            'sort': {
                'sensitive': {
                    'order': 'desc'
                }
            }
        }
        event_results = es_xnr_2.search(index=facebook_flow_text_index_name,
                                        doc_type=facebook_flow_text_index_type,
                                        body=query_body)['hits']['hits']
        if event_results:
            facebook_result = []
            friends_num_dict = dict()
            alluser_num_dict = dict()
            #print 'sencond_time:::',int(time.time())
            for item in event_results:
                #查询三个指标字段
                fid_result = lookup_fid_attend_index(item['_source']['fid'],
                                                     today_datetime)
                if fid_result:
                    item['_source']['comment'] = fid_result['comment']
                    item['_source']['share'] = fid_result['share']
                    item['_source']['favorite'] = fid_result['favorite']
                else:
                    item['_source']['comment'] = 0
                    item['_source']['share'] = 0
                    item['_source']['favorite'] = 0
                #print 'event_content:',item['_source']['text']
                #统计用户信息
                if alluser_num_dict.has_key(str(item['_source']['uid'])):
                    friends_mark = set_intersection(item['_source']['uid'],
                                                    friends_list)
                    if friends_mark > 0:
                        alluser_num_dict[str(
                            item['_source']['uid'])] = alluser_num_dict[str(
                                item['_source']['uid'])] + 1 * 2
                    else:
                        alluser_num_dict[str(
                            item['_source']['uid'])] = alluser_num_dict[str(
                                item['_source']['uid'])] + 1
                else:
                    alluser_num_dict[str(item['_source']['uid'])] = 1

                #计算影响力
                origin_influence_value = (1 + item['_source']['comment'] +
                                          item['_source']['share'] +
                                          item['_source']['favorite']) * (
                                              1 + item['_source']['sensitive'])
                friends_value = judge_user_type(item['_source']['uid'],
                                                friends_list)
                item['_source'][
                    'facebook_influence_value'] = origin_influence_value * friends_value

                #查询用户昵称
                item['_source']['nick_name'] = get_user_nickname(
                    item['_source']['uid'])
                facebook_result.append(item['_source'])

                #统计影响力、时间
                event_influence_sum = event_influence_sum + item['_source'][
                    'facebook_influence_value']
                event_time_sum = event_time_sum + item['_source']['timestamp']

            # print 'third_time:::',int(time.time())
            #典型信息
            facebook_result.sort(key=lambda k:
                                 (k.get('facebook_influence_value', 0)),
                                 reverse=True)
            event_warming_content['main_facebook_info'] = json.dumps(
                facebook_result)

            #事件影响力和事件时间
            number = len(event_results)
            event_warming_content[
                'event_influence'] = event_influence_sum / number
            event_warming_content['event_time'] = event_time_sum / number

            #对用户进行排序
            alluser_num_dict = sorted(alluser_num_dict.items(),
                                      key=lambda d: d[1],
                                      reverse=True)
            main_userid_list = []
            for i in xrange(0, len(alluser_num_dict)):
                main_userid_list.append(alluser_num_dict[i][0])

        #主要参与用户信息
            main_user_info = []
            user_es_result = es_xnr_2.mget(index=facebook_user_index_name,
                                           doc_type=facebook_user_index_type,
                                           body={'ids':
                                                 main_userid_list})['docs']
            # print 'user_es_result:',user_es_result
            for item in user_es_result:

                user_dict = dict()
                if item['found']:
                    user_dict['uid'] = item['_id']
                    user_dict['username'] = item['_source']['username']
                    if item['_source'].has_key('talking_about_count'):
                        user_dict['talking_about_count'] = item['_source'][
                            'talking_about_count']
                    else:
                        user_dict['talking_about_count'] = 0
                    if item['_source'].has_key('likes'):
                        user_dict['likes'] = item['_source']['likes']
                    else:
                        user_dict['likes'] = 0
                    if item['_source'].has_key('category'):
                        user_dict['category'] = item['_source']['category']
                    else:
                        user_dict['category'] = ''
                else:
                    # user_dict['icon']=''
                    user_dict['uid'] = item['_id']
                    user_dict['username'] = ''
                    user_dict['talking_about_count'] = 0
                    user_dict['likes'] = 0
                    user_dict['category'] = ''
                main_user_info.append(user_dict)
            event_warming_content['main_user_info'] = json.dumps(
                main_user_info)

            # print 'fourth_time:::',int(time.time())
            event_warming_content['xnr_user_no'] = xnr_user_no
            event_warming_content['validity'] = 0
            event_warming_content['timestamp'] = today_datetime
            now_time = int(time.time())
            # task_id=xnr_user_no+'_'+str(now_time)
            task_id = xnr_user_no + '_' + event_warming_content['event_name']

            #写入数据库
            if write_mark:
                # print 'today_datetime:::',ts2datetime(today_datetime)
                print 'task_id_event:', task_id
                mark = write_envent_warming(today_datetime,
                                            event_warming_content, task_id)
                event_warming_list.append(mark)
            else:
                event_warming_list.append(event_warming_content)

        else:
            pass
        # print 'fifth_time:::',int(time.time())
    return event_warming_list
def detect_by_seed_users(seed_users):
    retweet_mark = 1  #目前只有部分数据
    comment_mark = 0  #暂无数据

    group_uid_list = set()
    all_union_result_dict = {}
    #get retweet/comment es db_number
    now_ts = time.time()
    db_number = get_db_num(now_ts)

    #step1: mget retweet and be_retweet
    if retweet_mark == 1:
        # retweet_index_name = retweet_index_name_pre + str(db_number)
        be_retweet_index_name = be_retweet_index_name_pre + str(db_number)
        #mget retwet
        '''
        try:
            retweet_result = es.mget(index=retweet_index_name, doc_type=retweet_index_type, \
                                             body={'ids':seed_users}, _source=True)['docs']
        except:
            retweet_result = []
        '''
        #mget be_retweet
        try:
            be_retweet_result = es.mget(index=be_retweet_index_name, doc_type=be_retweet_index_type, \
                                                body={'ids':seed_users} ,_source=True)['docs']
        except:
            be_retweet_result = []
    '''
    #step2: mget comment and be_comment
    if comment_mark == 1:
        comment_index_name = comment_index_name_pre + str(db_number)
        be_comment_index_name = be_comment_index_name_pre + str(db_number)
        #mget comment
        try:
            comment_result = es.mget(index=comment_index_name, doc_type=comment_index_type, \
                                             body={'ids':seed_users}, _source=True)['docs']
        except:
            comment_result = []
        #mget be_comment
        try:
            be_comment_result = es.mget(index=be_comment_index_name, doc_type=be_comment_index_type, \
                                            body={'ids':seed_users}, _source=True)['docs']
        except:
            be_comment_result = []
    '''
    #step3: union retweet/be_retweet/comment/be_comment result
    union_count = 0

    for iter_search_uid in seed_users:
        try:
            uid_retweet_dict = json.loads(
                retweet_result[union_count]['_source']['uid_retweet'])
        except:
            uid_retweet_dict = {}
        try:
            uid_be_retweet_dict = json.loads(
                be_retweet_result[union_count]['_source']['uid_be_retweet'])
        except:
            uid_be_retweet_dict = {}
        try:
            uid_comment_dict = json.loads(
                comment_result[union_count]['_source']['uid_comment'])
        except:
            uid_comment_dict = {}
        try:
            uid_be_comment_dict = json.loads(
                be_comment_result[union_count]['_source']['uid_be_comment'])
        except:
            uid_be_comment_dict = {}
        #union four type user set
        union_result = union_dict(uid_retweet_dict, uid_be_retweet_dict,
                                  uid_comment_dict, uid_be_comment_dict)
        all_union_result_dict[iter_search_uid] = union_result
    '''
    !!!! 有一个转化提取 
    从 all_union_result_dict   中提取 所有的uid
    '''
    for seeder_uid, inter_dict in all_union_result_dict.iteritems():
        for uid, inter_count in inter_dict.iteritems():
            group_uid_list.add(uid)

    group_uid_list = list(group_uid_list)

    return group_uid_list
def my_domain_classfiy(uid_list, datetime_list):
    domain_results = {}
    #将处理后的结果保存到数据库中,并在处理前查询数据库中是否已经有了相应内容之前存储的结果,以提高效率
    uids = uid_list
    unresolved_uids = []
    res = es.mget(index=fb_portrait_index_name,
                  doc_type=fb_portrait_index_type,
                  body={'ids': uids})['docs']
    for r in res:
        uid = r['_id']
        if r.has_key('found'):
            found = r['found']
            if found and r['_source'].has_key('domain'):
                domain = r['_source']['domain']
                domain_results[uid] = domain
            else:
                unresolved_uids.append(uid)
        else:  #es表中目前无任何记录
            unresolved_uids.append(uid)

    #未在数据库中的进行计算并存储
    user_domain = {}
    user_domain_temp = {}
    if unresolved_uids:
        fb_flow_text_index_list = []
        for datetime in datetime_list:
            fb_flow_text_index_list.append(flow_text_index_name_pre + datetime)

        user_domain_data = {}
        #load num of text
        count_result = count_text_num(unresolved_uids, fb_flow_text_index_list)
        #load baseinfo
        fb_user_query_body = {
            'query': {
                "filtered": {
                    "filter": {
                        "bool": {
                            "must": [
                                {
                                    "terms": {
                                        "uid": unresolved_uids
                                    }
                                },
                            ]
                        }
                    }
                }
            },
            'size': MAX_SEARCH_SIZE,
            "fields": ["bio_str", "category", "uid"]
        }
        try:
            search_results = es.search(index=facebook_user_index_name,
                                       doc_type=facebook_user_index_type,
                                       body=fb_user_query_body)['hits']['hits']
            for item in search_results:
                content = item['fields']
                uid = content['uid'][0]
                if not uid in user_domain_data:
                    text_num = count_result[uid]
                    user_domain_data[uid] = {
                        'bio_str': '',
                        'category': '',
                        'number_of_text': text_num
                    }
                #对于长文本,Goslate 会在标点换行等分隔处把文本分拆为若干接近 2000 字节的子文本,再一一查询,最后将翻译结果拼接后返回用户。通过这种方式,Goslate 突破了文本长度的限制。
                if content.has_key('category'):
                    category = content.get('category')[0]
                else:
                    category = ''
                if content.has_key('bio_str'):
                    bio_str = content.get('bio_str')[0]
                else:
                    bio_str = '____'
                user_domain_data[uid]['bio_str'] = bio_str
                user_domain_data[uid]['category'] = category
        except Exception, e:
            print e
        #domain计算
        user_domain_temp = domain_main(user_domain_data)
        for uid in unresolved_uids:
            if uid in user_domain_temp:
                user_domain[uid] = {'domain': user_domain_temp[uid]}
            else:
                user_domain_temp[uid] = 'other'
                user_domain[uid] = {'domain': 'other'}
        save_data2es(user_domain)
Exemple #7
0
def my_domain_classfiy(uid_list, datetime_list):
    domain_results = {}
    #将处理后的结果保存到数据库中,并在处理前查询数据库中是否已经有了相应内容之前存储的结果,以提高效率
    uids = uid_list
    unresolved_uids = []
    res = es.mget(index=tw_portrait_index_name,
                  doc_type=tw_portrait_index_type,
                  body={'ids': uids})['docs']
    for r in res:
        uid = r['_id']
        if r.has_key('found'):
            found = r['found']
            if found and r['_source'].has_key('domain'):
                domain = r['_source']['domain']
                domain_results[uid] = domain
            else:
                unresolved_uids.append(uid)
        else:  #es表中目前无任何记录
            unresolved_uids.append(uid)

    #未在数据库中的进行计算并存储
    user_domain = {}
    user_domain_temp = {}
    if unresolved_uids:
        tw_flow_text_index_list = []
        for datetime in datetime_list:
            tw_flow_text_index_list.append(flow_text_index_name_pre + datetime)

        user_domain_data = {}
        #load num of text
        count_result = count_text_num(unresolved_uids, tw_flow_text_index_list)
        #load baseinfo
        tw_user_query_body = {
            'query': {
                "filtered": {
                    "filter": {
                        "bool": {
                            "must": [
                                {
                                    "terms": {
                                        "uid": unresolved_uids
                                    }
                                },
                            ]
                        }
                    }
                }
            },
            'size': MAX_SEARCH_SIZE,
            "fields": ["location", "username", "description", "uid"]
        }
        try:
            search_results = es.search(index=twitter_user_index_name,
                                       doc_type=twitter_user_index_type,
                                       body=tw_user_query_body)['hits']['hits']
            for item in search_results:
                content = item['fields']
                uid = content['uid'][0]
                if not uid in user_domain_data:
                    text_num = count_result[uid]
                    user_domain_data[uid] = {
                        'location': '',
                        'username': '',
                        'description': '',
                        'number_of_text': text_num
                    }
                if content.has_key('location_ch'):
                    location = content.get('location_ch')[0]
                else:
                    location = ''
                if content.has_key('description_ch'):
                    description = content.get('description_ch')[0]
                else:
                    description = ''
                if content.has_key('username'):
                    username = content.get('username')[0]
                else:
                    username = ''
                user_domain_data[uid]['location'] = location
                user_domain_data[uid]['username'] = username
                user_domain_data[uid]['description'] = description
        except Exception, e:
            print e
        #domian计算
        user_domain_temp = domain_main(user_domain_data)
        for uid in unresolved_uids:
            if uid in user_domain_temp:
                user_domain[uid] = {'domain': user_domain_temp[uid]}
            else:
                user_domain_temp[uid] = 'other'
                user_domain[uid] = {'domain': 'other'}
        save_data2es(user_domain)
Exemple #8
0
def get_hot_sensitive_recommend_at_user(sort_item):

    if S_TYPE == 'test':
        now_ts = datetime2ts(S_DATE_TW)
    else:
        now_ts = int(time.time())
    datetime = ts2datetime(now_ts - 24 * 3600)

    #sort_item = 'sensitive'
    sort_item_2 = 'timestamp'

    index_name = twitter_flow_text_index_name_pre + datetime

    query_body = {
        'query': {
            'match_all': {}
        },
        'sort': {
            sort_item: {
                'order': 'desc'
            }
        },
        'size': HOT_EVENT_TOP_USER,
        '_source': ['uid', 'user_fansnum', 'retweeted', 'timestamp']
    }

    es_results = es.search(index=index_name,
                           doc_type=twitter_flow_text_index_type,
                           body=query_body)['hits']['hits']

    uid_fansnum_dict = dict()
    if es_results:
        for result in es_results:
            result = result['_source']
            uid = result['uid']
            uid_fansnum_dict[uid] = {}
            uid_fansnum_dict[uid][sort_item_2] = result[sort_item_2]

    uid_fansnum_dict_sort_top = sorted(uid_fansnum_dict.items(),
                                       key=lambda x: x[1][sort_item_2],
                                       reverse=True)

    uid_set = set()

    for item in uid_fansnum_dict_sort_top:
        uid_set.add(item[0])

    uid_list = list(uid_set)

    ## 根据uid,从weibo_user中得到 nick_name
    uid_nick_name_dict = dict()  # uid不会变,而nick_name可能会变
    es_results_user = es.mget(index=twitter_user_index_name,
                              doc_type=twitter_user_index_type,
                              body={'ids': uid_list})['docs']
    i = 0
    for result in es_results_user:
        if result['found'] == True:
            result = result['_source']
            uid = result['uid']
            nick_name = result['username']
            if nick_name:
                i += 1
                uid_nick_name_dict[uid] = nick_name
        if i >= HOT_AT_RECOMMEND_USER_TOP:
            break

    return uid_nick_name_dict