Exemple #1
0
def get_user_ip(uid):
    flow_text_index_list = []
    now_timestamp = datetime2ts(ts2datetime(time.time()))
    if RUN_TYPE == 0:
        now_timestamp = datetime2ts(RUN_TEST_TIME)
    for i in range(7, 0, -1):
        iter_date = ts2datetime(now_timestamp - DAY * i)
        flow_text_index_list.append(flow_text_index_name_pre + iter_date)

    weibo_all = es_flow_text.search(index=flow_text_index_list,
                                    doc_type=flow_text_index_type,
                                    body={
                                        'query': {
                                            'filtered': {
                                                'filter': {
                                                    'term': {
                                                        'uid': uid
                                                    }
                                                }
                                            }
                                        },
                                        'size': 10,
                                    })['hits']['hits']
    ip = weibo_all[0]["_source"]["ip"]
    return ip
Exemple #2
0
def new_get_influence_trend(uid, time_segment):
    results = {}
    try:
        influence_history = ES_COPY_USER_PORTRAIT.get(index=COPY_USER_PORTRAIT_INFLUENCE, doc_type=COPY_USER_PORTRAIT_INFLUENCE_TYPE,\
                id=uid)['_source']
        print ES_COPY_USER_PORTRAIT,COPY_USER_PORTRAIT_INFLUENCE,COPY_USER_PORTRAIT_INFLUENCE_TYPE,uid
    	print influence_history
    except:
        influence_history = {}
    if influence_history:
        results = get_evaluate_trend(influence_history, 'bci')
    else:
        results = {}
    print results
    #deal results for situation---server power off
    new_time_list = []
    new_count_list = []
    new_results = {}
    now_time_ts = time.time()
    now_date_ts  = datetime2ts(ts2datetime(now_time_ts))
    if RUN_TYPE == 0:
        now_date_ts = datetime2ts(RUN_TEST_TIME)
    for i in range(time_segment, 0, -1):
        iter_date_ts = now_date_ts - i * DAY
        try:
            date_count = results[iter_date_ts]
        except:
            date_count = 0
        new_time_list.append(iter_date_ts)
        new_count_list.append(date_count)
    new_results = {'timeline': new_time_list, 'evaluate_index': new_count_list}
    return new_results
Exemple #3
0
def get_influence_content(uid, timestamp_from, timestamp_to):
    weibo_list = []
    #split timestamp range to new_range_dict_list
    from_date_ts = datetime2ts(ts2datetime(timestamp_from))
    to_date_ts = datetime2ts(ts2datetime(timestamp_to))
    new_range_dict_list = []
    if from_date_ts != to_date_ts:
        iter_date_ts = from_date_ts
        while iter_date_ts < to_date_ts:
            iter_next_date_ts = iter_date_ts + DAY
            new_range_dict_list.append({
                'range': {
                    'timestamp': {
                        'gte': iter_date_ts,
                        'lt': iter_next_date_ts
                    }
                }
            })
            iter_date_ts = iter_next_date_ts
        if new_range_dict_list[0]['range']['timestamp']['gte'] < timestamp_from:
            new_range_dict_list[0]['range']['timestamp'][
                'gte'] = timestamp_from
        if new_range_dict_list[-1]['range']['timestamp']['lt'] > timestamp_to:
            new_range_dict_list[-1]['range']['timestamp']['lt'] = timestamp_to
    else:
        new_range_dict_list = [{
            'range': {
                'timestamp': {
                    'gte': timestamp_from,
                    'lt': timestamp_to
                }
            }
        }]
    #iter date to search flow_text
    iter_result = []
    for range_item in new_range_dict_list:
        range_from_ts = range_item['range']['timestamp']['gte']
        range_from_date = ts2datetime(range_from_ts)
        flow_text_index_name = flow_text_index_name_pre + range_from_date
        query = []
        query.append({'term': {'uid': uid}})
        query.append(range_item)
        try:
            flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\
                    body={'query':{'bool':{'must': query}}, 'sort':[{'timestamp':'asc'}]})['hits']['hits']
        except:
            flow_text_exist = []
        iter_result.extend(flow_text_exist)
    # get weibo list
    for item in flow_text_exist:
        source = item['_source']
        weibo = {}
        weibo['timestamp'] = ts2date(source['timestamp'])
        weibo['ip'] = source['ip']
        weibo['text'] = source['text']
        weibo['geo'] = '\t'.join(source['geo'].split('&'))
        weibo_list.append(weibo)

    return weibo_list
Exemple #4
0
def get_db_num(timestamp):
    date = ts2datetime(timestamp)
    date_ts = datetime2ts(date)
    r_beigin_ts = datetime2ts(R_BEGIN_TIME)
    db_number = ((date_ts - r_beigin_ts) / (DAY * 7)) % 2 + 1
    # run_type
    if RUN_TYPE == 0:
        db_number = 1
    return db_number
Exemple #5
0
def get_db_num(timestamp):
    date = ts2datetime(timestamp)
    date_ts = datetime2ts(date)
    r_beigin_ts = datetime2ts(R_BEGIN_TIME)
    db_number = ((date_ts - r_beigin_ts) / (DAY * 7)) % 2 + 1
    # run_type
    if RUN_TYPE == 0:
        db_number = 1
    return db_number
Exemple #6
0
def get_social_inter_content(uid1, uid2, type_mark):
    weibo_list = []
    #get two type relation about uid1 and uid2
    #search weibo list
    now_ts = int(time.time())
    #run_type
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = datetime2ts(RUN_TEST_TIME)
    #uid2uname
    uid2uname = {}
    try:
        portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type ,\
                                body={'ids': [uid1, uid2]}, _source=False, fields=['uid', 'uname'])['docs']
    except:
        portrait_result = []
    
    for item in portrait_result:
        uid = item['_id']
        if item['found'] == True:
            uname = item['fields']['uname'][0]
            uid2uname[uid] = uname
        else:
            uid2uname[uid] = 'unknown'
    #iter date to search weibo list
    for i in range(7, 0, -1):
        iter_date_ts = now_date_ts - i*DAY
        iter_date = ts2datetime(iter_date_ts)
        flow_text_index_name = flow_text_index_name_pre + str(iter_date)
        query = []
        query.append({'bool':{'must':[{'term':{'uid':uid1}}, {'term':{'directed_uid': int(uid2)}}]}})
        if type_mark=='out':
            query.append({'bool':{'must':[{'term':{'uid':uid2}}, {'term':{'directed_uid': int(uid1)}}]}})
        try:
            flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\
                    body={'query': {'bool':{'should': query}}, 'sort':[{'timestamp':{'order': 'asc'}}], 'size':MAX_VALUE})['hits']['hits']
        except:
            flow_text_result = []
        for flow_text in flow_text_result:
            source = flow_text['_source']
            weibo = {}
            weibo['timestamp'] = source['timestamp']
            weibo['ip'] = source['ip']
            weibo['geo'] = source['geo']
            weibo['text'] = '\t'.join(source['text'].split('&'))
            weibo['uid'] =  source['uid']
            weibo['uname'] = uid2uname[weibo['uid']]
            weibo['directed_uid'] = str(source['directed_uid'])
            weibo['directed_uname'] = uid2uname[str(source['directed_uid'])]
            weibo_list.append(weibo)

    return weibo_list
Exemple #7
0
def new_get_user_location(uid):
    results = {}
    now_date = ts2datetime(time.time())
    now_date_ts = datetime2ts(now_date)
    #jln
    #now_date_ts = 1378310400
    #run type
    if RUN_TYPE == 0:
        now_date_ts = datetime2ts(RUN_TEST_TIME) - DAY
        now_date = ts2datetime(now_date_ts)
    #now ip
    try:
        ip_time_string = r_cluster.hget('new_ip_'+str(now_date_ts), uid)
    except Exception, e:
        raise e
Exemple #8
0
def get_user_ip(uid):
    flow_text_index_list = []
    now_timestamp = datetime2ts(ts2datetime(time.time()))
    if RUN_TYPE == 0:
        now_timestamp = datetime2ts(RUN_TEST_TIME)
    for i in range(7, 0, -1):
        iter_date = ts2datetime(now_timestamp - DAY * i)
        flow_text_index_list.append(flow_text_index_name_pre + iter_date)

    weibo_all = es_flow_text.search(index=flow_text_index_list,
                                    doc_type=flow_text_index_type,
                                    body={'query': {'filtered': {'filter': {'term': {'uid': uid}}}},
                                          'size': 10,
                                          })['hits']['hits']
    ip = weibo_all[0]["_source"]["ip"]
    return ip
Exemple #9
0
def new_get_activeness_trend(uid, time_segment):
    results = {}
    try:
        activeness_history = ES_COPY_USER_PORTRAIT.get(index=COPY_USER_PORTRAIT_ACTIVENESS, doc_type=COPY_USER_PORTRAIT_ACTIVENESS_TYPE,\
                id=uid)['_source']
    except:
        activeness_history = {}
    if activeness_history:
        results = get_evaluate_trend(activeness_history, 'activeness')
    else:
        results = {}
    #deal results for situation---server power off
    new_time_list = []
    new_count_list = []
    new_results = {}
    now_time_ts = time.time()
    now_date_ts  = datetime2ts(ts2datetime(now_time_ts))
    for i in range(time_segment, 0, -1):
        iter_date_ts = now_date_ts - i * DAY
        try:
            date_count = results[iter_date_ts]
        except:
            date_count = 0
        new_time_list.append(iter_date_ts)
        new_count_list.append(date_count)
    new_results = {'timeline': new_time_list, 'evaluate_index': new_count_list}
    return new_results
Exemple #10
0
def get_group_user_track(uid):
    results = []
    #step1:get user_portrait activity_geo_dict
    try:
        portrait_result = es_user_portrait.get(index=portrait_index_name, doc_type=portrait_index_type,\
                id=uid, _source=False, fields=['activity_geo_dict'])
    except:
        portrait_result = {}
    if portrait_result == {}:
        return 'uid is not in user_portrait'
    activity_geo_dict = json.loads(
        portrait_result['fields']['activity_geo_dict'][0])
    now_date_ts = datetime2ts(ts2datetime(int(time.time())))
    start_ts = now_date_ts - DAY * len(activity_geo_dict)
    #step2: iter date to get month track
    for geo_item in activity_geo_dict:
        iter_date = ts2datetime(start_ts)
        sort_day_dict = sorted(geo_item.items(),
                               key=lambda x: x[1],
                               reverse=True)
        if sort_day_dict:
            results.append([iter_date, sort_day_dict[0][0]])
        else:
            results.append([iter_date, ''])
        start_ts = start_ts + DAY

    return results
Exemple #11
0
def get_user_geo(uid, dropped_geos=u"中国&美国"):
    """
    :param uid: 用户的id
    :param dropped_geos: &分割的地点,因为geo中都包含中国
    :return: geo 位置的set
    """
    dropped_geos = set(dropped_geos.split("&"))
    # 获取用户的偏好
    try:
        user_portrait_result = es_user_portrait. \
            get_source(index=portrait_index_name, doc_type=profile_index_type, id=uid)
    except NotFoundError:
        user_portrait_result = None

    # portrait表中存在geo信息
    if user_portrait_result and len(user_portrait_result["activity_geo"]) > 0:
        geos = user_portrait_result["activity_geo"] - dropped_geos

    # 不存在geo信息,获取之前发去的微博提取
    else:
        flow_text_index_list = []
        now_timestamp = datetime2ts(ts2datetime(time.time()))
        if RUN_TYPE == 0:
            now_timestamp = datetime2ts(RUN_TEST_TIME)
        for i in range(7, 0, -1):
            iter_date = ts2datetime(now_timestamp - DAY * i)
            flow_text_index_list.append(flow_text_index_name_pre + iter_date)

        weibo_all = es_flow_text.search(index=flow_text_index_list,
                                        doc_type=flow_text_index_type,
                                        body={
                                            'query': {
                                                'filtered': {
                                                    'filter': {
                                                        'term': {
                                                            'uid': uid
                                                        }
                                                    }
                                                }
                                            },
                                            'size': 2000,
                                        })['hits']['hits']
        geos = set()
        for temp in weibo_all:
            geos |= set(temp["_source"]["geo"].split("&"))

    return geos
Exemple #12
0
def get_db_num(timestamp):
    date = ts2datetime(timestamp)
    date_ts = datetime2ts(date)
    db_number = ((date_ts - r_beigin_ts) / (DAY * 7)) % 2 + 1
    if RUN_TYPE == 0:
        db_number = 1
    #print 'db_number:', db_number
    return db_number
Exemple #13
0
def search_fans_new(uid,top_count):
    results = {}
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    db_number = get_db_num(now_date_ts)

    fan_result_new=es_fans.get(index = fans_index_name,doc_type=fans_index_type,id=uid)['_source']
    
    fan_result_new = json.loads(fan_result_new['uid_be_retweet'])

    out_portrait_list=[]
    # print fan_result_new
    i=1
    for key in fan_result_new:
        # print key
        fansnum=0
        user_friendsnum=0
        user_weibo_count=0
        fans_count=0

        uid=fan_result_new[key]['uid']

        if fan_result_new[key]['photo_url']:
            photo_url = fan_result_new[key]['photo_url']
        else:
            photo_url="http://tp2.sinaimg.cn/1878376757/50/0/1"
        
        if fan_result_new[key]['nick_name']:
            uname=fan_result_new[key]['nick_name']
        else:
            uname=uid

        if fan_result_new[key]['times']:
            fans_count=fan_result_new[key]['times']
        else:
            fans_count=0
        
        if fan_result_new[key]['fansnum']:
            fansnum=fan_result_new[key]['fansnum']
        else:
            fansnum=0

        if fan_result_new[key]['friendsnum']:
            user_friendsnum=fan_result_new[key]['friendsnum']
        else:
            user_friendsnum=0

        if fan_result_new[key]['statusnum']:
            user_weibo_count=fan_result_new[key]['statusnum']
        else:
            user_weibo_count=0

        out_portrait_list.append({'uid':uid,'photo_url':photo_url,'uname':uname, 'count':fans_count, 'fansnum':fansnum,'friendsnum': user_friendsnum,'weibo_count': user_weibo_count})
        
        if i>100:
            break
        i=i+1
    return out_portrait_list
Exemple #14
0
def cctv_video_rec(uid, k=10):
    flow_text_index_list = []
    now_timestamp = datetime2ts(ts2datetime(time.time()))
    if RUN_TYPE == 0:
        now_timestamp = datetime2ts(RUN_TEST_TIME)
    for i in range(7, 0, -1):
        iter_date = ts2datetime(now_timestamp - DAY * i)
        flow_text_index_list.append(flow_text_index_name_pre + iter_date)

    weibo_all = es_flow_text.search(index=flow_text_index_list,
                                    doc_type=flow_text_index_type,
                                    body={
                                        'query': {
                                            'filtered': {
                                                'filter': {
                                                    'term': {
                                                        'uid': uid
                                                    }
                                                }
                                            }
                                        },
                                        'size': 100,
                                    })['hits']['hits']
    user_words = set()
    for weibo in weibo_all:
        weibo_text = weibo["_source"]["text"]
        user_words |= set(jieba.cut(weibo_text))

    rio_dict = load_topic_video_dict(RIO_VIDEO_INFO_FILE)
    tiger_videos = load_videos(TIGER_VIDEO_INFO_FILE)

    ret_dict = dict()
    ret_dict["tiger"] = random.sample(tiger_videos, k)

    user_pref_topic = set(rio_dict.keys()) & user_words
    # 若找不到,随机分配topic
    if len(user_pref_topic) == 0:
        user_pref_topic = set(random.sample(rio_dict.keys(), k))
    ret_dict["rio"] = list()
    for topic in user_pref_topic:
        ret_dict["rio"].extend(rio_dict[topic])
        if len(ret_dict["rio"]) >= k:
            ret_dict["rio"] = ret_dict["rio"][:k]
            break
    return ret_dict
Exemple #15
0
def get_text_index(date):
    now_ts = datetime2ts(date)
    index_list = []
    for i in range(7):
        ts = now_ts - i*DAY
        tmp_index = pre_text_index + ts2datetime(ts)
        index_list.append(tmp_index)

    return index_list
Exemple #16
0
def get_influence_content(uid, timestamp_from, timestamp_to):
    weibo_list = []
    #split timestamp range to new_range_dict_list
    from_date_ts = datetime2ts(ts2datetime(timestamp_from))
    to_date_ts = datetime2ts(ts2datetime(timestamp_to))
    new_range_dict_list = []
    if from_date_ts != to_date_ts:
        iter_date_ts = from_date_ts
        while iter_date_ts < to_date_ts:
            iter_next_date_ts = iter_date_ts + DAY
            new_range_dict_list.append({'range':{'timestamp':{'gte':iter_date_ts, 'lt':iter_next_date_ts}}})
            iter_date_ts = iter_next_date_ts
        if new_range_dict_list[0]['range']['timestamp']['gte'] < timestamp_from:
            new_range_dict_list[0]['range']['timestamp']['gte'] = timestamp_from
        if new_range_dict_list[-1]['range']['timestamp']['lt'] > timestamp_to:
            new_range_dict_list[-1]['range']['timestamp']['lt'] = timestamp_to
    else:
        new_range_dict_list = [{'range':{'timestamp':{'gte':timestamp_from, 'lt':timestamp_to}}}]
    #iter date to search flow_text
    iter_result = []
    for range_item in new_range_dict_list:
        range_from_ts = range_item['range']['timestamp']['gte']
        range_from_date = ts2datetime(range_from_ts)
        flow_text_index_name = flow_text_index_name_pre + range_from_date
        query = []
        query.append({'term':{'uid':uid}})
        query.append(range_item)
        try:
            flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\
                    body={'query':{'bool':{'must': query}}, 'sort':[{'timestamp':'asc'}]})['hits']['hits']
        except:
            flow_text_exist = []
        iter_result.extend(flow_text_exist)
    # get weibo list
    for item in flow_text_exist:
        source = item['_source']
        weibo = {}
        weibo['timestamp'] = ts2date(source['timestamp'])
        weibo['ip'] = source['ip']
        weibo['text'] = source['text']
        weibo['geo'] = '\t'.join(source['geo'].split('&'))
        weibo_list.append(weibo)
        
    return weibo_list
Exemple #17
0
def search_weibo(root_uid,uid,mtype):
    query_body = {
        #'query':{
            'filter':{
                'bool':{
                    'must':[{'term':{'uid':uid}},
                            {'term':{'message_type':mtype}}],
                    'should':[{'term':{'root_uid':root_uid}},
                              {'term':{'directed_uid':root_uid}}],
                }
            }
        #}
    }
    index_list = []
    for i in range(7, 0, -1):
        if RUN_TYPE == 1:
            iter_date = ts2datetime(datetime2ts(now_date) - i * DAY)
        else:
            iter_date = ts2datetime(datetime2ts(RUN_TEST_TIME) - i * DAY) 
        index_list.append(flow_text_index_name_pre + iter_date)
    results = es_flow_text.search(index=index_list,doc_type=flow_text_index_type,body=query_body)['hits']['hits']
    weibo = {}
    f_result = []

    if len(results) > 0:
        for result in results:
            #print type(result),result
            weibo['last_text'] = [result['_source']['text'],result['_source']['text'],result['_source']['timestamp']]
            mid = result['_source']['root_mid']
            # print mid
            len_pre = len(flow_text_index_name_pre)
            index = result['_index'][len_pre:]
            root_index = []
            for j in range(0,7):   #一周的,一个月的话就0,30
                iter_date = ts2datetime(datetime2ts(index) - j * DAY) 
                root_index.append(flow_text_index_name_pre + iter_date)
            results0 = es_flow_text.search(index=root_index,doc_type=flow_text_index_type,body={'query':{'term':{'mid':mid}}})['hits']['hits']
            if len(results0)>0:
                for result0 in results0:
                    weibo['ori_text'] = [result0['_source']['text'],result0['_source']['timestamp']]
                    f_result.append(weibo)
                    weibo={}
    return f_result
Exemple #18
0
def search_mention(uid):
    now_date_ts = datetime2ts(ts2datetime(time.time()))
    #run type
    if RUN_TYPE == 0:
        now_date_ts = datetime2ts(RUN_TEST_TIME)
    day_result_dict_list = []
    for i in range(7,0, -1):
        iter_ts = now_date_ts - i * DAY
        try:
            result_string = r_cluster.hget('at_' + str(ts), str(uid))
        except:
            result_string = ''
        if not result_string:
            continue
        day_result_dict = json.loads(results_string)
        day_result_dict_list.append(day_result_dict)
    if day_result_dict_list:
        week_result_dict = union_dict(day_result_dict_list)
    else:
        week_result_dict = {}
    return week_result_dict 
Exemple #19
0
def get_user_geo(uid, dropped_geos=u"中国&美国"):
    """
    :param uid: 用户的id
    :param dropped_geos: &分割的地点,因为geo中都包含中国
    :return: geo 位置的set
    """
    dropped_geos = set(dropped_geos.split("&"))
    # 获取用户的偏好
    try:
        user_portrait_result = es_user_portrait. \
            get_source(index=portrait_index_name, doc_type=profile_index_type, id=uid)
    except NotFoundError:
        user_portrait_result = None

    # portrait表中存在geo信息
    if user_portrait_result and len(user_portrait_result["activity_geo"]) > 0:
        geos = user_portrait_result["activity_geo"] - dropped_geos

    # 不存在geo信息,获取之前发去的微博提取
    else:
        flow_text_index_list = []
        now_timestamp = datetime2ts(ts2datetime(time.time()))
        if RUN_TYPE == 0:
            now_timestamp = datetime2ts(RUN_TEST_TIME)
        for i in range(7, 0, -1):
            iter_date = ts2datetime(now_timestamp - DAY * i)
            flow_text_index_list.append(flow_text_index_name_pre + iter_date)

        weibo_all = es_flow_text.search(index=flow_text_index_list,
                                        doc_type=flow_text_index_type,
                                        body={'query': {'filtered': {'filter': {'term': {'uid': uid}}}},
                                              'size': 2000,
                                              })['hits']['hits']
        geos = set()
        for temp in weibo_all:
            geos |= set(temp["_source"]["geo"].split("&"))

    return geos
Exemple #20
0
def cctv_video_rec(uid, k=10):
    flow_text_index_list = []
    now_timestamp = datetime2ts(ts2datetime(time.time()))
    if RUN_TYPE == 0:
        now_timestamp = datetime2ts(RUN_TEST_TIME)
    for i in range(7, 0, -1):
        iter_date = ts2datetime(now_timestamp - DAY * i)
        flow_text_index_list.append(flow_text_index_name_pre + iter_date)

    weibo_all = es_flow_text.search(index=flow_text_index_list,
                                    doc_type=flow_text_index_type,
                                    body={'query': {'filtered': {'filter': {'term': {'uid': uid}}}},
                                          'size': 100,
                                          })['hits']['hits']
    user_words = set()
    for weibo in weibo_all:
        weibo_text = weibo["_source"]["text"]
        user_words |= set(jieba.cut(weibo_text))

    rio_dict = load_topic_video_dict(RIO_VIDEO_INFO_FILE)
    tiger_videos = load_videos(TIGER_VIDEO_INFO_FILE)

    ret_dict = dict()
    ret_dict["tiger"] = random.sample(tiger_videos, k)

    user_pref_topic = set(rio_dict.keys()) & user_words
    # 若找不到,随机分配topic
    if len(user_pref_topic) == 0:
        user_pref_topic = set(random.sample(rio_dict.keys(), k))
    ret_dict["rio"] = list()
    for topic in user_pref_topic:
        ret_dict["rio"].extend(rio_dict[topic])
        if len(ret_dict["rio"]) >= k:
            ret_dict["rio"] = ret_dict["rio"][:k]
            break
    return ret_dict
Exemple #21
0
def conclusion_on_activeness(uid):
    # test
    index_name = copy_portrait_index_name
    index_type = copy_portrait_index_type
    try:
        influ_result = es.get(index=index_name, doc_type=index_type,
                              id=uid)['_source']
    except:
        influ_result = {}
        result = activeness_conclusion_dict['0']
        return result

    # generate time series---keys
    now_ts = time.time()
    now_ts = datetime2ts('2013-09-12')
    activeness_set = set()
    for i in range(N):
        ts = ts2datetime(now_ts - i * 3600 * 24)
        activeness_set.add(pre_activeness + ts)

    # 区分影响力和活跃度的keys
    keys_set = set(influ_result.keys())
    activeness_keys = keys_set & activeness_set

    if activeness_keys:
        activeness_value = []
        for key in activeness_keys:
            activeness_value.append(influ_result[key])
        mean, std_var = level(activeness_value)
        if mean < activeness_level[0]:
            result = activeness_conclusion_dict['1']
        elif mean >= activeness_level[0] and mean < activeness_level[1]:
            result = activeness_conclusion_dict['2']
        elif mean >= activeness_level[1] and mean < activeness_level[2]:
            result = activeness_conclusion_dict["3"]
        elif mean >= activeness_level[2] and mean < activeness_level[3]:
            result = activeness_conclusion_dict["4"]
        else:
            result = activeness_conclusion_dict["5"]
    else:
        result = conclusion_dict['0']

    return result
Exemple #22
0
def conclusion_on_activeness(uid):
    # test
    index_name = copy_portrait_index_name
    index_type = copy_portrait_index_type
    try:
        influ_result = es.get(index=index_name, doc_type=index_type, id=uid)['_source']
    except:
        influ_result = {}
        result = activeness_conclusion_dict['0']
        return result

    # generate time series---keys
    now_ts = time.time()
    now_ts = datetime2ts('2013-09-12')
    activeness_set = set()
    for i in range(N):
        ts = ts2datetime(now_ts - i*3600*24)
        activeness_set.add(pre_activeness+ts)

    # 区分影响力和活跃度的keys
    keys_set = set(influ_result.keys())
    activeness_keys = keys_set & activeness_set

    if activeness_keys:
        activeness_value = []
        for key in activeness_keys:
            activeness_value.append(influ_result[key])
        mean, std_var = level(activeness_value)
        if mean < activeness_level[0]:
            result = activeness_conclusion_dict['1']
        elif mean >= activeness_level[0] and mean < activeness_level[1]:
            result = activeness_conclusion_dict['2']
        elif mean >= activeness_level[1] and mean < activeness_level[2]:
            result = activeness_conclusion_dict["3"]
        elif mean >= activeness_level[2] and mean < activeness_level[3]:
            result = activeness_conclusion_dict["4"]
        else:
            result = activeness_conclusion_dict["5"]
    else:
        result = conclusion_dict['0']

    return result
Exemple #23
0
def ajax_specified_user_active():
    date = request.args.get('date', '')  # '2013-09-01'
    uid = request.args.get('uid', '')  # 123456,123456
    date = str(date)

    results = []

    if date and uid:
        if RUN_TYPE == 0:
            timetemp = datetime2ts(RUN_TEST_TIME) - DAY
            date = ts2datetime(timetemp)
        print date
        index_name = pre_influence_index + date.replace('-', '')
        list_1 = []
        uid_list = [item for item in uid.split(',')]
        result = search_influence_detail(uid_list, index_name, "bci")

        description = influence_description(result)
        results.append(result)
        results.append(description)

    return json.dumps(results)
Exemple #24
0
def get_group_user_track(uid):
    results = []
    #step1:get user_portrait activity_geo_dict
    try:
        portrait_result = es_user_portrait.get(index=portrait_index_name, doc_type=portrait_index_type,\
                id=uid, _source=False, fields=['activity_geo_dict'])
    except:
        portrait_result = {}
    if portrait_result == {}:
        return 'uid is not in user_portrait'
    activity_geo_dict = json.loads(portrait_result['fields']['activity_geo_dict'][0])
    now_date_ts = datetime2ts(ts2datetime(int(time.time())))
    start_ts = now_date_ts - DAY * len(activity_geo_dict)
    #step2: iter date to get month track
    for geo_item in activity_geo_dict:
        iter_date = ts2datetime(start_ts)
        sort_day_dict = sorted(geo_item.items(), key=lambda x:x[1], reverse=True)
        if sort_day_dict:
            results.append([iter_date, sort_day_dict[0][0]])
        else:
            results.append([iter_date, ''])
        start_ts = start_ts + DAY

    return results
Exemple #25
0
def search_fans(uid,top_count):
    results = {}
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    db_number = get_db_num(now_date_ts)

    be_comment_index_name = be_comment_index_name_pre + str(db_number)
    be_retweet_index_name = be_retweet_index_name_pre + str(db_number)
    result = {}
    be_retweet_inter_dict = {}
    be_comment_inter_dict = {}
    center_uid = uid
    try:
        be_retweet_result = es_retweet.get(index = be_retweet_index_name,doc_type=be_retweet_index_type,id=uid)['_source']
    except:
        be_retweet_result = {}

    if be_retweet_result:
        be_retweet_uid_dict = json.loads(be_retweet_result['uid_be_retweet'])
    else:
        be_retweet_uid_dict = {}
    # print "be_retweet_uid_dict", be_retweet_uid_dict
    try:
        be_comment_result = es_be_comment.get(index=be_comment_index_name, doc_type=be_comment_index_type, id=uid)['_source']
    except:
        be_comment_result = {}

    if be_comment_result:
        be_comment_uid_dict = json.loads(be_comment_result['uid_be_comment'])
    else:
        be_comment_uid_dict = {}
    # print "be_comment_uid_dict", be_comment_uid_dict

    fans_result = union_dict(be_retweet_uid_dict,be_comment_uid_dict)
    fans_user_set = set(fans_result.keys())
    fans_list = list(fans_user_set)
    # print "fans_list", fans_list
    all_fans_dict = {}

    for fans_user in fans_list:
        if fans_user != center_uid:
            all_fans_dict[fans_user] = fans_result[fans_user]
    sort_all_fans_dict = sorted(all_fans_dict.items(), key=lambda x:x[1], reverse=True)
    all_fans_uid_list=[]
    all_fans_uid_list_all = [item[0] for item in sort_all_fans_dict]

    count = 0
    for i in all_fans_uid_list_all:
        count += 1
        all_fans_uid_list.append(i)
        if count == 1000:
            break
    # print all_fans_uid_list

    out_portrait_list = all_fans_uid_list
    #use to get user information from user profile
    out_portrait_result = {}
    try:
        out_user_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids':out_portrait_list})['docs']
    except:
        out_user_result = []
    #add index from bci_history
    try:
        bci_history_result = es_bci_history.mget(index=bci_history_index_name, doc_type=bci_history_index_type, body={'ids': out_portrait_list}, fields=fields)['docs']
    except:
        bci_history_result = []
    iter_count = 0
    out_portrait_list = []
    for out_user_item in out_user_result:
        uid = out_user_item['_id']
        if out_user_item['found'] == True:
            source = out_user_item['_source']
            uname = source['nick_name']
            photo_url = source['photo_url']
            if uname == '':
                uname =  u'未知'
            location = source['user_location']
            friendsnum = source['friendsnum']
        else:
            uname = u'未知'
            location = ''
            friendsnum = ''
            photo_url = 'unknown'
        #add index from bci_history
        try:
            bci_history_item = bci_history_result[iter_count]
        except:
            bci_history_item = {'found': False}
        # print bci_history_item
        if bci_history_item['found'] == True:
            fansnum = bci_history_item['fields'][fields[0]][0]
            user_weibo_count = bci_history_item['fields'][fields[1]][0]
            user_friendsnum = bci_history_item['fields'][fields[2]][0]
            influence = bci_history_item['fields'][fields[3]][0]
        else:
            fansnum = ''
            user_weibo_count = ''
            user_friendsnum = ''

        fans_count = int(all_fans_dict[uid])
        out_portrait_list.append({'uid':uid,'photo_url':photo_url,'uname':uname, 'count':fans_count, 'fansnum':fansnum,'friendsnum': user_friendsnum,'weibo_count': user_weibo_count})
        iter_count += 1

    return out_portrait_list
Exemple #26
0
def search_mention(now_ts, uid, top_count):
    date = ts2datetime(now_ts)
    #evaluate_max_dict = get_evaluate_max()
    ts = datetime2ts(date)
    stat_results = dict()
    results = dict()
    uid_dict = {}
    for i in range(1, 8):
        ts = ts - DAY
        try:
            result_string = r_cluster.hget('at_' + str(ts), str(uid))
        except:
            result_string = ''
        if not result_string:
            continue
        result_dict = json.loads(result_string)
        for at_uname in result_dict:
            try:
                stat_results[at_uname] += result_dict[at_uname]
            except:
                stat_results[at_uname] = result_dict[at_uname]
    sort_stat_results = sorted(stat_results.items(),
                               key=lambda x: x[1],
                               reverse=True)
    # # print sort_stat_results

    out_portrait_list = []
    out_list = stat_results.keys()

    #use to get user information from user profile
    out_query_list = [{'match': {'uname': item}} for item in out_list]
    if len(out_query_list) != 0:
        query = [{'bool': {'should': out_query_list}}]
        try:
            out_profile_result = es_user_profile.search(
                index=profile_index_name,
                doc_type=profile_index_type,
                body={
                    'query': {
                        'bool': {
                            'must': query
                        }
                    },
                    'size': 100
                })['hits']['hits']
        except:
            out_profile_result = []
    else:
        out_profile_result = []
    out_in_profile_list = []
    bci_search_id_list = []

    for out_item in out_profile_result:
        source = out_item['_source']
        uname = source['nick_name']
        uid = source['uid']
        location = source['location']
        friendsnum = source['friendsnum']
        out_portrait_list.append(
            [uid, uname, stat_results[uname], '', location, friendsnum, ''])
        out_in_profile_list.append(uname)
        #use to search bci history
        bci_search_id_list.append(uid)
    out_out_profile_list = list(set(out_list) - set(out_in_profile_list))
    for out_out_item in out_out_profile_list:
        out_portrait_list.append(
            ['', out_out_item, stat_results[out_out_item], '', '', '', ''])

    #add index from bci_history
    new_out_portrait_list = []
    try:
        bci_history_result = es_bci_history.mget(
            index=bci_history_index_name,
            doc_type=bci_history_index_type,
            body={'ids': bci_search_id_list},
            fields=['user_fansnum', 'weibo_month_sum',
                    'user_friendsnum'])['docs']
    except:
        bci_history_result = []
    iter_count = 0
    for out_portrait_item in out_portrait_list:
        append_dict = {}
        try:
            bci_history_item = bci_history_result[iter_count]
        except:
            bci_history_item = {}
        new_out_portrait_item = out_portrait_item
        append_dict['uid'] = out_portrait_item[0]
        append_dict['uname'] = out_portrait_item[1]
        append_dict['count'] = out_portrait_item[2]
        if bci_history_item:
            if bci_history_item['found'] == True:
                fansnum = bci_history_item['fields']['user_fansnum'][0]
                user_weibo_count = bci_history_item['fields'][
                    'weibo_month_sum'][0]
                user_friendsnum = bci_history_item['fields'][
                    'user_friendsnum'][0]
            else:
                fansnum = ''
                user_weibo_count = ''
                user_friendsnum = ''
        else:
            fansnum = ''
            user_weibo_count = ''
            user_friendsnum = ''
        append_dict['fansnum'] = fansnum
        append_dict['weibo_count'] = user_weibo_count
        append_dict['friendsnum'] = user_friendsnum
        # new_out_portrait_item[3] = fansnum
        # new_out_portrait_item[6] = user_weibo_count
        # new_out_portrait_item[-2] = user_friendsnum
        #new_out_portrait_list.append(new_out_portrait_item)
        new_out_portrait_list.append(append_dict)
        iter_count += 1
        ## print append_dict
    return new_out_portrait_list  #  uid,名字,提及次数,粉丝数,注册地,关注数,微博数
Exemple #27
0
def localRec(uid, queryInterval=HOUR*25*7, k=200):
    # 运行状态,
    # 0 ->  当前为2016-11-28 00:00:00
    # 1 ->  当前时间
    now_timestamp = datetime2ts(ts2datetime(time.time()))
    if RUN_TYPE == 0:
        now_timestamp = datetime2ts(RUN_TEST_TIME)

    flow_text_index_list = []
    for i in range(7, 0, -1):
        iter_date = ts2datetime(now_timestamp - DAY * i)
        flow_text_index_list.append(flow_text_index_name_pre + iter_date)

    # 获取用户地理位置
    # user_geos = get_user_geo(uid)
    # # 根据位置查询weibo
    # weibo_all = es_flow_text.search(index=flow_text_index_list, doc_type=ads_weibo_index_type,
    #                                 body={"query":{"bool":{"must":
    #                                                                 [{"match":{"keywords_string":"新闻"}},
    #                                                                  {"match":{"geo":"合肥"}}
    #                                                                  ]}},
    #                                            "size": 200
    #                                       })["hits"]["hits"]

    '''可以直接查询长度大于100的但是很慢
    {"query":{"filtered":{"query":{"bool":{"must":[{"match":{"keywords_string":"新闻"}},{"match":{"geo":"合肥"}}]}},"filter":{"regexp":{"text":{"value":".{100,}"}}}}}}
    '''
    ip = get_user_ip(uid)
    ip = ".".join(ip.split(".")[:-2])
    print '326'
    weibo_all = es_flow_text.search(index=flow_text_index_list, doc_type=ads_weibo_index_type,
                                    body={"query": {"bool": {"must": [{"prefix": {"text.ip": ip}}]}},
                                          "size": 2000})["hits"]["hits"]

    local_weibo_rec = []
    weibo_user_uids = [weibo["_source"]["uid"] for weibo in weibo_all]
    print '332',len(weibo_all)
    # user_profiles = search_user_profile_by_user_ids(weibo_user_uids)
    exists_ip = set()
    topic_word_weight_dic = construct_topic_word_weight_dic(ADS_TOPIC_TFIDF_DIR)
    for weibo in weibo_all:
        weibo = weibo["_source"]
        weibo_text = weibo["text"]
        if weibo["ip"] in exists_ip:
            continue
        # 一个ip只选一个
        exists_ip.add(weibo["ip"])
        if not is_suit(weibo_text):
            continue
        weibo["len"] = len(weibo_text)
        try:
            mid = weibo["mid"]
            uid = weibo["uid"]
        except:
            continue
        weibo["weibo_url"] = weiboinfo2url(uid, mid)
        weibo["weibo_topic"] = judge_ads_topic(list(jieba.cut(weibo_text)), topic_word_weight_dic)
        # 可能出现许多userprofile查不到的情况
        # if uid in user_profiles:
        #     weibo["photo_url"] = user_profiles[uid]["photo_url"]
        #     weibo["nick_name"] = user_profiles[uid]["nick_name"]
        # else:
        #     weibo["photo_url"] = "None"
        #     weibo["nick_name"] = "None"
        #     local_weibo_rec.append(weibo)
        local_weibo_rec.append(weibo)
    return local_weibo_rec
Exemple #28
0
def search_mention(now_ts, uid, top_count):
    date = ts2datetime(now_ts)
    #evaluate_max_dict = get_evaluate_max()
    ts = datetime2ts(date)
    stat_results = dict()
    results = dict()
    uid_dict = {}
    for i in range(1,8):
        ts = ts - DAY
        try:
            result_string = r_cluster.hget('at_' + str(ts), str(uid))
        except:
            result_string = ''
        if not result_string:
            continue
        result_dict = json.loads(result_string)
        for at_uname in result_dict:
            try:
                stat_results[at_uname] += result_dict[at_uname]
            except:
                stat_results[at_uname] = result_dict[at_uname]
    sort_stat_results = sorted(stat_results.items(), key=lambda x:x[1], reverse=True)
    # print sort_stat_results

    out_portrait_list = []
    out_list = stat_results.keys()

    #use to get user information from user profile
    out_query_list = [{'match':{'uname':item}} for item in out_list]
    if len(out_query_list) != 0:
        query = [{'bool':{'should': out_query_list}}]
        try:
            out_profile_result = es_user_profile.search(index=profile_index_name, doc_type=profile_index_type, body={'query':{'bool':{'must':query}}, 'size':100})['hits']['hits']
        except:
            out_profile_result = []
    else:
        out_profile_result = []
    out_in_profile_list = []
    bci_search_id_list = []

    for out_item in out_profile_result:
        source = out_item['_source']
        uname = source['nick_name']
        uid = source['uid']
        location = source['location']
        friendsnum = source['friendsnum']
        out_portrait_list.append([uid, uname, stat_results[uname], '', location, friendsnum, ''])
        out_in_profile_list.append(uname)
        #use to search bci history
        bci_search_id_list.append(uid)
    out_out_profile_list = list(set(out_list) - set(out_in_profile_list))
    for out_out_item in out_out_profile_list:
        out_portrait_list.append(['', out_out_item, stat_results[out_out_item],'', '', '', ''])
    
    #add index from bci_history
    new_out_portrait_list = []
    try:
        bci_history_result = es_bci_history.mget(index=bci_history_index_name, doc_type=bci_history_index_type, body={'ids': bci_search_id_list}, fields=['user_fansnum', 'weibo_month_sum', 'user_friendsnum'])['docs']
    except:
        bci_history_result = []
    iter_count = 0
    for out_portrait_item in out_portrait_list:
        append_dict = {}
        try:
            bci_history_item = bci_history_result[iter_count]
        except:
            bci_history_item = {}
        new_out_portrait_item = out_portrait_item
        append_dict['uid'] = out_portrait_item[0]
        append_dict['uname'] = out_portrait_item[1]
        append_dict['count'] = out_portrait_item[2]
        if bci_history_item:
            if bci_history_item['found'] == True:
                fansnum = bci_history_item['fields']['user_fansnum'][0]
                user_weibo_count = bci_history_item['fields']['weibo_month_sum'][0]
                user_friendsnum = bci_history_item['fields']['user_friendsnum'][0]
            else:
                fansnum = ''
                user_weibo_count = ''
                user_friendsnum = ''
        else:
            fansnum = ''
            user_weibo_count = ''
            user_friendsnum = ''
        append_dict['fansnum'] = fansnum
        append_dict['weibo_count'] = user_weibo_count
        append_dict['friendsnum'] = user_friendsnum
        # new_out_portrait_item[3] = fansnum
        # new_out_portrait_item[6] = user_weibo_count
        # new_out_portrait_item[-2] = user_friendsnum
        #new_out_portrait_list.append(new_out_portrait_item)
        new_out_portrait_list.append(append_dict)
        iter_count += 1
        #print append_dict
    return new_out_portrait_list  #  uid,名字,提及次数,粉丝数,注册地,关注数,微博数
Exemple #29
0
def search_bidirect_interaction(uid, top_count):

    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    db_number = get_db_num(now_date_ts)
    retweet_index_name = retweet_index_name_pre + str(db_number)
    be_retweet_index_name = be_retweet_index_name_pre + str(db_number)
    comment_index_name = comment_index_name_pre + str(db_number)
    be_comment_index_name = be_comment_index_name_pre + str(db_number)
    results = {}
    retweet_inter_dict = {}
    comment_inter_dict = {}
    center_uid = uid
    #bidirect interaction in retweet and be_retweet
    try:
        retweet_result = es_retweet.get(index=retweet_index_name, doc_type=retweet_index_type, id=uid)['_source']
    except:
        retweet_result = {}
    if retweet_result:
        retweet_uid_dict = json.loads(retweet_result['uid_retweet'])
    else:
        retweet_uid_dict = {}
    retweet_uid_list = retweet_uid_dict.keys()
    try:
        be_retweet_result = es_retweet.get(index=be_retweet_index_name, doc_type=be_retweet_index_type, id=uid)['_source']
    except:
        be_retweet_result = {}
    if be_retweet_result:
        be_retweet_uid_dict = json.loads(be_retweet_result['uid_be_retweet'])
    else:
        be_retweet_uid_dict = {}

    #bidirect interaction in comment and be_comment
    try:
        comment_result = es_comment.get(index=comment_index_name, doc_type=comment_index_type, id=uid)['_source']
    except:
        comment_result = {}
    if comment_result:
        comment_uid_dict = json.loads(comment_result['uid_comment'])
    else:
        comment_uid_dict = {}
    comment_uid_list = comment_uid_dict.keys()
    try:
        be_comment_result = es_comment.get(index=be_coment_index_name, doc_type=be_comment_index_type, id=uid)['_source']
    except:
        be_comment_result = {}
    if be_comment_result:
        be_comment_uid_dict = json.loads(be_comment_result['uid_be_comment'])
    else:
        be_comment_uid_dict = {}
    #get bidirect_interaction dict
    #all_interaction_dict = union_dict(retweet_inter_dict, comment_inter_dict)
    retweet_comment_result = union_dict(retweet_uid_dict, comment_uid_dict)
    be_retweet_comment_result = union_dict(be_retweet_uid_dict, be_comment_uid_dict)
    interaction_user_set = set(retweet_comment_result.keys()) & set(be_retweet_comment_result.keys())
    interaction_user_list = list(interaction_user_set)
    all_interaction_dict = {}
    for interaction_user in interaction_user_list:
        if interaction_user != center_uid:
            all_interaction_dict[interaction_user] = retweet_comment_result[interaction_user] + be_retweet_comment_result[interaction_user]
            
    sort_all_interaction_dict = sorted(all_interaction_dict.items(), key=lambda x:x[1], reverse=True)
    #get in_portrait_list, in_portrait_results and out_portrait_list
    all_interaction_uid_list = [item[0] for item in sort_all_interaction_dict]
    #print all_interaction_uid_list

    # if RUN_TYPE == 0:
        # all_interaction_dict = {'2029036025':3,'1282005885':2,'2549228714':2,'1809833450':1}
        # all_interaction_uid_list = ['2029036025', '1282005885', '2549228714', '1809833450']

    out_portrait_list = all_interaction_uid_list
    #use to get user information from user profile
    out_portrait_result = {}
    try:
        out_user_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids':out_portrait_list})['docs']
    except:
        out_user_result = []
    #add index from bci_history
    try:
        bci_history_result = es_bci_history.mget(index=bci_history_index_name, doc_type=bci_history_index_type, body={'ids': out_portrait_list}, fields=fields)['docs']
    except:
        bci_history_result = []
    iter_count = 0
    out_portrait_list = []
    for out_user_item in out_user_result:
        uid = out_user_item['_id']
        if out_user_item['found'] == True:
            source = out_user_item['_source']
            uname = source['nick_name']
            photo_url = source['photo_url']
            if uname == '':
                uname =  u'未知'
            location = source['user_location']
            friendsnum = source['friendsnum']
        else:
            uname = u'未知'
            location = ''
            friendsnum = ''
            photo_url = 'unknown'
        #add index from bci_history
        try:
            bci_history_item = bci_history_result[iter_count]
        except:
            bci_history_item = {'found': False}
        # print bci_history_item
        if bci_history_item['found'] == True:
            fansnum = bci_history_item['fields'][fields[0]][0]
            user_weibo_count = bci_history_item['fields'][fields[1]][0]
            user_friendsnum = bci_history_item['fields'][fields[2]][0]
            influence = bci_history_item['fields'][fields[3]][0]
        else:
            fansnum = ''
            user_weibo_count = ''
            user_friendsnum = ''

        interaction_count = int(all_interaction_dict[uid])
        out_portrait_list.append({'uid':uid,'photo_url':photo_url,'uname':uname, 'count':interaction_count, 'fansnum':fansnum,'friendsnum': user_friendsnum,'weibo_count': user_weibo_count})
        iter_count += 1

    return out_portrait_list
Exemple #30
0
def group_user_weibo(task_name, submit_user, sort_type):
    weibo_list = []
    now_date = ts2datetime(time.time())
    if sort_type == 'retweet':
        sort_type = 'retweeted'
    #run_type
    if RUN_TYPE == 0:
        now_date = RUN_TEST_TIME
        sort_type = 'timestamp'
    #step1: get group user
    task_id = submit_user + '-' + task_name
    try:
        group_exist_result = es_group_result.get(index=group_index_name, doc_type=group_index_type,\
                id=task_id)['_source']
    except:
        group_exist_result = {}
    if not group_exist_result:
        return 'group no exist'
    #step2: get user weibo list
    uid_list = group_exist_result['uid_list']
    for i in range(6,-1,-1):
        iter_date = ts2datetime(datetime2ts(now_date) - i * DAY)
        index_name = flow_text_index_name_pre + iter_date
        try:
            weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\
                    body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'sort':[{sort_type: {'order': 'desc'}}], 'size':100})['hits']['hits']
        except:
            weibo_result = []
        if weibo_result:
            weibo_list.extend(weibo_result)
    #sort_weibo_list = sorted(weibo_list, key=lambda x:x['_source'][sort_type], reverse=True)[:100]
    sort_weibo_list = weibo_list
    #step3: get user name
    try:
        portrait_exist_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \
                body={'ids':uid_list})['docs']
    except:
        portrait_exist_result = []
    uid2uname_dict = {}
    for portrait_item in portrait_exist_result:
        uid = portrait_item['_id']
        if portrait_item['found'] == True:
            source = portrait_item['_source']
            uname = source['uname']
        else:
            uname = 'unknown'
        uid2uname_dict[uid] = uname
    weibo_list = []
    for weibo_item in sort_weibo_list:
        source = weibo_item['_source']
        mid = source['mid']
        uid = source['uid']
        uname = uid2uname_dict[uid]
        text = source['text']
        ip = source['geo']
        timestamp = source['timestamp']
        date = ts2date(timestamp)
        sentiment = source['sentiment']
        weibo_url = weiboinfo2url(uid, mid)
        #run_type:
        if RUN_TYPE == 1:
            try:
                retweet_count = source['retweeted']
            except:
                retweet_count = 0
            try:
                comment_count = source['comment']
            except:
                comment_count = 0
            try:
                sensitive_score = source['sensitive']
            except:
                sensitive_score = 0
        else:
            retweet_count = 0
            comment_count = 0
            sensitive_score = 0
        city = ip2city(ip)
        weibo_list.append([mid, uid, uname, text, ip, city, timestamp, date, retweet_count, comment_count, sensitive_score, weibo_url])
    if sort_type == 'timestamp':
        new_weibo_list = sorted(weibo_list, key=lambda x:x[6], reverse=True)
    elif sort_type == 'retweeted':
        new_weibo_list = sorted(weibo_list, key=lambda x:x[8], reverse=True)
    elif sort_type == 'comment':
        new_weibo_list = sorted(weibo_list, key=lambda x:x[9], reverse=True)
    elif sort_type == 'sensitive':
        new_weibo_list = sorted(weibo_list, key=lambda x:x[10], reverse=True)
    return new_weibo_list
Exemple #31
0
def get_social_inter_content(uid1, uid2, type_mark):
    weibo_list = []
    #get two type relation about uid1 and uid2
    #search weibo list
    now_ts = int(time.time())
    #run_type
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = datetime2ts(RUN_TEST_TIME)
    #uid2uname
    uid2uname = {}
    try:
        portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type ,\
                                body={'ids': [uid1, uid2]}, _source=False, fields=['uid', 'uname'])['docs']
    except:
        portrait_result = []

    for item in portrait_result:
        uid = item['_id']
        if item['found'] == True:
            uname = item['fields']['uname'][0]
            uid2uname[uid] = uname
        else:
            uid2uname[uid] = 'unknown'
    #iter date to search weibo list
    for i in range(7, 0, -1):
        iter_date_ts = now_date_ts - i * DAY
        iter_date = ts2datetime(iter_date_ts)
        flow_text_index_name = flow_text_index_name_pre + str(iter_date)
        query = []
        query.append({
            'bool': {
                'must': [{
                    'term': {
                        'uid': uid1
                    }
                }, {
                    'term': {
                        'directed_uid': int(uid2)
                    }
                }]
            }
        })
        if type_mark == 'out':
            query.append({
                'bool': {
                    'must': [{
                        'term': {
                            'uid': uid2
                        }
                    }, {
                        'term': {
                            'directed_uid': int(uid1)
                        }
                    }]
                }
            })
        try:
            flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\
                    body={'query': {'bool':{'should': query}}, 'sort':[{'timestamp':{'order': 'asc'}}], 'size':MAX_VALUE})['hits']['hits']
        except:
            flow_text_result = []
        for flow_text in flow_text_result:
            source = flow_text['_source']
            weibo = {}
            weibo['timestamp'] = source['timestamp']
            weibo['ip'] = source['ip']
            weibo['geo'] = source['geo']
            weibo['text'] = '\t'.join(source['text'].split('&'))
            weibo['uid'] = source['uid']
            weibo['uname'] = uid2uname[weibo['uid']]
            weibo['directed_uid'] = str(source['directed_uid'])
            weibo['directed_uname'] = uid2uname[str(source['directed_uid'])]
            weibo_list.append(weibo)

    return weibo_list
Exemple #32
0
def new_get_user_weibo(uid, sort_type):
    results = []
    weibo_list = []
    now_date = ts2datetime(time.time())
    #run_type
    if RUN_TYPE == 0:
        now_date = RUN_TEST_TIME
        sort_type = 'timestamp'
    #step1:get user name
    print '708'
    try:
        user_profile_result = es_user_profile.get(index=profile_index_name, doc_type=profile_index_type,\
                id=uid, _source=False, fields=['nick_name'])
    except:
        user_profile_result = {}
    print '714',len(user_profile_result)
    if user_profile_result:
        uname = user_profile_result['fields']['nick_name'][0]
    else:
        uname = ''
    #step2:get user weibo
    for i in range(7, 0, -1):
        if RUN_TYPE == 1:
            iter_date = ts2datetime(datetime2ts(now_date) - i * DAY)
        else:
            iter_date = '2013-09-01'
        index_name = flow_text_index_name_pre + iter_date
        print '726'
        try:
            weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\
                    body={'query':{'filtered':{'filter':{'term': {'uid': uid}}}}, 'size':MAX_VALUE})['hits']['hits']
            #print weibo_result
        except:
            weibo_result = []
        print '732',len(weibo_result)
        if weibo_result:
            weibo_list.extend(weibo_result)
    
    #sort_weibo_list = sorted(weibo_list, key=lambda x:x['_source'][sort_type], reverse=True)[:100]
    mid_set = set()
    for weibo_item in weibo_list:
        source = weibo_item['_source']
        mid = source['mid']
        uid = source['uid']
        text = source['text']
        ip = source['ip']
        timestamp = source['timestamp']
        date = ts2date(timestamp)
        sentiment = source['sentiment']
        weibo_url = weiboinfo2url(uid, mid)
        #run_type
        if RUN_TYPE == 1:
            try:
                retweet_count = source['retweeted']
            except:
                retweet_count = 0
            try:
                comment_count = source['comment']
            except:
                comment_count = 0
            try:
                sensitive_score = source['sensitive']
            except:
                sensitive_score = 0
        else:
            retweet_count = 0
            comment_count = 0
            sensitive_score = 0
        city = ip2city(ip)
        if mid not in mid_set:
            results.append([mid, uid, text, ip, city,timestamp, date, retweet_count, comment_count, sensitive_score, weibo_url])
            mid_set.add(mid)
    if sort_type == 'timestamp':
        sort_results = sorted(results, key=lambda x:x[5], reverse=True)
    elif sort_type == 'retweet_count':
        sort_results = sorted(results, key=lambda x:x[7], reverse=True)
    elif sort_type == 'comment_count':
        sort_results = sorted(results, key=lambda x:x[8], reverse=True)
    elif sort_type == 'sensitive':
        sort_results = sorted(results, key=lambda x:x[9], reverse=True)
    print '778'
    return sort_results
Exemple #33
0
def search_weibo(root_uid, uid, mtype):
    query_body = {
        #'query':{
        'filter': {
            'bool': {
                'must': [{
                    'term': {
                        'uid': uid
                    }
                }, {
                    'term': {
                        'message_type': mtype
                    }
                }],
                'should': [{
                    'term': {
                        'root_uid': root_uid
                    }
                }, {
                    'term': {
                        'directed_uid': root_uid
                    }
                }],
            }
        }
        #}
    }
    index_list = []
    for i in range(7, 0, -1):
        if RUN_TYPE == 1:
            iter_date = ts2datetime(datetime2ts(now_date) - i * DAY)
        else:
            iter_date = ts2datetime(datetime2ts(RUN_TEST_TIME) - i * DAY)
        index_list.append(flow_text_index_name_pre + iter_date)
    results = es_flow_text.search(index=index_list,
                                  doc_type=flow_text_index_type,
                                  body=query_body)['hits']['hits']
    weibo = {}
    f_result = []

    if len(results) > 0:
        for result in results:
            ## print type(result),result
            weibo['last_text'] = [
                result['_source']['text'], result['_source']['text'],
                result['_source']['timestamp']
            ]
            mid = result['_source']['root_mid']
            # # print mid
            len_pre = len(flow_text_index_name_pre)
            index = result['_index'][len_pre:]
            root_index = []
            for j in range(0, 7):  #一周的,一个月的话就0,30
                iter_date = ts2datetime(datetime2ts(index) - j * DAY)
                root_index.append(flow_text_index_name_pre + iter_date)
            results0 = es_flow_text.search(
                index=root_index,
                doc_type=flow_text_index_type,
                body={'query': {
                    'term': {
                        'mid': mid
                    }
                }})['hits']['hits']
            if len(results0) > 0:
                for result0 in results0:
                    weibo['ori_text'] = [
                        result0['_source']['text'],
                        result0['_source']['timestamp']
                    ]
                    f_result.append(weibo)
                    weibo = {}
    return f_result
Exemple #34
0
def search_fans(uid, top_count):
    results = {}
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    db_number = get_db_num(now_date_ts)

    be_comment_index_name = be_comment_index_name_pre + str(db_number)
    be_retweet_index_name = be_retweet_index_name_pre + str(db_number)
    result = {}
    be_retweet_inter_dict = {}
    be_comment_inter_dict = {}
    center_uid = uid
    try:
        be_retweet_result = es_retweet.get(index=be_retweet_index_name,
                                           doc_type=be_retweet_index_type,
                                           id=uid)['_source']
    except:
        be_retweet_result = {}

    if be_retweet_result:
        be_retweet_uid_dict = json.loads(be_retweet_result['uid_be_retweet'])
    else:
        be_retweet_uid_dict = {}
    # # print "be_retweet_uid_dict", be_retweet_uid_dict
    try:
        be_comment_result = es_be_comment.get(index=be_comment_index_name,
                                              doc_type=be_comment_index_type,
                                              id=uid)['_source']
    except:
        be_comment_result = {}

    if be_comment_result:
        be_comment_uid_dict = json.loads(be_comment_result['uid_be_comment'])
    else:
        be_comment_uid_dict = {}
    # # print "be_comment_uid_dict", be_comment_uid_dict

    fans_result = union_dict(be_retweet_uid_dict, be_comment_uid_dict)
    fans_user_set = set(fans_result.keys())
    fans_list = list(fans_user_set)
    # # print "fans_list", fans_list
    all_fans_dict = {}

    for fans_user in fans_list:
        if fans_user != center_uid:
            all_fans_dict[fans_user] = fans_result[fans_user]
    sort_all_fans_dict = sorted(all_fans_dict.items(),
                                key=lambda x: x[1],
                                reverse=True)
    all_fans_uid_list = []
    all_fans_uid_list_all = [item[0] for item in sort_all_fans_dict]

    count = 0
    for i in all_fans_uid_list_all:
        count += 1
        all_fans_uid_list.append(i)
        if count == 1000:
            break
    # # print all_fans_uid_list

    out_portrait_list = all_fans_uid_list
    #use to get user information from user profile
    out_portrait_result = {}
    try:
        out_user_result = es_user_profile.mget(index=profile_index_name,
                                               doc_type=profile_index_type,
                                               body={'ids': out_portrait_list
                                                     })['docs']
    except:
        out_user_result = []
    #add index from bci_history
    try:
        bci_history_result = es_bci_history.mget(
            index=bci_history_index_name,
            doc_type=bci_history_index_type,
            body={'ids': out_portrait_list},
            fields=fields)['docs']
    except:
        bci_history_result = []
    iter_count = 0
    out_portrait_list = []
    for out_user_item in out_user_result:
        uid = out_user_item['_id']
        if out_user_item['found'] == True:
            source = out_user_item['_source']
            uname = source['nick_name']
            photo_url = source['photo_url']
            if uname == '':
                uname = u'未知'
            location = source['user_location']
            friendsnum = source['friendsnum']
        else:
            uname = u'未知'
            location = ''
            friendsnum = ''
            photo_url = 'unknown'
        #add index from bci_history
        try:
            bci_history_item = bci_history_result[iter_count]
        except:
            bci_history_item = {'found': False}
        # # print bci_history_item
        if bci_history_item['found'] == True:
            fansnum = bci_history_item['fields'][fields[0]][0]
            user_weibo_count = bci_history_item['fields'][fields[1]][0]
            user_friendsnum = bci_history_item['fields'][fields[2]][0]
            influence = bci_history_item['fields'][fields[3]][0]
        else:
            fansnum = ''
            user_weibo_count = ''
            user_friendsnum = ''

        fans_count = int(all_fans_dict[uid])
        out_portrait_list.append({
            'uid': uid,
            'photo_url': photo_url,
            'uname': uname,
            'count': fans_count,
            'fansnum': fansnum,
            'friendsnum': user_friendsnum,
            'weibo_count': user_weibo_count
        })
        iter_count += 1

    return out_portrait_list
Exemple #35
0
def search_fans_new(uid, top_count):
    results = {}
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    db_number = get_db_num(now_date_ts)

    fan_result_new = es_fans.get(index=fans_index_name,
                                 doc_type=fans_index_type,
                                 id=uid)['_source']

    fan_result_new = json.loads(fan_result_new['uid_be_retweet'])

    out_portrait_list = []
    # # print fan_result_new
    i = 1
    for key in fan_result_new:
        # # print key
        fansnum = 0
        user_friendsnum = 0
        user_weibo_count = 0
        fans_count = 0

        uid = fan_result_new[key]['uid']

        if fan_result_new[key]['photo_url']:
            photo_url = fan_result_new[key]['photo_url']
        else:
            photo_url = "http://tp2.sinaimg.cn/1878376757/50/0/1"

        if fan_result_new[key]['nick_name']:
            uname = fan_result_new[key]['nick_name']
        else:
            uname = uid

        if fan_result_new[key]['times']:
            fans_count = fan_result_new[key]['times']
        else:
            fans_count = 0

        if fan_result_new[key]['fansnum']:
            fansnum = fan_result_new[key]['fansnum']
        else:
            fansnum = 0

        if fan_result_new[key]['friendsnum']:
            user_friendsnum = fan_result_new[key]['friendsnum']
        else:
            user_friendsnum = 0

        if fan_result_new[key]['statusnum']:
            user_weibo_count = fan_result_new[key]['statusnum']
        else:
            user_weibo_count = 0

        out_portrait_list.append({
            'uid': uid,
            'photo_url': photo_url,
            'uname': uname,
            'count': fans_count,
            'fansnum': fansnum,
            'friendsnum': user_friendsnum,
            'weibo_count': user_weibo_count
        })

        if i > 100:
            break
        i = i + 1
    return out_portrait_list
Exemple #36
0
def adsRec(uid, queryInterval=HOUR * 24):
    '''
    从广告表中读取当前时间点前一段时间queryInterval内的广微博,得到其中的广告部分
    然后根据用户的key_word信息得到推荐的广告。
    :param uid: 用户ID
    :param queryInterval: 查询之前多久的广告
    :return: 广告微博列表,按照相关度(感兴趣程度)排序
    '''

    # 运行状态,
    # 0 ->  当前为2013-9-8 00:00:00
    # 1 ->  当前时间
    now_date = ts2datetime(time.time()) if RUN_TYPE == 1 else ts2datetime(
        datetime2ts(RUN_TEST_TIME) - DAY)

    # 获取用户的偏好
    try:
        print uid
        user_portrait_result = es_user_portrait. \
            get_source(index=portrait_index_name, doc_type=profile_index_type, id=uid)
    except:
        return None

    user_key_words = set(user_portrait_result["keywords_string"].split("&"))

    # 直接从广告表中读取并计算
    ads_weibo_all = es_ads_weibo.search(
        index=ads_weibo_index_name,
        doc_type=ads_weibo_index_type,
        body={
            'query': {
                "filtered": {
                    "filter": {
                        "range": {
                            "timestamp": {
                                "gte": datetime2ts(now_date) - queryInterval
                            }
                        }
                    }
                }
            },
            'size': 2000,
        })['hits']['hits']

    random.shuffle(ads_weibo_all)
    ads_weibo_all = ads_weibo_all[:800]

    # 根据权重得到不同类别上词语的权重TFIDF
    topic_word_weight_dic = construct_topic_word_weight_dic(
        ADS_TOPIC_TFIDF_DIR)

    # 根据用户发微博的keywords得到用户在广告的topic上的分布
    # 因为已有的topic不太适合广告的分类
    user_topic_dic = construct_topic_feature_dic(user_key_words,
                                                 topic_word_weight_dic)

    test_user_topic = {
        "3069348215": {
            u'\u5a31\u4e50': 10.0,
            u'\u751f\u6d3b': 14.0,
            u'\u4f53\u80b2': 1.0,
            u'\u8d22\u7ecf': 0,
            u'\u6821\u56ed': 0,
            u'IT': 9.0,
            u'\u6c7d\u8f66': 1.0,
            u'\u6e38\u620f\u52a8\u6f2b': 6.0,
            u'\u6559\u80b2': 2.0
        },
        "2218894100": {
            u'\u5a31\u4e50': 0,
            u'\u751f\u6d3b': 5.0,
            u'\u4f53\u80b2': 16.0,
            u'\u8d22\u7ecf': 1.0,
            u'\u6821\u56ed': 0,
            u'IT': 0,
            u'\u6c7d\u8f66': 0,
            u'\u6e38\u620f\u52a8\u6f2b': 0,
            u'\u6559\u80b2': 0
        },
        "1035933493": {
            u'\u5a31\u4e50': 0,
            u'\u751f\u6d3b': 10.0,
            u'\u4f53\u80b2': 17.0,
            u'\u8d22\u7ecf': 0,
            u'\u6821\u56ed': 0,
            u'IT': 9.0,
            u'\u6c7d\u8f66': 8.0,
            u'\u6e38\u620f\u52a8\u6f2b': 0,
            u'\u6559\u80b2': 2.0
        },
    }

    if uid in test_user_topic:
        user_topic_dic = test_user_topic[uid]

    # topics = [u"IT", u"体育", u"娱乐", u"教育", u"游戏动漫", u"生活", u"校园", u"生活", u"财经"]
    # for topic in topics:
    #     user_topic_dic[topic] = 1.0
    print user_topic_dic
    print "f**k"
    ads_weibo_prefer = adsPreferred(user_topic_dic, ads_weibo_all,
                                    topic_word_weight_dic, uid, 30)

    return ads_weibo_prefer
Exemple #37
0
from ruman.global_utils import ES_COPY_USER_PORTRAIT, COPY_USER_PORTRAIT_INFLUENCE, COPY_USER_PORTRAIT_INFLUENCE_TYPE,\
              COPY_USER_PORTRAIT_IMPORTANCE, COPY_USER_PORTRAIT_IMPORTANCE_TYPE, COPY_USER_PORTRAIT_ACTIVENESS,\
              COPY_USER_PORTRAIT_ACTIVENESS_TYPE, COPY_USER_PORTRAIT_SENSITIVE, COPY_USER_PORTRAIT_SENSITIVE_TYPE
from ruman.global_utils import es_bci_history, bci_history_index_name, bci_history_index_type
from ruman.parameter import verified_num2ch_dict, IP_TIME_SEGMENT, DAY, MAX_VALUE
from ruman.parameter import RUN_TYPE, RUN_TEST_TIME
from ruman.global_config import R_BEGIN_TIME
from ruman.time_utils import ts2datetime, datetime2ts, ts2date
from ruman.keyword_filter import keyword_filter

evaluate_index_dict = {'bci': [COPY_USER_PORTRAIT_INFLUENCE, COPY_USER_PORTRAIT_INFLUENCE_TYPE], \
                       'importance': [COPY_USER_PORTRAIT_IMPORTANCE, COPY_USER_PORTRAIT_IMPORTANCE_TYPE],\
                       'activeness': [COPY_USER_PORTRAIT_ACTIVENESS, COPY_USER_PORTRAIT_ACTIVENESS_TYPE],\
                       'sensitive': [COPY_USER_PORTRAIT_SENSITIVE, COPY_USER_PORTRAIT_SENSITIVE_TYPE ]}

r_beigin_ts = datetime2ts(R_BEGIN_TIME)
FILTER_ITER_COUNT = 100

#use to get user profile information
def new_get_user_profile(uid):
    try:
    	#print 'trying',es_user_profile,profile_index_name
        results = es_user_profile.get(index=profile_index_name, doc_type=profile_index_type,\
                id=uid)['_source']
        #print es_user_profile,profile_index_name
    except:
        results = {}
    #get new fansnum and statusnum
    try:
        bci_history_result = es_bci_history.get(index=bci_history_index_name, doc_type=bci_history_index_type, id=uid)['_source']
    except:
Exemple #38
0
def localRec(uid, queryInterval=HOUR * 25 * 7, k=200):
    # 运行状态,
    # 0 ->  当前为2016-11-28 00:00:00
    # 1 ->  当前时间
    now_timestamp = datetime2ts(ts2datetime(time.time()))
    if RUN_TYPE == 0:
        now_timestamp = datetime2ts(RUN_TEST_TIME)

    flow_text_index_list = []
    for i in range(7, 0, -1):
        iter_date = ts2datetime(now_timestamp - DAY * i)
        flow_text_index_list.append(flow_text_index_name_pre + iter_date)

    # 获取用户地理位置
    # user_geos = get_user_geo(uid)
    # # 根据位置查询weibo
    # weibo_all = es_flow_text.search(index=flow_text_index_list, doc_type=ads_weibo_index_type,
    #                                 body={"query":{"bool":{"must":
    #                                                                 [{"match":{"keywords_string":"新闻"}},
    #                                                                  {"match":{"geo":"合肥"}}
    #                                                                  ]}},
    #                                            "size": 200
    #                                       })["hits"]["hits"]
    '''可以直接查询长度大于100的但是很慢
    {"query":{"filtered":{"query":{"bool":{"must":[{"match":{"keywords_string":"新闻"}},{"match":{"geo":"合肥"}}]}},"filter":{"regexp":{"text":{"value":".{100,}"}}}}}}
    '''
    ip = get_user_ip(uid)
    ip = ".".join(ip.split(".")[:-2])
    print '326'
    weibo_all = es_flow_text.search(index=flow_text_index_list,
                                    doc_type=ads_weibo_index_type,
                                    body={
                                        "query": {
                                            "bool": {
                                                "must": [{
                                                    "prefix": {
                                                        "text.ip": ip
                                                    }
                                                }]
                                            }
                                        },
                                        "size": 2000
                                    })["hits"]["hits"]

    local_weibo_rec = []
    weibo_user_uids = [weibo["_source"]["uid"] for weibo in weibo_all]
    print '332', len(weibo_all)
    # user_profiles = search_user_profile_by_user_ids(weibo_user_uids)
    exists_ip = set()
    topic_word_weight_dic = construct_topic_word_weight_dic(
        ADS_TOPIC_TFIDF_DIR)
    for weibo in weibo_all:
        weibo = weibo["_source"]
        weibo_text = weibo["text"]
        if weibo["ip"] in exists_ip:
            continue
        # 一个ip只选一个
        exists_ip.add(weibo["ip"])
        if not is_suit(weibo_text):
            continue
        weibo["len"] = len(weibo_text)
        try:
            mid = weibo["mid"]
            uid = weibo["uid"]
        except:
            continue
        weibo["weibo_url"] = weiboinfo2url(uid, mid)
        weibo["weibo_topic"] = judge_ads_topic(list(jieba.cut(weibo_text)),
                                               topic_word_weight_dic)
        # 可能出现许多userprofile查不到的情况
        # if uid in user_profiles:
        #     weibo["photo_url"] = user_profiles[uid]["photo_url"]
        #     weibo["nick_name"] = user_profiles[uid]["nick_name"]
        # else:
        #     weibo["photo_url"] = "None"
        #     weibo["nick_name"] = "None"
        #     local_weibo_rec.append(weibo)
        local_weibo_rec.append(weibo)
    return local_weibo_rec
Exemple #39
0
def adsRec(uid, queryInterval=HOUR * 24):
    '''
    从广告表中读取当前时间点前一段时间queryInterval内的广微博,得到其中的广告部分
    然后根据用户的key_word信息得到推荐的广告。
    :param uid: 用户ID
    :param queryInterval: 查询之前多久的广告
    :return: 广告微博列表,按照相关度(感兴趣程度)排序
    '''

    # 运行状态,
    # 0 ->  当前为2013-9-8 00:00:00
    # 1 ->  当前时间
    now_date = ts2datetime(time.time()) if RUN_TYPE == 1 else ts2datetime(datetime2ts(RUN_TEST_TIME) - DAY)

    # 获取用户的偏好
    try:
        print uid
        user_portrait_result = es_user_portrait. \
            get_source(index=portrait_index_name, doc_type=profile_index_type, id=uid)
    except:
        return None

    user_key_words = set(user_portrait_result["keywords_string"].split("&"))

    # 直接从广告表中读取并计算
    ads_weibo_all = es_ads_weibo.search(index=ads_weibo_index_name,
                                        doc_type=ads_weibo_index_type,
                                        body={'query': {"filtered": {"filter": {
                                            "range": {"timestamp": {"gte": datetime2ts(now_date) - queryInterval}}}}},
                                            'size': 2000,
                                        }
                                        )['hits']['hits']


    random.shuffle(ads_weibo_all)
    ads_weibo_all = ads_weibo_all[:800]

    # 根据权重得到不同类别上词语的权重TFIDF
    topic_word_weight_dic = construct_topic_word_weight_dic(ADS_TOPIC_TFIDF_DIR)

    # 根据用户发微博的keywords得到用户在广告的topic上的分布
    # 因为已有的topic不太适合广告的分类
    user_topic_dic = construct_topic_feature_dic(user_key_words, topic_word_weight_dic)

    test_user_topic = {
        "3069348215": {u'\u5a31\u4e50': 10.0, u'\u751f\u6d3b': 14.0, u'\u4f53\u80b2': 1.0,
                        u'\u8d22\u7ecf': 0, u'\u6821\u56ed': 0, u'IT': 9.0, u'\u6c7d\u8f66': 1.0,
                        u'\u6e38\u620f\u52a8\u6f2b': 6.0, u'\u6559\u80b2': 2.0},
        "2218894100": {u'\u5a31\u4e50': 0, u'\u751f\u6d3b': 5.0, u'\u4f53\u80b2': 16.0,
                        u'\u8d22\u7ecf': 1.0, u'\u6821\u56ed': 0, u'IT': 0, u'\u6c7d\u8f66': 0,
                        u'\u6e38\u620f\u52a8\u6f2b': 0, u'\u6559\u80b2': 0},
        "1035933493": {u'\u5a31\u4e50': 0, u'\u751f\u6d3b': 10.0, u'\u4f53\u80b2': 17.0,
                        u'\u8d22\u7ecf': 0, u'\u6821\u56ed': 0, u'IT': 9.0, u'\u6c7d\u8f66': 8.0,
                        u'\u6e38\u620f\u52a8\u6f2b': 0, u'\u6559\u80b2': 2.0},
        }

    if uid in test_user_topic:
        user_topic_dic = test_user_topic[uid]



    # topics = [u"IT", u"体育", u"娱乐", u"教育", u"游戏动漫", u"生活", u"校园", u"生活", u"财经"]
    # for topic in topics:
    #     user_topic_dic[topic] = 1.0
    print user_topic_dic
    print "f**k"
    ads_weibo_prefer = adsPreferred(user_topic_dic, ads_weibo_all, topic_word_weight_dic, uid, 30)

    return ads_weibo_prefer
Exemple #40
0
def search_bidirect_interaction(uid, top_count):

    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    db_number = get_db_num(now_date_ts)
    retweet_index_name = retweet_index_name_pre + str(db_number)
    be_retweet_index_name = be_retweet_index_name_pre + str(db_number)
    comment_index_name = comment_index_name_pre + str(db_number)
    be_comment_index_name = be_comment_index_name_pre + str(db_number)
    results = {}
    retweet_inter_dict = {}
    comment_inter_dict = {}
    center_uid = uid
    #bidirect interaction in retweet and be_retweet
    try:
        retweet_result = es_retweet.get(index=retweet_index_name,
                                        doc_type=retweet_index_type,
                                        id=uid)['_source']
    except:
        retweet_result = {}
    if retweet_result:
        retweet_uid_dict = json.loads(retweet_result['uid_retweet'])
    else:
        retweet_uid_dict = {}
    retweet_uid_list = retweet_uid_dict.keys()
    try:
        be_retweet_result = es_retweet.get(index=be_retweet_index_name,
                                           doc_type=be_retweet_index_type,
                                           id=uid)['_source']
    except:
        be_retweet_result = {}
    if be_retweet_result:
        be_retweet_uid_dict = json.loads(be_retweet_result['uid_be_retweet'])
    else:
        be_retweet_uid_dict = {}

    #bidirect interaction in comment and be_comment
    try:
        comment_result = es_comment.get(index=comment_index_name,
                                        doc_type=comment_index_type,
                                        id=uid)['_source']
    except:
        comment_result = {}
    if comment_result:
        comment_uid_dict = json.loads(comment_result['uid_comment'])
    else:
        comment_uid_dict = {}
    comment_uid_list = comment_uid_dict.keys()
    try:
        be_comment_result = es_comment.get(index=be_coment_index_name,
                                           doc_type=be_comment_index_type,
                                           id=uid)['_source']
    except:
        be_comment_result = {}
    if be_comment_result:
        be_comment_uid_dict = json.loads(be_comment_result['uid_be_comment'])
    else:
        be_comment_uid_dict = {}
    #get bidirect_interaction dict
    #all_interaction_dict = union_dict(retweet_inter_dict, comment_inter_dict)
    retweet_comment_result = union_dict(retweet_uid_dict, comment_uid_dict)
    be_retweet_comment_result = union_dict(be_retweet_uid_dict,
                                           be_comment_uid_dict)
    interaction_user_set = set(retweet_comment_result.keys()) & set(
        be_retweet_comment_result.keys())
    interaction_user_list = list(interaction_user_set)
    all_interaction_dict = {}
    for interaction_user in interaction_user_list:
        if interaction_user != center_uid:
            all_interaction_dict[interaction_user] = retweet_comment_result[
                interaction_user] + be_retweet_comment_result[interaction_user]

    sort_all_interaction_dict = sorted(all_interaction_dict.items(),
                                       key=lambda x: x[1],
                                       reverse=True)
    #get in_portrait_list, in_portrait_results and out_portrait_list
    all_interaction_uid_list = [item[0] for item in sort_all_interaction_dict]
    ## print all_interaction_uid_list

    # if RUN_TYPE == 0:
    # all_interaction_dict = {'2029036025':3,'1282005885':2,'2549228714':2,'1809833450':1}
    # all_interaction_uid_list = ['2029036025', '1282005885', '2549228714', '1809833450']

    out_portrait_list = all_interaction_uid_list
    #use to get user information from user profile
    out_portrait_result = {}
    try:
        out_user_result = es_user_profile.mget(index=profile_index_name,
                                               doc_type=profile_index_type,
                                               body={'ids': out_portrait_list
                                                     })['docs']
    except:
        out_user_result = []
    #add index from bci_history
    try:
        bci_history_result = es_bci_history.mget(
            index=bci_history_index_name,
            doc_type=bci_history_index_type,
            body={'ids': out_portrait_list},
            fields=fields)['docs']
    except:
        bci_history_result = []
    iter_count = 0
    out_portrait_list = []
    for out_user_item in out_user_result:
        uid = out_user_item['_id']
        if out_user_item['found'] == True:
            source = out_user_item['_source']
            uname = source['nick_name']
            photo_url = source['photo_url']
            if uname == '':
                uname = u'未知'
            location = source['user_location']
            friendsnum = source['friendsnum']
        else:
            uname = u'未知'
            location = ''
            friendsnum = ''
            photo_url = 'unknown'
        #add index from bci_history
        try:
            bci_history_item = bci_history_result[iter_count]
        except:
            bci_history_item = {'found': False}
        # # print bci_history_item
        if bci_history_item['found'] == True:
            fansnum = bci_history_item['fields'][fields[0]][0]
            user_weibo_count = bci_history_item['fields'][fields[1]][0]
            user_friendsnum = bci_history_item['fields'][fields[2]][0]
            influence = bci_history_item['fields'][fields[3]][0]
        else:
            fansnum = ''
            user_weibo_count = ''
            user_friendsnum = ''

        interaction_count = int(all_interaction_dict[uid])
        out_portrait_list.append({
            'uid': uid,
            'photo_url': photo_url,
            'uname': uname,
            'count': interaction_count,
            'fansnum': fansnum,
            'friendsnum': user_friendsnum,
            'weibo_count': user_weibo_count
        })
        iter_count += 1

    return out_portrait_list
Exemple #41
0
def search_task(task_name, submit_date, state, status, submit_user):
    results = []
    query = []
    condition_num = 0
    if task_name:
        task_name_list = task_name.split(' ')
        for item in task_name_list:
            query.append({'wildcard':{'task_name': '*' + item + '*'}})
            condition_num += 1
    if submit_date:
        submit_date_ts = datetime2ts(submit_date)
        submit_date_start = submit_date_ts
        submit_date_end = submit_date_ts + DAY
        query.append({'range':{'submit_date': {'gte': submit_date_start, 'lt': submit_date_end}}})
        condition_num += 1
    if state:
        state_list = state.split(' ')
        for item in state_list:
            query.append({'wildcard':{'state': '*' + item + '*'}})
            condition_num += 1
    if status:
        query.append({'match':{'status': status}})
        condition_num += 1
    if submit_user:
        query.append({'term':{'submit_user': submit_user}})
        condition_num += 1
    print es_group_result,group_index_name,group_index_type
    if condition_num > 0:
        query.append({'term':{'task_type': 'analysis'}})
        try:
            source = es_group_result.search(
                    index = group_index_name,
                    doc_type = group_index_type,
                    body = {
                        'query':{
                            'bool':{
                                'must':query
                                }
                            },
                        'sort': [{'count':{'order': 'desc'}}],
                        'size': MAX_VALUE
                        }
                    )
        except Exception as e:
            raise e
    else:
        query.append({'term':{'task_type': 'analysis'}})
        source = es.search(
                index = group_index_name,
                doc_type = group_index_type,
                body = {
                    'query':{'bool':{
                        'must':query
                        }
                        },
                    'sort': [{'count': {'order': 'desc'}}],
                    'size': MAX_VALUE
                    }
                )

    try:
        task_dict_list = source['hits']['hits']
    except:
        return None
    result = []
    for task_dict in task_dict_list:
        try:
            state = task_dict['_source']['state']
        except:
            state = ''
        try:
            status = task_dict['_source']['status']
        except:
            status = 0
        #result.append([task_dict['_source']['task_name'], task_dict['_source']['submit_date'], task_dict['_source']['count'], state, status])
        result.append({'task_name':task_dict['_source']['task_name'],'submit_date':ts2date(task_dict['_source']['submit_date']), 'group_count':task_dict['_source']['count'], 'status':status})
    
    return result
Exemple #42
0
from ruman.global_config import R_BEGIN_TIME
from ruman.parameter import DAY, WEEK, MAX_VALUE, HALF_HOUR, FOUR_HOUR, GEO_COUNT_THRESHOLD, PATTERN_THRESHOLD
from ruman.parameter import PSY_DESCRIPTION_FIELD, psy_en2ch_dict, psy_description_dict
from ruman.search_user_profile import search_uid2uname
from ruman.filter_uid import all_delete_uid
from ruman.parameter import IP_TIME_SEGMENT, IP_TOP, DAY, IP_CONCLUSION_TOP, domain_en2ch_dict, topic_en2ch_dict
from ruman.parameter import INFLUENCE_TREND_SPAN_THRESHOLD, INFLUENCE_TREND_AVE_MIN_THRESHOLD,\
                                    INFLUENCE_TREND_AVE_MAX_THRESHOLD, INFLUENCE_TREND_DESCRIPTION_TEXT
from ruman.parameter import ACTIVENESS_TREND_SPAN_THRESHOLD, ACTIVENESS_TREND_AVE_MIN_THRESHOLD ,\
                                    ACTIVENESS_TREND_AVE_MAX_THRESHOLD, ACTIVENESS_TREND_DESCRIPTION_TEXT
from ruman.parameter import SENTIMENT_DICT,  ACTIVENESS_TREND_TAG_VECTOR
from ruman.parameter import SENTIMENT_SECOND
from ruman.parameter import RUN_TYPE, RUN_TEST_TIME
from ruman.keyword_filter import keyword_filter

r_beigin_ts = datetime2ts(R_BEGIN_TIME)

WEEK = 7

emotion_mark_dict = {'126': 'positive', '127':'negative', '128':'anxiety', '129':'angry'}
link_ratio_threshold = [0, 0.5, 1]

if RUN_TYPE == 0:
    fields = ['bci_week_sum', 'bci_month_ave', 'bci_month_sum','bci_week_ave']
else:
    fields = ['user_fansnum', 'weibo_month_sum', 'user_friendsnum','bci_week_ave']

def search_follower(uid, top_count):

    results = {}
    now_ts = time.time()
Exemple #43
0
def group_user_weibo(task_name, submit_user, sort_type):
    weibo_list = []
    now_date = ts2datetime(time.time())
    if sort_type == 'retweet':
        sort_type = 'retweeted'
    #run_type
    if RUN_TYPE == 0:
        now_date = RUN_TEST_TIME
        sort_type = 'timestamp'
    #step1: get group user
    task_id = submit_user + '-' + task_name
    try:
        group_exist_result = es_group_result.get(index=group_index_name, doc_type=group_index_type,\
                id=task_id)['_source']
    except:
        group_exist_result = {}
    if not group_exist_result:
        return 'group no exist'
    #step2: get user weibo list
    uid_list = group_exist_result['uid_list']
    for i in range(6, -1, -1):
        iter_date = ts2datetime(datetime2ts(now_date) - i * DAY)
        index_name = flow_text_index_name_pre + iter_date
        try:
            weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\
                    body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'sort':[{sort_type: {'order': 'desc'}}], 'size':100})['hits']['hits']
        except:
            weibo_result = []
        if weibo_result:
            weibo_list.extend(weibo_result)
    #sort_weibo_list = sorted(weibo_list, key=lambda x:x['_source'][sort_type], reverse=True)[:100]
    sort_weibo_list = weibo_list
    #step3: get user name
    try:
        portrait_exist_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \
                body={'ids':uid_list})['docs']
    except:
        portrait_exist_result = []
    uid2uname_dict = {}
    for portrait_item in portrait_exist_result:
        uid = portrait_item['_id']
        if portrait_item['found'] == True:
            source = portrait_item['_source']
            uname = source['uname']
        else:
            uname = 'unknown'
        uid2uname_dict[uid] = uname
    weibo_list = []
    for weibo_item in sort_weibo_list:
        source = weibo_item['_source']
        mid = source['mid']
        uid = source['uid']
        uname = uid2uname_dict[uid]
        text = source['text']
        ip = source['geo']
        timestamp = source['timestamp']
        date = ts2date(timestamp)
        sentiment = source['sentiment']
        weibo_url = weiboinfo2url(uid, mid)
        #run_type:
        if RUN_TYPE == 1:
            try:
                retweet_count = source['retweeted']
            except:
                retweet_count = 0
            try:
                comment_count = source['comment']
            except:
                comment_count = 0
            try:
                sensitive_score = source['sensitive']
            except:
                sensitive_score = 0
        else:
            retweet_count = 0
            comment_count = 0
            sensitive_score = 0
        city = ip2city(ip)
        weibo_list.append([
            mid, uid, uname, text, ip, city, timestamp, date, retweet_count,
            comment_count, sensitive_score, weibo_url
        ])
    if sort_type == 'timestamp':
        new_weibo_list = sorted(weibo_list, key=lambda x: x[6], reverse=True)
    elif sort_type == 'retweeted':
        new_weibo_list = sorted(weibo_list, key=lambda x: x[8], reverse=True)
    elif sort_type == 'comment':
        new_weibo_list = sorted(weibo_list, key=lambda x: x[9], reverse=True)
    elif sort_type == 'sensitive':
        new_weibo_list = sorted(weibo_list, key=lambda x: x[10], reverse=True)
    return new_weibo_list
Exemple #44
0
def search_task(task_name, submit_date, state, status, submit_user):
    results = []
    query = []
    condition_num = 0
    if task_name:
        task_name_list = task_name.split(' ')
        for item in task_name_list:
            query.append({'wildcard': {'task_name': '*' + item + '*'}})
            condition_num += 1
    if submit_date:
        submit_date_ts = datetime2ts(submit_date)
        submit_date_start = submit_date_ts
        submit_date_end = submit_date_ts + DAY
        query.append({
            'range': {
                'submit_date': {
                    'gte': submit_date_start,
                    'lt': submit_date_end
                }
            }
        })
        condition_num += 1
    if state:
        state_list = state.split(' ')
        for item in state_list:
            query.append({'wildcard': {'state': '*' + item + '*'}})
            condition_num += 1
    if status:
        query.append({'match': {'status': status}})
        condition_num += 1
    if submit_user:
        query.append({'term': {'submit_user': submit_user}})
        condition_num += 1
    if condition_num > 0:
        query.append({'term': {'task_type': 'analysis'}})
        try:
            source = es_group_result.search(index=group_index_name,
                                            doc_type=group_index_type,
                                            body={
                                                'query': {
                                                    'bool': {
                                                        'must': query
                                                    }
                                                },
                                                'sort': [{
                                                    'count': {
                                                        'order': 'desc'
                                                    }
                                                }],
                                                'size':
                                                MAX_VALUE
                                            })
        except Exception as e:
            raise e
    else:
        query.append({'term': {'task_type': 'analysis'}})
        source = es.search(index=group_index_name,
                           doc_type=group_index_type,
                           body={
                               'query': {
                                   'bool': {
                                       'must': query
                                   }
                               },
                               'sort': [{
                                   'count': {
                                       'order': 'desc'
                                   }
                               }],
                               'size': MAX_VALUE
                           })

    try:
        task_dict_list = source['hits']['hits']
    except:
        return None
    result = []
    for task_dict in task_dict_list:
        try:
            state = task_dict['_source']['state']
        except:
            state = ''
        try:
            status = task_dict['_source']['status']
        except:
            status = 0
        result.append([
            task_dict['_source']['task_name'],
            task_dict['_source']['submit_date'], task_dict['_source']['count'],
            state, status
        ])

    return result
Exemple #45
0
        new_get_influence_trend, new_get_sensitive_words
#from search_mid import index_mid
'''
from ruman.search_user_profile import es_get_source
from ruman.global_utils import es_user_portrait as es
from ruman.parameter import SOCIAL_DEFAULT_COUNT, SENTIMENT_TREND_DEFAULT_TYPE
from ruman.parameter import DEFAULT_SENTIMENT, DAY
from ruman.parameter import RUN_TYPE, RUN_TEST_TIME
from ruman.time_utils import ts2datetime, datetime2ts

#from personal_influence import get_user_influence, influenced_detail, influenced_people, influenced_user_detail, statistics_influence_people, tag_vector, comment_on_influence, detail_weibo_influence, influence_summary



# use to test 13-09-08
test_time = datetime2ts(RUN_TEST_TIME)

# custom_attribute
attribute_index_name = 'custom_attribute'
attribute_index_type = 'attribute'

mod = Blueprint('info_person_social', __name__, url_prefix='/info_person_social')






#use to get user be_retweet from es:be_retweet_1 or be_retweet_2
#write in version:15-12-08
#input: uid, top_count
Exemple #46
0
def influenced_people(uid, mid, influence_style, date, default_number=20):
# uid 
# which weibo----mid, retweeted weibo ---seek for root_mid
# influence_style: retweeted(0) or comment(1)
    date1 = ts2datetime(datetime2ts(date)).replace('-', '')
    index_name = pre_index + date1
    index_flow_text = pre_text_index + date
    text_result = es.get(index=index_flow_text, doc_type=flow_text_index_type, id=mid)["_source"]
    temp_mid = text_result.get("root_mid",'') #判断微博是否是原创微博
    if temp_mid:
        mid_type = 1 # 非原创微博
    else:
        mid_type = 0 # 原创微博
    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                        ]
                    }
                }
            }
        },
        "size": 30000
    }

    if mid_type == 0:
        if int(influence_style) == 0: # origin weibo, all retweeted people
            query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"root_uid": uid}}, {"term": {"message_type": 3}}, {"term": {"root_mid": mid}}])
        else: # commented people
            query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"directed_uid": uid}}, {"term": {"message_type": 2}}, {"term": {"root_mid": mid}}])
    else:
        if int(influence_style) == 0: # origin weibo, all retweeted people
            query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"directed_uid": uid}}, {"term": {"message_type": 3}}, {"term": {"root_mid": temp_mid}}])
        else: # commented people
            query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"directed_uid": uid}}, {"term": {"message_type": 2}}, {"term": {"root_mid": temp_mid}}])
    search_results = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=query_body, fields=["uid"], timeout=30)["hits"]["hits"]
    results = [] # uid_list
    if search_results:
        for item in search_results:
            if int(item["fields"]["uid"][0]) == int(uid):
                pass
            else:
                results.append(item["fields"]["uid"][0])
        results = list(set(results))
    else:
        results = []

    bci_index = "bci_" + date.replace('-','')

    if results:
        portrait_results = es_user_portrait.mget(index=user_portrait, doc_type=portrait_index_type, body={"ids": results}, fields=["domain", "topic_string", "activity_geo_dict","importance", "influence"])["docs"]
        bci_results = es_user_portrait.mget(index=bci_index, doc_type='bci', body={"ids":results}, fields=['user_index'])['docs']
    else:
        portrait_results = {}
        bci_results = {}


    in_portrait = []
    out_portrait = []
    in_portrait_info = []
    retweeted_domain = {}
    retweeted_topic = {}
    retweeted_geo = {}
    average_influence = 0
    total_influence = 0
    count = 0

    if bci_results:
        total_influence = 0
        for item in bci_results:
            if item['found']:
                total_influence += item['fields']['user_index'][0]
    try:
        average_influence = total_influence/len(results)
    except:
        average_influence = 0

    if portrait_results:
        for item in portrait_results:
            if item["found"]:
                temp = []
                count += 1
                temp.append(item['_id'])
                temp.append(item["fields"]["importance"][0])
                in_portrait.append(temp)
                temp_domain = item["fields"]["domain"][0].split('&')
                temp_topic = item["fields"]["topic_string"][0].split('&')
                temp_geo = json.loads(item["fields"]["activity_geo_dict"][0])[-1].keys()
                #total_influence += item["fields"]["influence"][0]
                retweeted_domain = aggregation(temp_domain, retweeted_domain)
                retweeted_topic = aggregation(temp_topic, retweeted_topic)
                retweeted_geo = aggregation(temp_geo, retweeted_geo)
            else:
                out_portrait.append(item['_id'])
        retweeted_domain = proportion(retweeted_domain)
        retweeted_topic = proportion(retweeted_topic)
        retweeted_geo = proportion(retweeted_geo)
        #try:
        #    average_influence = total_influence/count
        #except:
        #    average_influence = 0
    sorted_retweeted_domain = sorted(retweeted_domain.items(),key=lambda x:x[1], reverse=True)
    sorted_retweeted_topic = sorted(retweeted_topic.items(),key=lambda x:x[1], reverse=True)
    sorted_retweeted_geo = sorted(retweeted_geo.items(), key=lambda x:x[1], reverse=True)

    retweeted_results = dict()
    retweeted_results["domian"] = sorted_retweeted_domain[:5]
    retweeted_results["topic"] = sorted_retweeted_topic[:5]
    retweeted_results["geo"] = sorted_retweeted_geo[:5]
    retweeted_results["influence"] = average_influence
    in_portrait = sorted(in_portrait, key=lambda x:x[1], reverse=True)


    temp_list = []
    for item in in_portrait:
        temp_list.append(item[0])
    retweeted_results['in_portrait_number'] = len(temp_list)
    retweeted_results['out_portrait_number'] = len(out_portrait)
    in_portrait_url = get_user_url(temp_list[:default_number])
    out_portrait_url = get_user_url(out_portrait[:default_number])

    return_results = dict()
    return_results["influence_users"] = [in_portrait_url, out_portrait_url]
    return_results["influence_distribution"] = retweeted_results

    return return_results