Esempio n. 1
0
def get_user_profile_weibo(user_list):
    user_info_dict = {}
    try:
        user_profile_dict = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, \
                body={'ids': user_list})['docs']
    except:
        user_profile_dict = []
    if user_profile_dict:
        for user_dict in user_profile_dict:
            if user_dict['found'] == True:
                source = user_dict['_source']
                source_dict['uid'] = source['uid']
                source_dict['uname'] = source['nick_name']
                source_dict['location'] = source['location']
                source_dict['photo_url'] = source['photo_url']
                source_dict['fansnum'] = source['fansnum']
                source_dict['friendsnum'] = source['friendsnum']
                source_dict['statusnum'] = source['statusnum']
                source_dict['description'] = source['description']
            else:
                source_dict['uid'] = source['uid']
                source_dict['uname'] = 'unknown'
                source_dict['location'] = 'unknown'
                source_dict['photo_url'] = ''
                source_dict['fansnum'] = 0
                source_dict['friendsnum'] = 0
                source_dict['statusnum'] = 0
                source_dict['description'] = ''

            user_info_dict[user_dict['_id']] = source_dict
    return user_info_dict
Esempio n. 2
0
def query_vary_top_k(index_name, doctype, top_k, sort_index="vary"):
    query_body = {
        "query": {
            "match_all": {}
        },
        "size": top_k,
        "sort": [{sort_index: {"order": "desc"}}]
    }

    result = es.search(index=index_name, doc_type=doctype, body=query_body)['hits']['hits']
    uid_list = []
    for item in result:
        uid_list.append(item['_id'])

    portrait_result = es_portrait.mget(index="user_portrait", doc_type="user", body={"ids":uid_list}, _source=True)['docs']
    profile_result = es_profile.mget(index="weibo_user",doc_type="user", body={"ids":uid_list}, _source=True)['docs']

    return_list = []
    rank = 1
    for i in range(len(result)):
        info = ['','','','','']
        info[0] = rank
        if profile_result[i]['found']:
            info[1] = profile_result[i]['_source'].get('photo_url','')
            info[3] = profile_result[i]['_source'].get('nick_name','')
        info[2] = result[i].get('_id','')
        info[4] = result[i]['_source']['vary']
        if portrait_result[i]['found']:
            info.append('1')
        else:
            info.append('0')
        return_list.append(info)
        rank += 1

    return return_list
def search_portrait_user(es,
                         number,
                         active_index,
                         active_type,
                         portrait_index,
                         portrait_type,
                         field="user_index"):

    return_list = []
    index_exist = es.indices.exists(index=active_index)
    if not index_exist:
        return "no active_index exist"
        sys.exit(0)

    count_s = 0
    count_c = 0
    start = 0
    rank = 1
    while 1:
        search_list = []
        user_list = search_k(es, active_index, active_type, start, field, 100)
        start += 100
        for item in user_list:
            if field == "vary":
                uid = item.get('uid',
                               '0')  # obtain uid, notice "uid" or "user"
            else:
                uid = item.get('user', '0')
            search_list.append(uid)  # uid list

        search_result = es_portrait.mget(index=portrait_index,
                                         doc_type=portrait_type,
                                         body={"ids": search_list},
                                         _source=True)["docs"]
        profile_result = es_profile.mget(index="weibo_user",
                                         doc_type="user",
                                         body={"ids": search_list},
                                         _source=True)["docs"]

        for item in search_result:
            if item["found"]:
                info = ['', '', '', '', '', '']
                info[0] = rank
                index = search_result.index(item)

                if profile_result[index]['found']:
                    info[1] = profile_result[index]['_source'].get(
                        'photo_url', '')
                    info[3] = profile_result[index]['_source'].get(
                        'nick_name', '')
                info[2] = search_result[index].get('_id', '')
                info[4] = user_list[index][field]
                info[5] = "1"
                return_list.append(info)
                rank += 1
                count_c += 1

                if count_c >= int(number):
                    return return_list
def search_portrait_user_in_activity(es, number, active_index, active_type, portrait_index, portrait_type, field="user_index"):

    return_list = []
    index_exist = es.indices.exists(index=active_index)
    if not index_exist:
        return "no active_index exist"
        sys.exit(0)

    count_s = 0
    count_c = 0
    start = 0
    rank = 1
    while 1:
        search_list = []
        user_list = search_k(es, active_index, active_type, start, field, 100)
        start += 100
        for item in user_list:
            if field == "vary":
                uid = item.get('uid', '0') # obtain uid, notice "uid" or "user"
            else:
                uid = item.get('user', '0')
            search_list.append(uid) # uid list

        search_result = es_portrait.mget(index=portrait_index, doc_type=portrait_type, body={"ids": search_list}, _source=True)["docs"]
        profile_result = es_profile.mget(index="weibo_user", doc_type="user", body={"ids": search_list}, _source=True)["docs"]

        key_list = ["origin_weibo_retweeted_total_number", "origin_weibo_retweeted_average_number", "origin_weibo_retweeted_top_number", "origin_weibo_retweeted_brust_average", \
                   "origin_weibo_comment_total_number", "origin_weibo_comment_average_number", "origin_weibo_comment_top_number", "origin_weibo_retweeted_brust_average", \
                   "retweeted_weibo_retweeted_total_number", "retweeted_weibo_retweeted_average_number", "retweeted_weibo_retweeted_top_number", "retweeted_weibo_retweeted_brust_average", \
                   "retweeted_weibo_comment_total_number", "retweeted_weibo_comment_average_number", "retweeted_weibo_comment_top_number", "retweeted_weibo_retweeted_brust_average"]
        for item in search_result:
            if item["found"]:
                info = ['','','','','','']
                info[0] = rank
                index = search_result.index(item)

                if profile_result[index]['found']:
                    info[1] = profile_result[index]['_source'].get('photo_url','')
                    info[3] = profile_result[index]['_source'].get('nick_name','')
                info[2] = search_result[index].get('_id','')
                info[4] = user_list[index]['user_index']
                info[5] = "1"
                if field == 'origin_weibo_retweeted_brust_average':
                    info.append(user_list[index]['origin_weibo_retweeted_brust_average'])
                    for key in key_list:
                        info.append(user_list[index][key])
                elif field == 'origin_weibo_comment_brust_average':
                    info.append(user_list[index]['origin_weibo_comment_brust_average'])
                    for key in key_list:
                        info.append(user_list[index][key])
                else:
                    pass
                return_list.append(info)
                rank += 1
                count_c += 1

                if count_c >= int(number):
                    return return_list
Esempio n. 5
0
def search_user_info(es,index_name,doc_type,uid,result_name):
    try:
        retweet_result = es.get(index=index_name, doc_type=doc_type, id=uid)['_source']
    except:
        return None
    if retweet_result:
        retweet_dict = json.loads(retweet_result[result_name])
        sorted_list = sorted(retweet_dict.iteritems(),key=lambda x:x[1],reverse=True)[:20]
        uid_list = [i[0] for i in sorted_list if i[0] != uid]
        portrait_result = []
        try:
            user_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids':uid_list})['docs']
        except:
            user_result = []
        try:
            bci_history_result = es_bci_history.mget(index=bci_history_index_name, doc_type=bci_history_index_type, body={'ids':uid_list}, fields=fields)['docs']    
        except:
            bci_history_result = []
        #print bci_history_result
        iter_count = 0
        out_portrait_list = []
        for out_user_item in user_result:
            uid = out_user_item['_id']
            if out_user_item['found'] == True:
                source = out_user_item['_source']
                uname = source['nick_name']
                photo_url = source['photo_url']
                if uname == '':
                    uname = u'未知'
                #location = source['user_location']
                friendsnum = source['friendsnum']
            else:
                uname = u'未知'
                location = ''
                friendsnum = ''
                photo_url = 'unknown'
            #add index from bci_history
            try:
                bci_history_item = bci_history_result[iter_count]
            except:
                bci_history_item = {'found': False}
            if bci_history_item['found']==True:
                fansnum = bci_history_item['fields'][fields[0]][0]
                user_weibo_count = bci_history_item['fields'][fields[1]][0]
                user_friendsnum = bci_history_item['fields'][fields[2]][0]
                influence = bci_history_item['fields'][fields[3]][0]
            else:
                fansnum = ''
                user_weibo_count = ''
                user_friendsnum = ''
                influence = ''
            #retweet_count = int(retweet_dict[uid])
            count = retweet_dict[uid]
            out_portrait_list.append({'uid':uid,'photo_url':photo_url,'count':count,'uname':uname,'influence':influence,'fansnum':fansnum, 'friendsnum':user_friendsnum,'weibo_count':user_weibo_count})#location,
            iter_count += 1
        return out_portrait_list
    else:
        return None
Esempio n. 6
0
def search_top_index(index_name, top_k=1, index_type="bci", top=False, sort_order="user_index"):
    query_body = {
        "query": {
            "match_all": {}
        },
        "size": top_k,
        "sort": [{sort_order: {"order": "desc"}}]
    }

    if top:
        result = es.search(index=index_name, doc_type=index_type, body=query_body)['hits']['hits'][0]['_source'][sort_order]
    else:
        search_result = es.search(index=index_name, doc_type=index_type, body=query_body)['hits']['hits']

        uid_list = []
        for item in search_result:
            uid_list.append(item['_id'])
        profile_result = es_profile.mget(index=profile_index_name,doc_type=profile_index_type, body={"ids":uid_list}, _source=True)['docs']
        portrait_result = es_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids":uid_list}, _source=True)['docs']

        result = []
        rank = 1
        for i in range(len(search_result)):
            info = ['','','','']
            info[0] = rank
            if profile_result[i]['found']:
                info[1] = profile_result[i]['_source'].get('photo_url','')
                info[3] = profile_result[i]['_source'].get('nick_name','')

            info[2] = search_result[i].get('_id','')
            if sort_order in ["user_index","origin_weibo_retweeted_brust_average","origin_weibo_comment_brust_average"]:
                info.append(search_result[i]['_source'][sort_order])
                if portrait_result[i]['found']:
                    info.append("1")
                else:
                    info.append("0")
            elif sort_order == "origin_weibo_retweeted_top_number":
               info.append(search_result[i]['_source']['origin_weibo_retweeted_top_number']) 
               mid = search_result[i]['_source']['origin_weibo_top_retweeted_id']
               info.append(weiboinfo2url(info[2],mid))
               if portrait_result[i]['found']:
                   info.append("1")
               else:
                   info.append("0")
            elif sort_order == "origin_weibo_comment_top_number":
                info.append(search_result[i]['_source']['origin_weibo_comment_top_number'])
                mid = search_result[i]['_source']['origin_weibo_top_comment_id']
                info.append(weiboinfo2url(info[2],mid))
                if portrait_result[i]['found']:
                    info.append("1")
                else:
                    info.append("0")

            rank += 1
            result.append(info)

    return result
Esempio n. 7
0
def search_tag(es, number, active_index, active_type, portrait_index,
               portrait_type, tag):

    #field_dict = {"domain":"art"}
    return_list = []
    count_s = 0
    count_c = 0
    start = 0
    rank = 1

    while 1:
        search_list = []
        user_list = search_k(es, active_index, active_type, start,
                             "user_index", 10000)
        start += 10000
        for item in user_list:
            uid = item.get('user', '0')
            search_list.append(uid)  # uid list

        search_result = es_portrait.mget(index=portrait_index,
                                         doc_type=portrait_type,
                                         body={"ids": search_list},
                                         _source=True)["docs"]
        profile_result = es_profile.mget(index="weibo_user",
                                         doc_type="user",
                                         body={"ids": search_list},
                                         _source=True)["docs"]
        for item in search_result:
            count_s += 1
            if item['found'] and tag in item['_source']['domain']:
                info = ['', '', '', '', '', '', '']
                info[0] = rank
                index = search_result.index(item)

                if profile_result[index]['found']:
                    info[1] = profile_result[index]['_source'].get(
                        'photo_url', '')
                    info[3] = profile_result[index]['_source'].get(
                        'nick_name', '')
                info[2] = search_result[index].get('_id', '')
                info[4] = user_list[index]['user_index']
                info[5] = search_result[index]['_source'].get('activeness', '')
                info[6] = search_result[index]['_source'].get('importance', '')

                rank += 1
                return_list.append(info)

                if rank >= int(number) + 1:
                    return return_list

        if count_s > 100000:
            return return_list
Esempio n. 8
0
def search_max_single_field(field, index_name, doctype, top_k=3):

    # field = "origin_weibo_retweeted_top_number", "origin_weibo_comment_top_number"
    query_body = {
        "query": {
            "match_all": {}
        },
        "sort": [{field: {"order": "desc"}}],
        "size": top_k
    }

    
    return_list = []
    rank = 1
    count_c = 0
    start = 0

    while 1:
        search_list = []
        user_list = search_k(es, index_name, doctype, start, field, 100)
        start += 100
        for item in user_list:
            uid = item.get('user','0')
            search_list.append(uid) # uid list

        search_result = es_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids": search_list}, _source=True)["docs"]
        profile_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids": search_list}, _source=True)["docs"]

        for i in range(len(search_result)):
            if search_result[i]['found']:
                info = ['','','','','','','1']
                info[0] = rank
                info[2] = search_result[i].get('_id','')

                if profile_result[i]['found']:
                    info[1] = profile_result[i]['_source'].get('photo_url','')
                    info[3] = profile_result[i]['_source'].get('nick_name','')

                if 'retweeted' in field:
                    temp_mid = user_list[i]['origin_weibo_top_retweeted_id']
                    info[5] = weiboinfo2url(info[2], temp_mid)
                    info[4] = user_list[i]['origin_weibo_retweeted_top_number']
                else:
                    temp_mid = user_list[i]['origin_weibo_top_comment_id']
                    info[5] = weiboinfo2url(info[2], temp_mid)
                    info[4] = user_list[i]['origin_weibo_comment_top_number']

                rank += 1
                return_list.append(info)

                if rank >= int(top_k)+1:
                    return return_list
Esempio n. 9
0
def search_yangshi_attention(uid, top_count):

    results = {}
    now_ts = time.time()
    db_number = get_db_num(now_ts)
    index_name = retweet_index_name_pre + str(db_number)
    center_uid = uid
    # # print es_retweet,index_name,retweet_index_type,uid
    try:
        retweet_result = es_retweet.get(index=index_name,
                                        doc_type=retweet_index_type,
                                        id=uid)['_source']
    except:
        return None
    if retweet_result:
        retweet_dict = json.loads(retweet_result['uid_retweet'])
        sorted_list = sorted(retweet_dict.iteritems(),
                             key=lambda x: x[1],
                             reverse=True)[:20]
        uid_list = [i[0] for i in sorted_list if i[0] != uid]
        portrait_result = []
        try:
            user_result = es_user_profile.mget(index=profile_index_name,
                                               doc_type=profile_index_type,
                                               body={'ids': uid_list})['docs']
        except:
            user_result = []

        iter_count = 0
        out_portrait_list = []
        for out_user_item in user_result:
            uid = out_user_item['_id']
            if out_user_item['found'] == True:
                source = out_user_item['_source']
                uname = source['nick_name']
                if uname == '':
                    uname = u'未知'

            else:
                uname = u'未知'

            count = retweet_dict[uid]
            out_portrait_list.append({
                'uid': uid,
                'count': count,
                'uname': uname,
            })  #location,
            iter_count += 1
        return out_portrait_list
    else:
        return None
Esempio n. 10
0
def search_user_profile_by_user_ids(users):
    users = list(users)
    user_profile_return = dict()
    try:
        user_result = es_user_profile.mget(index=profile_index_name,
                                           doc_type=profile_index_type,
                                           body={'ids': users})['docs']
    except:
        user_result = []

    for out_user_item in user_result:
        if out_user_item['found']:
            uid = out_user_item['_id']
            user_profile_return[uid] = out_user_item['_source']
    return user_profile_return
Esempio n. 11
0
def search_user_profile_by_user_ids(users):
    users = list(users)
    user_profile_return = dict()
    try:
        user_result = es_user_profile.mget(index=profile_index_name,
                                           doc_type=profile_index_type,
                                           body={'ids': users})['docs']
    except:
        user_result = []

    for out_user_item in user_result:
        if out_user_item['found']:
            uid = out_user_item['_id']
            user_profile_return[uid] = out_user_item['_source']
    return user_profile_return
Esempio n. 12
0
def query_vary_top_k(index_name, doctype, top_k, sort_index="vary"):
    query_body = {
        "query": {
            "match_all": {}
        },
        "size": top_k,
        "sort": [{
            sort_index: {
                "order": "desc"
            }
        }]
    }

    result = es.search(index=index_name, doc_type=doctype,
                       body=query_body)['hits']['hits']
    uid_list = []
    for item in result:
        uid_list.append(item['_id'])

    portrait_result = es_portrait.mget(index="user_portrait",
                                       doc_type="user",
                                       body={"ids": uid_list},
                                       _source=True)['docs']
    profile_result = es_profile.mget(index="weibo_user",
                                     doc_type="user",
                                     body={"ids": uid_list},
                                     _source=True)['docs']

    return_list = []
    rank = 1
    for i in range(len(result)):
        info = ['', '', '', '', '']
        info[0] = rank
        if profile_result[i]['found']:
            info[1] = profile_result[i]['_source'].get('photo_url', '')
            info[3] = profile_result[i]['_source'].get('nick_name', '')
        info[2] = result[i].get('_id', '')
        info[4] = result[i]['_source']['vary']
        if portrait_result[i]['found']:
            info.append('1')
        else:
            info.append('0')
        return_list.append(info)
        rank += 1

    return return_list
Esempio n. 13
0
def portrait_user_vary(es, number, active_index, active_type, portrait_index, portrait_type, field="vary"):

    return_list = []
    index_exist = es.indices.exists(index=active_index)
    if not index_exist:
        return "no active_index exist"
        sys.exit(0)

    count_s = 0
    count_c = 0
    start = 0
    rank = 1
    try:
        while 1:
            search_list = []
            user_list = search_k(es, active_index, active_type, start, field, 100)
            start += 100
            for item in user_list:
                uid = item.get('uid', '0') # obtain uid, notice "uid" or "user"
                search_list.append(uid) # uid list
            search_result = es_portrait.mget(index="user_portrait", doc_type="user", body={"ids": search_list}, _source=True)["docs"]
            profile_result = es_profile.mget(index="weibo_user", doc_type="user", body={"ids": search_list}, _source=True)["docs"]

            for item in search_result:
                count_c += 1
                if item["found"]:
                    info = ['','','','','','1']
                    info[0] = rank
                    index = search_result.index(item)

                    if profile_result[index]['found']:
                        info[1] = profile_result[index]['_source'].get('photo_url','')
                        info[3] = profile_result[index]['_source'].get('nick_name','')
                    info[2] = search_result[index].get('_id','')
                    info[4] = user_list[index]['vary']
                    return_list.append(info)
                    rank += 1
                    if rank == int(number)+1:
                        return return_list

            if count_c > 10000:
                break
    except RequestError:
        print "timeout"

    return return_list
Esempio n. 14
0
def get_user_url(uid_list):
    results = []
    try:
        es_results = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids": uid_list})['docs']
    except:
        es_results = {}
    for item in es_results:
        temp = []
        if item['found']:
            temp.append(item['_source']["photo_url"])
            temp.append(item['_source']['nick_name'])
            temp.append(item['_id'])
        else:
            temp.append("unknown")
            temp.append("unknown")
            temp.append(item['_id'])
        results.append(temp)
    return results
Esempio n. 15
0
def search_tag(es, number, active_index, active_type, portrait_index, portrait_type, tag):

    #field_dict = {"domain":"art"}
    return_list = []
    count_s = 0
    count_c = 0
    start = 0
    rank = 1

    while 1:
        search_list = []
        user_list = search_k(es, active_index, active_type, start, "user_index", 10000)
        start += 10000
        for item in user_list:
            uid = item.get('user', '0')
            search_list.append(uid) # uid list

        search_result = es_portrait.mget(index=portrait_index, doc_type=portrait_type, body={"ids": search_list}, _source=True)["docs"]
        profile_result = es_profile.mget(index="weibo_user", doc_type="user", body={"ids": search_list}, _source=True)["docs"]
        for item in search_result:
            count_s += 1
            if item['found'] and tag in item['_source']['domain']:
                info = ['','','','','','','']
                info[0] = rank
                index = search_result.index(item)

                if profile_result[index]['found']:
                    info[1] = profile_result[index]['_source'].get('photo_url','')
                    info[3] = profile_result[index]['_source'].get('nick_name','')
                info[2] = search_result[index].get('_id','')
                info[4] = user_list[index]['user_index']
                info[5] = search_result[index]['_source'].get('activeness','')
                info[6] = search_result[index]['_source'].get('importance','')

                rank += 1
                return_list.append(info)

                if rank >= int(number)+1:
                   return return_list

        if count_s > 100000:
            return return_list
Esempio n. 16
0
def search_yangshi_follower(uid, top_count):
    results = {}
    now_ts = time.time()
    db_number = get_db_num(now_ts)
    index_name = be_retweet_index_name_pre + str(db_number)
    center_uid = uid
    try:
        retweet_result = es_retweet.get(index=index_name, doc_type=be_retweet_index_type, id=uid)['_source']
    except:
        return None
    # print retweet_result
    if retweet_result:
        retweet_dict = json.loads(retweet_result['uid_be_retweet'])
        sorted_list = sorted(retweet_dict.iteritems(),key=lambda x:x[1],reverse=True)[:20]
        uid_list = [i[0] for i in sorted_list if i[0] != uid]
        portrait_result = []
        try:
            user_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids':uid_list})['docs']
        except:
            user_result = []

        iter_count = 0
        out_portrait_list = []
        for out_user_item in user_result:
            uid = out_user_item['_id']
            if out_user_item['found'] == True:
                source = out_user_item['_source']
                uname = source['nick_name']
                if uname == '':
                    uname = u'未知'

            else:
                uname = u'未知'


            #retweet_count = int(retweet_dict[uid])
            count = retweet_dict[uid]
            out_portrait_list.append({'uid':uid,'count':count,'uname':uname})#location,
            iter_count += 1
        return out_portrait_list
    else:
        return None
Esempio n. 17
0
def get_sensitive_text_detail(task_name, ts, user, order):
    _id = user + '-' + task_name
    task_detail = es.get(index=index_sensing_task, doc_type=_id,
                         id=ts)['_source']
    weibo_detail = json.loads(task_detail['sensitive_weibo_detail'])

    weibo_detail_list = []
    if weibo_detail:
        for iter_mid, item in weibo_detail.iteritems():
            tmp = []
            tmp.append(iter_mid)
            tmp.append(item[iter_mid])
            tmp.append(item['retweeted'])
            tmp.append(item['comment'])
            weibo_detail_list.append(tmp)
    mid_list = weibo_detail.keys()

    results = []
    query_body = {
        "query": {
            "filtered": {
                "filter": {
                    "terms": {
                        "mid": mid_list
                    }
                }
            }
        }
    }

    index_list = []
    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts - DAY)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    if exist_es:
        index_list.append(index_name)
    index_name_1 = flow_text_index_name_pre + datetime_1
    exist_es_1 = es_text.indices.exists(index_name_1)
    if exist_es_1:
        index_list.append(index_name_1)

    if index_list and mid_list:
        search_results = es_text.search(index=index_list,
                                        doc_type=flow_text_index_type,
                                        body=query_body)["hits"]["hits"]
    else:
        search_results = []

    uid_list = []
    text_dict = dict()  # 文本信息
    portrait_dict = dict()  # 背景信息
    if search_results:
        for item in search_results:
            uid_list.append(item["_source"]['uid'])
            text_dict[item['_id']] = item['_source']  # _id是mid
        if uid_list:
            portrait_result = es_profile.mget(
                index=profile_index_name,
                doc_type=profile_index_type,
                body={"ids": uid_list},
                fields=['nick_name', 'photo_url'])["docs"]
            for item in portrait_result:
                if item['found']:
                    portrait_dict[item['_id']] = {
                        "nick_name": item["fields"]["nick_name"][0],
                        "photo_url": item["fields"]["photo_url"][0]
                    }
                else:
                    portrait_dict[item['_id']] = {
                        "nick_name": item['_id'],
                        "photo_url": ""
                    }

        if order == "total":
            sorted_list = sorted(weibo_detail_list,
                                 key=lambda x: x[1],
                                 reverse=True)
        elif order == "retweeted":
            sorted_list = sorted(weibo_detail_list,
                                 key=lambda x: x[2],
                                 reverse=True)
        elif order == "comment":
            sorted_list = sorted(weibo_detail_list,
                                 key=lambda x: x[3],
                                 reverse=True)
        else:
            sorted_list = weibo_detail_list

        count_n = 0
        for item in sorted_list:
            mid = item[0]
            iter_text = text_dict.get(mid, {})
            temp = []
            # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type
            if iter_text:
                uid = iter_text['uid']
                temp.append(uid)
                iter_portrait = portrait_dict.get(uid, {})
                if iter_portrait:
                    temp.append(iter_portrait['nick_name'])
                    temp.append(iter_portrait['photo_url'])
                else:
                    temp.extend([uid, ''])
                temp.append(iter_text["text"])
                temp.append(iter_text["sentiment"])
                temp.append(ts2date(iter_text['timestamp']))
                temp.append(iter_text['geo'])
                temp.append(iter_text['message_type'])
                temp.append(item[2])
                temp.append(item[3])
                temp.append(iter_text.get('sensitive', 0))
                count_n += 1
                results.append(temp)

        if results and order == "ts":
            results = sorted(results, key=lambda x: x[5], reverse=True)

        if results and order == "sensitive":
            results = sorted(results, key=lambda x: x[-1], reverse=True)

    return results
Esempio n. 18
0
def get_positive_weibo_detail(ts,
                              social_sensors,
                              keywords_list,
                              size,
                              sentiment_type=1):
    former_mid_list = query_mid_list(ts - time_interval, keywords_list,
                                     time_segment,
                                     social_sensors)  # 前一段时间内的微博mid list
    current_mid_list = query_mid_list(ts, keywords_list, time_interval,
                                      social_sensors)
    mid_list = []
    mid_list.extend(former_mid_list)
    mid_list.extend(current_mid_list)

    query_body = {
        "query": {
            "filtered": {
                "filter": {
                    "bool": {
                        "must": [
                            {
                                "range": {
                                    "timestamp": {
                                        "gte": ts - time_interval,
                                        "lt": ts
                                    }
                                }
                            },
                        ],
                        "should": [{
                            "terms": {
                                "root_mid": mid_list
                            }
                        }, {
                            "terms": {
                                "mid": mid_list
                            }
                        }, {
                            "terms": {
                                "keywords_string": keywords_list
                            }
                        }]
                    }
                }
            }
        },
        "sort": {
            "timestamp": {
                "order": "desc"
            }
        },
        "size": 100
    }

    #if social_sensors and int(sentiment_type) == 1:
    #    query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"terms":{"uid": social_sensors}})

    if int(sentiment_type) == 1 or int(sentiment_type) == 0:
        query_body["query"]["filtered"]["filter"]["bool"]["must"].append(
            {"term": {
                "sentiment": sentiment_type
            }})
    else:
        query_body["query"]["filtered"]["filter"]["bool"]["must"] = [{
            "terms": {
                "sentiment": ["2", "3"]
            }
        }]

    # 判断当前ts和ts-time_interval是否属于同一天,确定查询哪个es
    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts - time_interval)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    index_name_1 = flow_text_index_name_pre + datetime_1
    exist_es_1 = es_text.indices.exists(index_name_1)

    # 1. 聚合原创微博mid list
    if datetime == datetime_1 and exist_es:
        search_results = es_text.search(index=index_name,
                                        doc_type=flow_text_index_type,
                                        body=query_body)["hits"]["hits"]
    elif datetime != datetime_1 and exist_es_1:
        search_results = es_text.search(index=index_name_1,
                                        doc_type=flow_text_index_type,
                                        body=query_body)["hits"]["hits"]
    else:
        search_results = []

    uid_list = []
    results = []
    if search_results:
        for item in search_results:
            uid_list.append(item["_source"]['uid'])
        if uid_list:
            portrait_result = es_profile.mget(
                index=profile_index_name,
                doc_type=profile_index_type,
                body={"ids": uid_list},
                fields=['nick_name', 'photo_url'])["docs"]

        for i in range(len(uid_list)):
            item = search_results[i]['_source']
            temp = []
            # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type
            temp.append(item['uid'])
            if portrait_result[i]['found']:
                temp.append(portrait_result[i]["fields"]["nick_name"][0])
                temp.append(portrait_result[i]["fields"]["photo_url"][0])
            else:
                temp.append("unknown")
                temp.append("")
            temp.append(item["text"])
            temp.append(item["sentiment"])
            temp.append(ts2date(item['timestamp']))
            temp.append(item['geo'])
            keywords_set = set(item['keywords_string'].split('&'))
            common_keywords = set(keywords_list) & keywords_set
            temp.append(list(common_keywords))
            temp.append(item['message_type'])
            results.append(temp)

    return results
Esempio n. 19
0
def get_origin_weibo_detail(ts, user, task_name, size, order, message_type=1):
    _id = user + '-' + task_name
    task_detail = es_user_portrait.get(index=index_sensing_task,
                                       doc_type=_id,
                                       id=ts)['_source']
    print '37', index_sensing_task, _id
    mid_value = json.loads(task_detail['mid_topic_value'])
    duplicate_dict = json.loads(task_detail['duplicate_dict'])
    tmp_duplicate_dict = dict()
    for k, v in duplicate_dict.iteritems():
        try:
            tmp_duplicate_dict[v].append(k)
        except:
            tmp_duplicate_dict[v] = [k, v]

    if message_type == 1:
        weibo_detail = json.loads(task_detail['origin_weibo_detail'])
    elif message_type == 2:
        weibo_detail = json.loads(task_detail['retweeted_weibo_detail'])
    else:
        weibo_detail = json.loads(task_detail['sensitive_weibo_detail'])
    weibo_detail_list = []
    if weibo_detail:
        for iter_mid, item in weibo_detail.iteritems():
            tmp = []
            tmp.append(iter_mid)
            tmp.append(item[iter_mid])
            tmp.append(item['retweeted'])
            tmp.append(item['comment'])
            weibo_detail_list.append(tmp)
    mid_list = weibo_detail.keys()
    print len(mid_list)
    results = []
    query_body = {
        "query": {
            "filtered": {
                "filter": {
                    "terms": {
                        "mid": mid_list
                    }
                }
            }
        },
        "size": 1000,
        "sort": {
            "timestamp": {
                "order": "desc"
            }
        }
    }

    index_list = []
    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts - DAY)
    index_name = flow_text_index_name_pre + datetime
    print es_text
    exist_es = es_text.indices.exists(index_name)
    print exist_es
    if exist_es:
        index_list.append(index_name)
    index_name_1 = flow_text_index_name_pre + datetime_1
    exist_es_1 = es_text.indices.exists(index_name_1)
    if exist_es_1:
        index_list.append(index_name_1)

    if index_list and mid_list:
        search_results = es_text.search(index=index_list,
                                        doc_type=flow_text_index_type,
                                        body=query_body)["hits"]["hits"]
    else:
        search_results = []

    uid_list = []
    text_dict = dict()  # 文本信息
    portrait_dict = dict()  # 背景信息
    sort_results = []
    if search_results:
        for item in search_results:
            uid_list.append(item["_source"]['uid'])
            text_dict[item['_id']] = item['_source']  # _id是mid
        if uid_list:
            portrait_result = es_profile.mget(
                index=profile_index_name,
                doc_type=profile_index_type,
                body={"ids": uid_list},
                fields=['nick_name', 'photo_url'])["docs"]
            for item in portrait_result:
                if item['found']:
                    portrait_dict[item['_id']] = {
                        "nick_name": item["fields"]["nick_name"][0],
                        "photo_url": item["fields"]["photo_url"][0]
                    }
                else:
                    portrait_dict[item['_id']] = {
                        "nick_name": item['_id'],
                        "photo_url": ""
                    }

        if order == "total":
            sorted_list = sorted(weibo_detail_list,
                                 key=lambda x: x[1],
                                 reverse=True)[:10]
        elif order == "retweeted":
            sorted_list = sorted(weibo_detail_list,
                                 key=lambda x: x[2],
                                 reverse=True)[:10]
        elif order == "comment":
            sorted_list = sorted(weibo_detail_list,
                                 key=lambda x: x[3],
                                 reverse=True)[:10]
        else:
            sorted_list = weibo_detail_list

        count_n = 0
        results_dict = dict()
        mid_index_dict = dict()
        for item in sorted_list:  # size
            mid = item[0]
            iter_text = text_dict.get(mid, {})
            temp = []
            # uid, nick_name, photo_url, text, sentiment, timestamp, geo, keywords_string, message_type
            if iter_text:
                uid = iter_text['uid']
                temp.append(uid)
                iter_portrait = portrait_dict.get(uid, {})
                if iter_portrait:
                    temp.append(iter_portrait['nick_name'])
                    temp.append(iter_portrait['photo_url'])
                else:
                    temp.extend([uid, ''])
                temp.append(iter_text["text"])
                temp.append(iter_text["sentiment"])
                temp.append(ts2date(iter_text['timestamp']))
                temp.append(iter_text['geo'])
                if message_type == 1:
                    temp.append(1)
                elif message_type == 2:
                    temp.append(3)
                else:
                    temp.append(iter_text['message_type'])
                #jln 提取关键词
                f_key = get_weibo_single(iter_text['text'])
                temp.append(
                    sorted(f_key.iteritems(), key=lambda x: x[1],
                           reverse=True))

                temp.append(item[2])
                temp.append(item[3])
                temp.append(iter_text.get('sensitive', 0))
                temp.append(iter_text['timestamp'])
                temp.append(mid_value[mid])
                temp.append(mid)

                results.append(temp)
            count_n += 1

        results = sorted(results,
                         key=operator.itemgetter(-4, -2, -6),
                         reverse=True)  # -4 -2 -3
        sort_results = []
        count = 0
        for item in results:
            sort_results.append([item])
            mid_index_dict[item[-1]] = count
            count += 1

        if tmp_duplicate_dict:
            remove_list = []
            value_list = tmp_duplicate_dict.values()  # [[mid, mid], ]
            for item in value_list:
                tmp = []
                for mid in item:
                    if mid_index_dict.get(mid, 0):
                        tmp.append(mid_index_dict[mid])
                if len(tmp) > 1:
                    tmp_min = min(tmp)
                else:
                    continue
                tmp.remove(tmp_min)
                for iter_count in tmp:
                    sort_results[tmp_min].extend(sort_results[iter_count])
                    remove_list.append(sort_results[iter_count])
            if remove_list:
                for item in remove_list:
                    sort_results.remove(item)

    return sort_results
Esempio n. 20
0
def get_task_detail_2(task_name, ts, user):
    results = dict()
    index_name = task_name
    _id = user + "-" + task_name
    task_detail = es.get(index=index_manage_sensing_task, doc_type=task_doc_type, id=_id)["_source"]
    task_name = task_detail['task_name']
    social_sensors = json.loads(task_detail['social_sensors'])
    history_status = json.loads(task_detail['history_status'])
    start_time = task_detail['create_at']
    create_by = task_detail['create_by']
    stop_time = task_detail['stop_time']
    remark = task_detail.get('remark', '')
    portrait_detail = []
    count = 0 # 计数

    top_influence = get_top_influence("influence")
    top_activeness = get_top_influence("activeness")
    top_importance = get_top_influence("importance")

    if social_sensors:
        search_results = es.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids":social_sensors}, fields=SOCIAL_SENSOR_INFO)['docs']
        for item in search_results:
            temp = []
            if item['found']:
                for iter_item in SOCIAL_SENSOR_INFO:
                    if iter_item == "topic_string":
                        temp.append(item["fields"][iter_item][0].split('&'))
                    elif iter_item == "activeness":
                        temp.append(math.log(item['fields']['activeness'][0]/float(top_activeness)*9+1, 10)*100)
                    elif iter_item == "importance":
                        temp.append(math.log(item['fields']['importance'][0]/float(top_importance)*9+1, 10)*100)
                    elif iter_item == "influence":
                        temp.append(math.log(item['fields']['influence'][0]/float(top_influence)*9+1, 10)*100)
                    else:
                        temp.append(item["fields"][iter_item][0])
                portrait_detail.append(temp)
        portrait_detail = sorted(portrait_detail, key=lambda x:x[5], reverse=True)

    time_series = [] # 时间
    #positive_sentiment_list = [] # 情绪列表
    #neutral_sentiment_list = []
    #negetive_sentiment_list = []
    all_weibo_list = []
    origin_weibo_list = [] # 微博列表
    retweeted_weibo_list = []
    #retweeted_weibo_count = [] # 别人转发他的数量
    #comment_weibo_count = []
    #total_number_count = []
    #burst_time_list = [] # 爆发时间列表
    important_user_set = set() # 重要人物列表
    out_portrait_users = set() # 未入库

    ts = int(ts)
    time_series = history_status
    #for item in history_status:
    #    if int(item[0]) <= ts:
    #        time_series.append(item[0]) # 到目前为止的所有的时间戳

    # get detail task information from es
    if time_series:
        flow_detail = es.mget(index=index_sensing_task, doc_type=_id, body={"ids": time_series})['docs']
    else:
        flow_detail = {}
    if flow_detail:
        for item in flow_detail:
            item = item['_source']
            timestamp = item['timestamp']
            #sentiment_distribution = json.loads(item["sentiment_distribution"])
            #positive_sentiment_list.append(int(sentiment_distribution['1']))
            #negetive_sentiment_list.append(int(sentiment_distribution['2'])+int(sentiment_distribution['3']) \
            #        +int(sentiment_distribution['4'])+int(sentiment_distribution['5'])+int(sentiment_distribution['6']))
            #neutral_sentiment_list.append(int(sentiment_distribution['0']))
            origin_weibo_list.append(item["origin_weibo_number"]) # real
            retweeted_weibo_list.append(item['retweeted_weibo_number']) # real
            all_weibo_list.append(item["origin_weibo_number"]+item['retweeted_weibo_number'])
            #retweeted_weibo_count.append(item['retweeted_weibo_count'])
            #comment_weibo_count.append(item['comment_weibo_count'])
            #total_number_count.append(item['weibo_total_number'])
            temp_important_user_list = json.loads(item['important_users'])
            unfiltered_users = json.loads(item['unfilter_users'])
            temp_out_portrait_users = set(unfiltered_users) - set(temp_important_user_list) # 未入库
            important_user_set = important_user_set | set(temp_important_user_list)
            out_portrait_users = out_portrait_users | set(temp_out_portrait_users)

            #burst_reason = item.get("burst_reason", "")
            #if burst_reason:
            #    burst_time_list.append([timestamp, count, burst_reason])
            count += 1

    ####################################################################################
    # 统计爆发原因,下相应的结论
    """
    weibo_variation_count = 0
    weibo_variation_time = []
    sentiment_variation_count = 0
    sentiment_variation_time = []
    sensitive_variation_count = 0 # sensitive
    sensitive_variation_time = [] # sensitive
    common_variation_count = 0
    common_variation_time = []
    if burst_time_list:
        for item in burst_time_list:
            tmp_common = 0
            x1 = 0
            x2 = 0
            x3 = 0
            if signal_count_varition in item[2]:
                weibo_variation_count += 1
                weibo_variation_time.append([ts2date_min(item[0]), total_number_count[item[1]]])
                x1 = total_number_count[item[1]]
                tmp_common += 1
            if signal_sentiment_varition in item[2]:
                tmp_common += 1
                sentiment_variation_count += 1
                x2 = negetive_sentiment_list[item[1]]
                sentiment_variation_time.append([ts2date_min(item[0]), negetive_sentiment_list[item[1]]])
            if signal_sensitive_variation in item[2]:
                tmp_common += 1
                sensitive_variation_count += 1
                x3 = sensitive_total_number_list[item[1]]
                sensitive_variation_time.append([ts2date_min(item[0]), all_weibo_list[item[1]]])
            if tmp_common >= 2:
                common_variation_count += 1
                common_variation_time.append([ts2date_min(item[0]), x1, x2, x3])

    warning_conclusion = remark
    variation_distribution = []
    if weibo_variation_count:
        variation_distribution.append(weibo_variation_time)
    else:
        variation_distribution.append([])

    if sentiment_variation_count:
        variation_distribution.append(sentiment_variation_time)
    else:
        variation_distribution.append([])

    if sensitive_variation_count:
        variation_distribution.append(sensitive_variation_time)
    else:
        variation_distribution.append([])

    if common_variation_count:
        variation_distribution.append(common_variation_time)
    else:
        variation_distribution.append([])

    results['warning_conclusion'] = warning_conclusion
    results['variation_distribution'] = variation_distribution

    # 每个用户的热度
    """

    # 获取重要用户的个人信息
    important_uid_list = list(important_user_set)
    out_portrait_users_list = list(out_portrait_users)
    social_sensor_set = set(social_sensors)
    user_detail_info = [] #
    out_user_detail_info = []
    if important_uid_list:
        user_results = es.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids":important_uid_list}, fields=['uid', 'uname', 'domain', 'topic_string', "photo_url", 'importance', 'influence', 'activeness'])['docs']
        for item in user_results:
            if item['found']:
                temp = []
                #if int(item['fields']['importance'][0]) < IMPORTANT_USER_THRESHOULD:
                #    continue
                temp.append(item['fields']['uid'][0])
                uname = item['fields']['uname'][0]
                if not uname or uname == "未知":
                    uname = item['fields']['uid'][0]
                temp.append(uname)
                temp.append(item['fields']['photo_url'][0])
                temp.append(item['fields']['domain'][0])
                temp.append(item['fields']['topic_string'][0].split('&'))
                #hot_count = count_hot_uid(item['fields']['uid'][0], start_time, stop_time)
                #temp.append(hot_count)
                temp.append(math.log(item['fields']['importance'][0]/float(top_importance)*9+1, 10)*100)
                temp.append(math.log(item['fields']['influence'][0]/float(top_influence)*9+1, 10)*100)
                temp.append(math.log(item['fields']['activeness'][0]/float(top_activeness)*9+1, 10)*100)
                if item['fields']['uid'][0] in social_sensor_set:
                    temp.append(1)
                else:
                    temp.append(0)
                user_detail_info.append(temp)
    # 排序
    if user_detail_info:
        user_detail_info = sorted(user_detail_info, key=lambda x:x[6], reverse=True)
    else:
        user_detail_info = []

    if out_portrait_users_list:
        profile_results = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":out_portrait_users_list})["docs"]
        bci_index = "bci_" + ts2datetime(ts-DAY).replace('-','')
        influence_results = es.mget(index=bci_index, doc_type="bci", body={"ids":out_portrait_users_list}, fields=["user_index"])['docs']
        bci_results = es_profile.mget(index="bci_history", doc_type="bci", body={"ids":out_portrait_users_list}, fields=['user_fansnum'])['docs']
        top_influence = get_top_all_influence("user_index", ts)
        count = 0
        if profile_results:
            for item in profile_results:
                temp = []
                if item['found']:
                    temp.append(item['_source']['uid'])
                    if item['_source']['nick_name']:
                        temp.append(item['_source']['nick_name'])
                    else:
                        temp.append(item['_source']['uid'])
                    temp.append(item['_source']['user_location'])
                    #temp.append(item['_source']['fansnum'])
                else:
                    temp.append(item['_id'])
                    temp.append(item['_id'])
                    temp.extend([''])
                try:
                    user_fansnum = bci_results[count]["fields"]["user_fansnum"][0]
                except:
                    user_fansnum = 0
                temp.append(user_fansnum)
                temp_influ = influence_results[count]
                if temp_influ.get('found', 0):
                    user_index = temp_influ['fields']['user_index'][0]
                    temp.append(math.log(user_index/float(top_influence)*9+1, 10)*100)
                else:
                    temp.append(0)
                count += 1
                out_user_detail_info.append(temp)
    print len(out_user_detail_info)
    if len(out_user_detail_info):
        print "sort"
        out_user_detail_info = sorted(out_user_detail_info, key=lambda x:x[4], reverse=True)


    revise_time_series = []
    for item in time_series:
        revise_time_series.append(ts2date_min(item))

    results['important_user_detail'] = user_detail_info
    results['out_portrait_user_detail'] = out_user_detail_info
    #results['burst_time'] = burst_time_list # 爆发时间点,以及爆发原因
    results['time_series'] = revise_time_series
    #results['positive_sentiment_list'] = positive_sentiment_list
    #esults['negetive_sentiment_list'] = negetive_sentiment_list
    #results['neutral_sentiment_list'] = neutral_sentiment_list
    results['all_weibo_list'] = all_weibo_list
    results['origin_weibo_list'] = origin_weibo_list
    results['retweeted_weibo_list'] = retweeted_weibo_list
    #results['comment_weibo_count'] = comment_weibo_count
    #results['retweeted_weibo_count'] = retweeted_weibo_count
    #results['total_number_list'] = total_number_count
    results['social_sensors_detail'] = portrait_detail

    return results
Esempio n. 21
0
def get_temporal_rank(task_type, sort="retweeted", number=100):
    number = int(number) - 1
    if int(task_type) == 0:  # 到目前位置
        sort_list = r.zrange("influence_%s" % sort,
                             0,
                             number,
                             withscores=True,
                             desc=True)
    elif int(task_type) == 1:
        sort_list = r.zrange("influence_%s_1" % sort,
                             0,
                             number,
                             withscores=True,
                             desc=True)
    elif int(task_type) == 2:
        sort_list = r.zrange("influence_%s_2" % sort,
                             0,
                             number,
                             withscores=True,
                             desc=True)
    elif int(task_type) == 3:
        sort_list = r.zrange("influence_%s_3" % sort,
                             0,
                             number,
                             withscores=True,
                             desc=True)
    else:
        sort_list = r.zrange("influence_%s_4" % sort,
                             0,
                             number,
                             withscores=True,
                             desc=True)

    uid_list = []
    for item in sort_list:
        uid_list.append(item[0])

    if sort == "retweeted":
        other = "comment"
    else:
        other = "retweeted"

    results = []
    # 查看背景信息
    if uid_list:
        profile_result = es_user_profile.mget(index=profile_index_name,
                                              doc_type=profile_index_type,
                                              body={"ids": uid_list})["docs"]
        bci_result = es_user_profile.mget(
            index="bci_history",
            doc_type="bci",
            body={"ids": uid_list},
            _source=False,
            fields=['user_fansnum', "weibo_month_sum"])["docs"]
        count = 0
        for item in profile_result:
            _id = item['_id']
            index = profile_result.index(item)
            tmp = []
            tmp.append(item['_id'])
            if item['found']:
                item = item['_source']
                tmp.append(item['nick_name'])
                tmp.append(item['statusnum'])
                tmp.append(item['user_location'])
                tmp.append(item['fansnum'])
            else:
                tmp.extend(['', 0, '', 0])
            try:
                user_fansnum = bci_result[count]['fields']['user_fansnum'][0]
                tmp[4] = user_fansnum
            except:
                pass
            try:
                weibo_number = bci_result[count]['fields']["weibo_month_sum"][
                    0]
                tmp[2] = weibo_number
            except:
                pass
            count_1 = int(sort_list[index][1])
            if int(task_type) == 0:
                tmp_count = r.zscore("influence_%s" % other, _id)
                if tmp_count:
                    count_2 = int(tmp_count)
                else:
                    count_2 = 0
            else:
                tmp_count = r.zscore("influence_%s_%s" % (other, task_type),
                                     _id)
                if tmp_count:
                    count_2 = int(tmp_count)
                else:
                    count_2 = 0
            if sort == "retweeted":
                tmp.append(count_1)
                tmp.append(count_2)
            else:
                tmp.append(count_2)
                tmp.append(count_1)
            results.append(tmp)
            count += 1

    if uid_list:
        count = 0
        portrait_result = es_user_portrait.mget(index=portrait_index_name,
                                                doc_type=portrait_index_type,
                                                body={"ids": uid_list})["docs"]
        for item in portrait_result:
            if item['found']:
                results[count].append("1")
            else:
                results[count].append("0")
            count += 1

    return results
Esempio n. 22
0
def get_retweet_weibo_detail(ts, user, task_name, size, text_type, type_value):
    _id = user + '-' + task_name
    task_detail = es_user_portrait.get(index=index_sensing_task, doc_type=_id, id=ts)['_source']
    origin_weibo_detail = json.loads(task_detail['origin_weibo_detail'])
    retweeted_weibo_detail = json.loads(task_detail['retweeted_weibo_detail'])

    mid_list = []
    mid_list.extend(origin_weibo_detail.keys())
    mid_list.extend(retweeted_weibo_detail.keys())

    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                            {"range":{
                                "timestamp":{
                                    "gte": ts - time_interval,
                                    "lt": ts
                                }
                            }},
                            {"terms": {"root_mid": mid_list}}
                        ]
                    }
                }
            }
        },
        "sort": {"timestamp": {"order": "desc"}},
        "size": 100
    }

    if text_type == "message_type":
        query_body['query']['filtered']['filter']['bool']['must'].append({"term":{text_type: type_value}})
    if text_type == "sentiment":
        #if isinstance(type_value, str):
        if len(type_value) == 1:
            query_body['query']['filtered']['filter']['bool']['must'].append({"term":{text_type: type_value}})
        else:
            query_body['query']['filtered']['filter']['bool']['must'].append({"terms":{text_type: type_value}})

    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts-time_interval)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    index_name_1 = flow_text_index_name_pre + datetime_1
    exist_es_1 = es_text.indices.exists(index_name_1)

    # 1. 查询微博
    if datetime == datetime_1 and exist_es:
        search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    elif datetime != datetime_1 and exist_es_1:
        search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    else:
        search_results = []
    #print search_results
    # 2. 获取微博相关信息
    results = []
    uid_list = []
    if search_results:
        for item in search_results:
            uid_list.append(item["_source"]['uid'])
        if uid_list:
            portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"]

        for i in range(len(uid_list)):
            item = search_results[i]['_source']
            temp = []
            # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type
            temp.append(item['uid'])
            if portrait_result[i]['found']:
                temp.append(portrait_result[i]["fields"]["nick_name"][0])
                temp.append(portrait_result[i]["fields"]["photo_url"][0])
            else:
                temp.append(item['uid'])
                temp.append("")
            temp.append(item["text"])
            #print item['text']
            temp.append(item["sentiment"])
            temp.append(ts2date(item['timestamp']))
            temp.append(item['geo'])
            temp.append(item["message_type"])
            results.append(temp)

    return results
Esempio n. 23
0
def get_retweet_weibo_detail(ts, user, task_name, size, text_type, type_value):
    _id = user + '-' + task_name
    task_detail = es_user_portrait.get(index=index_sensing_task,
                                       doc_type=_id,
                                       id=ts)['_source']
    origin_weibo_detail = json.loads(task_detail['origin_weibo_detail'])
    retweeted_weibo_detail = json.loads(task_detail['retweeted_weibo_detail'])

    mid_list = []
    mid_list.extend(origin_weibo_detail.keys())
    mid_list.extend(retweeted_weibo_detail.keys())

    query_body = {
        "query": {
            "filtered": {
                "filter": {
                    "bool": {
                        "must": [{
                            "range": {
                                "timestamp": {
                                    "gte": ts - time_interval,
                                    "lt": ts
                                }
                            }
                        }, {
                            "terms": {
                                "root_mid": mid_list
                            }
                        }]
                    }
                }
            }
        },
        "sort": {
            "timestamp": {
                "order": "desc"
            }
        },
        "size": 100
    }

    if text_type == "message_type":
        query_body['query']['filtered']['filter']['bool']['must'].append(
            {"term": {
                text_type: type_value
            }})
    if text_type == "sentiment":
        #if isinstance(type_value, str):
        if len(type_value) == 1:
            query_body['query']['filtered']['filter']['bool']['must'].append(
                {"term": {
                    text_type: type_value
                }})
        else:
            query_body['query']['filtered']['filter']['bool']['must'].append(
                {"terms": {
                    text_type: type_value
                }})

    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts - time_interval)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    index_name_1 = flow_text_index_name_pre + datetime_1
    exist_es_1 = es_text.indices.exists(index_name_1)

    # 1. 查询微博
    if datetime == datetime_1 and exist_es:
        search_results = es_text.search(index=index_name,
                                        doc_type=flow_text_index_type,
                                        body=query_body)["hits"]["hits"]
    elif datetime != datetime_1 and exist_es_1:
        search_results = es_text.search(index=index_name_1,
                                        doc_type=flow_text_index_type,
                                        body=query_body)["hits"]["hits"]
    else:
        search_results = []
    #print search_results
    # 2. 获取微博相关信息
    results = []
    uid_list = []
    if search_results:
        for item in search_results:
            uid_list.append(item["_source"]['uid'])
        if uid_list:
            portrait_result = es_profile.mget(
                index=profile_index_name,
                doc_type=profile_index_type,
                body={"ids": uid_list},
                fields=['nick_name', 'photo_url'])["docs"]

        for i in range(len(uid_list)):
            item = search_results[i]['_source']
            temp = []
            # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type
            temp.append(item['uid'])
            if portrait_result[i]['found']:
                temp.append(portrait_result[i]["fields"]["nick_name"][0])
                temp.append(portrait_result[i]["fields"]["photo_url"][0])
            else:
                temp.append(item['uid'])
                temp.append("")
            temp.append(item["text"])
            #print item['text']
            temp.append(item["sentiment"])
            temp.append(ts2date(item['timestamp']))
            temp.append(item['geo'])
            temp.append(item["message_type"])
            results.append(temp)

    return results
Esempio n. 24
0
def search_user_info(es, index_name, doc_type, uid, result_name):
    try:
        retweet_result = es.get(index=index_name, doc_type=doc_type,
                                id=uid)['_source']
    except:
        return None
    if retweet_result:
        retweet_dict = json.loads(retweet_result[result_name])
        sorted_list = sorted(retweet_dict.iteritems(),
                             key=lambda x: x[1],
                             reverse=True)[:20]
        uid_list = [i[0] for i in sorted_list if i[0] != uid]
        portrait_result = []
        try:
            user_result = es_user_profile.mget(index=profile_index_name,
                                               doc_type=profile_index_type,
                                               body={'ids': uid_list})['docs']
        except:
            user_result = []
        try:
            bci_history_result = es_bci_history.mget(
                index=bci_history_index_name,
                doc_type=bci_history_index_type,
                body={'ids': uid_list},
                fields=fields)['docs']
        except:
            bci_history_result = []
        ## print bci_history_result
        iter_count = 0
        out_portrait_list = []
        for out_user_item in user_result:
            uid = out_user_item['_id']
            if out_user_item['found'] == True:
                source = out_user_item['_source']
                uname = source['nick_name']
                photo_url = source['photo_url']
                if uname == '':
                    uname = u'未知'
                #location = source['user_location']
                friendsnum = source['friendsnum']
            else:
                uname = u'未知'
                location = ''
                friendsnum = ''
                photo_url = 'unknown'
            #add index from bci_history
            try:
                bci_history_item = bci_history_result[iter_count]
            except:
                bci_history_item = {'found': False}
            if bci_history_item['found'] == True:
                fansnum = bci_history_item['fields'][fields[0]][0]
                user_weibo_count = bci_history_item['fields'][fields[1]][0]
                user_friendsnum = bci_history_item['fields'][fields[2]][0]
                influence = bci_history_item['fields'][fields[3]][0]
            else:
                fansnum = ''
                user_weibo_count = ''
                user_friendsnum = ''
                influence = ''
            #retweet_count = int(retweet_dict[uid])
            count = retweet_dict[uid]
            out_portrait_list.append({
                'uid': uid,
                'photo_url': photo_url,
                'count': count,
                'uname': uname,
                'influence': influence,
                'fansnum': fansnum,
                'friendsnum': user_friendsnum,
                'weibo_count': user_weibo_count
            })  #location,
            iter_count += 1
        return out_portrait_list
    else:
        return None
Esempio n. 25
0
def search_fans(uid, top_count):
    results = {}
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    db_number = get_db_num(now_date_ts)

    be_comment_index_name = be_comment_index_name_pre + str(db_number)
    be_retweet_index_name = be_retweet_index_name_pre + str(db_number)
    result = {}
    be_retweet_inter_dict = {}
    be_comment_inter_dict = {}
    center_uid = uid
    try:
        be_retweet_result = es_retweet.get(index=be_retweet_index_name,
                                           doc_type=be_retweet_index_type,
                                           id=uid)['_source']
    except:
        be_retweet_result = {}

    if be_retweet_result:
        be_retweet_uid_dict = json.loads(be_retweet_result['uid_be_retweet'])
    else:
        be_retweet_uid_dict = {}
    # # print "be_retweet_uid_dict", be_retweet_uid_dict
    try:
        be_comment_result = es_be_comment.get(index=be_comment_index_name,
                                              doc_type=be_comment_index_type,
                                              id=uid)['_source']
    except:
        be_comment_result = {}

    if be_comment_result:
        be_comment_uid_dict = json.loads(be_comment_result['uid_be_comment'])
    else:
        be_comment_uid_dict = {}
    # # print "be_comment_uid_dict", be_comment_uid_dict

    fans_result = union_dict(be_retweet_uid_dict, be_comment_uid_dict)
    fans_user_set = set(fans_result.keys())
    fans_list = list(fans_user_set)
    # # print "fans_list", fans_list
    all_fans_dict = {}

    for fans_user in fans_list:
        if fans_user != center_uid:
            all_fans_dict[fans_user] = fans_result[fans_user]
    sort_all_fans_dict = sorted(all_fans_dict.items(),
                                key=lambda x: x[1],
                                reverse=True)
    all_fans_uid_list = []
    all_fans_uid_list_all = [item[0] for item in sort_all_fans_dict]

    count = 0
    for i in all_fans_uid_list_all:
        count += 1
        all_fans_uid_list.append(i)
        if count == 1000:
            break
    # # print all_fans_uid_list

    out_portrait_list = all_fans_uid_list
    #use to get user information from user profile
    out_portrait_result = {}
    try:
        out_user_result = es_user_profile.mget(index=profile_index_name,
                                               doc_type=profile_index_type,
                                               body={'ids': out_portrait_list
                                                     })['docs']
    except:
        out_user_result = []
    #add index from bci_history
    try:
        bci_history_result = es_bci_history.mget(
            index=bci_history_index_name,
            doc_type=bci_history_index_type,
            body={'ids': out_portrait_list},
            fields=fields)['docs']
    except:
        bci_history_result = []
    iter_count = 0
    out_portrait_list = []
    for out_user_item in out_user_result:
        uid = out_user_item['_id']
        if out_user_item['found'] == True:
            source = out_user_item['_source']
            uname = source['nick_name']
            photo_url = source['photo_url']
            if uname == '':
                uname = u'未知'
            location = source['user_location']
            friendsnum = source['friendsnum']
        else:
            uname = u'未知'
            location = ''
            friendsnum = ''
            photo_url = 'unknown'
        #add index from bci_history
        try:
            bci_history_item = bci_history_result[iter_count]
        except:
            bci_history_item = {'found': False}
        # # print bci_history_item
        if bci_history_item['found'] == True:
            fansnum = bci_history_item['fields'][fields[0]][0]
            user_weibo_count = bci_history_item['fields'][fields[1]][0]
            user_friendsnum = bci_history_item['fields'][fields[2]][0]
            influence = bci_history_item['fields'][fields[3]][0]
        else:
            fansnum = ''
            user_weibo_count = ''
            user_friendsnum = ''

        fans_count = int(all_fans_dict[uid])
        out_portrait_list.append({
            'uid': uid,
            'photo_url': photo_url,
            'uname': uname,
            'count': fans_count,
            'fansnum': fansnum,
            'friendsnum': user_friendsnum,
            'weibo_count': user_weibo_count
        })
        iter_count += 1

    return out_portrait_list
Esempio n. 26
0
def search_follower(uid, top_count):

    results = {}
    now_ts = time.time()
    db_number = get_db_num(now_ts)
    index_name = be_retweet_index_name_pre + str(db_number)
    # return search_user_info(es_retweet,index_name,retweet_index_type,uid,'uid_be_retweet')
    center_uid = uid
    try:
        retweet_result = es_retweet.get(index=index_name,
                                        doc_type=be_retweet_index_type,
                                        id=uid)['_source']
    except:
        return None
    retweet_dict = {}
    if retweet_result:
        retweet_dict_old = json.loads(retweet_result['uid_be_retweet'])
        for key in retweet_dict_old:
            retweet_dict[key] = int(retweet_dict_old[key])
        sorted_list = sorted(retweet_dict.iteritems(),
                             key=lambda x: x[1],
                             reverse=True)[:20]
        uid_list = [i[0] for i in sorted_list if i[0] != uid]
        portrait_result = []
        try:
            user_result = es_user_profile.mget(index=profile_index_name,
                                               doc_type=profile_index_type,
                                               body={'ids': uid_list})['docs']
        except:
            user_result = []

        try:
            bci_history_result = es_bci_history.mget(
                index=bci_history_index_name,
                doc_type=bci_history_index_type,
                body={'ids': uid_list},
                fields=fields)['docs']
        except:
            bci_history_result = []
        # # print bci_history_result
        iter_count = 0
        out_portrait_list = []
        for out_user_item in user_result:
            uid = out_user_item['_id']
            if out_user_item['found'] == True:
                source = out_user_item['_source']
                uname = source['nick_name']
                photo_url = source['photo_url']
                if uname == '':
                    uname = u'未知'
                #location = source['user_location']
                friendsnum = source['friendsnum']

            else:
                uname = u'未知'
                location = ''
                friendsnum = ''
                photo_url = ''

            #add index from bci_history
            try:
                bci_history_item = bci_history_result[iter_count]
            except:
                bci_history_item = {'found': False}
            if bci_history_item['found'] == True:
                fansnum = bci_history_item['fields'][fields[0]][0]
                user_weibo_count = bci_history_item['fields'][fields[1]][0]
                user_friendsnum = bci_history_item['fields'][fields[2]][0]
                influence = bci_history_item['fields'][fields[3]][0]
            else:
                fansnum = ''
                user_weibo_count = ''
                user_friendsnum = ''
                influence = ''
            #retweet_count = int(retweet_dict[uid])
            # print uid
            count = retweet_dict[uid]
            # print count
            out_portrait_list.append({
                'uid': uid,
                'photo_url': photo_url,
                'count': count,
                'uname': uname,
                'influence': influence,
                'fansnum': fansnum,
                'friendsnum': user_friendsnum,
                'weibo_count': user_weibo_count
            })  #location,
            iter_count += 1
        return out_portrait_list
    else:
        return None
Esempio n. 27
0
def search_bidirect_interaction(uid, top_count):

    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    db_number = get_db_num(now_date_ts)
    retweet_index_name = retweet_index_name_pre + str(db_number)
    be_retweet_index_name = be_retweet_index_name_pre + str(db_number)
    comment_index_name = comment_index_name_pre + str(db_number)
    be_comment_index_name = be_comment_index_name_pre + str(db_number)
    results = {}
    retweet_inter_dict = {}
    comment_inter_dict = {}
    center_uid = uid
    #bidirect interaction in retweet and be_retweet
    try:
        retweet_result = es_retweet.get(index=retweet_index_name,
                                        doc_type=retweet_index_type,
                                        id=uid)['_source']
    except:
        retweet_result = {}
    if retweet_result:
        retweet_uid_dict = json.loads(retweet_result['uid_retweet'])
    else:
        retweet_uid_dict = {}
    retweet_uid_list = retweet_uid_dict.keys()
    try:
        be_retweet_result = es_retweet.get(index=be_retweet_index_name,
                                           doc_type=be_retweet_index_type,
                                           id=uid)['_source']
    except:
        be_retweet_result = {}
    if be_retweet_result:
        be_retweet_uid_dict = json.loads(be_retweet_result['uid_be_retweet'])
    else:
        be_retweet_uid_dict = {}

    #bidirect interaction in comment and be_comment
    try:
        comment_result = es_comment.get(index=comment_index_name,
                                        doc_type=comment_index_type,
                                        id=uid)['_source']
    except:
        comment_result = {}
    if comment_result:
        comment_uid_dict = json.loads(comment_result['uid_comment'])
    else:
        comment_uid_dict = {}
    comment_uid_list = comment_uid_dict.keys()
    try:
        be_comment_result = es_comment.get(index=be_coment_index_name,
                                           doc_type=be_comment_index_type,
                                           id=uid)['_source']
    except:
        be_comment_result = {}
    if be_comment_result:
        be_comment_uid_dict = json.loads(be_comment_result['uid_be_comment'])
    else:
        be_comment_uid_dict = {}
    #get bidirect_interaction dict
    #all_interaction_dict = union_dict(retweet_inter_dict, comment_inter_dict)
    retweet_comment_result = union_dict(retweet_uid_dict, comment_uid_dict)
    be_retweet_comment_result = union_dict(be_retweet_uid_dict,
                                           be_comment_uid_dict)
    interaction_user_set = set(retweet_comment_result.keys()) & set(
        be_retweet_comment_result.keys())
    interaction_user_list = list(interaction_user_set)
    all_interaction_dict = {}
    for interaction_user in interaction_user_list:
        if interaction_user != center_uid:
            all_interaction_dict[interaction_user] = retweet_comment_result[
                interaction_user] + be_retweet_comment_result[interaction_user]

    sort_all_interaction_dict = sorted(all_interaction_dict.items(),
                                       key=lambda x: x[1],
                                       reverse=True)
    #get in_portrait_list, in_portrait_results and out_portrait_list
    all_interaction_uid_list = [item[0] for item in sort_all_interaction_dict]
    ## print all_interaction_uid_list

    # if RUN_TYPE == 0:
    # all_interaction_dict = {'2029036025':3,'1282005885':2,'2549228714':2,'1809833450':1}
    # all_interaction_uid_list = ['2029036025', '1282005885', '2549228714', '1809833450']

    out_portrait_list = all_interaction_uid_list
    #use to get user information from user profile
    out_portrait_result = {}
    try:
        out_user_result = es_user_profile.mget(index=profile_index_name,
                                               doc_type=profile_index_type,
                                               body={'ids': out_portrait_list
                                                     })['docs']
    except:
        out_user_result = []
    #add index from bci_history
    try:
        bci_history_result = es_bci_history.mget(
            index=bci_history_index_name,
            doc_type=bci_history_index_type,
            body={'ids': out_portrait_list},
            fields=fields)['docs']
    except:
        bci_history_result = []
    iter_count = 0
    out_portrait_list = []
    for out_user_item in out_user_result:
        uid = out_user_item['_id']
        if out_user_item['found'] == True:
            source = out_user_item['_source']
            uname = source['nick_name']
            photo_url = source['photo_url']
            if uname == '':
                uname = u'未知'
            location = source['user_location']
            friendsnum = source['friendsnum']
        else:
            uname = u'未知'
            location = ''
            friendsnum = ''
            photo_url = 'unknown'
        #add index from bci_history
        try:
            bci_history_item = bci_history_result[iter_count]
        except:
            bci_history_item = {'found': False}
        # # print bci_history_item
        if bci_history_item['found'] == True:
            fansnum = bci_history_item['fields'][fields[0]][0]
            user_weibo_count = bci_history_item['fields'][fields[1]][0]
            user_friendsnum = bci_history_item['fields'][fields[2]][0]
            influence = bci_history_item['fields'][fields[3]][0]
        else:
            fansnum = ''
            user_weibo_count = ''
            user_friendsnum = ''

        interaction_count = int(all_interaction_dict[uid])
        out_portrait_list.append({
            'uid': uid,
            'photo_url': photo_url,
            'uname': uname,
            'count': interaction_count,
            'fansnum': fansnum,
            'friendsnum': user_friendsnum,
            'weibo_count': user_weibo_count
        })
        iter_count += 1

    return out_portrait_list
Esempio n. 28
0
File: utils.py Progetto: SwoJa/ruman
def get_sensitive_text_detail(task_name, ts, user, order):
    _id = user + '-' + task_name
    task_detail = es.get(index=index_sensing_task, doc_type=_id, id=ts)['_source']
    weibo_detail = json.loads(task_detail['sensitive_weibo_detail'])

    weibo_detail_list = []
    if weibo_detail:
        for iter_mid, item in weibo_detail.iteritems():
            tmp = []
            tmp.append(iter_mid)
            tmp.append(item[iter_mid])
            tmp.append(item['retweeted'])
            tmp.append(item['comment'])
            weibo_detail_list.append(tmp)
    mid_list = weibo_detail.keys()

    results = []
    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "terms":{"mid": mid_list}
                }
            }
        }
    }

    index_list = []
    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts-DAY)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    if exist_es:
        index_list.append(index_name)
    index_name_1 = flow_text_index_name_pre + datetime_1
    exist_es_1 = es_text.indices.exists(index_name_1)
    if exist_es_1:
        index_list.append(index_name_1)

    if index_list and mid_list:
        search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    else:
        search_results = []

    uid_list = []
    text_dict = dict() # 文本信息
    portrait_dict = dict() # 背景信息
    if search_results:
        for item in search_results:
            uid_list.append(item["_source"]['uid'])
            text_dict[item['_id']] = item['_source'] # _id是mid
        if uid_list:
            portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"]
            for item in portrait_result:
                if item['found']:
                    portrait_dict[item['_id']] = {"nick_name": item["fields"]["nick_name"][0], "photo_url": item["fields"]["photo_url"][0]}
                else:
                    portrait_dict[item['_id']] = {"nick_name": item['_id'], "photo_url":""}

        if order == "total":
            sorted_list = sorted(weibo_detail_list, key=lambda x:x[1], reverse=True)
        elif order == "retweeted":
            sorted_list = sorted(weibo_detail_list, key=lambda x:x[2], reverse=True)
        elif order == "comment":
            sorted_list = sorted(weibo_detail_list, key=lambda x:x[3], reverse=True)
        else:
            sorted_list = weibo_detail_list

        count_n = 0
        for item in sorted_list:
            mid = item[0]
            iter_text = text_dict.get(mid, {})
            temp = []
            # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type
            if iter_text:
                uid = iter_text['uid']
                temp.append(uid)
                iter_portrait = portrait_dict.get(uid, {})
                if iter_portrait:
                    temp.append(iter_portrait['nick_name'])
                    temp.append(iter_portrait['photo_url'])
                else:
                    temp.extend([uid,''])
                temp.append(iter_text["text"])
                temp.append(iter_text["sentiment"])
                temp.append(ts2date(iter_text['timestamp']))
                temp.append(iter_text['geo'])
                temp.append(iter_text['message_type'])
                temp.append(item[2])
                temp.append(item[3])
                temp.append(iter_text.get('sensitive', 0))
                count_n += 1
                results.append(temp)

        if results and order == "ts":
            results = sorted(results, key=lambda x:x[5], reverse=True)

        if results and order == "sensitive":
            results = sorted(results, key=lambda x:x[-1], reverse=True)

    return results
Esempio n. 29
0
def search_bidirect_interaction(uid, top_count):

    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    db_number = get_db_num(now_date_ts)
    retweet_index_name = retweet_index_name_pre + str(db_number)
    be_retweet_index_name = be_retweet_index_name_pre + str(db_number)
    comment_index_name = comment_index_name_pre + str(db_number)
    be_comment_index_name = be_comment_index_name_pre + str(db_number)
    results = {}
    retweet_inter_dict = {}
    comment_inter_dict = {}
    center_uid = uid
    #bidirect interaction in retweet and be_retweet
    try:
        retweet_result = es_retweet.get(index=retweet_index_name, doc_type=retweet_index_type, id=uid)['_source']
    except:
        retweet_result = {}
    if retweet_result:
        retweet_uid_dict = json.loads(retweet_result['uid_retweet'])
    else:
        retweet_uid_dict = {}
    retweet_uid_list = retweet_uid_dict.keys()
    try:
        be_retweet_result = es_retweet.get(index=be_retweet_index_name, doc_type=be_retweet_index_type, id=uid)['_source']
    except:
        be_retweet_result = {}
    if be_retweet_result:
        be_retweet_uid_dict = json.loads(be_retweet_result['uid_be_retweet'])
    else:
        be_retweet_uid_dict = {}

    #bidirect interaction in comment and be_comment
    try:
        comment_result = es_comment.get(index=comment_index_name, doc_type=comment_index_type, id=uid)['_source']
    except:
        comment_result = {}
    if comment_result:
        comment_uid_dict = json.loads(comment_result['uid_comment'])
    else:
        comment_uid_dict = {}
    comment_uid_list = comment_uid_dict.keys()
    try:
        be_comment_result = es_comment.get(index=be_coment_index_name, doc_type=be_comment_index_type, id=uid)['_source']
    except:
        be_comment_result = {}
    if be_comment_result:
        be_comment_uid_dict = json.loads(be_comment_result['uid_be_comment'])
    else:
        be_comment_uid_dict = {}
    #get bidirect_interaction dict
    #all_interaction_dict = union_dict(retweet_inter_dict, comment_inter_dict)
    retweet_comment_result = union_dict(retweet_uid_dict, comment_uid_dict)
    be_retweet_comment_result = union_dict(be_retweet_uid_dict, be_comment_uid_dict)
    interaction_user_set = set(retweet_comment_result.keys()) & set(be_retweet_comment_result.keys())
    interaction_user_list = list(interaction_user_set)
    all_interaction_dict = {}
    for interaction_user in interaction_user_list:
        if interaction_user != center_uid:
            all_interaction_dict[interaction_user] = retweet_comment_result[interaction_user] + be_retweet_comment_result[interaction_user]
            
    sort_all_interaction_dict = sorted(all_interaction_dict.items(), key=lambda x:x[1], reverse=True)
    #get in_portrait_list, in_portrait_results and out_portrait_list
    all_interaction_uid_list = [item[0] for item in sort_all_interaction_dict]
    #print all_interaction_uid_list

    # if RUN_TYPE == 0:
        # all_interaction_dict = {'2029036025':3,'1282005885':2,'2549228714':2,'1809833450':1}
        # all_interaction_uid_list = ['2029036025', '1282005885', '2549228714', '1809833450']

    out_portrait_list = all_interaction_uid_list
    #use to get user information from user profile
    out_portrait_result = {}
    try:
        out_user_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids':out_portrait_list})['docs']
    except:
        out_user_result = []
    #add index from bci_history
    try:
        bci_history_result = es_bci_history.mget(index=bci_history_index_name, doc_type=bci_history_index_type, body={'ids': out_portrait_list}, fields=fields)['docs']
    except:
        bci_history_result = []
    iter_count = 0
    out_portrait_list = []
    for out_user_item in out_user_result:
        uid = out_user_item['_id']
        if out_user_item['found'] == True:
            source = out_user_item['_source']
            uname = source['nick_name']
            photo_url = source['photo_url']
            if uname == '':
                uname =  u'未知'
            location = source['user_location']
            friendsnum = source['friendsnum']
        else:
            uname = u'未知'
            location = ''
            friendsnum = ''
            photo_url = 'unknown'
        #add index from bci_history
        try:
            bci_history_item = bci_history_result[iter_count]
        except:
            bci_history_item = {'found': False}
        # print bci_history_item
        if bci_history_item['found'] == True:
            fansnum = bci_history_item['fields'][fields[0]][0]
            user_weibo_count = bci_history_item['fields'][fields[1]][0]
            user_friendsnum = bci_history_item['fields'][fields[2]][0]
            influence = bci_history_item['fields'][fields[3]][0]
        else:
            fansnum = ''
            user_weibo_count = ''
            user_friendsnum = ''

        interaction_count = int(all_interaction_dict[uid])
        out_portrait_list.append({'uid':uid,'photo_url':photo_url,'uname':uname, 'count':interaction_count, 'fansnum':fansnum,'friendsnum': user_friendsnum,'weibo_count': user_weibo_count})
        iter_count += 1

    return out_portrait_list
Esempio n. 30
0
def get_positive_weibo_detail(ts, social_sensors, keywords_list, size, sentiment_type=1):
    former_mid_list = query_mid_list(ts-time_interval, keywords_list, time_segment, social_sensors) # 前一段时间内的微博mid list
    current_mid_list = query_mid_list(ts, keywords_list, time_interval,  social_sensors)
    mid_list = []
    mid_list.extend(former_mid_list)
    mid_list.extend(current_mid_list)

    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                            {"range":{
                                "timestamp":{
                                    "gte": ts - time_interval,
                                    "lt": ts
                                }
                            }},
                        ],
                        "should":[
                            {"terms": {"root_mid": mid_list}},
                            {"terms": {"mid": mid_list}},
                            {"terms":{"keywords_string": keywords_list}}
                        ]
                    }
                }
            }
        },
        "sort": {"timestamp": {"order": "desc"}},
        "size": 100
    }



    #if social_sensors and int(sentiment_type) == 1:
    #    query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"terms":{"uid": social_sensors}})

    if int(sentiment_type) == 1 or int(sentiment_type) == 0:
        query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"term":{"sentiment":sentiment_type}})
    else:
        query_body["query"]["filtered"]["filter"]["bool"]["must"] = [{"terms":{"sentiment": ["2", "3"]}}]

    # 判断当前ts和ts-time_interval是否属于同一天,确定查询哪个es
    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts-time_interval)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    index_name_1 = flow_text_index_name_pre + datetime_1
    exist_es_1 = es_text.indices.exists(index_name_1)

    # 1. 聚合原创微博mid list
    if datetime == datetime_1 and exist_es:
        search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    elif datetime != datetime_1 and exist_es_1:
        search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    else:
        search_results = []

    uid_list = []
    results = []
    if search_results:
        for item in search_results:
            uid_list.append(item["_source"]['uid'])
        if uid_list:
            portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"]

        for i in range(len(uid_list)):
            item = search_results[i]['_source']
            temp = []
            # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type
            temp.append(item['uid'])
            if portrait_result[i]['found']:
                temp.append(portrait_result[i]["fields"]["nick_name"][0])
                temp.append(portrait_result[i]["fields"]["photo_url"][0])
            else:
                temp.append("unknown")
                temp.append("")
            temp.append(item["text"])
            temp.append(item["sentiment"])
            temp.append(ts2date(item['timestamp']))
            temp.append(item['geo'])
            keywords_set = set(item['keywords_string'].split('&'))
            common_keywords = set(keywords_list) & keywords_set
            temp.append(list(common_keywords))
            temp.append(item['message_type'])
            results.append(temp)

    return results
Esempio n. 31
0
def search_follower(uid, top_count):

    results = {}
    now_ts = time.time()
    db_number = get_db_num(now_ts)
    index_name = be_retweet_index_name_pre + str(db_number)
    # return search_user_info(es_retweet,index_name,retweet_index_type,uid,'uid_be_retweet')
    center_uid = uid
    try:
        retweet_result = es_retweet.get(index=index_name, doc_type=be_retweet_index_type, id=uid)['_source']
    except:
        return None
    retweet_dict={}
    if retweet_result:
        retweet_dict_old = json.loads(retweet_result['uid_be_retweet'])
        for key in retweet_dict_old:
            retweet_dict[key]=int(retweet_dict_old[key])
        sorted_list = sorted(retweet_dict.iteritems(),key=lambda x:x[1],reverse=True)[:20]
        uid_list = [i[0] for i in sorted_list if i[0] != uid]
        portrait_result = []
        try:
            user_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids':uid_list})['docs']
        except:
            user_result = []

        try:
            bci_history_result = es_bci_history.mget(index=bci_history_index_name, doc_type=bci_history_index_type, body={'ids':uid_list}, fields=fields)['docs']    
        except:
            bci_history_result = []
        # print bci_history_result
        iter_count = 0
        out_portrait_list = []
        for out_user_item in user_result:
            uid = out_user_item['_id']
            if out_user_item['found'] == True:
                source = out_user_item['_source']
                uname = source['nick_name']
                photo_url = source['photo_url']
                if uname == '':
                    uname = u'未知'
                #location = source['user_location']
                friendsnum = source['friendsnum']

            else:
                uname = u'未知'
                location = ''
                friendsnum = ''
                photo_url = ''

            #add index from bci_history
            try:
                bci_history_item = bci_history_result[iter_count]
            except:
                bci_history_item = {'found': False}
            if bci_history_item['found']==True:
                fansnum = bci_history_item['fields'][fields[0]][0]
                user_weibo_count = bci_history_item['fields'][fields[1]][0]
                user_friendsnum = bci_history_item['fields'][fields[2]][0]
                influence = bci_history_item['fields'][fields[3]][0]
            else:
                fansnum = ''
                user_weibo_count = ''
                user_friendsnum = ''
                influence = ''
            #retweet_count = int(retweet_dict[uid])
            print uid
            count = retweet_dict[uid]
            print count
            out_portrait_list.append({'uid':uid,'photo_url':photo_url,'count':count,'uname':uname,'influence':influence,'fansnum':fansnum, 'friendsnum':user_friendsnum,'weibo_count':user_weibo_count})#location,
            iter_count += 1
        return out_portrait_list
    else:
        return None
Esempio n. 32
0
def get_origin_weibo_detail(ts, user, task_name, size, order, message_type=1):
    _id = user + '-' + task_name
    task_detail = es_user_portrait.get(index=index_sensing_task, doc_type=_id, id=ts)['_source']
    print '37',index_sensing_task,_id
    mid_value = json.loads(task_detail['mid_topic_value'])
    duplicate_dict = json.loads(task_detail['duplicate_dict'])
    tmp_duplicate_dict = dict()
    for k,v in duplicate_dict.iteritems():
        try:
            tmp_duplicate_dict[v].append(k)
        except:
            tmp_duplicate_dict[v] = [k, v]


    if message_type == 1:
        weibo_detail = json.loads(task_detail['origin_weibo_detail'])
    elif message_type == 2:
        weibo_detail = json.loads(task_detail['retweeted_weibo_detail'])
    else:
        weibo_detail = json.loads(task_detail['sensitive_weibo_detail'])
    weibo_detail_list = []
    if weibo_detail:
        for iter_mid, item in weibo_detail.iteritems():
            tmp = []
            tmp.append(iter_mid)
            tmp.append(item[iter_mid])
            tmp.append(item['retweeted'])
            tmp.append(item['comment'])
            weibo_detail_list.append(tmp)
    mid_list = weibo_detail.keys()
    print len(mid_list)
    results = []
    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "terms":{"mid": mid_list}
                }
            }
        },
        "size": 1000,
        "sort": {"timestamp": {"order": "desc"}}
    }


    index_list = []
    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts-DAY)
    index_name = flow_text_index_name_pre + datetime
    print es_text
    exist_es = es_text.indices.exists(index_name)
    print exist_es
    if exist_es:
        index_list.append(index_name)
    index_name_1 = flow_text_index_name_pre + datetime_1
    exist_es_1 = es_text.indices.exists(index_name_1)
    if exist_es_1:
        index_list.append(index_name_1)

    if index_list and mid_list:
        search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    else:
        search_results = []

    uid_list = []
    text_dict = dict() # 文本信息
    portrait_dict = dict() # 背景信息
    sort_results = []
    if search_results:
        for item in search_results:
            uid_list.append(item["_source"]['uid'])
            text_dict[item['_id']] = item['_source'] # _id是mid
        if uid_list:
            portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"]
            for item in portrait_result:
                if item['found']:
                    portrait_dict[item['_id']] = {"nick_name": item["fields"]["nick_name"][0], "photo_url": item["fields"]["photo_url"][0]}
                else:
                    portrait_dict[item['_id']] = {"nick_name": item['_id'], "photo_url":""}


        if order == "total":
            sorted_list = sorted(weibo_detail_list, key=lambda x:x[1], reverse=True)[:10]
        elif order == "retweeted":
            sorted_list = sorted(weibo_detail_list, key=lambda x:x[2], reverse=True)[:10]
        elif order == "comment":
            sorted_list = sorted(weibo_detail_list, key=lambda x:x[3], reverse=True)[:10]
        else:
            sorted_list = weibo_detail_list

        count_n = 0
        results_dict = dict()
        mid_index_dict = dict()
        for item in sorted_list: # size
            mid = item[0]
            iter_text = text_dict.get(mid, {})
            temp = []
            # uid, nick_name, photo_url, text, sentiment, timestamp, geo, keywords_string, message_type
            if iter_text:
                uid = iter_text['uid']
                temp.append(uid)
                iter_portrait = portrait_dict.get(uid, {})
                if iter_portrait:
                    temp.append(iter_portrait['nick_name'])
                    temp.append(iter_portrait['photo_url'])
                else:
                    temp.extend([uid,''])
                temp.append(iter_text["text"])
                temp.append(iter_text["sentiment"])
                temp.append(ts2date(iter_text['timestamp']))
                temp.append(iter_text['geo'])
                if message_type == 1:
                    temp.append(1)
                elif message_type == 2:
                    temp.append(3)
                else:
                    temp.append(iter_text['message_type'])
                #jln 提取关键词
                f_key = get_weibo_single(iter_text['text'])
                temp.append(sorted(f_key.iteritems(),key=lambda x:x[1],reverse=True))
                
                temp.append(item[2])
                temp.append(item[3])
                temp.append(iter_text.get('sensitive', 0))
                temp.append(iter_text['timestamp'])
                temp.append(mid_value[mid])
                temp.append(mid)
                


                results.append(temp)
            count_n += 1


                

        results = sorted(results, key=operator.itemgetter(-4, -2, -6), reverse=True) # -4 -2 -3
        sort_results = []
        count = 0
        for item in results:
            sort_results.append([item])
            mid_index_dict[item[-1]] = count
            count += 1

        
        if tmp_duplicate_dict:
            remove_list = []
            value_list = tmp_duplicate_dict.values() # [[mid, mid], ]
            for item in value_list:
                tmp = []
                for mid in item:
                    if mid_index_dict.get(mid, 0):
                        tmp.append(mid_index_dict[mid])
                if len(tmp) > 1:
                    tmp_min = min(tmp)
                else:
                    continue
                tmp.remove(tmp_min)
                for iter_count in tmp:
                    sort_results[tmp_min].extend(sort_results[iter_count])
                    remove_list.append(sort_results[iter_count])
            if remove_list:
                for item in remove_list:
                    sort_results.remove(item)
        

    return sort_results
Esempio n. 33
0
def search_fans(uid,top_count):
    results = {}
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    db_number = get_db_num(now_date_ts)

    be_comment_index_name = be_comment_index_name_pre + str(db_number)
    be_retweet_index_name = be_retweet_index_name_pre + str(db_number)
    result = {}
    be_retweet_inter_dict = {}
    be_comment_inter_dict = {}
    center_uid = uid
    try:
        be_retweet_result = es_retweet.get(index = be_retweet_index_name,doc_type=be_retweet_index_type,id=uid)['_source']
    except:
        be_retweet_result = {}

    if be_retweet_result:
        be_retweet_uid_dict = json.loads(be_retweet_result['uid_be_retweet'])
    else:
        be_retweet_uid_dict = {}
    # print "be_retweet_uid_dict", be_retweet_uid_dict
    try:
        be_comment_result = es_be_comment.get(index=be_comment_index_name, doc_type=be_comment_index_type, id=uid)['_source']
    except:
        be_comment_result = {}

    if be_comment_result:
        be_comment_uid_dict = json.loads(be_comment_result['uid_be_comment'])
    else:
        be_comment_uid_dict = {}
    # print "be_comment_uid_dict", be_comment_uid_dict

    fans_result = union_dict(be_retweet_uid_dict,be_comment_uid_dict)
    fans_user_set = set(fans_result.keys())
    fans_list = list(fans_user_set)
    # print "fans_list", fans_list
    all_fans_dict = {}

    for fans_user in fans_list:
        if fans_user != center_uid:
            all_fans_dict[fans_user] = fans_result[fans_user]
    sort_all_fans_dict = sorted(all_fans_dict.items(), key=lambda x:x[1], reverse=True)
    all_fans_uid_list=[]
    all_fans_uid_list_all = [item[0] for item in sort_all_fans_dict]

    count = 0
    for i in all_fans_uid_list_all:
        count += 1
        all_fans_uid_list.append(i)
        if count == 1000:
            break
    # print all_fans_uid_list

    out_portrait_list = all_fans_uid_list
    #use to get user information from user profile
    out_portrait_result = {}
    try:
        out_user_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids':out_portrait_list})['docs']
    except:
        out_user_result = []
    #add index from bci_history
    try:
        bci_history_result = es_bci_history.mget(index=bci_history_index_name, doc_type=bci_history_index_type, body={'ids': out_portrait_list}, fields=fields)['docs']
    except:
        bci_history_result = []
    iter_count = 0
    out_portrait_list = []
    for out_user_item in out_user_result:
        uid = out_user_item['_id']
        if out_user_item['found'] == True:
            source = out_user_item['_source']
            uname = source['nick_name']
            photo_url = source['photo_url']
            if uname == '':
                uname =  u'未知'
            location = source['user_location']
            friendsnum = source['friendsnum']
        else:
            uname = u'未知'
            location = ''
            friendsnum = ''
            photo_url = 'unknown'
        #add index from bci_history
        try:
            bci_history_item = bci_history_result[iter_count]
        except:
            bci_history_item = {'found': False}
        # print bci_history_item
        if bci_history_item['found'] == True:
            fansnum = bci_history_item['fields'][fields[0]][0]
            user_weibo_count = bci_history_item['fields'][fields[1]][0]
            user_friendsnum = bci_history_item['fields'][fields[2]][0]
            influence = bci_history_item['fields'][fields[3]][0]
        else:
            fansnum = ''
            user_weibo_count = ''
            user_friendsnum = ''

        fans_count = int(all_fans_dict[uid])
        out_portrait_list.append({'uid':uid,'photo_url':photo_url,'uname':uname, 'count':fans_count, 'fansnum':fansnum,'friendsnum': user_friendsnum,'weibo_count': user_weibo_count})
        iter_count += 1

    return out_portrait_list
Esempio n. 34
0
def portrait_user_vary(es,
                       number,
                       active_index,
                       active_type,
                       portrait_index,
                       portrait_type,
                       field="vary"):

    return_list = []
    index_exist = es.indices.exists(index=active_index)
    if not index_exist:
        return "no active_index exist"
        sys.exit(0)

    count_s = 0
    count_c = 0
    start = 0
    rank = 1
    try:
        while 1:
            search_list = []
            user_list = search_k(es, active_index, active_type, start, field,
                                 100)
            start += 100
            for item in user_list:
                uid = item.get('uid',
                               '0')  # obtain uid, notice "uid" or "user"
                search_list.append(uid)  # uid list
            search_result = es_portrait.mget(index="user_portrait",
                                             doc_type="user",
                                             body={"ids": search_list},
                                             _source=True)["docs"]
            profile_result = es_profile.mget(index="weibo_user",
                                             doc_type="user",
                                             body={"ids": search_list},
                                             _source=True)["docs"]

            for item in search_result:
                count_c += 1
                if item["found"]:
                    info = ['', '', '', '', '', '1']
                    info[0] = rank
                    index = search_result.index(item)

                    if profile_result[index]['found']:
                        info[1] = profile_result[index]['_source'].get(
                            'photo_url', '')
                        info[3] = profile_result[index]['_source'].get(
                            'nick_name', '')
                    info[2] = search_result[index].get('_id', '')
                    info[4] = user_list[index]['vary']
                    return_list.append(info)
                    rank += 1
                    if rank == int(number) + 1:
                        return return_list

            if count_c > 10000:
                break
    except RequestError:
        print "timeout"

    return return_list