Beispiel #1
0
def get_community_userinfo(uid_list,core_uidlist,outer_uidlist):
    user_list = []
    # print 'es_user_profile::',es_user_profile
    user_result = es_user_profile.mget(index = profile_index_name,doc_type = profile_index_type,body = {'ids':uid_list})['docs']
    
    core_user = []
    for item in user_result:
        user_dict = dict()
        user_dict['uid'] = item['_id']

        if item['found']:
            user_dict['photo_url']=item['_source']['photo_url']            
            user_dict['nick_name']=item['_source']['nick_name']
            user_dict['sex']=item['_source']['sex']
            user_dict['friendsnum']=item['_source']['friendsnum']
            user_dict['fansnum']=item['_source']['fansnum']
            user_dict['user_location']=item['_source']['user_location']
        else:
            user_dict['photo_url']=''            
            user_dict['nick_name']=''
            user_dict['sex']=''
            user_dict['friendsnum']=''
            user_dict['fansnum']=''
            user_dict['user_location']=''

        #核心人物判断
        
        core_mark = len(list(set(item['_id'].split())&set(core_uidlist)))
        if core_mark > 0:
            user_dict['core_user'] = 1
            core_user.append(user_dict)
        else:
            user_dict['core_user'] = 0

        user_list.append(user_dict)


    #社区外人物信息
    outer_userlist = []
    outeruser_result = es_user_profile.mget(index = profile_index_name,doc_type = profile_index_type,body = {'ids':outer_uidlist})['docs']
    for item in outeruser_result:
        user_dict = dict()
        user_dict['uid'] = item['_id']

        if item['found']:
            user_dict['photo_url']=item['_source']['photo_url']            
            user_dict['nick_name']=item['_source']['nick_name']
            user_dict['sex']=item['_source']['sex']
            user_dict['friendsnum']=item['_source']['friendsnum']
            user_dict['fansnum']=item['_source']['fansnum']
            user_dict['user_location']=item['_source']['user_location']
        else:
            user_dict['photo_url']=''            
            user_dict['nick_name']=''
            user_dict['sex']=''
            user_dict['friendsnum']=''
            user_dict['fansnum']=''
            user_dict['user_location']=''
        outer_userlist.append(user_dict)
    return json.dumps(user_list),json.dumps(core_user),json.dumps(outer_userlist)
Beispiel #2
0
def group_evaluate(xnr_user_no, nodes, all_influence, all_sensitive, G=None):
    result = {}
    result['xnr_user_no'] = xnr_user_no
    result['nodes'] = nodes
    result['num'] = len(nodes)
    if G:
        sub_g = G.subgraph(nodes)
    else:
        sub_g = get_users(xnr_user_no, nodes)
    result['density'] = round(nx.density(sub_g), 4)
    result['cluster'] = round(nx.average_clustering(sub_g), 4)
    result['transitivity'] = round(nx.transitivity(sub_g), 4)

    # for i in user_es.mget(index=sensitive_index, doc_type=sensitive_type,body={'ids':nodes}, fields=['sensitive_week_ave'],_source=False)['docs']:
    # print i#['fields']['sensitive_week_ave']

    influence_result = [
        float(i['fields']['bci_week_ave'][0]) if i['found'] else 0
        for i in es_user_profile.mget(index=influence_index,
                                      doc_type=influence_type,
                                      body={'ids': nodes},
                                      fields=['bci_week_ave'],
                                      _source=False)['docs']
    ]
    sensitive_result = [
        float(i['fields']['sensitive_week_ave'][0]) if i['found'] else 0
        for i in es_user_profile.mget(index=sensitive_index,
                                      doc_type=sensitive_type,
                                      body={'ids': nodes},
                                      fields=['sensitive_week_ave'],
                                      _source=False)['docs']
    ]

    result['max_influence'] = round(
        (max(influence_result) / float(all_influence)) * 100, 4)
    result['mean_influence'] = round(
        ((sum(influence_result) / len(influence_result)) /
         float(all_influence)) * 100, 4)

    max_sensitive = round((max(sensitive_result) / float(all_sensitive)) * 100,
                          4)
    result['mean_sensitive'] = round(
        ((sum(sensitive_result) / len(sensitive_result)) /
         float(all_sensitive)) * 100, 4)
    if max_sensitive > 100:
        result['max_sensitive'] = 100.0000
    else:
        result['max_sensitive'] = max_sensitive
    # result['mean_sensitive'] = 0.4670
    # result['mean_influence'] = 25.9874
    # result['density'] = 0.0068

    return result
def get_user_profile(uid_list, specify_field=[]):
    if not uid_list:
        return []

    results = []
    search_results = es_user_profile.mget(index=profile_index_name,
                                          doc_type=profile_index_type,
                                          body={"ids": uid_list})['docs']
    field_list = [
        "nick_name", "fansnum", "friendsnum", "photo_url", 'description',
        "statusnum", "sp_type", "user_location", "create_at", "sex",
        "verified_type", "isreal", "user_email"
    ]
    for item in search_results:
        iter_result = []
        iter_result.append(item['_id'])
        if item['found']:
            if specify_field:
                field_list = specify_field
            for iter_field in field_list:
                iter_result.append(item['_source'][iter_field])
        else:
            if specify_field:
                iter_result.extend([''] * len(specify_field))
            else:
                iter_result.extend([''] * len(field_list))
            iter_result[1] = item['_id']
        results.append(iter_result)

    return results
Beispiel #4
0
def es_mget_source(ids):
    try:
        source = es.mget(index=INDEX_NAME, doc_type=DOC_TYPE, body={'ids': ids})
    except Exception as e:
        raise e
    source = [item['_source'] for item in source['docs'] if item['found'] is True]
    return source
def get_profile_information(uid_list):
    #print 'len uid list:', len(uid_list)
    result_dict = dict()
    search_result = es.mget(index=index_name,
                            doc_type=index_type,
                            body={'ids': uid_list},
                            _source=True)['docs']
    #print 'search_result:', search_result
    for item in search_result:
        user_dict = {}
        for field in fields_dict:
            try:
                user_dict[field] = item['_source'][fields_dict[field]]
            except:
                if field == 'statusnum':
                    user_dict[field] = 0
                elif field == 'fansnum':
                    user_dict[field] = 0
                elif field == 'friendsnum':
                    user_dict[field] = 0
                elif field == 'uname':
                    user_dict[field] = "unknown"
                else:
                    user_dict[field] = 0
        result_dict[item['_id']] = user_dict
        #print 'uname type:', type(user_dict['uname'])
    #print 'result_dict:', result_dict
    #print 'len result_dict:', len(search_result)
    return result_dict
def get_user_info(uid_list):
    user_list = []
    # print 'user_list::',user_list
    user_result = es_user_profile.mget(index=profile_index_name,
                                       doc_type=profile_index_type,
                                       body={'ids': uid_list})['docs']
    core_user = []
    for item in user_result:
        user_dict = dict()
        user_dict['uid'] = item['_id']

        if item['found']:
            user_dict['photo_url'] = item['_source']['photo_url']
            user_dict['nick_name'] = item['_source']['nick_name']
            user_dict['sex'] = item['_source']['sex']
            user_dict['friendsnum'] = item['_source']['friendsnum']
            user_dict['fansnum'] = item['_source']['fansnum']
            user_dict['user_location'] = item['_source']['user_location']
        else:
            user_dict['photo_url'] = ''
            user_dict['nick_name'] = ''
            user_dict['sex'] = ''
            user_dict['friendsnum'] = ''
            user_dict['fansnum'] = ''
            user_dict['user_location'] = ''

        user_list.append(user_dict)
    return json.dumps(user_list)
def get_profile_information(uid_list):
    #print 'len uid list:', len(uid_list)
    result_dict = dict()
    search_result = es.mget(index=index_name, doc_type=index_type, body={'ids':uid_list}, _source=True)['docs']
    #print 'search_result:', search_result
    for item in search_result:
        user_dict = {}
        for field in fields_dict:
            try:
                user_dict[field] = item['_source'][fields_dict[field]]
            except:
                if field=='statusnum':
                    user_dict[field] = 0
                elif field=='fansnum':
                    user_dict[field] =0
                elif field=='friendsnum':
                    user_dict[field] = 0
                elif field=='uname':
                    user_dict[field] = 0
                else:
                    user_dict[field] = 0
        result_dict[item['_id']] = user_dict
        #print 'uname type:', type(user_dict['uname'])
    #print 'result_dict:', result_dict
    #print 'len result_dict:', len(search_result)
    return result_dict
def get_profile_information(uid_list):
    result_dict = dict()
    search_result = es.mget(index=index_name,
                            doc_type=index_type,
                            body={'ids': uid_list},
                            _source=True)['docs']
    for item in search_result:
        user_dict = {}
        for field in fields_dict:
            try:
                user_dict[field] = item['_source'][fields_dict[field]]
            except:
                if field == 'statusnum':
                    user_dict[field] = 0
                elif field == 'fansnum':
                    user_dict[field] = 0
                elif field == 'friendsnum':
                    user_dict[field] = 0
                elif field == 'gender':
                    user_dict[field] = 0
                elif field == 'uname':
                    user_dict[field] = item['_id']
                else:
                    user_dict[field] = ''
        result_dict[item['_id']] = user_dict
    return result_dict
Beispiel #9
0
def main():
    uid_list = []
    with open("uid_list_0520.txt", 'rb') as f:
        for item in f:
            uid_list.append(item.strip())

    bci_results = es_user_profile.mget(index="bci_history", doc_type="bci", body={"ids":uid_list})["docs"]
    sen_results = es_user_profile.mget(index="sensitive_history", doc_type="sensitive", body={"ids":uid_list})["docs"]

    with open("bci_history.txt", 'wb') as f_bci:
        for item in bci_results:
            if item['found']:
                f_bci.write(json.dumps(item['_source'])+"\n")

    with open("sen_history.txt", "wb") as f_sen:
        for item in sen_results:
            if item['found']:
                f_sen.write(json.dumps(item['_source'])+"\n")
Beispiel #10
0
def main():
    uid_list = []
    with open("uid_list_0520.txt", 'rb') as f:
        for item in f:
            uid_list.append(item.strip())

    bci_results = es_user_profile.mget(index="bci_history",
                                       doc_type="bci",
                                       body={"ids": uid_list})["docs"]
    sen_results = es_user_profile.mget(index="sensitive_history",
                                       doc_type="sensitive",
                                       body={"ids": uid_list})["docs"]

    with open("bci_history.txt", 'wb') as f_bci:
        for item in bci_results:
            if item['found']:
                f_bci.write(json.dumps(item['_source']) + "\n")

    with open("sen_history.txt", "wb") as f_sen:
        for item in sen_results:
            if item['found']:
                f_sen.write(json.dumps(item['_source']) + "\n")
def search_hot_mid(task_name, ts):
    query_body = {
        "query": {
            "range":{
                "timestamp":{
                    "lt": ts
                }
            }
        },
        "aggs":{
            "hot_mid":{
                "terms":{"field": "root_mid", "size": 100}
            }
        }
    }

    mid_list = []
    return_list = [] # return hot mid
    uid_list = []
    es_results = es.search(index=task_name, doc_type="text", body=query_body)["aggregations"]["hot_mid"]["buckets"]
    for item in es_results:
        if item["doc_count"] >= 500:
            mid_list.append(item["key"])

    if mid_list:
        weibo_results = es.mget(index=task_name, doc_type="text", body={"ids":mid_list})["docs"]
        for item in weibo_results:
            if item["found"]:
                mid = item["_id"]
                retweet, comment = search_retweet_comment(task_name, mid)
                detail = item["_source"]
                detail["retweet"] = retweet
                detail["comment"] = comment
                uid_list.append(detail["uid"])
                return_list.append(detail)
        if uid_list:
            profile_results = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list})["docs"]
            for i in range(len(uid_list)):
                detail = profile_results[i]
                if detail["found"]:
                    return_list[i]["uname"] = detail["_source"]["nick_name"]
                    return_list[i]["photo_url"] = detail["_source"]["photo_url"]
                    return_list[i]["fansnum"] = detail["_source"]["fansnum"]
                    return_list[i]["statusnum"] = detail["_source"]["statusnum"]
                else:
                    return_list[i]["uname"] = detail["_id"]
                    return_list[i]["photo_url"] = ""
                    return_list[i]["fansnum"] = ""
                    return_list[i]["statusnum"] = ""
    return return_list
Beispiel #12
0
def get_user_name(uid_list):
    try:
        portrait_exist_result = es_user_profile.mget(index=profile_index_name,\
            doc_type=profile_index_type,body={'ids':uid_list})['docs']
    except:
        portrait_exist_result = []
    uid2uname_dict = {}
    print portrait_exist_result
    for portrait_item in portrait_exist_result:
        uid = portrait_item['_id']
        if portrait_item['found'] == True:
            source = portrait_item['_source']
            uname = source['nick_name']
        else:
            uname = uid
        uid2uname_dict[uid] = uname
    return uid2uname_dict
Beispiel #13
0
def get_profile_information(uid_list):
    result_dict = dict()
    search_result = es.mget(index=index_name,
                            doc_type=index_type,
                            body={'ids': uid_list},
                            _source=True)['docs']
    iter_count = 0
    for item in search_result:
        user_dict = {}
        for field in fields_dict:
            try:
                user_dict[field] = item['_source'][fields_dict[field]]
            except:
                user_dict[field] = ''
        result_dict[item['_id']] = user_dict
        iter_count += 1
    return result_dict
def get_future_user(uid_list):
    return_list = []
    if uid_list:
        profile_results = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list})["docs"]
        for item in profile_results:
            tmp = []
            tmp.append(item["_id"])
            if item["found"]:
                tmp.append(item["_source"]["nick_name"])
                tmp.append(item["_source"]["photo_url"])
                tmp.append(item["_source"]["fansnum"])
                tmp.append(item["_source"]["statusnum"])
            else:
                tmp.append(item["_id"])
                tmp.extend(["","",""])
            return_list.append(tmp)

    return return_list
Beispiel #15
0
def search_attention(uid):
    stat_results = dict()
    results = dict()
    for db_num in R_DICT:
        r = R_DICT[db_num]
        ruid_results = r.hgetall('retweet_'+str(uid))
        if ruid_results:
            for ruid in ruid_results:
                if ruid != uid:
                    try:
                        stat_results[ruid] += ruid_results[ruid]
                    except:
                        stat_results[ruid] = ruid_results[ruid]
    # print 'results:', stat_results
    if not stat_results:
        return [None, 0]
    try:
        sort_state_results = sorted(stat_results.items(), key=lambda x:x[1], reverse=True)[:20]
    except:
        return [None, 0]
    print 'sort_state_results:', sort_state_results
    uid_list = [item[0] for item in sort_state_results]
    es_profile_results = es_user_profile.mget(index='weibo_user', doc_type='user', body={'ids':uid_list})['docs']
    es_portrait_results = es_user_portrait.mget(index='user_portrait', doc_type='user', body={'ids':uid_list})['docs']
    result_list = dict()
    for i in range(len(es_profile_results)):
        item = es_profile_results[i]
        uid = item['_id']
        try:
            source = item['_source']
            uname = source['nick_name']
        except:
            uname = u'未知'
        # identify uid is in the user_portrait
        portrait_item = es_portrait_results[i]
        try:
            source = portrait_item[i]
            in_status = 1
        except:
            in_status = 0

        result_list[uid] = [uid,[uname, stat_results[uid], in_status]]
       
    return [result_list, len(stat_results)]
Beispiel #16
0
def search_attention(uid):
    stat_results = dict()
    results = dict()
    for db_num in R_DICT:
        r = R_DICT[db_num]
        ruid_results = r.hgetall("retweet_" + str(uid))
        if ruid_results:
            for ruid in ruid_results:
                if ruid != uid:
                    try:
                        stat_results[ruid] += ruid_results[ruid]
                    except:
                        stat_results[ruid] = ruid_results[ruid]
    # print 'results:', stat_results
    if not stat_results:
        return [None, 0]
    try:
        sort_state_results = sorted(stat_results.items(), key=lambda x: x[1], reverse=True)[:20]
    except:
        return [None, 0]
    print "sort_state_results:", sort_state_results
    uid_list = [item[0] for item in sort_state_results]
    es_profile_results = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids": uid_list})["docs"]
    es_portrait_results = es_user_portrait.mget(index="user_portrait", doc_type="user", body={"ids": uid_list})["docs"]
    result_list = dict()
    for i in range(len(es_profile_results)):
        item = es_profile_results[i]
        uid = item["_id"]
        try:
            source = item["_source"]
            uname = source["nick_name"]
        except:
            uname = u"未知"
        # identify uid is in the user_portrait
        portrait_item = es_portrait_results[i]
        try:
            source = portrait_item[i]
            in_status = 1
        except:
            in_status = 0

        result_list[uid] = [uid, [uname, stat_results[uid], in_status]]

    return [result_list, len(stat_results)]
Beispiel #17
0
def search_profile(uid_list):
    '''
    输入:uid列表
    '''
    es_profile_results = es_user_profile.mget(index='user_portrait_1222', doc_type='user', body={'ids':uid_list})['docs']

    result_list = dict()
    for i in range(len(es_profile_results)):
        item = es_profile_results[i]
        uid = item['_id'].encode('utf-8')
        if item['found']:#有数据
            source = item['_source']
            topic = source['topic_string']
        else:
            topic = ''        

        result_list[uid] = topic
       
    return result_list
def search_user_type(uid_list):
    type_list = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, \
                body={'ids': uid_list},_source=False, fields=['id', 'verified_type'])['docs']
    user_list = []
    org_list = []
    for i in type_list:
        if i['found'] == False:
            user_list.append(i['_id'])
        else:
            # print i
            if not i.has_key('verified_type'):
                user_list.append(i['_id'])
                continue
            verified_type = i['fields']['verified_type'][0]
            if verified_type in org_list:
                org_list.append(i['_id'])
            else:
                user_list.append(i['_id'])
    return user_list, org_list
def get_user_profile(uid_list,specify_field=[]):
    if not uid_list:
        return []

    results = []
    search_results = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type,body={"ids":uid_list})['docs']
    field_list = ["nick_name", "fansnum", "friendsnum","photo_url", 'description', "statusnum","sp_type", "user_location", "create_at", "sex", "verified_type", "isreal", "user_email"]
    for item in search_results:
        iter_result = []
        iter_result.append(item['_id'])
        if item['found']:
            if specify_field:
                field_list = specify_field
            for iter_field in field_list:
                iter_result.append(item['_source'][iter_field])
        else:
            iter_result.extend(['']*len(field_list))
        results.append(iter_result)

    return results
Beispiel #20
0
def search_profile(uid_list):
    '''
    输入:uid列表
    '''
    es_profile_results = es_user_profile.mget(index='user_portrait_1222',
                                              doc_type='user',
                                              body={'ids': uid_list})['docs']

    result_list = dict()
    for i in range(len(es_profile_results)):
        item = es_profile_results[i]
        uid = item['_id'].encode('utf-8')
        if item['found']:  #有数据
            source = item['_source']
            topic = source['topic_string']
        else:
            topic = ''

        result_list[uid] = topic

    return result_list
def reducer():
    count = 0
    ts = time.time()
    while 1:
        user_set = r_flow.rpop('update_bci_list')
        bulk_action = []
        if user_set:
            items = json.loads(user_set)
            uid_list = []
            for item in items:
                uid_list.append(item['id'])
            if uid_list:
                search_results = es.mget(index=BCIHIS_INDEX_NAME, doc_type=BCIHIS_INDEX_TYPE, body={"ids":uid_list})["docs"]
                cal_num_for_bci_history(uid_list, items, search_results)
                count += len(uid_list)
                if count % 10000 == 0:
                    te = time.time()
                    #print "count: %s, cost time: %s" %(count, te-ts)
                    ts = te
        else:
            print count
            break
Beispiel #22
0
def get_profile_information(uid_list):
    result_dict = dict()
    search_result = es.mget(index=index_name, doc_type=index_type, body={'ids':uid_list}, _source=True)['docs']
    for item in search_result:
        user_dict = {}
        for field in fields_dict:
            try:
                user_dict[field] = item['_source'][fields_dict[field]]
            except:
                if field=='statusnum':
                    user_dict[field] = 0
                elif field=='fansnum':
                    user_dict[field] =0
                elif field=='friendsnum':
                    user_dict[field] = 0
                elif field=='gender':
                    user_dict[field] = 0
                elif field=='uname':
                    user_dict[field] = u'unknown'
                else:
                    user_dict[field] = 'unknown'
        result_dict[item['_id']] = user_dict
    return result_dict
Beispiel #23
0
def get_topic_top_user(topic_top):
    result = {}
    topic_user = {}
    index_name = 'weibo_user'
    index_type = 'user'
    #test user list
    test_user_list = [['1499104401', '1265965213', '3270699555', '2073915493', '1686474312'],\
                      ['2803301701', '2105426467', '1665372775', '3716504593', '2892376557'],\
                      ['1457530250', '1698513182', '2793591492', '2218894100', '1737961042'],\
                      ['1656818110', '1660127070', '1890124610', '1182391230', '1243861100'],\
                      ['1680430844', '2998045524', '2202896360', '1639498782', '3494698730'],\
                      ['2587093162', '1677675054', '1871767009', '1193111400', '1672418622'],\
                      ['1730726640', '1752502540', '1868725480', '1262486750', '1235733080'],\
                      ['1250041100', '2275231150', '1268642530', '1658606270', '1857599860'],\
                      ['1929496477', '2167425990', '1164667670', '2417139911', '1708853044'],\
                      ['1993292930', '1645823930', '1890926610', '1641561810', '2023833990'],\
                      ['2005471590', '1233628160', '2074684140', '1396715380', '1236762250'],\
                      ['1423592890', '2612799560', '1926127090', '2684951180', '1760607220']]
    count = 0
    for item in topic_top:
        topic = item[0]
        #test
        user_list = test_user_list[count]
        result[topic] = []
        profile_result = es_user_profile.mget(index=index_name, doc_type=index_type, body={'ids':user_list})['docs']
        for profile in profile_result:
            uid = profile['_id']
            try:
                uname = profile['_source']['nick_name']
                photo_url = profile['_source']['photo_url']
            except:
                uname = 'unknown'
                photo_url = 'unknown'
            result[topic].append([uid, uname, photo_url])
        count += 1
    return result
Beispiel #24
0
def reducer():
    count = 0
    ts = time.time()
    while 1:
        user_set = r_flow.rpop('update_bci_list')
        bulk_action = []
        if user_set:
            items = json.loads(user_set)
            uid_list = []
            for item in items:
                uid_list.append(item['id'])
            if uid_list:
                search_results = es.mget(index=BCIHIS_INDEX_NAME,
                                         doc_type=BCIHIS_INDEX_TYPE,
                                         body={"ids": uid_list})["docs"]
                cal_num_for_bci_history(uid_list, items, search_results)
                count += len(uid_list)
                if count % 10000 == 0:
                    te = time.time()
                    #print "count: %s, cost time: %s" %(count, te-ts)
                    ts = te
        else:
            print count
            break
Beispiel #25
0
def get_domain_top_user(domain_top):
    result = {}
    domain_user = {}
    index_name = 'weibo_user'
    index_type = 'user'
    #test user list
    test_user_list = [['2803301701', '1639498782', '2656274875', '1402977920', '3114175427'], \
                      ['3575186384', '1316683401', '1894603174', '1641542052', '1068248497'], \
                      ['1729736051', '1396715380', '2377610962', '1828183230', '2718018210'], \
                      ['1250748474', '3270699555', '1417037145', '1193111400', '1403915120'], \
                      ['1671342103', '1255849511', '1647497355', '1989660417', '1189729754'], \
                      ['1182391231', '1670071920', '1689618340', '1494850741', '1708942053'],\
                      ['3400918220', '2685504141', '2056115850', '1768001547', '3317008062'],\
                      ['2001627641', '1773489534', '2458194884', '1822155333', '1799201635'],\
                      ['1709157165', '2074370833', '2167425990', '3204839810', '3690518992'],\
                      ['1664065962', '3299094722', '1942531237', '2799434700', '1784404677'],\
                      ['1218353337', '1761179351', '3482911112', '1220291284', '2504433601'],\
                      ['3682473195', '1627673351', '1779065471', '3316144700', '1896701827']]
    count = 0
    for item in domain_top:
        domain = item[0]
        #test
        user_list = test_user_list[count]
        result[domain] = []
        profile_result = es_user_profile.mget(index=index_name, doc_type=index_type, body={'ids':user_list})['docs']
        for profile in profile_result:
            uid = profile['_id']
            try:
                uname = profile['_source']['nick_name']
                photo_url = profile['_source']['photo_url']
            except:
                uname = 'unknown'
                photo_url = 'unknown'
            result[domain].append([uid, uname, photo_url])
        count += 1
    return result
Beispiel #26
0
 query_sensitive_body = {
 "query":{
     "match_all":{}
     },
     "size":1,
     "sort":{sensitive_string:{"order":"desc"}}
     }
 try:
     top_sensitive_result = es_bci_history.search(index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX, body=query_sensitive_body, _source=False, fields=[sensitive_string])['hits']['hits']
     top_sensitive = top_sensitive_result[0]['fields'][sensitive_string][0]
 except Exception, reason:
     print Exception, reason
     top_sensitive = 400
 index_type = 'bci'
 user_bci_result = es_cluster.mget(index=index_name, doc_type=index_type, body={'ids':uid_list}, _source=True)['docs']
 user_profile_result = es_user_profile.mget(index='weibo_user', doc_type='user', body={'ids':uid_list}, _source=True)['docs']
 bci_history_result = es_bci_history.mget(index=bci_history_index_name, doc_type=bci_history_index_type, body={"ids":uid_list}, fields=['user_fansnum', 'weibo_month_sum'])['docs']
 sensitive_history_result = es_bci_history.mget(index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX, body={'ids':uid_list}, fields=[sensitive_string], _source=False)['docs']
 max_evaluate_influ = get_evaluate_max(index_name)
 for i in range(0, len(uid_list)):
     uid = uid_list[i]
     bci_dict = user_bci_result[i]
     profile_dict = user_profile_result[i]
     bci_history_dict = bci_history_result[i]
     sensitive_history_dict = sensitive_history_result[i]
     #print sensitive_history_dict
     try:
         bci_source = bci_dict['_source']
     except:
         bci_source = None
     if bci_source:
Beispiel #27
0
def main():
    if RUN_TYPE:
        now_ts = time.time() - DAY  # 前一天
        ts = str(datetime2ts(ts2datetime(now_ts)))
    else:
        ts = str(datetime2ts('2016-03-16'))
    now_ts = int(ts)
    print now_ts
    sensitive_string = "sensitive_" + ts
    date_string = ts
    update_sensitive_key = "sensitive_score_" + ts  # 更新的键
    sensitive_dict_key = "sensitive_dict_" + ts
    sensitive_string_key = "sensitive_string_" + ts
    sensitive_day_change_key = "sensitive_" + ts + "_day_change"
    del_month = datetime2ts(ts2datetime(now_ts - MONTH))
    del_sensitive_key = "sensitive_score_" + str(del_month)  # 要删除的键

    former_ts = int(ts) - DAY
    former_date = str(datetime2ts(ts2datetime(former_ts)))
    former_sensitive_key = "sensitive_score_" + former_date

    iter_count = 0
    bulk_action = []

    mappings(ES_SENSITIVE_INDEX)
    total_number = r.hlen(sensitive_string)
    scan_cursor = 0
    print total_number

    while 1:
        re_scan = r.hscan(sensitive_string, scan_cursor, count=1000)
        scan_cursor = re_scan[0]
        if len(re_scan[1]) != 0:
            sensitive_info = re_scan[1]  # 字典形式,uid:sensitive_words_dict
            uid_list = sensitive_info.keys()
            sensitive_results = es.mget(index=ES_SENSITIVE_INDEX,
                                        doc_type=DOCTYPE_SENSITIVE_INDEX,
                                        body={"ids": uid_list})['docs']
            if sensitive_results:
                for item in sensitive_results:
                    uid = item['_id']
                    sensitive_words_dict = json.loads(
                        sensitive_info[uid])  # json.loads
                    current_sensitive_score = 0
                    for k, v in sensitive_words_dict.iteritems():
                        tmp_stage = r_sensitive.hget("sensitive_words", k)
                        if tmp_stage:
                            current_sensitive_score += v * sensitive_score_dict[
                                str(tmp_stage)]
                    if item['found']:  # 之前存在相关信息
                        revise_item = item["_source"]
                        if del_sensitive_key in revise_item:
                            item.remove(del_sensitive_key)
                        revise_item['uid'] = uid
                        # 新更新的敏感度
                        revise_item[
                            update_sensitive_key] = current_sensitive_score
                        # 新更新的敏感词
                        revise_item[sensitive_dict_key] = sensitive_info[uid]
                        # 新更新的string
                        revise_item[sensitive_string_key] = "&".join(
                            sensitive_words_dict.keys())
                        # 当天和之前一天、一周和一月均值的差异
                        revise_item[
                            'sensitive_day_change'] = current_sensitive_score - revise_item.get(
                                former_sensitive_key, 0)
                        revise_item[
                            'sensitive_week_change'] = current_sensitive_score - revise_item.get(
                                'sensitive_week_ave', 0)
                        revise_item[
                            'sensitive_month_change'] = current_sensitive_score - revise_item.get(
                                'sensitive_month_ave', 0)
                        # 更新后week、month的均值和方差
                        revise_item['sensitive_week_ave'], revise_item[
                            'sensitive_week_var'], revise_item[
                                'sensitive_week_sum'] = compute_week(
                                    revise_item, now_ts)
                        revise_item['senstiive_month_ave'], revise_item[
                            'sensitive_month_var'], revise_item[
                                'sensitive_month_sum'] = compute_month(
                                    revise_item, now_ts)

                    else:
                        revise_item = dict()
                        revise_item['uid'] = uid
                        revise_item[
                            update_sensitive_key] = current_sensitive_score
                        revise_item[sensitive_dict_key] = sensitive_info[uid]
                        revise_item[sensitive_string_key] = "&".join(
                            sensitive_words_dict.keys())
                        revise_item[
                            'sensitive_day_change'] = current_sensitive_score
                        revise_item[
                            'sensitive_week_change'] = current_sensitive_score
                        revise_item[
                            'sensitive_month_change'] = current_sensitive_score
                        revise_item['sensitive_week_ave'], revise_item[
                            'sensitive_week_var'], revise_item[
                                'sensitive_week_sum'] = compute_week(
                                    revise_item, now_ts)
                        revise_item['senstiive_month_ave'], revise_item[
                            'sensitive_month_var'], revise_item[
                                'sensitive_month_sum'] = compute_month(
                                    revise_item, now_ts)
                    action = {'index': {'_id': uid}}
                    bulk_action.extend([action, revise_item])
                    iter_count += 1
                    if iter_count % 1000 == 0:
                        es.bulk(bulk_action,
                                index=ES_SENSITIVE_INDEX,
                                doc_type=DOCTYPE_SENSITIVE_INDEX)
                        bulk_action = []
                        print iter_count
        if int(scan_cursor) == 0:
            break
    if bulk_action:
        es.bulk(bulk_action,
                index=ES_SENSITIVE_INDEX,
                doc_type=DOCTYPE_SENSITIVE_INDEX)

    print iter_count
Beispiel #28
0
def key_words_search(task_id,
                     search_type,
                     pre,
                     during,
                     start_time,
                     keyword_list,
                     search_key='',
                     sort_norm='',
                     sort_scope='',
                     time=1,
                     isall=False,
                     number=100):
    number = int(number)
    should = []
    for key in keyword_list:
        if search_type == "hashtag":
            should.append({"prefix": {"text": "#" + key + "#"}})
        else:
            should.append({"wildcard": {"text": "*" + key + "*"}})
    index_list = []
    date = ts2datetime(start_time)
    index_name = pre + date
    while during:
        if es_flow_text.indices.exists(index=index_name):
            index_list.append(index_name)
            start_time = start_time + DAY
            date = ts2datetime(start_time)
            index_name = pre + date
            during -= 1

    print index_list
    uid_set = set()
    text_results = []

    query_body = {
        "query": {
            "bool": {
                "must": should
            }
        },
        "sort": {
            "user_fansnum": {
                "order": "desc"
            }
        },
        "size": 5000
    }

    results = es_flow_text.search(index=index_list,
                                  doc_type='text',
                                  body=query_body,
                                  _source=False,
                                  fields=[
                                      "uid", "user_fansnum", "text",
                                      "message_type", "sentiment", "timestamp",
                                      "geo", "retweeted", "comment"
                                  ])["hits"]["hits"]

    id_index = 0
    index_list = []
    un_uid_list = []
    for item in results:
        if item['fields']['uid'][0] not in uid_set:
            uid_set.add(item['fields']['uid'][0])
            un_uid_list.append(item['fields']['uid'][0])
            index_list.append(id_index)
        id_index += 1

    uid_list = []
    print "un_uid_list: ", len(un_uid_list)
    portrait_list = []
    count = 0
    in_index = 0
    if not isall and un_uid_list:  # 库内
        portrait_results = es_user_portrait.mget(index=USER_INDEX_NAME,
                                                 doc_type=USER_INDEX_TYPE,
                                                 body={"ids": un_uid_list},
                                                 _source=False,
                                                 fields=['uname'])["docs"]
        for item in portrait_results:
            if item["found"]:
                portrait_list.append(item['_id'])
                nick_name = item['fields']['uname'][0]
                if nick_name == 'unknown':
                    nick_name = item['_id']
                index = index_list[in_index]
                weibo_url = weiboinfo2url(results[index]['fields']['uid'][0],
                                          results[index]['_id'])
                text_results.extend([
                    results[index]['fields']['uid'][0],
                    results[index]['fields']['user_fansnum'][0],
                    results[index]['fields']['text'][0],
                    results[index]['fields']['message_type'][0],
                    results[index]['fields']['sentiment'][0],
                    ts2date(results[index]['fields']['timestamp'][0]),
                    results[index]['fields']['geo'][0],
                    results[index]['fields']['retweeted'][0],
                    results[index]['fields']['comment'][0], nick_name,
                    weibo_url
                ])
                count += 1
                if count == number:
                    break
                print "portrait_len, ", len(portrait_list)
            in_index += 1
        if portrait_list:
            uid_list = in_sort_filter(time, sort_norm, sort_scope, None,
                                      portrait_list, True, number)  # sort
    elif un_uid_list:
        profile_result = es_user_profile.mget(index="weibo_user",
                                              doc_type="user",
                                              body={"ids": un_uid_list},
                                              fields=['nick_name'])["docs"]
        for i in range(len(profile_result)):
            index = index_list[i]
            try:
                nick_name = profile_result[i]['fields']['nick_name'][0]
            except:
                nick_name = un_uid_list[i]
            item = results[index]
            weibo_url = weiboinfo2url(item['fields']['uid'][0],
                                      results[index]['_id'])
            text_results.append([
                item['fields']['uid'][0], item['fields']['user_fansnum'][0],
                item['fields']['text'][0], item['fields']['message_type'][0],
                item['fields']['sentiment'][0],
                ts2date(item['fields']['timestamp'][0]),
                results[index]['fields']['geo'][0],
                results[index]['fields']['retweeted'][0],
                results[index]['fields']['comment'][0], nick_name, weibo_url
            ])
            if i == number:
                break
        uid_list = all_sort_filter(un_uid_list[:number], sort_norm, time, True,
                                   number)

    print "filter_uid_list: ", len(uid_list)
    if uid_list:
        results = make_up_user_info(uid_list, isall, time, sort_norm)
    else:
        results = []
    print "results: ", len(results)
    # 修改状态
    task_detail = es_user_portrait.get(index=USER_RANK_KEYWORD_TASK_INDEX,
                                       doc_type=USER_RANK_KEYWORD_TASK_TYPE,
                                       id=task_id)
    item = task_detail['_source']
    item['status'] = 1
    item['result'] = json.dumps(results)
    item['text_results'] = json.dumps(text_results)
    item['number'] = len(results)
    es_user_portrait.index(index=USER_RANK_KEYWORD_TASK_INDEX,
                           doc_type=USER_RANK_KEYWORD_TASK_TYPE,
                           id=task_id,
                           body=item)

    return "1"
def get_retweet_weibo_detail(ts, size, text_type, type_value):
    task_detail = es_prediction.get(index=index_sensing_task, doc_type=_id, id=ts)['_source']
    origin_weibo_detail = json.loads(task_detail['origin_weibo_detail'])
    retweeted_weibo_detail = json.loads(task_detail['retweeted_weibo_detail'])

    mid_list = []
    mid_list.extend(origin_weibo_detail.keys())
    mid_list.extend(retweeted_weibo_detail.keys())

    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                            {"range":{
                                "timestamp":{
                                    "gte": ts - time_interval,
                                    "lt": ts
                                }
                            }},
                            {"terms": {"root_mid": mid_list}}
                        ]
                    }
                }
            }
        },
        "sort": {"timestamp": {"order": "desc"}},
        "size": 100
    }

    if text_type == "message_type":
        query_body['query']['filtered']['filter']['bool']['must'].append({"term":{text_type: type_value}})
    if text_type == "sentiment":
        #if isinstance(type_value, str):
        if len(type_value) == 1:
            query_body['query']['filtered']['filter']['bool']['must'].append({"term":{text_type: type_value}})
        else:
            query_body['query']['filtered']['filter']['bool']['must'].append({"terms":{text_type: type_value}})

    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts-time_interval)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    index_name_1 = flow_text_index_name_pre + datetime_1
    exist_es_1 = es_text.indices.exists(index_name_1)

    # 1. 查询微博
    if datetime == datetime_1 and exist_es:
        search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    elif datetime != datetime_1 and exist_es_1:
        search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    else:
        search_results = []
    #print search_results
    # 2. 获取微博相关信息
    results = []
    uid_list = []
    if search_results:
        for item in search_results:
            uid_list.append(item["_source"]['uid'])
        if uid_list:
            portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"]

        for i in range(len(uid_list)):
            item = search_results[i]['_source']
            temp = []
            # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type
            temp.append(item['uid'])
            if portrait_result[i]['found']:
                temp.append(portrait_result[i]["fields"]["nick_name"][0])
                temp.append(portrait_result[i]["fields"]["photo_url"][0])
            else:
                temp.append(item['uid'])
                temp.append("")
            temp.append(item["text"])
            #print item['text']
            temp.append(item["sentiment"])
            temp.append(ts2date(item['timestamp']))
            temp.append(item['geo'])
            temp.append(item["message_type"])
            results.append(temp)

    return results
def get_origin_weibo_detail(ts, size, order, message_type=1):
    #print r.get("topic_value_dict")
    #error:topic_value_dict里存的为空
    #topic_value_dict = json.loads(r.get("topic_value_dict"))
    task_detail = es_prediction.get(index=index_sensing_task, doc_type=_id, id=ts)['_source']

    mid_value = json.loads(task_detail['mid_topic_value'])
    duplicate_dict = json.loads(task_detail['duplicate_dict'])
    tmp_duplicate_dict = dict()
    for k,v in duplicate_dict.iteritems():
        try:
            tmp_duplicate_dict[v].append(k)
        except:
            tmp_duplicate_dict[v] = [k, v]


    if message_type == 1:
        weibo_detail = json.loads(task_detail['origin_weibo_detail'])
    elif message_type == 2:
        weibo_detail = json.loads(task_detail['retweeted_weibo_detail'])
    else:
        weibo_detail = json.loads(task_detail['sensitive_weibo_detail'])
    weibo_detail_list = []
    if weibo_detail:
        for iter_mid, item in weibo_detail.iteritems():
            tmp = []
            tmp.append(iter_mid)
            tmp.append(item[iter_mid])
            tmp.append(item['retweeted'])
            tmp.append(item['comment'])
            weibo_detail_list.append(tmp)
    mid_list = weibo_detail.keys()

    results = []
    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "terms":{"mid": mid_list}
                }
            }
        },
        "size": 1000,
        "sort": {"timestamp": {"order": "desc"}}
    }


    index_list = []
    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts-DAY)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    if exist_es:
        index_list.append(index_name)
    index_name_1 = flow_text_index_name_pre + datetime_1
    exist_es_1 = es_text.indices.exists(index_name_1)
    if exist_es_1:
        index_list.append(index_name_1)

    if index_list and mid_list:
        search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    else:
        search_results = []

    uid_list = []
    text_dict = dict() # 文本信息
    portrait_dict = dict() # 背景信息
    sort_results = []
    if search_results:
        for item in search_results:
            uid_list.append(item["_source"]['uid'])
            text_dict[item['_id']] = item['_source'] # _id是mid
        if uid_list:
            portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"]
            for item in portrait_result:
                if item['found']:
                    portrait_dict[item['_id']] = {"nick_name": item["fields"]["nick_name"][0], "photo_url": item["fields"]["photo_url"][0]}
                else:
                    portrait_dict[item['_id']] = {"nick_name": item['_id'], "photo_url":""}

        if order == "total":
            sorted_list = sorted(weibo_detail_list, key=lambda x:x[1], reverse=True)
        elif order == "retweeted":
            sorted_list = sorted(weibo_detail_list, key=lambda x:x[2], reverse=True)
        elif order == "comment":
            sorted_list = sorted(weibo_detail_list, key=lambda x:x[3], reverse=True)
        else:
            sorted_list = weibo_detail_list

        count_n = 0
        results_dict = dict()
        mid_index_dict = dict()
        for item in sorted_list: # size
            mid = item[0]
            iter_text = text_dict.get(mid, {})
            temp = []
            # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type
            if iter_text:
                uid = iter_text['uid']
                temp.append(uid)
                iter_portrait = portrait_dict.get(uid, {})
                if iter_portrait:
                    temp.append(iter_portrait['nick_name'])
                    temp.append(iter_portrait['photo_url'])
                else:
                    temp.extend([uid,''])
                temp.append(iter_text["text"])
                temp.append(iter_text["sentiment"])
                temp.append(ts2date(iter_text['timestamp']))
                temp.append(iter_text['geo'])
                if message_type == 1:
                    temp.append(1)
                elif message_type == 2:
                    temp.append(3)
                else:
                    temp.append(iter_text['message_type'])
                temp.append(item[2])
                temp.append(item[3])
                temp.append(iter_text.get('sensitive', 0))
                temp.append(iter_text['timestamp'])
                temp.append(mid_value[mid])
                temp.append(mid)
                results.append(temp)
            count_n += 1

        results = sorted(results, key=operator.itemgetter(-4, -2, -6), reverse=True) # -4 -2 -3
        sort_results = []
        count = 0
        for item in results:
            sort_results.append([item])
            mid_index_dict[item[-1]] = count
            count += 1

        
        if tmp_duplicate_dict:
            remove_list = []
            value_list = tmp_duplicate_dict.values() # [[mid, mid], ]
            for item in value_list:
                tmp = []
                for mid in item:
                    if mid_index_dict.get(mid, 0):
                        tmp.append(mid_index_dict[mid])
                if len(tmp) > 1:
                    tmp_min = min(tmp)
                else:
                    continue
                tmp.remove(tmp_min)
                for iter_count in tmp:
                    sort_results[tmp_min].extend(sort_results[iter_count])
                    remove_list.append(sort_results[iter_count])
            if remove_list:
                for item in remove_list:
                    sort_results.remove(item)
        

    return sort_results
def get_positive_weibo_detail(ts, social_sensors, keywords_list, size, sentiment_type=1):
    former_mid_list = query_mid_list(ts-time_interval, keywords_list, time_segment, social_sensors) # 前一段时间内的微博mid list
    current_mid_list = query_mid_list(ts, keywords_list, time_interval,  social_sensors)
    mid_list = []
    mid_list.extend(former_mid_list)
    mid_list.extend(current_mid_list)

    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                            {"range":{
                                "timestamp":{
                                    "gte": ts - time_interval,
                                    "lt": ts
                                }
                            }},
                        ],
                        "should":[
                            {"terms": {"root_mid": mid_list}},
                            {"terms": {"mid": mid_list}},
                            {"terms":{"keywords_string": keywords_list}}
                        ]
                    }
                }
            }
        },
        "sort": {"timestamp": {"order": "desc"}},
        "size": 100
    }



    #if social_sensors and int(sentiment_type) == 1:
    #    query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"terms":{"uid": social_sensors}})

    if int(sentiment_type) == 1 or int(sentiment_type) == 0:
        query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"term":{"sentiment":sentiment_type}})
    else:
        query_body["query"]["filtered"]["filter"]["bool"]["must"] = [{"terms":{"sentiment": ["2", "3"]}}]

    # 判断当前ts和ts-time_interval是否属于同一天,确定查询哪个es
    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts-time_interval)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    index_name_1 = flow_text_index_name_pre + datetime_1
    exist_es_1 = es_text.indices.exists(index_name_1)

    # 1. 聚合原创微博mid list
    if datetime == datetime_1 and exist_es:
        search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    elif datetime != datetime_1 and exist_es_1:
        search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    else:
        search_results = []

    uid_list = []
    results = []
    if search_results:
        for item in search_results:
            uid_list.append(item["_source"]['uid'])
        if uid_list:
            portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"]

        for i in range(len(uid_list)):
            item = search_results[i]['_source']
            temp = []
            # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type
            temp.append(item['uid'])
            if portrait_result[i]['found']:
                temp.append(portrait_result[i]["fields"]["nick_name"][0])
                temp.append(portrait_result[i]["fields"]["photo_url"][0])
            else:
                temp.append("unknown")
                temp.append("")
            temp.append(item["text"])
            temp.append(item["sentiment"])
            temp.append(ts2date(item['timestamp']))
            temp.append(item['geo'])
            keywords_set = set(item['keywords_string'].split('&'))
            common_keywords = set(keywords_list) & keywords_set
            temp.append(list(common_keywords))
            temp.append(item['message_type'])
            results.append(temp)

    return results
Beispiel #32
0
def key_words_search(task_id, search_type , pre , during , start_time , keyword_list , search_key = '' , sort_norm = '', sort_scope = ''  ,time = 7 , isall = False, number = 100):
    number = int(number)
    should = []
    for key in keyword_list:
        if search_type == "hashtag":
            should.append({"prefix":{"text": "#" +  key + "#"}})
        else:    
            should.append({"wildcard":{"text": "*" +key + "*"}})    
    index_list = []
    date = ts2datetime(start_time)
    index_name = pre + date
    while during:
        if es_flow_text.indices.exists(index=index_name):
            index_list.append(index_name)
            start_time = start_time + DAY
            date = ts2datetime(start_time)
            index_name = pre + date
            during -= 1

    print index_list
    uid_set = set()
    text_results = []
    sorted_text_results = []

    query_body = {
        "query":{
            "bool":{
                "must":should
             }
        },
        "sort":{"user_fansnum":{"order":"desc"}},
        "size":5000
    }
                    
    results = es_flow_text.search(index = index_list , doc_type = 'text' , body = query_body, _source=False, fields=["uid", "user_fansnum","text", "message_type", "sentiment","timestamp", "geo", "retweeted", "comment"])["hits"]["hits"]
    id_index = 0
    index_list = []
    un_uid_list = []
    for item in results :
        if item['fields']['uid'][0] not in uid_set:
            uid_set.add(item['fields']['uid'][0])
            un_uid_list.append(item['fields']['uid'][0])
            index_list.append(id_index)
        id_index += 1
    
    #get_all_filed(sort_norm , time)
    uid_list = []
    print "un_uid_list: ", len(un_uid_list)
    portrait_list = []
    count = 0
    in_index = 0
    if not isall and un_uid_list : # 库内
        portrait_results = es_user_portrait.mget(index=USER_INDEX_NAME, doc_type=USER_INDEX_TYPE, body={"ids":un_uid_list}, _source=False, fields=['uname'])["docs"]
        for item in portrait_results:
            if item["found"]:
                portrait_list.append(item['_id'])    
                nick_name = item['fields']['uname'][0]
                if nick_name == 'unknown':
                    nick_name = item['_id']
                index = index_list[in_index]
                weibo_url = weiboinfo2url(results[index]['fields']['uid'][0], results[index]['_id'])
                text_results.extend([results[index]['fields']['uid'][0], results[index]['fields']['user_fansnum'][0], results[index]['fields']['text'][0], results[index]['fields']['message_type'][0], results[index]['fields']['sentiment'][0], ts2date(results[index]['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url])
                count += 1
                if count == number:
                    break
                print "portrait_len, ", len(portrait_list)
            in_index += 1
        if portrait_list:
            uid_list = in_sort_filter(time,sort_norm ,sort_scope ,None , portrait_list , True, number) # sort
            for iter_uid in uid_list:
                iter_index = portrait_list.index(iter_uid)
                sorted_text_results.append(text_results[i])

    elif un_uid_list:
        profile_result = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids":un_uid_list}, fields=['nick_name'])["docs"]
        for i in range(len(profile_result)):
            index = index_list[i]
            try:
                nick_name = profile_result[i]['fields']['nick_name'][0]
            except:
                nick_name = un_uid_list[i]
            item = results[index]
            weibo_url = weiboinfo2url(item['fields']['uid'][0], results[index]['_id'])
            text_results.append([item['fields']['uid'][0], item['fields']['user_fansnum'][0], item['fields']['text'][0], item['fields']['message_type'][0], item['fields']['sentiment'][0], ts2date(item['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url])
            if i == number:
                break
        uid_list = all_sort_filter(un_uid_list[:number] , sort_norm , time ,True, number)
        sorted_text_results = []
        f = open("small.txt", "wb")
        for iter_uid in uid_list:
            iter_index = un_uid_list.index(iter_uid)
            f.write(str(iter_uid)+"\n")
            sorted_text_results.append(text_results[iter_index])
        f.close()
    print "filter_uid_list: ", len(uid_list)
    if uid_list:
        results = make_up_user_info(uid_list,isall,time,sort_norm)
    else:
        results = []
    print "results: ", len(results)
    # 修改状态
    task_detail = es_user_portrait.get(index=USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id)
    item = task_detail['_source']
    item['status'] = 1
    item['result'] = json.dumps(results)
    item['text_results'] = json.dumps(sorted_text_results)
    item['number'] = len(results)
    es_user_portrait.index(index = USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE , id=task_id,  body=item)

    return "1"
def potential_user(task_name, ts):
    index_name = "stimulation_"+task_name
    index_type = "stimulation_results"

    #查询当前root_mid
    query_body = {
        "query": {
            "bool":{
                "must":[
                    {"range":{
                        "timestamp":{
                            "lt": ts
                        }
                    }},
                    {"term":{"message_type":1}},
                    {"range":{
                        "user_fansnum":{
                            "gte": 10000
                        }
                    }}
                ]
            }
        },
        "size": 10000
    }

    es_results = es.search(index=task_name, doc_type="text", body=query_body)["hits"]["hits"]

    mid_list = []
    uid_list = []
    feature_list = []
    prediction_uid = []
    prediction_weibo = []
    with open("prediction_uid.pkl", "r") as f:
        uid_model = pickle.load(f)
    with open("prediction_weibo.pkl", "r") as f:
        weibo_model = pickle.load(f)

    for item in es_results:
        mid_list.append(item["_id"])
        uid_list.append(item["_source"]["uid"])
        tmp_feature_list = organize_feature(task_name,item["_id"], ts)
        feature_list.append(tmp_feature_list)
        weibo_prediction_result = weibo_model.predict(feature_list)
        uid_prediction_result = uid_model.predict(feature_list)

    future_total = 0
    current_total = 0

    results_dict = dict()
    in_potential_list = []
    for i in range(len(mid_list)):
        mid = mid_list[i]
        uid = uid_list[i]
        iter_count = es.count(index=task_name, doc_type="text", body={"query":{"term":{"root_mid":mid}}})["count"]
        pre_count = weibo_prediction_result[i]
        future_total += abs(pre_count-iter_count)
        if pre_count >= 500 and iter_count <= 500:
            current_total += abs(pre_count-iter_count)
            if not results_dict.has_key(uid):
                results_dict[uid] = dict()
            tmp = dict()
            tmp["mid"] = mid
            tmp["current_count"] = iter_count
            tmp["prediction_count"] = int(pre_count)
            weibo_detail = es.get(index=task_name, doc_type="text", id=mid)["_source"]
            tmp.update(weibo_detail)
            retweet, comment = search_retweet_comment(task_name, mid)
            tmp["retweeted"] = retweet
            tmp["comment"] = comment
            results_dict[uid][mid] = tmp


    # user profile
    tmp_in_list = results_dict.keys()
    if tmp_in_list:
        profile_results = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":tmp_in_list})["docs"]
        for i in range(len(tmp_in_list)):
            detail = profile_results[i]
            tmp = []
            uid = tmp_in_list[i]
            if detail["found"]:
                tmp.append(detail["_source"]["nick_name"])
                tmp.append(detail["_source"]["photo_url"])
                tmp.append(detail["_source"]["fansnum"])
                tmp.append(detail["_source"]["statusnum"])
            else:
                tmp.append(detail["_id"])
                tmp.extend(["","",""])
            results_dict[uid]["user_profile"] = tmp


    return results_dict, future_total, current_total
Beispiel #34
0
def create_event_warning(xnr_user_no,start_time,end_time):
    #获取事件名称
    today_datetime = start_time
    hashtag_list = get_hashtag(today_datetime)
    #print 'hashtag_list::',hashtag_list

    flow_text_index_name = get_day_flow_text_index_list(today_datetime)

    #虚拟人的粉丝列表和关注列表
    try:
        es_xnr_result=es_xnr.get(index=weibo_xnr_fans_followers_index_name,doc_type=weibo_xnr_fans_followers_index_type,id=xnr_user_no)['_source']
        followers_list=es_xnr_result['followers_list']
        fans_list=es_xnr_result['fans_list']
    except:
        followers_list=[]
        fans_list=[]

    event_warming_list=[]
    event_num=0
    for event_item in hashtag_list:
        event_sensitive_count=0
        event_warming_content=dict()     #事件名称、主要参与用户、典型微博、事件影响力、事件平均时间
        event_warming_content['event_name']=event_item['event_name']
        print 'event_name:',event_item
        event_num=event_num+1
        print 'event_num:::',event_num
        print 'first_time:::',int(time.time())
        event_influence_sum=0
        event_time_sum=0       
        query_body={
            'query':{
                # 'bool':{
                #     'must':[{'wildcard':{'text':'*'+event_item[0]+'*'}},
                #     {'range':{'sensitive':{'gte':1}}}]
                # }
                'filtered':{
                    'filter':{
                        'bool':{
                            'must':[
                                {'term':{'hashtag':event_item['event_name']}},
                                {'range':{'sensitive':{'gte':1}}},
                                {'range':{'timestamp':{'gte':start_time,'lte':end_time}}}
                            ]
                        }
                    }
                }
            },
            'size':MAX_WARMING_SIZE,
            'sort':{'sensitive':{'order':'desc'}}
        }
        #try:         
        event_results=es_flow_text.search(index=flow_text_index_name,doc_type=flow_text_index_type,body=query_body)['hits']['hits']
        print 'event:::',len(event_results),start_time,end_time
        if event_results:
            weibo_result=[]
            fans_num_dict=dict()
            followers_num_dict=dict()
            alluser_num_dict=dict()
            print 'sencond_time:::',int(time.time())
            for item in event_results:
                #print 'event_content:',item['_source']['text']          
                
                #统计用户信息
                if alluser_num_dict.has_key(str(item['_source']['uid'])):
                    followers_mark=set_intersection(item['_source']['uid'],followers_list)
                    if followers_mark > 0:
                        alluser_num_dict[str(item['_source']['uid'])]=alluser_num_dict[str(item['_source']['uid'])]+1*2
                    else:
                        alluser_num_dict[str(item['_source']['uid'])]=alluser_num_dict[str(item['_source']['uid'])]+1
                else:
                    alluser_num_dict[str(item['_source']['uid'])]=1                

                #计算影响力
                origin_influence_value=(1+item['_source']['comment']+item['_source']['retweeted'])*(1+item['_source']['sensitive'])
                # fans_value=judge_user_type(item['_source']['uid'],fans_list)
                followers_value=judge_user_type(item['_source']['uid'],followers_list)
                item['_source']['weibo_influence_value']=origin_influence_value*(followers_value)
                
                item['_source']['nick_name']=get_user_nickname(item['_source']['uid'])

                weibo_result.append(item['_source'])

                #统计影响力、时间
                event_influence_sum=event_influence_sum+item['_source']['weibo_influence_value']
                event_time_sum=event_time_sum+item['_source']['timestamp']            
        
            print 'third_time:::',int(time.time())
            #典型微博信息
            the_weibo_result=remove_repeat_v2(weibo_result)
            the_weibo_result.sort(key=lambda k:(k.get('weibo_influence_value',0)),reverse=True)
            event_warming_content['main_weibo_info']=json.dumps(the_weibo_result)

            #事件影响力和事件时间
            number=len(event_results)
            event_warming_content['event_influence']=event_influence_sum/number
            event_warming_content['event_time']=event_time_sum/number

        # except:
        #     event_warming_content['main_weibo_info']=[]
        #     event_warming_content['event_influence']=0
        #     event_warming_content['event_time']=0
        
        # try:
            #对用户进行排序
            alluser_num_dict=sorted(alluser_num_dict.items(),key=lambda d:d[1],reverse=True)
            main_userid_list=[]
            for i in xrange(0,len(alluser_num_dict)):
                main_userid_list.append(alluser_num_dict[i][0])

        #主要参与用户信息
            main_user_info=[]
            user_es_result=es_user_profile.mget(index=profile_index_name,doc_type=profile_index_type,body={'ids':main_userid_list})['docs']
            for item in user_es_result:

                user_dict=dict()
                if item['found']:
                    user_dict['photo_url']=item['_source']['photo_url']
                    user_dict['uid']=item['_id']
                    user_dict['nick_name']=item['_source']['nick_name']
                    user_dict['favoritesnum']=item['_source']['favoritesnum']
                    user_dict['fansnum']=item['_source']['fansnum']
                else:
                    user_dict['photo_url']=''
                    user_dict['uid']=item['_id']
                    user_dict['nick_name']=''
                    user_dict['favoritesnum']=0
                    user_dict['fansnum']=0
                main_user_info.append(user_dict)
            event_warming_content['main_user_info']=json.dumps(main_user_info)


        # except:
            # event_warming_content['main_user_info']=[]
            print 'fourth_time:::',int(time.time())
            event_warming_content['xnr_user_no']=xnr_user_no
            event_warming_content['validity']=0
            event_warming_content['timestamp']=today_datetime

            event_warming_list.append(event_warming_content)
        else:
        	pass
        print 'fifth_time:::',int(time.time())
    return event_warming_list
Beispiel #35
0
def get_profile_information(uid_list):
    result_dict = dict()
    search_result = es.mget(index=index_name,
                            doc_type=index_type,
                            body={'ids': uid_list},
                            _source=True)['docs']
    try:
        bci_history_result = es_bci_history.mget(
            index=bci_history_index_name,
            doc_type=bci_history_index_type,
            body={'ids': uid_list},
            fields=['user_fansnum', 'weibo_month_sum',
                    'user_friendsnum'])['docs']
    except:
        bci_history_result = []
    iter_count = 0
    for item in search_result:
        user_dict = {}
        try:
            bci_history_item = bci_history_result[iter_count]
        except:
            bci_history_item = {}
        for field in fields_dict:
            try:
                if field == 'statusnum':
                    if bci_history_item and bci_history_item['found'] == True:
                        if isinstance(
                                bci_history_item['fields']['weibo_month_num']
                            [0], int):
                            user_dict[field] = bci_history_item['fields'][
                                'weibo_month_sum'][0]
                        else:
                            user_dict[field] = 0
                    else:
                        user_dict[field] = 0
                elif field == 'fansnum':
                    if bci_history_item and bci_history_item['found'] == True:
                        if isinstance(
                                bci_history_item['fields']['user_fansnum'][0],
                                int):
                            user_dict[field] = bci_history_item['fields'][
                                'user_fansnum'][0]
                        else:
                            user_dict[field] = 0
                    else:
                        user_dict[field] = 0
                elif field == 'friendsnum':
                    if bci_history_item and bci_history_item['found'] == True:
                        if isinstance(
                                bci_history_item['fields']['user_friendsnum']
                            [0], int):
                            user_dict[field] = bci_history_item['fields'][
                                'user_friendsnum'][0]
                        else:
                            user_dict[field] = 0
                    else:
                        user_dict[field] = 0
                else:
                    try:
                        user_dict[field] = item['_source'][field]
                    except:
                        user_dict[field] = ''
            except:
                if field == 'statusnum':
                    user_dict[field] = 0
                elif field == 'fansnum':
                    user_dict[field] = 0
                elif field == 'friendsnum':
                    user_dict[field] = 0
                elif field == 'gender':
                    user_dict[field] = 0
                elif field == 'uname':
                    user_dict[field] = u'unknown'
                else:
                    user_dict[field] = 'unknown'
        result_dict[item['_id']] = user_dict
        iter_count += 1
    return result_dict
def get_profile_information(uid_list):
    result_dict = dict()
    search_result = es.mget(index=index_name, doc_type=index_type, body={'ids':uid_list}, _source=True)['docs']
    try:
        bci_history_result = es_bci_history.mget(index=bci_history_index_name, doc_type=bci_history_index_type, body={'ids': uid_list}, fields=['user_fansnum', 'weibo_month_sum', 'user_friendsnum'])['docs']
    except:
        bci_history_result = []
    iter_count = 0
    for item in search_result:
        user_dict = {}
        try:
            bci_history_item = bci_history_result[iter_count]
        except:
            bci_history_item = {}
        for field in fields_dict:
            try:
                if field == 'statusnum':
                    if bci_history_item and bci_history_item['found']==True:
                        if isinstance(bci_history_item['fields']['weibo_month_num'][0], int):
                            user_dict[field] = bci_history_item['fields']['weibo_month_sum'][0]
                        else:
                            user_dict[field] = 0
                    else:
                        user_dict[field] = 0
                elif field == 'fansnum':
                    if bci_history_item and bci_history_item['found']==True:
                        if isinstance(bci_history_item['fields']['user_fansnum'][0], int):
                            user_dict[field] = bci_history_item['fields']['user_fansnum'][0]
                        else:
                            user_dict[field] = 0
                    else:
                        user_dict[field] = 0
                elif field == 'friendsnum':
                    if bci_history_item and bci_history_item['found']==True:
                        if isinstance(bci_history_item['fields']['user_friendsnum'][0], int):
                            user_dict[field] = bci_history_item['fields']['user_friendsnum'][0]
                        else:
                            user_dict[field] = 0
                    else:
                        user_dict[field] = 0
                else:
                    try:
                        user_dict[field] = item['_source'][field]
                    except:
                        user_dict[field] = ''
            except:
                if field=='statusnum':
                    user_dict[field] = 0
                elif field=='fansnum':
                    user_dict[field] =0
                elif field=='friendsnum':
                    user_dict[field] = 0
                elif field=='gender':
                    user_dict[field] = 0
                elif field=='uname':
                    user_dict[field] = u'unknown'
                else:
                    user_dict[field] = 'unknown'
        result_dict[item['_id']] = user_dict
        iter_count += 1
    return result_dict
def main():
    if RUN_TYPE:
        now_ts = time.time()-DAY # 前一天
        now_ts = datetime2ts('2016-03-24')
        ts = str(datetime2ts(ts2datetime(now_ts)))
    else:
        ts = str(datetime2ts('2016-03-16'))
    now_ts = int(ts)
    print now_ts
    sensitive_string = "sensitive_" + ts
    date_string = ts
    update_sensitive_key = "sensitive_score_" + ts # 更新的键
    sensitive_dict_key = "sensitive_dict_" + ts
    sensitive_string_key = "sensitive_string_" + ts
    sensitive_day_change_key = "sensitive_" + ts +"_day_change"
    del_month = datetime2ts(ts2datetime(now_ts - MONTH))
    del_sensitive_key = "sensitive_score_"+str(del_month) # 要删除的键

    former_ts = int(ts) - DAY
    former_date = str(datetime2ts(ts2datetime(former_ts)))
    former_sensitive_key = "sensitive_score_" + former_date

    iter_count = 0
    bulk_action = []

    mappings(ES_SENSITIVE_INDEX)
    total_number = r.hlen(sensitive_string)
    scan_cursor = 0
    print total_number

    while 1:
        re_scan = r.hscan(sensitive_string, scan_cursor, count=1000)
        scan_cursor = re_scan[0]
        if len(re_scan[1]) != 0:
            sensitive_info = re_scan[1] # 字典形式,uid:sensitive_words_dict
            uid_list = sensitive_info.keys()
            sensitive_results = es.mget(index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX, body={"ids":uid_list})['docs']
            if sensitive_results:
                for item in sensitive_results:
                    uid = item['_id']
                    sensitive_words_dict = json.loads(sensitive_info[uid]) # json.loads
                    current_sensitive_score = 0
                    for k,v in sensitive_words_dict.iteritems():
                        tmp_stage = r_sensitive.hget("sensitive_words", k)
                        if tmp_stage:
                            current_sensitive_score += v*sensitive_score_dict[str(tmp_stage)]
                    if item['found']: # 之前存在相关信息
                        revise_item = item["_source"]
                        if del_sensitive_key in revise_item:
                            item.remove(del_sensitive_key)
                        revise_item['uid'] = uid
                        # 新更新的敏感度
                        revise_item[update_sensitive_key] = current_sensitive_score
                        # 新更新的敏感词
                        revise_item[sensitive_dict_key] = sensitive_info[uid]
                        # 新更新的string
                        revise_item[sensitive_string_key] = "&".join(sensitive_words_dict.keys())
                        # 当天和之前一天、一周和一月均值的差异
                        revise_item['sensitive_day_change'] = current_sensitive_score - revise_item.get(former_sensitive_key, 0)
                        revise_item['sensitive_week_change'] = current_sensitive_score - revise_item.get('sensitive_week_ave', 0)
                        revise_item['sensitive_month_change'] = current_sensitive_score - revise_item.get('sensitive_month_ave', 0)
                        # 更新后week、month的均值和方差
                        revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts)
                        revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts)

                    else:
                        revise_item = dict()
                        revise_item['uid'] = uid
                        revise_item[update_sensitive_key] = current_sensitive_score
                        revise_item[sensitive_dict_key] = sensitive_info[uid]
                        revise_item[sensitive_string_key] = "&".join(sensitive_words_dict.keys())
                        revise_item['sensitive_day_change'] = current_sensitive_score
                        revise_item['sensitive_week_change'] = current_sensitive_score
                        revise_item['sensitive_month_change'] = current_sensitive_score
                        revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts)
                        revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts)
                    action = {'index':{'_id': uid}}
                    bulk_action.extend([action, revise_item])
                    iter_count += 1
                    if iter_count % 1000 == 0:
                        es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX)
                        bulk_action = []
                        print iter_count
        if int(scan_cursor) == 0:
            break
    if bulk_action:
        es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX)

    print iter_count