Ejemplo n.º 1
0
def co_search(es, user_list, bulk_action, count_n, tb):
    search_list = []
    for item in user_list:
        uid = item.get('uid', '0')  # obtain uid, notice "uid" or "user"
        search_list.append(uid)

    search_result = es.mget(index=index_destination,
                            doc_type=index_destination_doctype,
                            body={"ids": search_list},
                            _source=False)["docs"]
    search_list = []

    for item in search_result:
        if not item['found']:
            user_info = {}
            user_info['uid'] = item['_id']
            user_info['low_number'] = 0
            xdata = expand_index_action(user_info)
            bulk_action.extend([xdata[0], xdata[1]])
            count_n += 1
            if count_n % 1000 == 0:
                es.bulk(bulk_action,
                        index=index_destination,
                        doc_type=index_destination_doctype,
                        timeout=30)
                bulk_action = []
                print count_n

            if count_n % 10000 == 0:
                ts = time.time()
                print "count_n %s  per  %s  second" % (count_n, ts - tb)
                print "count %s " % count
                tb = ts

    return bulk_action, count_n, tb
def co_search(es, user_list, bulk_action, count_n, tb):
    search_list = []
    for item in user_list:
        uid = item.get('uid', '0') # obtain uid, notice "uid" or "user"
        search_list.append(uid)

    search_result = es.mget(index=index_destination, doc_type=index_destination_doctype, body={"ids": search_list}, _source=False)["docs"]
    search_list = []

    for item in search_result:
        if not item['found']:
            user_info = {}
            user_info['uid'] = item['_id']
            user_info['low_number'] = 0
            xdata = expand_index_action(user_info)
            bulk_action.extend([xdata[0], xdata[1]])
            count_n += 1
            if count_n % 1000 == 0:
                es.bulk(bulk_action, index=index_destination, doc_type=index_destination_doctype, timeout=30)
                bulk_action = []
                print count_n

            if count_n % 10000 == 0:
                ts = time.time()
                print "count_n %s  per  %s  second"  %(count_n, ts-tb)
                print "count %s " % count
                tb = ts

    return bulk_action, count_n, tb
Ejemplo n.º 3
0
def filter_in(top_user_set):
    results = []
    try:
        in_results = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, body={'ids':list(top_user_set)})
    except Exception as e:
        print 'cron/recommend_in/recommend_in.py&error-2&'
    filter_list = [item['_id'] for item in in_results['docs'] if item['found'] is True]
    results = set(top_user_set) - set(filter_list)
    return results
def filter_in(top_user_set):
    results = []
    try:
        in_results = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, body={'ids':list(top_user_set)})
    except Exception as e:
        print 'cron/recommend_in/recommend_in.py&error-2&'
    filter_list = [item['_id'] for item in in_results['docs'] if item['found'] is True]
    results = set(top_user_set) - set(filter_list)
    return results
Ejemplo n.º 5
0
def all_makeup_info(uid_list , sort_norm , time):
    es = es_user_profile
    field_bci ,field_sen, field_weibo = get_all_filed(sort_norm , time) 
    search_result = es.mget(index=WEBUSER_INDEX_NAME , doc_type=WEBUSER_INDEX_TYPE, body={"ids":uid_list})["docs"]
    current_ts = datetime2ts(ts2datetime(TIME.time()-DAY))
    bci_result = es.mget(index="bci_history", doc_type="bci", body={"ids":uid_list}, fields=[field_bci, "user_fansnum", field_weibo, "weibo_month_sum"])["docs"]
    sen_result = es.mget(index=SESHIS_INDEX_NAME, doc_type=SESHIS_INDEX_TYPE, body={"ids":uid_list}, fields=[field_sen])["docs"]
    in_portrait = es_user_portrait.mget(index=USER_INDEX_NAME, doc_type=USER_INDEX_TYPE, body={"ids":uid_list}, _source=False)["docs"]
    results = []
    #fans_result = es_user_profile.mget(index="bci_history", doc_type="bci", body={"ids":uid_list}, fields=["user_fansnum"], _source=False)["docs"]
    bci_max = get_max_value(es_user_profile, "bci_history", "bci", field_bci)
    sen_max = get_max_value(es_user_profile, "sensitive_history", "sensitive", field_sen)
    for i in range(len(uid_list)):
        tmp = dict()
        tmp['uid'] = uid_list[i]
        if search_result[i]['found']:
            iter_item = search_result[i]['_source']
            tmp['location'] = iter_item['user_location']
            tmp['uname'] = iter_item['nick_name']
            tmp['photo_url'] = iter_item['photo_url']
        else:
            tmp['location'] = None
            tmp['uname'] = tmp['uid']
            tmp['photo_url'] = 'unknown'
        if in_portrait[i]['found']:
            tmp['is_warehousing'] = True
        else:
            tmp['is_warehousing'] = False
        if bci_result[i]['found']:
            try:
                bci_value = bci_result[i]['fields'][field_bci][0]
                tmp['bci'] = math.log(bci_value/float(bci_max)*9+1,10)*100
            except:
                tmp['bci'] = 0
            try:
                tmp['fans'] = bci_result[i]['fields']["user_fansnum"][0]
            except:
                tmp['fans'] = ''
            try:
                tmp["weibo_count"] = bci_result[i]['fields']["weibo_month_sum"][0]
            except:
                tmp["weibo_count"] = ''
        else:
            tmp['bci'] = None
            tmp['fans'] = None
            tmp["weibo_count"] = None
        if sen_result[i]['found']:
            try:
                sen_value = sen_result[i]['fields'][field_sen][0]
                tmp['sen'] = math.log(sen_value/float(sen_max)*9+1,10)*100
            except:
                tmp['sen'] = 0
        else:
            tmp['sen'] = None

        results.append(tmp)
    return results
Ejemplo n.º 6
0
def all_makeup_info(uid_list , sort_norm , time):
    es = es_user_profile
    field_bci ,field_sen, field_weibo = get_all_filed(sort_norm , time) 
    search_result = es.mget(index=WEBUSER_INDEX_NAME , doc_type=WEBUSER_INDEX_TYPE, body={"ids":uid_list})["docs"]
    current_ts = datetime2ts(ts2datetime(TIME.time()-DAY))
    bci_result = es.mget(index="bci_history", doc_type="bci", body={"ids":uid_list}, fields=[field_bci, "user_fansnum", field_weibo, "weibo_month_sum"])["docs"]
    sen_result = es.mget(index=SESHIS_INDEX_NAME, doc_type=SESHIS_INDEX_TYPE, body={"ids":uid_list}, fields=[field_sen])["docs"]
    in_portrait = es_user_portrait.mget(index=USER_INDEX_NAME, doc_type=USER_INDEX_TYPE, body={"ids":uid_list}, _source=False)["docs"]
    results = []
    #fans_result = es_user_profile.mget(index="bci_history", doc_type="bci", body={"ids":uid_list}, fields=["user_fansnum"], _source=False)["docs"]
    bci_max = get_max_value(es_user_profile, "bci_history", "bci", field_bci)
    sen_max = get_max_value(es_user_profile, "sensitive_history", "sensitive", field_sen)
    for i in range(len(uid_list)):
        tmp = dict()
        tmp['uid'] = uid_list[i]
        if search_result[i]['found']:
            iter_item = search_result[i]['_source']
            tmp['location'] = iter_item['user_location']
            tmp['uname'] = iter_item['nick_name']
        else:
            tmp['location'] = None
            tmp['uname'] = tmp['uid']
        if in_portrait[i]['found']:
            tmp['is_warehousing'] = True
        else:
            tmp['is_warehousing'] = False
        if bci_result[i]['found']:
            try:
                bci_value = bci_result[i]['fields'][field_bci][0]
                tmp['bci'] = math.log(bci_value/float(bci_max)*9+1,10)*100
            except:
                tmp['bci'] = 0
            try:
                tmp['fans'] = bci_result[i]['fields']["user_fansnum"][0]
            except:
                tmp['fans'] = 0
            try:
                tmp["weibo_count"] = bci_result[i]['fields']["weibo_month_sum"][0]
            except:
                tmp["weibo_count"] = 0
        else:
            tmp['bci'] = None
            tmp['fans'] = None
            tmp["weibo_count"] = None
        if sen_result[i]['found']:
            try:
                sen_value = sen_result[i]['fields'][field_sen][0]
                tmp['sen'] = math.log(sen_value/float(sen_max)*9+1,10)*100
            except:
                tmp['sen'] = 0
        else:
            tmp['sen'] = None

        results.append(tmp)
    return results
Ejemplo n.º 7
0
def in_makeup_info(uid_list , sort_norm , time):
    es = es_user_portrait
    search_results = []
    results = []
    ts = datetime2ts(ts2datetime(TIME.time()-DAY))
    field_bci , field_sen ,field_imp ,field_act = get_in_filed(sort_norm,time)
    field_dict = {"uid":"uid","uname":"uname","location":"location","topic":"topic_string","domain":"domain","fans":"fansnum", "act":"activeness", "imp":"importance", "bci":"influence", "sen":"sensitive"}
    if uid_list:
        search_results = es_user_portrait.mget(index=USER_INDEX_NAME, doc_type=USER_INDEX_TYPE, body={"ids":uid_list}, _source=False, fields=["uid","uname","location","topic_string","domain","fansnum", "influence", "importance", "activeness", "sensitive"])["docs"]
        
        bci_results = es.mget(index=BCI_INDEX_NAME, doc_type=BCI_INDEX_TYPE, body={"ids":uid_list}, _source=False, fields=[field_bci,"user_fansnum", "weibo_month_sum"])["docs"]
        imp_results = es.mget(index=IMP_INDEX_NAME, doc_type=IMP_INDEX_TYPE, body={"ids":uid_list}, _source=False, fields=[field_imp])["docs"]
        act_results = es.mget(index=ACT_INDEX_NAME, doc_type=ACT_INDEX_TYPE, body={"ids":uid_list}, _source=False, fields=[field_act])["docs"]
        sen_results = es.mget(index=SES_INDEX_NAME, doc_type=SES_INDEX_TYPE, body={"ids":uid_list}, _source=False, fields=[field_sen])["docs"]
        
        results = []
        for i in range(len(uid_list)):
            item = dict()
            if not search_results[i].get('found', 0):
                continue
            for k,v in field_dict.iteritems():
                item[k] = search_results[i]["fields"][v][0]
                if k == "uname" and not item[k]:
                    item[k] = uid_list[i]


            try:
                act_value = act_results[i]['fields'][field_act][0]
                item['act'] = act_value
            except:
                item['act'] = 0
            try:
                imp_value = imp_results[i]['fields'][field_imp][0]
                item['ipm'] = imp_value
            except:
                item['ipm'] = 0
            try:
                user_fansnum = bci_results[i]['fields']['user_fansnum'][0]
                item['fans'] = user_fansnum
            except:
                item['fans'] = 0
            try:
                bci_value = bci_results[i]['fields'][field_bci][0]
                item['bci'] = bci_value
            except:
                item['bci'] = 0 
            try:
                sen_value = sen_results[i]['fields'][field_sen][0]
                tmp['sen'] = sen_value
            except:
                item['sen'] = 0
            
            results.append(item)

    return results
def in_makeup_info(uid_list , sort_norm , time):
    es = es_user_portrait
    search_results = []
    results = []
    ts = datetime2ts(ts2datetime(TIME.time()-DAY))
    field_bci , field_sen ,field_imp ,field_act = get_in_filed(sort_norm,time)
    field_dict = {"uid":"uid","uname":"uname","location":"location","topic":"topic_string","domain":"domain","fans":"fansnum", "act":"activeness", "imp":"importance", "bci":"influence", "sen":"sensitive"}
    if uid_list:
        search_results = es_user_portrait.mget(index=USER_INDEX_NAME, doc_type=USER_INDEX_TYPE, body={"ids":uid_list}, _source=False, fields=["uid","uname","location","topic_string","domain","fansnum", "influence", "importance", "activeness", "sensitive"])["docs"]
        
        bci_results = es.mget(index=BCI_INDEX_NAME, doc_type=BCI_INDEX_TYPE, body={"ids":uid_list}, _source=False, fields=[field_bci,"user_fansnum", "weibo_month_sum"])["docs"]
        imp_results = es.mget(index=IMP_INDEX_NAME, doc_type=IMP_INDEX_TYPE, body={"ids":uid_list}, _source=False, fields=[field_imp])["docs"]
        act_results = es.mget(index=ACT_INDEX_NAME, doc_type=ACT_INDEX_TYPE, body={"ids":uid_list}, _source=False, fields=[field_act])["docs"]
        sen_results = es.mget(index=SES_INDEX_NAME, doc_type=SES_INDEX_TYPE, body={"ids":uid_list}, _source=False, fields=[field_sen])["docs"]
        
        results = []
        for i in range(len(uid_list)):
            item = dict()
            if not search_results[i].get('found', 0):
                continue
            for k,v in field_dict.iteritems():
                item[k] = search_results[i]["fields"][v][0]
                if k == "uname" and not item[k]:
                    item[k] = uid_list[i]


            try:
                act_value = act_results[i]['fields'][field_act][0]
                item['act'] = act_value
            except:
                item['act'] = 0
            try:
                imp_value = imp_results[i]['fields'][field_imp][0]
                item['ipm'] = imp_value
            except:
                item['ipm'] = 0
            try:
                user_fansnum = bci_results[i]['fields']['user_fansnum'][0]
                item['fans'] = user_fansnum
            except:
                item['fans'] = ''
            try:
                bci_value = bci_results[i]['fields'][field_bci][0]
                item['bci'] = bci_value
            except:
                item['bci'] = 0 
            try:
                sen_value = sen_results[i]['fields'][field_sen][0]
                tmp['sen'] = sen_value
            except:
                item['sen'] = 0
            
            results.append(item)

    return results
Ejemplo n.º 9
0
def filter_in(top_user_set):
    results = []
    try:
        in_results = es_user_portrait.mget(index='user_portrait', doc_type='user', body={'ids':list(top_user_set)})
    except Exception as e:
        raise e
    filter_list = [item['_id'] for item in in_results['docs'] if item['found'] is True]
    print 'before filter in:', len(top_user_set)
    print 'filter_list:', len(filter_list)
    results = set(top_user_set) - set(filter_list)
    print 'after filter in:', len(results)
    return results
Ejemplo n.º 10
0
def get_bci_detail():
    uid_list = []
    with open("uid_list_0520.txt", 'rb') as f:
        for item in f:
            uid_list.append(item.strip())
    
    print uid_list
    index_name = "bci_20160522"
    bci_results = es_user_portrait.mget(index=index_name, doc_type="bci", body={"ids":uid_list})["docs"]
    with open("bci_detail_0522.txt", "wb") as f:
       for item in bci_results:
           if item["found"]:
               f.write(json.dumps(item["_source"])+"\n")
Ejemplo n.º 11
0
def get_bci_detail():
    uid_list = []
    with open("uid_list_0520.txt", 'rb') as f:
        for item in f:
            uid_list.append(item.strip())

    print uid_list
    index_name = "bci_20160522"
    bci_results = es_user_portrait.mget(index=index_name,
                                        doc_type="bci",
                                        body={"ids": uid_list})["docs"]
    with open("bci_detail_0522.txt", "wb") as f:
        for item in bci_results:
            if item["found"]:
                f.write(json.dumps(item["_source"]) + "\n")
Ejemplo n.º 12
0
def get_temporal_rank(timestamp):
    index = get_queue_index(timestamp)
    index_ts = "influence_timestamp_" + str(index)
    
    uid_list = r.zrange(index_ts, 0, 10000, desc=True)
    user_info = []
    in_portrait = [] # 入库
    if uid_list:
        search_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids":uid_list}, field=SOCIAL_SENSOR_INFO)["docs"]
        for item in search_result:
            if item["found"]:
                temp = []
                in_portrait.append(item['_id'])
                for iter_key in SOCIAL_SENSOR_INFO:
                   
    """
Ejemplo n.º 13
0
def es_km_storage(uid_list):
    es_results = es_user_portrait.mget(index="user_portrait_1222", doc_type="user", body={"ids":uid_list})["docs"]
    in_list = []
    out_list = []
    bulk_action = []
    for item in es_results:
        if item["found"]:
            in_list.append(item["_id"])
            bulk_action.append(item["_source"])
        else:
            out_list.append(item["_id"])

    if bulk_action:
        es_km.bulk(bulk_action, index='user_portrait', doc_type="user", timeout=60) 

    return in_list, out_list
def get_forward_numerical_info(task_name, ts, create_by):
    results = []
    ts_series = []
    for i in range(1, forward_n + 1):
        ts_series.append(ts - i * time_interval)

    # check if detail es of task exists
    doctype = create_by + "-" + task_name
    index_exist = es_user_portrait.indices.exists_type(index_sensing_task, doctype)
    if not index_exist:
        print "new create task detail index"
        mappings_sensing_task(doctype)

    if ts_series:
        search_results = es_user_portrait.mget(index=index_sensing_task, doc_type=doctype, body={"ids": ts_series})[
            "docs"
        ]
        found_count = 0
        average_origin = []
        average_retweeted = []
        average_commet = []
        average_total = []
        average_negetive = []
        for item in search_results:
            if item["found"]:
                temp = item["_source"]
                sentiment_dict = json.loads(temp["sentiment_distribution"])
                average_total.append(int(temp["weibo_total_number"]))
                average_negetive.append(
                    int(sentiment_dict["2"])
                    + int(sentiment_dict["3"])
                    + int(sentiment_dict["4"])
                    + int(sentiment_dict["5"])
                    + int(sentiment_dict["6"])
                )
                found_count += 1

        if found_count > initial_count:
            number_mean = np.mean(average_total)
            number_std = np.std(average_total)
            sentiment_mean = np.mean(average_negetive)
            sentiment_std = np.mean(average_negetive)
            results = [1, number_mean, number_std, sentiment_mean, sentiment_std]
        else:
            results = [0]

    return results
def get_forward_numerical_info(task_name, ts, create_by):
    results = []
    ts_series = []
    for i in range(1, forward_n + 1):
        ts_series.append(ts - i * time_interval)

    # check if detail es of task exists
    doctype = create_by + "-" + task_name
    index_exist = es_user_portrait.indices.exists_type(index_sensing_task,
                                                       doctype)
    if not index_exist:
        print "new create task detail index"
        mappings_sensing_task(doctype)

    if ts_series:
        search_results = es_user_portrait.mget(index=index_sensing_task,
                                               doc_type=doctype,
                                               body={"ids": ts_series})['docs']
        found_count = 0
        average_origin = []
        average_retweeted = []
        average_commet = []
        average_total = []
        average_negetive = []
        for item in search_results:
            if item['found']:
                temp = item['_source']
                sentiment_dict = json.loads(temp['sentiment_distribution'])
                average_total.append(int(temp['weibo_total_number']))
                average_negetive.append(
                    int(sentiment_dict["2"]) + int(sentiment_dict['3']) +
                    int(sentiment_dict['4']) + int(sentiment_dict['5']) +
                    int(sentiment_dict['6']))
                found_count += 1

        if found_count > initial_count:
            number_mean = np.mean(average_total)
            number_std = np.std(average_total)
            sentiment_mean = np.mean(average_negetive)
            sentiment_std = np.mean(average_negetive)
            results = [
                1, number_mean, number_std, sentiment_mean, sentiment_std
            ]
        else:
            results = [0]

    return results
Ejemplo n.º 16
0
def get_influence_value(date_time, field_name, uid_list):
    datename = ts2datetime(date_time - DAY)
    new_datetime = datename[0:4] + datename[5:7] + datename[8:10]
    bci_index_name = weibo_bci_index_name_pre + new_datetime
    index_value_list = []
    try:
        result = es_user_portrait.mget(index=bci_index_name,
                                       doc_type=weibo_bci_index_type,
                                       body={'ids': uid_list},
                                       _source=True)['docs']
        for item in result:
            # print 'item_influence::',item
            # print 'item_type::',type(item)
            if item['found']:
                index_value_list.append(item['_source']['user_index'])
    except Exception, e:
        print '影响力查询错误::', e
Ejemplo n.º 17
0
def search_attention(uid):
    stat_results = dict()
    results = dict()
    for db_num in R_DICT:
        r = R_DICT[db_num]
        ruid_results = r.hgetall('retweet_'+str(uid))
        if ruid_results:
            for ruid in ruid_results:
                if ruid != uid:
                    try:
                        stat_results[ruid] += ruid_results[ruid]
                    except:
                        stat_results[ruid] = ruid_results[ruid]
    # print 'results:', stat_results
    if not stat_results:
        return [None, 0]
    try:
        sort_state_results = sorted(stat_results.items(), key=lambda x:x[1], reverse=True)[:20]
    except:
        return [None, 0]
    print 'sort_state_results:', sort_state_results
    uid_list = [item[0] for item in sort_state_results]
    es_profile_results = es_user_profile.mget(index='weibo_user', doc_type='user', body={'ids':uid_list})['docs']
    es_portrait_results = es_user_portrait.mget(index='user_portrait', doc_type='user', body={'ids':uid_list})['docs']
    result_list = dict()
    for i in range(len(es_profile_results)):
        item = es_profile_results[i]
        uid = item['_id']
        try:
            source = item['_source']
            uname = source['nick_name']
        except:
            uname = u'未知'
        # identify uid is in the user_portrait
        portrait_item = es_portrait_results[i]
        try:
            source = portrait_item[i]
            in_status = 1
        except:
            in_status = 0

        result_list[uid] = [uid,[uname, stat_results[uid], in_status]]
       
    return [result_list, len(stat_results)]
Ejemplo n.º 18
0
def search_attention(uid):
    stat_results = dict()
    results = dict()
    for db_num in R_DICT:
        r = R_DICT[db_num]
        ruid_results = r.hgetall("retweet_" + str(uid))
        if ruid_results:
            for ruid in ruid_results:
                if ruid != uid:
                    try:
                        stat_results[ruid] += ruid_results[ruid]
                    except:
                        stat_results[ruid] = ruid_results[ruid]
    # print 'results:', stat_results
    if not stat_results:
        return [None, 0]
    try:
        sort_state_results = sorted(stat_results.items(), key=lambda x: x[1], reverse=True)[:20]
    except:
        return [None, 0]
    print "sort_state_results:", sort_state_results
    uid_list = [item[0] for item in sort_state_results]
    es_profile_results = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids": uid_list})["docs"]
    es_portrait_results = es_user_portrait.mget(index="user_portrait", doc_type="user", body={"ids": uid_list})["docs"]
    result_list = dict()
    for i in range(len(es_profile_results)):
        item = es_profile_results[i]
        uid = item["_id"]
        try:
            source = item["_source"]
            uname = source["nick_name"]
        except:
            uname = u"未知"
        # identify uid is in the user_portrait
        portrait_item = es_portrait_results[i]
        try:
            source = portrait_item[i]
            in_status = 1
        except:
            in_status = 0

        result_list[uid] = [uid, [uname, stat_results[uid], in_status]]

    return [result_list, len(stat_results)]
def co_search(add_info, update_bci_key, former_bci_key, now_ts):
    uid_list = add_info.keys()
    evaluate_history_results = es_user_portrait.mget(index=COPY_USER_PORTRAIT_INFLUENCE, doc_type=COPY_USER_PORTRAIT_INFLUENCE_TYPE,body={'ids':uid_list})['docs']
    iter_count = 0
    bulk_action = []
    for uid in uid_list:
        item = evaluate_history_results[iter_count]
        if item['found']:
            user_history_item = item['_source']
            #更新新的字段
            user_history_item.update(add_info[uid])
            user_history_item['bci_day_change'] = user_history_item[update_bci_key] - user_history_item.get(former_bci_key, 0)
            user_history_item['bci_week_change'] = user_history_item[update_bci_key] - user_history_item.get('bci_week_ave', 0)
            user_history_item['bci_month_change'] = user_history_item[update_bci_key] - user_history_item.get('bci_month_ave', 0)
            user_history_item['bci_week_ave'], user_history_item['bci_week_var'], user_history_item['bci_week_sum'] = compute_week(user_history_item, now_ts)
            user_history_item['bci_month_ave'], user_history_item['bci_month_var'], user_history_item['bci_month_sum'] = compute_month(user_history_item, now_ts)
            if user_history_item[update_bci_key] < LOW_INFLUENCE_THRESHOULD:
                user_history_item['low_number'] += 1
            else:
                user_history_item['low_number'] = 0
        else:
            user_history_item = dict()
            user_history_item.update(add_info[uid])
            user_history_item["uid"] = uid
            user_history_item.update(add_info[uid])
            user_history_item['bci_day_change'] = user_history_item[update_bci_key]
            user_history_item['bci_week_change'] = user_history_item[update_bci_key]
            user_history_item['bci_month_change'] = user_history_item[update_bci_key]
            user_history_item['bci_week_ave'], user_history_item['bci_week_var'], user_history_item['bci_week_sum'] = compute_week(user_history_item, now_ts)
            user_history_item['bci_month_ave'], user_history_item['bci_month_var'], user_history_item['bci_month_sum'] = compute_month(user_history_item, now_ts)
            if user_history_item[update_bci_key] < LOW_INFLUENCE_THRESHOULD:
                user_history_item['low_number'] = 1
        iter_count += 1

        try:
            user_history_item.pop(del_bci_key)
        except:
            pass

        action = {'index':{'_id': uid}}
        bulk_action.extend([action, user_history_item])
    if bulk_action:
        es_cluster.bulk(bulk_action, index=COPY_USER_PORTRAIT_INFLUENCE, doc_type=COPY_USER_PORTRAIT_INFLUENCE_TYPE,timeout=600)
    print iter_count
Ejemplo n.º 20
0
def es_km_storage(uid_list):
    es_results = es_user_portrait.mget(index=remote_portrait_name,
                                       doc_type=portrait_type,
                                       body={"ids": uid_list})["docs"]
    in_list = []
    out_list = []
    bulk_action = []
    for item in es_results:
        if item["found"]:
            in_list.append(item["_id"])
            bulk_action.append(item["_source"])
        else:
            out_list.append(item["_id"])

    if bulk_action:
        es_km.bulk(bulk_action,
                   index=portrait_name,
                   doc_type=portrait_type,
                   timeout=60)

    return out_list
def filter_out(all_user_set):
    out_results = []
    all_user_list = list(all_user_set)
    all_count = len(all_user_set)
    out_count = 0
    iter_count = 0
    while out_count < RECOMMEND_IN_OUT_SIZE:
        iter_user_list = all_user_list[iter_count: iter_count + RECOMMEND_IN_ITER_COUNT]
        if iter_user_list == []:
            break
        #out portrait
        try:
            in_portrait_result = es_user_portrait.mget(index=portrait_index_name, \
                    doc_type=portrait_index_type, body={'ids': iter_user_list})['docs']
        except:
            in_portrait_result = []
        for in_item in in_portrait_result:
            if in_item['found'] == False:
                out_count += 1
                out_results.append(in_item['_id'])
        iter_count += RECOMMEND_IN_ITER_COUNT
    return out_results
def filter_out(all_user_set):
    out_results = []
    all_user_list = list(all_user_set)
    all_count = len(all_user_set)
    out_count = 0
    iter_count = 0
    while out_count < RECOMMEND_IN_OUT_SIZE:
        iter_user_list = all_user_list[iter_count: iter_count + RECOMMEND_IN_ITER_COUNT]
        if iter_user_list == []:
            break
        #out portrait
        try:
            in_portrait_result = es_user_portrait.mget(index=portrait_index_name, \
                    doc_type=portrait_index_type, body={'ids': iter_user_list})['docs']
        except:
            in_portrait_result = []
        for in_item in in_portrait_result:
            if in_item['found'] == False:
                out_count += 1
                out_results.append(in_item['_id'])
        iter_count += RECOMMEND_IN_ITER_COUNT
    return out_results
def co_search(add_info, update_bci_key, former_bci_key, now_ts):
    uid_list = add_info.keys()
    evaluate_history_results = es_user_portrait.mget(index=COPY_USER_PORTRAIT_SENSITIVE, doc_type=COPY_USER_PORTRAIT_SENSITIVE_TYPE,body={'ids':uid_list})['docs']
    iter_count = 0
    bulk_action = []
    for uid in uid_list:
        item = evaluate_history_results[iter_count]
        if item['found']:
            user_history_item = item['_source']
            #更新新的字段
            user_history_item.update(add_info[uid])
            user_history_item['sensitive_day_change'] = user_history_item[update_bci_key] - user_history_item.get(former_bci_key, 0)
            user_history_item['sensitive_week_change'] = user_history_item[update_bci_key] - user_history_item.get('sensitive_week_ave', 0)
            user_history_item['sensitive_month_change'] = user_history_item[update_bci_key] - user_history_item.get('sensitive_month_ave', 0)
            user_history_item['sensitive_week_ave'], user_history_item['sensitive_week_var'], user_history_item['sensitive_week_sum'] = compute_week(user_history_item, now_ts)
            user_history_item['sensitive_month_ave'], user_history_item['sensitive_month_var'], user_history_item['sensitive_month_sum'] = compute_month(user_history_item, now_ts)
        else:
            user_history_item = dict()
            user_history_item.update(add_info[uid])
            user_history_item["uid"] = uid
            user_history_item.update(add_info[uid])
            user_history_item['sensitive_day_change'] = user_history_item[update_bci_key]
            user_history_item['sensitive_week_change'] = user_history_item[update_bci_key]
            user_history_item['sensitive_month_change'] = user_history_item[update_bci_key]
            user_history_item['sensitive_week_ave'], user_history_item['sensitive_week_var'], user_history_item['sensitive_week_sum'] = compute_week(user_history_item, now_ts)
            user_history_item['sensitive_month_ave'], user_history_item['sensitive_month_var'], user_history_item['sensitive_month_sum'] = compute_month(user_history_item, now_ts)
        iter_count += 1

        try:
            user_history_item.pop(del_bci_key)
        except:
            pass

        action = {'index':{'_id': uid}}
        bulk_action.extend([action, user_history_item])
    if bulk_action:
        es_cluster.bulk(bulk_action, index=COPY_USER_PORTRAIT_SENSITIVE, doc_type=COPY_USER_PORTRAIT_SENSITIVE_TYPE,timeout=600)
    print iter_count
Ejemplo n.º 24
0
def specific_keywords_burst_dection(task_detail):
    task_name = task_detail[0]
    social_sensors = task_detail[1]
    keywords_list = task_detail[2]
    sensitive_words = task_detail[3]
    stop_time = task_detail[4]
    forward_warning_status = task_detail[5]
    ts = int(task_detail[7])
    forward_result = get_forward_numerical_info(task_name, ts, keywords_list)
    # 之前时间阶段内的原创微博list
    forward_origin_weibo_list = query_mid_list(ts-time_interval, keywords_list, forward_time_range)
    # 当前阶段内原创微博list
    current_mid_list = query_mid_list(ts, keywords_list, time_interval)
    all_mid_list = []
    all_mid_list.extend(current_mid_list)
    all_mid_list.extend(forward_origin_weibo_list)
    print "all mid list: ", len(all_mid_list)
    # 查询当前的原创微博和之前12个小时的原创微博在当前时间内的转发和评论数, 聚合按照message_type
    statistics_count = query_related_weibo(ts, all_mid_list, time_interval, keywords_list)
    current_total_count = statistics_count['total_count']
    # 当前阶段内所有微博总数
    print "current all weibo: ", statistics_count
    current_origin_count = statistics_count['origin']
    current_retweeted_count = statistics_count['retweeted']
    current_comment_count = statistics_count['comment']


    # 针对敏感微博的监测,给定传感器和敏感词的前提下,只要传感器的微博里提及到敏感词即会认为是预警

    # 聚合当前时间内积极、中性、悲伤、愤怒情绪分布
    # sentiment_dict = {"0": "neutral", "1":"positive", "2":"sad", "3": "anger"}
    sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0}
    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts-time_interval)
    if datetime != datetime_1:
        index_name = flow_text_index_name_pre + datetime_1
    else:
        index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    if exist_es:
        search_results = aggregation_sentiment_related_weibo(ts, all_mid_list, time_interval, keywords_list)

        sentiment_count = search_results
        print "sentiment_count: ", sentiment_count
    negetive_count = sentiment_count['2'] + sentiment_count['3']

    # 聚合当前时间内重要的人
    important_uid_list = []
    if exist_es:
        #search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=aggregation_sensor_keywords(ts-time_interval, ts, [], "root_uid", size=IMPORTANT_USER_NUMBER))['aggregations']['all_keywords']['buckets']
        search_results = query_hot_weibo(ts, all_mid_list, time_interval, keywords_list, aggregation_field="root_uid", size=100)
        important_uid_list = search_results.keys()
        if datetime != datetime_1:
            index_name_1 = flow_text_index_name_pre + datetime_1
            if es_text.indices.exists(index_name_1):
                #search_results_1 = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=aggregation_sensor_keywords(ts-time_interval, ts, [], "root_uid", size=IMPORTANT_USER_NUMBER))['aggregations']['all_keywords']['buckets']
                search_results_1 = query_hot_weibo(ts, all_mid_list, time_interval, keywords_list, aggregation_field="root_uid", size=100)
                if search_results_1:
                    for item in search_results_1:
                        important_uid_list.append(item['key'])
    # 根据获得uid_list,从人物库中匹配重要人物
    if important_uid_list:
        important_results = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids": important_uid_list})['docs']
    else:
        important_results = {}
    filter_important_list = [] # uid_list
    if important_results:
        for item in important_results:
            if item['found']:
                if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD:
                        filter_important_list.append(item['_id'])
    print filter_important_list

    # 6. 敏感词识别,如果传感器的微博中出现这么一个敏感词,那么就会预警------PS.敏感词是一个
    sensitive_origin_weibo_number = 0
    sensitive_retweeted_weibo_number = 0
    sensitive_comment_weibo_number = 0
    sensitive_total_weibo_number = 0

    if sensitive_words:
        query_sensitive_body = {
            "query":{
                "filtered":{
                    "filter":{
                        "bool":{
                            "must":[
                                {"range":{
                                    "timestamp":{
                                        "gte": ts - time_interval,
                                        "lt": ts
                                    }}
                                },
                                {"terms": {"keywords_string": sensitive_words}}
                            ]
                        }
                    }
                }
            },
            "aggs":{
                "all_list":{
                    "terms":{"field": "message_type"}
                }
            }
        }
        if social_sensors:
            query_sensitive_body['query']['filtered']['filter']['bool']['must'].append({"terms":{"uid": social_sensors}})

        sensitive_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_sensitive_body)['aggregations']['all_list']["buckets"]
        if sensitive_results:
            for item in sensitive_results:
                if int(item["key"]) == 1:
                    sensitive_origin_weibo_number = item['doc_count']
                elif int(item["key"]) == 2:
                    sensitive_comment_weibo_number = item['doc_count']
                elif int(item["key"]) == 3:
                    sensitive_retweeted_weibo_number = item["doc_count"]
                else:
                    pass

            sensitive_total_weibo_number = sensitive_origin_weibo_number + sensitive_comment_weibo_number + sensitive_retweeted_weibo_number




    burst_reason = signal_nothing_variation
    warning_status = signal_nothing
    finish = unfinish_signal # "0"
    process_status = "1"

    if sensitive_total_weibo_number > WARNING_SENSITIVE_COUNT: # 敏感微博的数量异常
        print "======================"
        if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪
            warning_status = signal_track
        else:
            warning_status = signal_brust
        burst_reason = signal_sensitive_variation

    if forward_result[0]:
        # 根据移动平均判断是否有时间发生
        mean_count = forward_result[1]
        std_count = forward_result[2]
        mean_sentiment = forward_result[3]
        std_sentiment = forward_result[4]
        if current_total_count > mean_count+1.96*std_count: # 异常点发生
            print "====================================================="
            if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪
                warning_status = signal_track
            else:
                warning_status = signal_brust
            burst_reason += signal_count_varition # 数量异常
        if negetive_count > mean_sentiment+1.96*std_sentiment:
            warning_status = signal_brust
            burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常
            if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪
                warning_status = signal_track

    if int(stop_time) <= ts: # 检查任务是否已经完成
        finish = finish_signal
        process_status = "0"

    # 7. 感知到的事, all_mid_list
    tmp_burst_reason = burst_reason
    topic_list = []
    # 判断是否有敏感微博出现:有,则聚合敏感微博,replace;没有,聚合普通微博
    if burst_reason: # 有事情发生
        text_list = []
        mid_set = set()
        if signal_sensitive_variation in burst_reason:
            query_sensitive_body = {
                "query":{
                    "filtered":{
                        "filter":{
                            "bool":{
                                "must":[
                                    {"range":{
                                        "timestamp":{
                                            "gte": ts - time_interval,
                                            "lt": ts
                                        }}
                                    },
                                    {"terms": {"keywords_string": sensitive_words}}
                                ]
                            }
                        }
                    }
                },
                "size": 10000
            }

            if social_sensors:
                query_sensitive_body['query']['filtered']['filter']['bool']['must'].append({"terms":{"uid": social_sensors}})

            sensitive_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_sensitive_body)['hits']['hits']
            if sensitive_results:
                for item in sensitive_results:
                    iter_mid = item['_source']['mid']
                    iter_text = item['_source']['text']
                    temp_dict = dict()
                    temp_dict["mid"] = iter_mid
                    temp_dict["text"] = iter_text
                    if iter_mid not in mid_set:
                        text_list.append(temp_dict) # 整理后的文本,mid,text
                        mid_set.add(iter_mid)
            burst_reason.replace(signal_sensitive_variation, "")

        current_origin_mid_list = query_mid_list(ts, keywords_list, time_interval, 1)
        print "current_origin_mid_list:", len(current_origin_mid_list)
        if burst_reason and current_mid_list:
            origin_sensing_text = es_text.mget(index=index_name, doc_type=flow_text_index_type, body={"ids": current_origin_mid_list}, fields=["mid", "text"])["docs"]
            if origin_sensing_text:
                for item in origin_sensing_text:
                    if item["found"]:
                        iter_mid = item["fields"]["mid"][0]
                        iter_text = item["fields"]["text"][0]
                        temp_dict = dict()
                        temp_dict["mid"] = iter_mid
                        temp_dict["text"] = iter_text
                        if iter_mid not in mid_set:
                            text_list.append(temp_dict) # 整理后的文本,mid,text
                            mid_set.add(iter_mid)

        if len(text_list) == 1:
            top_word = freq_word(text_list[0])
            topic_list = [top_word.keys()]
        elif len(text_list) == 0:
            topic_list = []
            tmp_burst_reason = "" #没有相关微博,归零
            print "***********************************"
        else:
            feature_words, input_word_dict = tfidf(text_list) #生成特征词和输入数据
            word_label, evaluation_results = kmeans(feature_words, text_list) #聚类
            inputs = text_classify(text_list, word_label, feature_words)
            clustering_topic = cluster_evaluation(inputs)
            print "========================================================================================"
            print "========================================================================================="
            sorted_dict = sorted(clustering_topic.items(), key=lambda x:x[1], reverse=True)
            topic_list = []
            if sorted_dict:
                for item in sorted_dict:
                    topic_list.append(word_label[item[0]])
        print "topic_list, ", topic_list

    if not topic_list:
        warning_status = signal_nothing
        tmp_burst_reason = signal_nothing_variation

    results = dict()
    results['origin_weibo_number'] = current_origin_count
    results['retweeted_weibo_number'] = current_retweeted_count
    results['comment_weibo_number'] = current_comment_count
    results['weibo_total_number'] = current_total_count
    results['sensitive_origin_weibo_number'] = sensitive_origin_weibo_number
    results['sensitive_retweeted_weibo_number'] = sensitive_retweeted_weibo_number
    results['sensitive_comment_weibo_number'] = sensitive_comment_weibo_number
    results['sensitive_weibo_total_number'] = sensitive_total_weibo_number
    results['sentiment_distribution'] = json.dumps(sentiment_count)
    results['important_users'] = json.dumps(filter_important_list)
    results['burst_reason'] = tmp_burst_reason
    results['timestamp'] = ts
    if tmp_burst_reason:
        results['clustering_topic'] = json.dumps(topic_list)
    # es存储当前时段的信息
    doctype = task_name
    es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results)

    # 更新manage social sensing的es信息
    temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=task_name)['_source']
    temporal_result['warning_status'] = warning_status
    temporal_result['burst_reason'] = tmp_burst_reason
    temporal_result['finish'] = finish
    temporal_result['processing_status'] = process_status
    history_status = json.loads(temporal_result['history_status'])
    history_status.append([ts, ' '.join(keywords_list), warning_status])
    temporal_result['history_status'] = json.dumps(history_status)
    es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=task_name, body=temporal_result)

    return "1"
Ejemplo n.º 25
0
                 new_user_item['aver_influence'] = aver_influence
                 new_user_item['aver_importance'] = aver_importance
                 action = {'index':{'_id': uid}}
                 bulk_action.extend([action, new_user_item])
                 iter_count += 1
             es_user_portrait.bulk(bulk_action, index=copy_portrait_index_name, doc_type=copy_portrait_index_type)
             bulk_action = []
             add_info = {}
             iter_count = 0
         break
     except Exception, e:
         raise e
 
 if len(add_info)!=0:
     uid_list = add_info.keys()
     evaluate_history_results = es_user_portrait.mget(index=copy_portrait_index_name, doc_type=copy_portrait_index_type, body={'ids':uid_list})['docs']
     '''
     del_date = ts2datetime(time.time() - DAY*31)
     del_activeness_key = 'activeness_'+del_date
     del_influence_key = del_date
     '''
     iter_count = 0
     for uid in uid_list:
         try:
             user_history_item = evaluate_history_results[iter_count]['_source']
         except:
             user_history_item = {}
         try:
             user_history_item.pop(del_activeness_key)
             user_history_item.pop(del_influence_key)
             user_history_item.pop(del_importance_key)
def social_sensing(task_detail):
    # 任务名 传感器 终止时间 之前状态 创建者 时间
    task_name = task_detail[0]
    social_sensors = task_detail[1]
    stop_time = task_detail[2]
    create_by = task_detail[3]
    ts = int(task_detail[4])

    print ts2date(ts)
    # PART 1
    
    #forward_result = get_forward_numerical_info(task_name, ts, create_by)
    # 之前时间阶段内的原创微博list/retweeted
    forward_origin_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range)
    forward_retweeted_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range, 3)
    # 当前阶段内原创微博list
    current_mid_list = query_mid_list(ts, social_sensors, time_interval)
    current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3)
    all_mid_list = []
    all_mid_list.extend(current_mid_list)
    all_mid_list.extend(current_retweeted_mid_list)
    all_mid_list.extend(forward_origin_weibo_list)
    all_mid_list.extend(forward_retweeted_weibo_list)
    all_origin_list = []
    all_origin_list.extend(current_mid_list)
    all_origin_list.extend(forward_origin_weibo_list)
    all_retweeted_list = []
    all_retweeted_list.extend(current_retweeted_mid_list)
    all_retweeted_list.extend(forward_retweeted_weibo_list)#被转发微博的mid/root-mid
    print "all mid list: ", len(all_mid_list)
    #print "all_origin_list", all_origin_list
    #print "all_retweeted_list", all_retweeted_list

    # 查询微博在当前时间内的转发和评论数, 聚合按照message_type
    statistics_count = query_related_weibo(ts, all_mid_list, time_interval)
    if all_origin_list:
        origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情
    else:
        origin_weibo_detail = {}
    if all_retweeted_list:
        retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情
    else:
        retweeted_weibo_detail = {}
    current_total_count = statistics_count['total_count']

    # 当前阶段内所有微博总数
    current_retweeted_count = statistics_count['retweeted']
    current_comment_count = statistics_count['comment']


    # 聚合当前时间内重要的人
    important_uid_list = []
    datetime = ts2datetime(ts-time_interval)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    if exist_es:
        search_results = get_important_user(ts, all_mid_list, time_interval)
        important_uid_list = search_results
    # 根据获得uid_list,从人物库中匹配重要人物
    if important_uid_list:
        important_results = es_user_portrait.mget(index=portrait_index_name,doc_type=portrait_index_type, body={"ids": important_uid_list})['docs']
    else:
        important_results = []
    filter_important_list = [] # uid_list
    if important_results:
        for item in important_results:
            if item['found']:
                #if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD:
                filter_important_list.append(item['_id'])

    print "filter_important_list", filter_important_list
    print "important_results", important_uid_list

    #判断感知
    finish = unfinish_signal # "0"
    process_status = "1"


    if int(stop_time) <= ts: # 检查任务是否已经完成
        finish = finish_signal
        process_status = "0"

    # 感知到的事, all_mid_list
    sensitive_text_list = []

    # 有事件发生时开始
    if 1:
        index_list = []
        important_words = []
        datetime_1 = ts2datetime(ts)
        index_name_1 = flow_text_index_name_pre + datetime_1
        exist_es = es_text.indices.exists(index=index_name_1)
        if exist_es:
            index_list.append(index_name_1)
        datetime_2 = ts2datetime(ts-DAY)
        index_name_2 = flow_text_index_name_pre + datetime_2
        exist_es = es_text.indices.exists(index=index_name_2)
        if exist_es:
            index_list.append(index_name_2)
        if index_list and all_mid_list:
            query_body = {
                "query":{
                    "filtered":{
                        "filter":{
                            "terms":{"mid": all_mid_list}
                        }
                    }
                },
                "size": 5000
            }
            search_results = es_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits']
            tmp_sensitive_warning = ""
            text_dict = dict() # 文本信息
            mid_value = dict() # 文本赋值
            duplicate_dict = dict() # 重合字典
            portrait_dict = dict() # 背景信息
            classify_text_dict = dict() # 分类文本
            classify_uid_list = []
            duplicate_text_list = []
            sensitive_words_dict = dict()
            if search_results:
                for item in search_results:
                    iter_uid = item['_source']['uid']
                    iter_mid = item['_source']['mid']
                    iter_text = item['_source']['text'].encode('utf-8', 'ignore')
                    iter_sensitive = item['_source'].get('sensitive', 0)

                    duplicate_text_list.append({"_id":iter_mid, "title": "", "content":iter_text})

                    if iter_sensitive:
                        tmp_sensitive_warning = signal_sensitive_variation #涉及到敏感词的微博
                        sensitive_words_dict[iter_mid] = iter_sensitive

                    keywords_dict = json.loads(item['_source']['keywords_dict'])
                    personal_keywords_dict = dict()
                    for k, v in keywords_dict.iteritems():
                        k = k.encode('utf-8', 'ignore')
                        personal_keywords_dict[k] = v
                    classify_text_dict[iter_mid] = personal_keywords_dict
                    classify_uid_list.append(iter_uid)

                # 去重
                if duplicate_text_list:
                    dup_results = duplicate(duplicate_text_list)
                    for item in dup_results:
                        if item['duplicate']:
                            duplicate_dict[item['_id']] = item['same_from']

                # 分类
                if classify_text_dict:
                     classify_results = topic_classfiy(classify_uid_list, classify_text_dict)
                     mid_value = dict()
                     #print "classify_results: ", classify_results
                     for k,v in classify_results.iteritems(): # mid:value
                        mid_value[k] = topic_value_dict[v[0]]

            sensitive_weibo_detail = {}
            if sensitive_words_dict:
                sensitive_mid_list = sensitive_words_dict.keys()
                sensitivie_weibo_detail = query_hot_weibo(ts, sensitive_mid_list, time_interval)


    results = dict()
    results['mid_topic_value'] = json.dumps(mid_value)
    results['duplicate_dict'] = json.dumps(duplicate_dict)
    results['sensitive_words_dict'] = json.dumps(sensitive_words_dict)
    results['sensitive_weibo_detail'] = json.dumps(sensitive_weibo_detail)
    results['origin_weibo_number'] = len(all_origin_list)
    results['retweeted_weibo_number'] = len(all_retweeted_list)
    results['origin_weibo_detail'] = json.dumps(origin_weibo_detail)
    results['retweeted_weibo_detail'] = json.dumps(retweeted_weibo_detail)
    results['retweeted_weibo_count'] = current_retweeted_count
    results['comment_weibo_count'] = current_comment_count
    results['weibo_total_number'] = current_total_count
    results['important_users'] = json.dumps(filter_important_list)
    results['unfilter_users'] = json.dumps(important_uid_list)
    results['timestamp'] = ts
    #results['clustering_topic'] = json.dumps(topic_list)
    # es存储当前时段的信息
    doctype = create_by + '-' + task_name
    es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results)

    # 更新manage social sensing的es信息
    temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=doctype)['_source']
    temporal_result['finish'] = finish
    temporal_result['processing_status'] = process_status
    history_status = json.loads(temporal_result['history_status'])
    history_status.append(ts)
    temporal_result['history_status'] = json.dumps(history_status)
    es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=doctype, body=temporal_result)
    return "1"
Ejemplo n.º 27
0
def scan_index_history():
    s_re = scan(es_user_portrait, query={'query':{'match_all':{}}, 'size':1000}, index=portrait_index_name, doc_type=portrait_index_type)
    bulk_action = []
    add_info = {}
    count = 0
    start_ts = time.time()
    now_date = ts2datetime(start_ts - DAY)
    now_date = '2013-09-06'
    #now_date_string = ''.join(now_date.split('-'))
    now_date_string = now_date
    activeness_key = 'activeness_'+now_date_string
    #influence_key = now_date_string
    influence_key = now_date_string
    importance_key = "importance_" + now_date_string
    del_date = ts2datetime(time.time() - DAY*31)
    #del_date_string = ''.join(del_date.split('-'))
    del_date_string = del_date
    del_activeness_key = 'activeness_'+del_date_string
    #del_influence_key = del_date_string
    del_influence_key = del_date_string
    del_importance_key = "importance_" + del_date_string
    #get max value for importance and activeness
    max_activeness = get_max_index('activeness')
    max_influence = get_max_index('influence')
    max_importance = get_max_index('importance')
    while True:
        try:
            scan_re = s_re.next()['_source']
            count += 1
            uid = scan_re['uid']

            activeness_key = 'activeness_'+now_date_string
            influence_key = now_date_string
            importance_key = "importance_" + now_date_string
            #save to normal activeness and normal influence
            activeness_value = scan_re['activeness']
            influence_value = scan_re['influence']
            importance_value = scan_re['importance']
            normal_activeness = normal_index(activeness_value, max_activeness)
            normal_influence = normal_index(influence_value, max_influence)
            normal_importance = normal_index(importance_value, max_importance)

            add_info[uid] = {activeness_key:normal_activeness, influence_key:normal_influence, importance_key:normal_importance}
            if count % 1000==0:
                uid_list = add_info.keys()
                evaluate_history_results = es_user_portrait.mget(index=copy_portrait_index_name, doc_type=copy_portrait_index_type, body={'ids':uid_list})['docs']
                '''
                del_date = ts2datetime(time.time() - DAY*31)
                del_date_string = ''.join(s)
                del_activeness_key = 'activeness_'+del_date
                del_influence_key = del_date
                '''
                iter_count = 0
                for uid in uid_list:
                    try:
                        user_history_item = evaluate_history_results[iter_count]['_source']
                    except:
                        user_history_item = {}
                    try:
                        user_history_item.pop(del_activeness_key)
                        user_history_item.pop(del_influence_key)
                        user_history_item.pop(del_importance_key)
                    except:
                        pass
                    new_user_item = dict(user_history_item, **add_info[uid])
                    # yuankun-20151229
                    if add_info[uid][influence_key] < LOW_INFLUENCE_THRESHOULD:#更新活跃情况,出库
                        try:
                            new_user_item["low_number"] += 1
                        except:
                            new_user_item["low_number"] = 1
                    else:
                        new_user_item["low_number"] = 0
                    aver_activeness, aver_influence, aver_importance = average_value(new_user_item)
                    new_user_item['aver_activeness'] = aver_activeness
                    new_user_item['aver_influence'] = aver_influence
                    new_user_item['aver_importance'] = aver_importance
                    #print 'add_info:', add_info[uid]
                    #print 'user_history_item:', user_history_item
                    #print 'new_user_item:', new_user_item
                    action = {'index':{'_id': uid}}
                    #print 'action:', action
                    bulk_action.extend([action, new_user_item])
                es_user_portrait.bulk(bulk_action, index=copy_portrait_index_name, doc_type=copy_portrait_index_type)
                bulk_action = []
                add_info = {}
                iter_count = 0
                end_ts = time.time()
                print '%s sec count 1000' % (end_ts - start_ts)
        except StopIteration:
            print 'all done'
            if len(add_info) != 0:
                uid_list = add_info.keys() 
                evaluate_history_results = es_user_portrait.mget(index=copy_portrait_index_name, doc_type=copy_portrait_index_type, body={'ids':uid_list})['docs']
                '''
                del_date = ts2datetime(time.time() - DAY*31)
                del_activeness_key = 'activeness_'+del_date
                del_influence_key = del_date
                '''
                iter_count = 0
                for uid in uid_list:
                    try:
                        user_history_item = evaluate_history_results[iter_count]['_source']
                    except:
                        user_history_item = {}
                    try:
                        user_history_item.pop(del_activeness_key)
                        user_history_item.pop(del_influence_key)
                        user_history_item.pop(del_importance_key)
                    except:
                        pass
                    new_user_item = dict(user_history_item, **add_info[uid])
                    if add_info[uid][influence_key] < LOW_INFLUENCE_THRESHOULD:
                        try:
                            new_user_item["low_number"] += 1
                        except:
                            new_user_item["low_number"] = 1
                    else:
                        new_user_item["low_number"] = 0
                    aver_activeness, aver_influence, aver_importance = average_value(new_user_item)
                    new_user_item['aver_activeness'] = aver_activeness
                    new_user_item['aver_influence'] = aver_influence
                    new_user_item['aver_importance'] = aver_importance
                    action = {'index':{'_id': uid}}
                    bulk_action.extend([action, new_user_item])
                    iter_count += 1
                es_user_portrait.bulk(bulk_action, index=copy_portrait_index_name, doc_type=copy_portrait_index_type)
                bulk_action = []
                add_info = {}
                iter_count = 0
            break
        except Exception, e:
            raise e
Ejemplo n.º 28
0
def key_words_search(task_id, search_type , pre , during , start_time , keyword_list , search_key = '' , sort_norm = '', sort_scope = ''  ,time = 7 , isall = False, number = 100):
    number = int(number)
    should = []
    for key in keyword_list:
        if search_type == "hashtag":
            should.append({"prefix":{"text": "#" +  key + "#"}})
        else:    
            should.append({"wildcard":{"text": "*" +key + "*"}})    
    index_list = []
    date = ts2datetime(start_time)
    index_name = pre + date
    while during:
        if es_flow_text.indices.exists(index=index_name):
            index_list.append(index_name)
            start_time = start_time + DAY
            date = ts2datetime(start_time)
            index_name = pre + date
            during -= 1

    print index_list
    uid_set = set()
    text_results = []
    sorted_text_results = []

    query_body = {
        "query":{
            "bool":{
                "must":should
             }
        },
        "sort":{"user_fansnum":{"order":"desc"}},
        "size":5000
    }
                    
    results = es_flow_text.search(index = index_list , doc_type = 'text' , body = query_body, _source=False, fields=["uid", "user_fansnum","text", "message_type", "sentiment","timestamp", "geo", "retweeted", "comment"])["hits"]["hits"]
    id_index = 0
    index_list = []
    un_uid_list = []
    for item in results :
        if item['fields']['uid'][0] not in uid_set:
            uid_set.add(item['fields']['uid'][0])
            un_uid_list.append(item['fields']['uid'][0])
            index_list.append(id_index)
        id_index += 1
    
    #get_all_filed(sort_norm , time)
    uid_list = []
    print "un_uid_list: ", len(un_uid_list)
    portrait_list = []
    count = 0
    in_index = 0
    if not isall and un_uid_list : # 库内
        portrait_results = es_user_portrait.mget(index=USER_INDEX_NAME, doc_type=USER_INDEX_TYPE, body={"ids":un_uid_list}, _source=False, fields=['uname'])["docs"]
        for item in portrait_results:
            if item["found"]:
                portrait_list.append(item['_id'])    
                nick_name = item['fields']['uname'][0]
                if nick_name == 'unknown':
                    nick_name = item['_id']
                index = index_list[in_index]
                weibo_url = weiboinfo2url(results[index]['fields']['uid'][0], results[index]['_id'])
                text_results.extend([results[index]['fields']['uid'][0], results[index]['fields']['user_fansnum'][0], results[index]['fields']['text'][0], results[index]['fields']['message_type'][0], results[index]['fields']['sentiment'][0], ts2date(results[index]['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url])
                count += 1
                if count == number:
                    break
                print "portrait_len, ", len(portrait_list)
            in_index += 1
        if portrait_list:
            uid_list = in_sort_filter(time,sort_norm ,sort_scope ,None , portrait_list , True, number) # sort
            for iter_uid in uid_list:
                iter_index = portrait_list.index(iter_uid)
                sorted_text_results.append(text_results[i])

    elif un_uid_list:
        profile_result = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids":un_uid_list}, fields=['nick_name'])["docs"]
        for i in range(len(profile_result)):
            index = index_list[i]
            try:
                nick_name = profile_result[i]['fields']['nick_name'][0]
            except:
                nick_name = un_uid_list[i]
            item = results[index]
            weibo_url = weiboinfo2url(item['fields']['uid'][0], results[index]['_id'])
            text_results.append([item['fields']['uid'][0], item['fields']['user_fansnum'][0], item['fields']['text'][0], item['fields']['message_type'][0], item['fields']['sentiment'][0], ts2date(item['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url])
            if i == number:
                break
        uid_list = all_sort_filter(un_uid_list[:number] , sort_norm , time ,True, number)
        sorted_text_results = []
        f = open("small.txt", "wb")
        for iter_uid in uid_list:
            iter_index = un_uid_list.index(iter_uid)
            f.write(str(iter_uid)+"\n")
            sorted_text_results.append(text_results[iter_index])
        f.close()
    print "filter_uid_list: ", len(uid_list)
    if uid_list:
        results = make_up_user_info(uid_list,isall,time,sort_norm)
    else:
        results = []
    print "results: ", len(results)
    # 修改状态
    task_detail = es_user_portrait.get(index=USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id)
    item = task_detail['_source']
    item['status'] = 1
    item['result'] = json.dumps(results)
    item['text_results'] = json.dumps(sorted_text_results)
    item['number'] = len(results)
    es_user_portrait.index(index = USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE , id=task_id,  body=item)

    return "1"
def social_sensing(task_detail):
    # 任务名 传感器 终止时间 之前状态 创建者 时间
    task_name = task_detail[0]
    social_sensors = task_detail[1]
    stop_time = task_detail[2]
    forward_warning_status = task_detail[3]
    create_by = task_detail[4]
    ts = int(task_detail[5])
    new = int(task_detail[6])

    print ts2date(ts)
    # PART 1
    
    forward_result = get_forward_numerical_info(task_name, ts, create_by)
    # 之前时间阶段内的原创微博list/retweeted
    forward_origin_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range)
    forward_retweeted_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range, 3)
    # 当前阶段内原创微博list
    current_mid_list = query_mid_list(ts, social_sensors, time_interval)
    current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3)
    all_mid_list = []
    all_mid_list.extend(current_mid_list)
    all_mid_list.extend(current_retweeted_mid_list)
    all_mid_list.extend(forward_origin_weibo_list)
    all_mid_list.extend(forward_retweeted_weibo_list)
    all_origin_list = []
    all_origin_list.extend(current_mid_list)
    all_origin_list.extend(forward_origin_weibo_list)
    all_retweeted_list = []
    all_retweeted_list.extend(current_retweeted_mid_list)
    all_retweeted_list.extend(forward_retweeted_weibo_list)#被转发微博的mid/root-mid
    print "all mid list: ", len(all_mid_list)
    #print "all_origin_list", all_origin_list
    #print "all_retweeted_list", all_retweeted_list

    # 查询微博在当前时间内的转发和评论数, 聚合按照message_type
    statistics_count = query_related_weibo(ts, all_mid_list, time_interval)
    if all_origin_list:
        origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情
    else:
        origin_weibo_detail = {}
    if all_retweeted_list:
        retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情
    else:
        retweeted_weibo_detail = {}
    current_total_count = statistics_count['total_count']

    # 当前阶段内所有微博总数
    current_retweeted_count = statistics_count['retweeted']
    current_comment_count = statistics_count['comment']


    # PART 2
    # 聚合当前时间内积极、中性、悲伤、愤怒情绪分布
    # sentiment_dict = {"0": "neutral", "1":"positive", "2":"sad", "3": "anger"}
    sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0}
    search_results = aggregation_sentiment_related_weibo(ts, all_mid_list, time_interval)
    sentiment_count = search_results
    print "sentiment_count: ", sentiment_count
    negetive_key = ["2", "3", "4", "5", "6"]
    negetive_count = 0
    for key in negetive_key:
        negetive_count += sentiment_count[key]


    # 聚合当前时间内重要的人
    important_uid_list = []
    datetime = ts2datetime(ts-time_interval)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    if exist_es:
        search_results = get_important_user(ts, all_mid_list, time_interval)
        important_uid_list = search_results
    # 根据获得uid_list,从人物库中匹配重要人物
    if important_uid_list:
        important_results = es_user_portrait.mget(index=portrait_index_name,doc_type=portrait_index_type, body={"ids": important_uid_list})['docs']
    else:
        important_results = []
    filter_important_list = [] # uid_list
    if important_results:
        for item in important_results:
            if item['found']:
                #if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD:
                filter_important_list.append(item['_id'])


    #判断感知
    burst_reason = signal_nothing_variation
    warning_status = signal_nothing
    finish = unfinish_signal # "0"
    process_status = "1"

    if forward_result[0]:
        # 根据移动平均判断是否有时间发生
        mean_count = forward_result[1]
        std_count = forward_result[2]
        mean_sentiment = forward_result[3]
        std_sentiment = forward_result[4]
        if mean_count >= MEAN_COUNT and current_total_count > mean_count+1.96*std_count or current_total_count >= len(all_mid_list)*AVERAGE_COUNT: # 异常点发生
            if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪
                warning_status = signal_track
            else:
                warning_status = signal_brust
            burst_reason += signal_count_varition # 数量异常

        if negetive_count > mean_sentiment+1.96*std_sentiment and mean_sentiment >= MEAN_COUNT or negetive_count >= len(all_mid_list)*AVERAGE_COUNT:
            warning_status = signal_brust
            burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常
            if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪
                warning_status = signal_track

    if int(stop_time) <= ts: # 检查任务是否已经完成
        finish = finish_signal
        process_status = "0"

    # 感知到的事, all_mid_list
    tmp_burst_reason = burst_reason
    topic_list = []
    sensitive_text_list = []

    # 有事件发生时开始
    #if warning_status:
    if 1:
        index_list = []
        important_words = []
        datetime_1 = ts2datetime(ts)
        index_name_1 = flow_text_index_name_pre + datetime_1
        exist_es = es_text.indices.exists(index=index_name_1)
        if exist_es:
            index_list.append(index_name_1)
        datetime_2 = ts2datetime(ts-DAY)
        index_name_2 = flow_text_index_name_pre + datetime_2
        exist_es = es_text.indices.exists(index=index_name_2)
        if exist_es:
            index_list.append(index_name_2)
        if index_list and all_mid_list:
            query_body = {
                "query":{
                    "filtered":{
                        "filter":{
                            "terms":{"mid": all_mid_list}
                        }
                    }
                },
                "size": 2000
            }
            search_results = es_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits']
            text_list = []
            tmp_sensitive_warning = ""
            sensitive_words_dict = dict()
            if search_results:
                for item in search_results:
                    iter_mid = item['_source']['mid']
                    iter_text = item['_source']['text']
                    iter_sensitive = item['_source'].get('sensitive', 0)
                    if iter_sensitive:
                        tmp_sensitive_warning = signal_sensitive_variation #涉及到敏感词的微博
                        sensitive_words_dict[iter_mid] = iter_sensitive
                    temp_dict = dict()
                    temp_dict["mid"] = iter_mid
                    temp_dict["text"] = iter_text
                    text_list.append(temp_dict)
            if tmp_sensitive_warning:
                warning_status = signal_brust
                burst_reason += signal_sensitive_variation
            sensitive_weibo_detail = {}
            if sensitive_words_dict:
                sensitive_mid_list = sensitive_words_dict.keys()
                sensitivie_weibo_detail = query_hot_weibo(ts, sensitive_mid_list, time_interval)

            """
            if len(text_list) == 1:
                top_word = freq_word(text_list[0])
                topic_list = [top_word.keys()]
            elif len(text_list) == 0:
                topic_list = []
                tmp_burst_reason = "" #没有相关微博,归零
                print "no relate weibo text"
            else:
                feature_words, input_word_dict = tfidf(text_list) #生成特征词和输入数据
                word_label, evaluation_results = kmeans(feature_words, text_list) #聚类
                inputs = text_classify(text_list, word_label, feature_words)
                clustering_topic = cluster_evaluation(inputs)
                print "clustering weibo topic"
                sorted_dict = sorted(clustering_topic.items(), key=lambda x:x[1], reverse=True)
                topic_list = []
                if sorted_dict:
                    for item in sorted_dict:
                        if item[0] != "other":
                            topic_list.append(word_label[item[0]])
                print "topic list: ", len(topic_list)
            """

    results = dict()
    if sensitive_weibo_detail:
        print "sensitive_weibo_detail: ", sensitive_weibo_detail
    results['sensitive_words_dict'] = json.dumps(sensitive_words_dict)
    results['sensitive_weibo_detail'] = json.dumps(sensitive_weibo_detail)
    results['origin_weibo_number'] = len(all_origin_list)
    results['retweeted_weibo_number'] = len(all_retweeted_list)
    results['origin_weibo_detail'] = json.dumps(origin_weibo_detail)
    results['retweeted_weibo_detail'] = json.dumps(retweeted_weibo_detail)
    results['retweeted_weibo_count'] = current_retweeted_count
    results['comment_weibo_count'] = current_comment_count
    results['weibo_total_number'] = current_total_count
    results['sentiment_distribution'] = json.dumps(sentiment_count)
    results['important_users'] = json.dumps(filter_important_list)
    results['unfilter_users'] = json.dumps(important_uid_list)
    results['burst_reason'] = tmp_burst_reason
    results['timestamp'] = ts
    #results['clustering_topic'] = json.dumps(topic_list)
    # es存储当前时段的信息
    doctype = create_by + '-' + task_name
    es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results)

    # 更新manage social sensing的es信息
    if not new:
        temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=doctype)['_source']
        temporal_result['warning_status'] = warning_status
        temporal_result['burst_reason'] = tmp_burst_reason
        temporal_result['finish'] = finish
        temporal_result['processing_status'] = process_status
        history_status = json.loads(temporal_result['history_status'])
        history_status.append([ts, task_name, warning_status])
        temporal_result['history_status'] = json.dumps(history_status)
        es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=doctype, body=temporal_result)
    else:
        print "test"
    return "1"
Ejemplo n.º 30
0
def get_scan_results():
    result_dict = {}
    gender_result = {'1':0, '2':0}
    verified_result = {'yes':0, 'no':0}
    location_result = {}
    activity_geo_result = {}
    keywords_result = {}
    hashtag_result = {}
    topic_result = {}
    online_pattern_result = {}
    domain_result = {}
    no_gender_count = 0
    no_verified_count = 0
    no_location_count = 0
    no_activity_geo_count = 0
    no_keywords_count = 0
    no_hashtag_count = 0
    no_topic_count = 0
    no_online_pattern_count = 0
    no_domain_count = 0
    s_re = scan(es, query={'query':{'match_all':{}}, 'size':100}, index=index_name, doc_type=index_type)
    print 's_re:', s_re
    activity_count = 0
    while True:
        portrait_uid_list = []
        while True:
            try:
                scan_re = s_re.next()['_source']
                # gender ratio count
                portrait_uid_list.append(scan_re['uid'])
                #print 'portrait_uid_list:', len(portrait_uid_list)
                try:
                    gender_result[str(scan_re['gender'])] += 1
                except:
                    no_gender_count += 1
                # verified ratio count
                try:
                    verified_result[str(scan_re['verified'])] += 1
                except:
                    no_verified_count += 1
                # loation top
                try:
                    location = scan_re['location']
                    if len(location.split(' '))>1:
                        location = location.split(' ')[0]
                    try:
                        location_result[location] += 1
                    except:
                        location_result[location] = 1
                except:
                    no_location_count += 1
                # activity geo
                try:
                    activity_geo = scan_re['activity_geo_dict']
                    if scan_re:
                        activity_geo_dict = json.loads(activity_geo)
                        for geo in activity_geo_dict:
                            geo_list = geo.split('\t')
                            if geo_list[0]==u'中国' and len(geo_list)>=2:
                                province = geo_list[1]
                                try:
                                    activity_geo_result[province] += activity_geo_dict[geo]
                                except:
                                    activity_geo_result[province] = activity_geo_dict[geo]
                except:
                    no_activity_geo_count += 1
                # keywords
                try:
                    keywords = json.loads(scan_re['keywords'])
                    if keywords:
                        for word in keywords:
                            try:
                                keywords_result[word] += keywords[word]
                            except:
                                keywords_result[word] = keywords[word]
                except:
                    no_keywords_count += 1
                # hashtag top
                try:
                    hashtag_dict = json.loads(scan_re['hashtag_dict'])
                    if hashtag_dict:
                        for tag in hashtag_dict:
                            try:
                                hashtag_result[tag] += hashtag_dict[tag]
                            except:
                                hashtag_result[tag] = hashtag_dict[tag]
                except:
                    no_hashtag_count += 1
                # topic top
                try:
                    topic = json.loads(scan_re['topic'])
                    if topic:
                        for item in topic:
                            try:
                                topic_result[item] += 1
                            except:
                                topic_result[item] = 1
                except:
                    no_topic_count += 1
                # online pattern top
                try:
                    online_pattern = json.loads(scan_re['online_pattern'])
                    if online_pattern:
                        for item in online_pattern:
                            try:
                                online_pattern_result[item] += online_pattern[item]
                            except:
                                online_pattern_result[item] = online_pattern[item]
                except:
                    no_online_pattern_count += 1
                # domain top
                try:
                    domain = scan_re['domain']
                    if domain:
                        domain_list = domain.split('_')
                        for item in domain_list:
                            try:
                                domain_result[item] += 1
                            except:
                                domain_result[item] = 1
                except:
                    no_domain_count += 1
                 
            except StopIteration:
                print 'all done'
                # gender ratio count
                count = sum(gender_result.values())
                gender_ratio = {'1':float(gender_result['1']) / count, '2':float(gender_result['2']) / count}
                #print 'gender ratio:', gender_ratio
                activity_result = es.mget(index='20130907', doc_type='bci', body={'ids':portrait_uid_list})['docs']
                for activity_item in activity_result:
                    if activity_item['found']:
                        activity_count += 1
                #print 'activity_count:', activity_count
                result_dict['activity_count'] = float(activity_count) / count
                result_dict['gender_ratio'] = json.dumps(gender_ratio)
                # verified ratio count
                count = sum(verified_result.values())
                if count==0:
                    verified_ratio = {'yes':0.5, 'no':0.5}
                else:
                    verified_ratio = {'yes':float(verified_result['yes']) / count, 'no':float(verified_result['no'])/count}
                #print 'verified ratio:', verified_ratio
                result_dict['verified_ratio'] = json.dumps(verified_ratio)
                # location top
                if location_result:
                    sort_location = sorted(location_result.items(), key=lambda x:x[1], reverse=True)
                    location_top = sort_location[:5]
                else:
                    location_top = {}
                #print 'location top:', location_top
                result_dict['location_top'] = json.dumps(location_top)
                # activity geo top
                if activity_geo_result:
                    sort_activity_geo = sorted(activity_geo_result.items(), key=lambda x:x[1], reverse=True)
                    activity_geo_top = sort_activity_geo[:50]
                else:
                    activity_geo_top = {}
                #print 'activity_geo_top:', activity_geo_top
                result_dict['activity_geo_top'] = json.dumps(activity_geo_top)
                # keywords top
                if keywords_result:
                    sort_keywords = sorted(keywords_result.items(), key=lambda x:x[1], reverse=True)
                    keywords_top = sort_keywords[:50]
                else:
                    keywords_top = {}
                #print 'keywords_top:', keywords_top
                result_dict['keywords_top'] = json.dumps(keywords_top)
                # hashtag top
                if hashtag_result:
                    sort_hashtag = sorted(hashtag_result.items(), key=lambda x:x[1], reverse=True)
                    hashtag_top = sort_hashtag[:50]
                else:
                    hashtag_top = {}
                #print 'hashtag top:', hashtag_top
                result_dict['hashtag_top'] = json.dumps(hashtag_top)
                # topic top
                if topic_result:
                    sort_topic = sorted(topic_result.items(), key=lambda x:x[1], reverse=True)
                    topic_top = sort_topic[:50]
                else:
                    topic_top = {}
                #print 'topic top:', topic_top
                result_dict['topic_top'] = json.dumps(topic_top)
                # online_pattern top
                if online_pattern_result:
                    sort_online_pattern = sorted(online_pattern_result.items(), key=lambda x:x[1], reverse=True)
                    online_pattern_top = sort_online_pattern[:50]
                else:
                    online_pattern_top = {}
                #print 'online pattern top:', online_pattern_top
                result_dict['online_pattern_top'] = json.dumps(online_pattern_top)
                # domain top
                if domain_result:
                    sort_domain = sorted(domain_result.items(), key=lambda x:x[1], reverse=True)
                    domain_top = sort_domain[:20]
                    #test:
                    domain_top = [('education',50), ('art', 40), ('lawyer', 30), ('student', 20), ('media', 10), ('oversea',1)]
                else:
                    domain_top = {}
                #print 'domain top:', domain_top
                result_dict['domain_top'] = json.dumps(domain_top)
                #test need to add domain top user
                domain_top = [[u'媒体',1],[u'法律人士',1], [u'政府机构人士',1], [u'活跃人士',1], [u'媒体人士',1], [u'商业人士',1],\
                              [u'高校微博', 1], [u'境内机构', 1], [u'境外机构', 1], [u'民间组织',1], [u'草根',1], [u'其他', 1]]
                result_dict['domain_top_user'] = json.dumps(get_domain_top_user(domain_top))
                #test need to add topic user
                topic_top = [[u'军事', 1], [u'政治',1], [u'体育',1], [u'计算机',1], [u'民生',1], [u'生活',1],\
                              [u'娱乐',1], [u'健康',1], [u'交通',1], [u'经济',1], [u'教育',1], [u'自然',1]]
                result_dict['topic_top_user'] = json.dumps(get_topic_top_user(topic_top))
                return result_dict 
            except Exception, r:
                print Exception, r
                return result_dict
        #print 'portrait_uid_list:', len(portrait_uid_list)
        activity_result = es.mget(index='20130907', doc_type='bci', body={'ids':portrait_uid_list})['docs']
        for activity_item in activity_result:
            if activity_item['found']:
                activity_count += 1
Ejemplo n.º 31
0
def get_structure_user(seed_uid_list, structure_dict, filter_dict):
    structure_user_dict = {}
    retweet_mark = int(structure_dict['retweet'])
    comment_mark = int(structure_dict['comment'])
    hop = int(structure_dict['hop'])
    retweet_user_dict = {}
    comment_user_dict = {}
    #get retweet/comment es db_number
    now_ts = time.time()
    db_number = get_db_num(now_ts)
    #iter to find seed uid list retweet/be_retweet/comment/be_comment user set by hop
    iter_hop_user_list = seed_uid_list
    iter_count = 0
    all_union_result = dict()
    while iter_count < hop:   # hop number control
        iter_count += 1
        search_user_count = len(iter_hop_user_list)
        hop_union_result = dict()
        iter_search_count = 0
        while iter_search_count < search_user_count:
            iter_search_user_list = iter_hop_user_list[iter_search_count: iter_search_count + DETECT_ITER_COUNT]
            #step1: mget retweet and be_retweet
            if retweet_mark == 1:
                retweet_index_name = retweet_index_name_pre + str(db_number)
                be_retweet_index_name = be_retweet_index_name_pre + str(db_number)
                #mget retwet
                try:
                    retweet_result = es_retweet.mget(index=retweet_index_name, doc_type=retweet_index_type, \
                                                     body={'ids':iter_search_user_list}, _source=True)['docs']
                except:
                    retweet_result = []
                #mget be_retweet
                try:
                    be_retweet_result = es_retweet.mget(index=be_retweet_index_name, doc_type=be_retweet_type, \
                                                        body={'ids':iter_search_user_list} ,_source=True)['docs']
                except:
                    be_retweet_result = []
            #step2: mget comment and be_comment
            if comment_mark == 1:
                comment_index_name = comment_index_name_pre + str(db_number)
                be_comment_index_name = be_comment_index_name_pre + str(db_number)
                #mget comment
                try:
                    comment_result = es_comment.mget(index=comment_index_name, doc_type=comment_index_type, \
                                                     body={'ids':iter_search_user_list}, _source=True)['docs']
                except:
                    comment_result = []
                #mget be_comment
                try:
                    be_comment_result = es_comment.mget(index=be_comment_index_name, doc_type=be_comment_index_type, \
                                                    body={'ids':iter_search_user_list}, _source=True)['docs']
                except:
                    be_comment_result = []
            #step3: union retweet/be_retweet/comment/be_comment result
            union_count = 0
            
            for iter_search_uid in iter_search_user_list:
                try:
                    uid_retweet_dict = json.loads(retweet_result[union_count]['_source']['uid_retweet'])
                except:
                    uid_retweet_dict = {}
                try:
                    uid_be_retweet_dict = json.loads(be_retweet_result[union_count]['_source']['uid_be_retweet'])
                except:
                    uid_be_retweet_dict = {}
                try:
                    uid_comment_dict = json.loads(comment_result[union_count]['_source']['uid_comment'])
                except:
                    uid_comment_dict = {}
                try:
                    uid_be_comment_dict = json.loads(be_comment_result[union_count]['_source']['uid_be_comment'])
                except:
                    uid_be_comment_dict = {}
                #union four type user set
                union_result = union_dict(uid_retweet_dict, uid_be_retweet_dict, uid_comment_dict, uid_be_comment_dict)
                hop_union_result = union_dict(hop_union_result, union_result)
            #step4: add iter search count
            iter_search_count += DETECT_ITER_COUNT

        #pop seed uid self
        for iter_hop_user_item in iter_hop_user_list:
            try:
                hop_union_result.pop(iter_hop_user_item)
            except:
                pass
        #get new iter_hop_user_list
        iter_hop_user_list = hop_union_result.keys()
        #get all union result
        all_union_result = union_dict(all_union_result, hop_union_result)
    #step5: identify the who is in user_portrait
    sort_all_union_result = sorted(all_union_result.items(), key=lambda x:x[1], reverse=True)
    iter_count = 0
    all_count = len(sort_all_union_result)
    in_portrait_result = []
    filter_importance_from = filter_dict['importance']['gte']
    filter_importance_to = filter_dict['importance']['lt']
    filter_influence_from = filter_dict['influence']['gte']
    filter_influence_to = filter_dict['influence']['lt']
    while iter_count < all_count:
        iter_user_list = [item[0] for item in sort_all_union_result[iter_count:iter_count + DETECT_ITER_COUNT]]
        try:
            portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \
                    body={'ids':iter_user_list}, _source=True)['docs']
        except:
            portrait_result = []
        for portrait_item in portrait_result:
            if portrait_item['found'] == True:
                if portrait_item['_source']['importance'] >= filter_importance_from and portrait_item['_source']['importance'] <= filter_importance_to:
                    if portrait_item['_source']['influence'] >= filter_influence_from and portrait_item['_source']['influence'] <= filter_influence_to:
                        uid = portrait_item['_id']
                        in_portrait_result.append(uid)
        if len(in_portrait_result) > (filter_dict['count'] * DETECT_COUNT_EXPAND):
            break
        iter_count += DETECT_ITER_COUNT

    return in_portrait_result
Ejemplo n.º 32
0
def get_attr_social(uid_list):
    #test
    '''
    uid_list = ['1514608170', '2729648295', '3288875501', '1660612723', '1785934112',\
                '2397686502', '1748065927', '2699434042', '1886419032', '1830325932']
    '''
    result = {}
    union_dict = {}
    union_edge_count = 0
    union_weibo_count = 0
    union_user_set = set()
    group_user_set = set(uid_list)
    be_retweeted_out = 0
    be_retweeted_count_out = 0
    retweet_relation = []
    out_beretweet_relation = []
    for uid in uid_list:
        in_stat_results = dict()
        out_stat_results = dict()
        for db_num in r_dict:
            r_db = r_dict[db_num]
            ruid_results = r_db.hgetall('retweet_'+str(uid))
            #print 'len ruid_result:', len(ruid_results)
            if ruid_results:
                for ruid in ruid_results:
                    try:
                        in_stat_results[ruid] += ruid_results[ruid]
                    except:
                        in_stat_results[ruid] = ruid_results[ruid]
            br_uid_results = r_db.hgetall('be_retweet_'+str(uid))
            #print 'len br_uid_results:', len(br_uid_results)
            if br_uid_results:
                for br_uid in br_uid_results:
                    try:
                        out_stat_results[br_uid] += br_uid_results[br_uid]
                    except:
                        out_stat_results[br_uid] = br_uid_results[br_uid]
        # record the retweet relation in group uid
        uid_retweet_relation = [[uid, user, int(in_stat_results[user])] for user in in_stat_results if user in uid_list and user != uid]
        retweet_relation.extend(uid_retweet_relation)
        
        # record the be_retweet relation out group uid but in user_portrait
        uid_beretweet_relation = []
        uid_beretweet = [user for user in out_stat_results if user not in uid_list]
        es_portrait_result = es.mget(index='user_portrait', doc_type='user', body={'ids':uid_beretweet})['docs']
        for be_retweet_item in es_portrait_result:
            br_uid = be_retweet_item['_id']
            beretweet_count = int(out_stat_results[br_uid])
            try:
                be_retweet_source = be_retweet_item['_source']
                if be_retweet_source['influence']>=900:
                    uid_beretweet_relation.append([uid, br_uid, be_retweet_source['uname'], beretweet_count, be_retweet_source['influence']])
            except:
                next
        out_beretweet_relation.extend(uid_beretweet_relation)

        retweet_user_set = set(in_stat_results.keys())
        union_set = retweet_user_set & (group_user_set - set([uid]))
        union_edge_count += len(union_set) # count the retweet edge number
        if union_set:
            for ruid in union_set:
                union_weibo_count += int(in_stat_results[ruid])
        union_user_set = union_user_set | union_set

        #use to count the beretweeted by user who is out of the group
        be_retweeted_user_set = set(out_stat_results.keys())
        subtract_set = be_retweeted_user_set - set(uid_list)
        be_retweeted_out += len(subtract_set)
        be_retweeted_count_out_list = [int(out_stat_results[br_uid]) for br_uid in subtract_set]
        #print 'be_retweeted_count_out_list:', be_retweeted_count_out_list
        be_retweeted_count_out += sum(be_retweeted_count_out_list)

    result['density'] = float(union_edge_count) / (len(uid_list) * (len(uid_list)-1))
    result['retweet_weibo_count'] = float(union_weibo_count) / len(uid_list)
    result['retweet_user_count'] = float(len(union_user_set)) / len(uid_list)
    result['be_retweeted_count_out'] = be_retweeted_count_out
    result['be_retweeted_out'] = be_retweeted_out
    if retweet_relation!=[]:
        sort_retweet_relation = sorted(retweet_relation, key=lambda x:x[2], reverse=True)
    else:
        sort_retweet_relation = []
    result['retweet_relation'] = json.dumps(sort_retweet_relation)
    
    if out_beretweet_relation!=[]:
        sort_out_beretweet_relation = sorted(out_beretweet_relation, key=lambda x:x[4], reverse=True)
    else:
        sort_out_beretweet_relation = []
    result['out_beretweet_relation'] = json.dumps(sort_out_beretweet_relation)
    #print 'be_retweeted_out, be_retweeted_count_out:', be_retweeted_out, be_retweeted_count_out
    #print 'result:', result
    #print 'out_beretweet_relation:', sort_out_beretweet_relation
    return result
def social_sensing(task_detail):
    # 任务名 传感器 终止时间 之前状态 创建者 时间
    task_name = task_detail[0]
    social_sensors = task_detail[1]
    stop_time = task_detail[2]
    forward_warning_status = task_detail[3]
    create_by = task_detail[4]
    ts = int(task_detail[5])
    new = int(task_detail[6])

    print ts2date(ts)
    # PART 1

    forward_result = get_forward_numerical_info(task_name, ts, create_by)
    # 之前时间阶段内的原创微博list/retweeted
    forward_origin_weibo_list = query_mid_list(ts - time_interval,
                                               social_sensors,
                                               forward_time_range)
    forward_retweeted_weibo_list = query_mid_list(ts - time_interval,
                                                  social_sensors,
                                                  forward_time_range, 3)
    # 当前阶段内原创微博list
    current_mid_list = query_mid_list(ts, social_sensors, time_interval)
    current_retweeted_mid_list = query_mid_list(ts, social_sensors,
                                                time_interval, 3)
    all_mid_list = []
    all_mid_list.extend(current_mid_list)
    all_mid_list.extend(current_retweeted_mid_list)
    all_mid_list.extend(forward_origin_weibo_list)
    all_mid_list.extend(forward_retweeted_weibo_list)
    all_origin_list = []
    all_origin_list.extend(current_mid_list)
    all_origin_list.extend(forward_origin_weibo_list)
    all_retweeted_list = []
    all_retweeted_list.extend(current_retweeted_mid_list)
    all_retweeted_list.extend(
        forward_retweeted_weibo_list)  #被转发微博的mid/root-mid
    print "all mid list: ", len(all_mid_list)
    #print "all_origin_list", all_origin_list
    #print "all_retweeted_list", all_retweeted_list

    # 查询微博在当前时间内的转发和评论数, 聚合按照message_type
    statistics_count = query_related_weibo(ts, all_mid_list, time_interval)
    if all_origin_list:
        origin_weibo_detail = query_hot_weibo(ts, all_origin_list,
                                              time_interval)  # 原创微博详情
    else:
        origin_weibo_detail = {}
    if all_retweeted_list:
        retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list,
                                                 time_interval)  # 转发微博详情
    else:
        retweeted_weibo_detail = {}
    current_total_count = statistics_count['total_count']

    # 当前阶段内所有微博总数
    current_retweeted_count = statistics_count['retweeted']
    current_comment_count = statistics_count['comment']

    # PART 2
    # 聚合当前时间内积极、中性、悲伤、愤怒情绪分布
    # sentiment_dict = {"0": "neutral", "1":"positive", "2":"sad", "3": "anger"}
    sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0}
    search_results = aggregation_sentiment_related_weibo(
        ts, all_mid_list, time_interval)
    sentiment_count = search_results
    print "sentiment_count: ", sentiment_count
    negetive_key = ["2", "3", "4", "5", "6"]
    negetive_count = 0
    for key in negetive_key:
        negetive_count += sentiment_count[key]

    # 聚合当前时间内重要的人
    important_uid_list = []
    datetime = ts2datetime(ts - time_interval)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    if exist_es:
        search_results = get_important_user(ts, all_mid_list, time_interval)
        important_uid_list = search_results
    # 根据获得uid_list,从人物库中匹配重要人物
    if important_uid_list:
        important_results = es_user_portrait.mget(
            index=portrait_index_name,
            doc_type=portrait_index_type,
            body={"ids": important_uid_list})['docs']
    else:
        important_results = []
    filter_important_list = []  # uid_list
    if important_results:
        for item in important_results:
            if item['found']:
                #if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD:
                filter_important_list.append(item['_id'])

    #判断感知
    burst_reason = signal_nothing_variation
    warning_status = signal_nothing
    finish = unfinish_signal  # "0"
    process_status = "1"

    if forward_result[0]:
        # 根据移动平均判断是否有时间发生
        mean_count = forward_result[1]
        std_count = forward_result[2]
        mean_sentiment = forward_result[3]
        std_sentiment = forward_result[4]
        if mean_count >= MEAN_COUNT and current_total_count > mean_count + 1.96 * std_count or current_total_count >= len(
                all_mid_list) * AVERAGE_COUNT:  # 异常点发生
            if forward_warning_status == signal_brust:  # 已有事件发生,改为事件追踪
                warning_status = signal_track
            else:
                warning_status = signal_brust
            burst_reason += signal_count_varition  # 数量异常

        if negetive_count > mean_sentiment + 1.96 * std_sentiment and mean_sentiment >= MEAN_COUNT or negetive_count >= len(
                all_mid_list) * AVERAGE_COUNT:
            warning_status = signal_brust
            burst_reason += signal_sentiment_varition  # 负面情感异常, "12"表示两者均异常
            if forward_warning_status == signal_brust:  # 已有事件发生,改为事件追踪
                warning_status = signal_track

    if int(stop_time) <= ts:  # 检查任务是否已经完成
        finish = finish_signal
        process_status = "0"

    # 感知到的事, all_mid_list
    tmp_burst_reason = burst_reason
    topic_list = []
    sensitive_text_list = []

    # 有事件发生时开始
    #if warning_status:
    if 1:
        index_list = []
        important_words = []
        datetime_1 = ts2datetime(ts)
        index_name_1 = flow_text_index_name_pre + datetime_1
        exist_es = es_text.indices.exists(index=index_name_1)
        if exist_es:
            index_list.append(index_name_1)
        datetime_2 = ts2datetime(ts - DAY)
        index_name_2 = flow_text_index_name_pre + datetime_2
        exist_es = es_text.indices.exists(index=index_name_2)
        if exist_es:
            index_list.append(index_name_2)
        if index_list and all_mid_list:
            query_body = {
                "query": {
                    "filtered": {
                        "filter": {
                            "terms": {
                                "mid": all_mid_list
                            }
                        }
                    }
                },
                "size": 5000
            }
            search_results = es_text.search(index=index_list,
                                            doc_type="text",
                                            body=query_body)['hits']['hits']
            tmp_sensitive_warning = ""
            text_dict = dict()  # 文本信息
            mid_value = dict()  # 文本赋值
            duplicate_dict = dict()  # 重合字典
            portrait_dict = dict()  # 背景信息
            classify_text_dict = dict()  # 分类文本
            classify_uid_list = []
            duplicate_text_list = []
            sensitive_words_dict = dict()
            if search_results:
                for item in search_results:
                    iter_uid = item['_source']['uid']
                    iter_mid = item['_source']['mid']
                    iter_text = item['_source']['text'].encode(
                        'utf-8', 'ignore')
                    iter_sensitive = item['_source'].get('sensitive', 0)

                    duplicate_text_list.append({
                        "_id": iter_mid,
                        "title": "",
                        "content": iter_text
                    })

                    if iter_sensitive:
                        tmp_sensitive_warning = signal_sensitive_variation  #涉及到敏感词的微博
                        sensitive_words_dict[iter_mid] = iter_sensitive

                    keywords_dict = json.loads(
                        item['_source']['keywords_dict'])
                    personal_keywords_dict = dict()
                    for k, v in keywords_dict.iteritems():
                        k = k.encode('utf-8', 'ignore')
                        personal_keywords_dict[k] = v
                    classify_text_dict[iter_mid] = personal_keywords_dict
                    classify_uid_list.append(iter_uid)

                # 去重
                if duplicate_text_list:
                    dup_results = duplicate(duplicate_text_list)
                    for item in dup_results:
                        if item['duplicate']:
                            duplicate_dict[item['_id']] = item['same_from']

                # 分类
                if classify_text_dict:
                    classify_results = topic_classfiy(classify_uid_list,
                                                      classify_text_dict)
                    mid_value = dict()
                    #print "classify_results: ", classify_results
                    for k, v in classify_results.iteritems():  # mid:value
                        mid_value[k] = topic_value_dict[v[0]]

            if tmp_sensitive_warning:
                warning_status = signal_brust
                burst_reason += signal_sensitive_variation
            sensitive_weibo_detail = {}
            if sensitive_words_dict:
                sensitive_mid_list = sensitive_words_dict.keys()
                sensitivie_weibo_detail = query_hot_weibo(
                    ts, sensitive_mid_list, time_interval)

    results = dict()
    results['mid_topic_value'] = json.dumps(mid_value)
    results['duplicate_dict'] = json.dumps(duplicate_dict)
    results['sensitive_words_dict'] = json.dumps(sensitive_words_dict)
    results['sensitive_weibo_detail'] = json.dumps(sensitive_weibo_detail)
    results['origin_weibo_number'] = len(all_origin_list)
    results['retweeted_weibo_number'] = len(all_retweeted_list)
    results['origin_weibo_detail'] = json.dumps(origin_weibo_detail)
    results['retweeted_weibo_detail'] = json.dumps(retweeted_weibo_detail)
    results['retweeted_weibo_count'] = current_retweeted_count
    results['comment_weibo_count'] = current_comment_count
    results['weibo_total_number'] = current_total_count
    results['sentiment_distribution'] = json.dumps(sentiment_count)
    results['important_users'] = json.dumps(filter_important_list)
    results['unfilter_users'] = json.dumps(important_uid_list)
    results['burst_reason'] = tmp_burst_reason
    results['timestamp'] = ts
    #results['clustering_topic'] = json.dumps(topic_list)
    # es存储当前时段的信息
    doctype = create_by + '-' + task_name
    es_user_portrait.index(index=index_sensing_task,
                           doc_type=doctype,
                           id=ts,
                           body=results)

    # 更新manage social sensing的es信息
    if not new:
        temporal_result = es_user_portrait.get(index=index_manage_social_task,
                                               doc_type=task_doc_type,
                                               id=doctype)['_source']
        temporal_result['warning_status'] = warning_status
        temporal_result['burst_reason'] = tmp_burst_reason
        temporal_result['finish'] = finish
        temporal_result['processing_status'] = process_status
        history_status = json.loads(temporal_result['history_status'])
        history_status.append([ts, task_name, warning_status])
        temporal_result['history_status'] = json.dumps(history_status)
        es_user_portrait.index(index=index_manage_social_task,
                               doc_type=task_doc_type,
                               id=doctype,
                               body=temporal_result)
    else:
        print "test"
    return "1"
Ejemplo n.º 34
0
def get_seed_user_attribute(seed_user_list, attribute_list):
    results = {}
    attribute_query_list = []
    #step1: mget user result from user_portrait
    try:
        seed_user_portrait = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \
                body={'ids':seed_user_list}, _source=True)['docs']
    except:
        seed_user_portrait = []
    #init results dict---result={'location':{}, 'domain':{}, ...}
    for attribute_item in attribute_list:
        results[attribute_item] = {}
    #step2: compute attribute result about attribute_list
    for seed_user_item in seed_user_portrait:
        uid = seed_user_item['_id']
        if seed_user_item['found'] == True:
            source = seed_user_item['_source']
            #static the attribute
            #step2.1: location
            if 'location' in attribute_list:
                location_value = source['location']
                try:
                    results['location'][location_value] += 1
                except:
                    results['location'][location_value] = 1
            #step2.2: domain
            if 'domain' in attribute_list:
                domain_value = source['domain']
                try:
                    results['domain'][domain_value] += 1
                except:
                    results['domain'][domain_value] = 1
            #step2.3: topic_string
            if 'topic_string' in attribute_list:
                topic_value_string = source['topic_string']
                topic_value_list = topic_value_string.split('&')
                for topic_item in topic_value_list:
                    try:
                        results['topic_string'][topic_item] += 1
                    except:
                        results['topic_string'][topic_item] = 1
            #step2.4: keywords_string
            if 'keywords_string' in attribute_list:
                keywords_value_string = source['keywords_string']
                keywords_value_list = keywords_value_string.split('&')
                for keywords_item in keywords_value_list:
                    try:
                        results['keywords_string'][keywords_item] += 1
                    except:
                        results['keywords_string'][keywords_item] = 1
            #step2.5: hashtag
            if 'hashtag' in attribute_list:
                hashtag_value_string = source['hashtag']
                hashtag_value_list = hashtag_value_string.split('&')
                for hashtag_item in hashtag_value_list:
                    try:
                        results['hashtag'][hashtag_item] += 1
                    except:
                        results['hashtag'][hashtag_item] = 1
            #step2.6: activity_geo
            if 'activity_geo' in attribute_list:
                activity_geo_dict = json.loads(source['activity_geo_dict'])[-1]
                for activity_geo_item in activity_geo_dict:
                    try:
                        results['activity_geo'][activity_geo_item] += 1
                    except:
                        results['activity_geo'][activity_geo_item] = 1
            #step2.7: tendency
            #step2.8: tag
            #step2.9: remark
    #step3: get search attribtue value-- new attribute query condition
    new_attribute_query_condition = []
    for item in results:
        iter_dict = results[item]
        sort_item_dict = sorted(iter_dict.items(), key=lambda x:x[1], reverse=True)
        nest_body_list = []
        for query_item in sort_item_dict[:3]:
            item_value = query_item[0]
            nest_body_list.append({'wildcard':{item: '*'+item_value+'*'}})
        new_attribute_query_condition.append({'bool':{'should': nest_body_list}})

    return new_attribute_query_condition
Ejemplo n.º 35
0
def get_tweets_distribute(xnr_user_no):

    topic_distribute_dict = {}
    topic_distribute_dict['radar'] = {}

    uid = xnr_user_no2uid(xnr_user_no)

    if xnr_user_no:
        es_results = es.get(index=weibo_xnr_fans_followers_index_name,doc_type=weibo_xnr_fans_followers_index_type,\
                                id=xnr_user_no)["_source"]
        followers_list = es_results['followers_list']

    if S_TYPE == 'test':
        uid=PORTRAI_UID
        followers_list=PORTRAIT_UID_LIST

    # 关注者topic分布

    results = es_user_portrait.mget(index=portrait_index_name,doc_type=portrait_index_type,\
        body={'ids':followers_list})['docs']

    topic_list_followers = []

    for result in results:
        if result['found'] == True:
            result = result['_source']
            topic_string_first = result['topic_string'].split('&')
            topic_list_followers.extend(topic_string_first)

    topic_list_followers_count = Counter(topic_list_followers)

    #topic_distribute_dict['topic_follower'] = topic_list_followers_count
    # 虚拟人topic分布
    try:
        xnr_results = es_user_portrait.get(index=portrait_index_name,doc_type=portrait_index_type,\
            id=uid)['_source']
        topic_string = xnr_results['topic_string'].split('&')
        topic_xnr_count = Counter(topic_string)
        #topic_distribute_dict['topic_xnr'] = topic_xnr_count

    except:
        topic_xnr_count = {}
        #topic_distribute_dict['topic_xnr'] = topic_xnr_count

    # 整理雷达图数据
    # if topic_xnr_count:
    #     for topic, value in topic_xnr_count.iteritems():
    #         try:
    #             topic_value = float(value)/(topic_list_followers_count[topic])
    #         except:
    #             continue
    #         topic_distribute_dict['radar'][topic] = topic_value
    if topic_xnr_count:
        for topic, value in topic_list_followers_count.iteritems():
            try:
                topic_value = float(topic_xnr_count[topic])/value
            except:
                continue
            topic_distribute_dict['radar'][topic] = topic_value
            
    # 整理仪表盘数据
    mark = 0
    
    if topic_xnr_count:
        n_topic = len(topic_list_followers_count.keys())
        for topic,value in topic_xnr_count.iteritems():
            try:
                mark += float(value)/(topic_list_followers_count[topic]*n_topic)
                print topic 
                print mark
            except:
                continue
    topic_distribute_dict['mark'] = mark

    return topic_distribute_dict
Ejemplo n.º 36
0
def get_follow_group_distribute(xnr_user_no):
    
    domain_distribute_dict = {}
    domain_distribute_dict['radar'] = {}

    if S_TYPE == 'test':
        followers_list=PORTRAIT_UID_LIST
        followers_list_today = FOLLOWERS_TODAY
    else:
        # 获取所有关注者
        es_results = es.get(index=weibo_xnr_fans_followers_index_name,doc_type=weibo_xnr_fans_followers_index_type,\
                                id=xnr_user_no)["_source"]
        followers_list = es_results['followers_list']

        # 获取今日关注者
        current_time = int(time.time()-DAY)
        current_date = ts2datetime(current_time)
        r_uid_list_datetime_index_name = r_followers_uid_list_datetime_pre + current_date
        followers_results = r_fans_followers.hget(r_uid_list_datetime_index_name,xnr_user_no)
        followers_list_today = json.loads(followers_results)

    # 所有关注者领域分布

    results = es_user_portrait.mget(index=portrait_index_name,doc_type=portrait_index_type,\
        body={'ids':followers_list})['docs']
    
    domain_list_followers = []

    for result in results:
        if result['found'] == True:
            result = result['_source']
            domain_name = result['domain']
            domain_list_followers.append(domain_name)

    domain_list_followers_count = Counter(domain_list_followers)

    #domain_distribute_dict['domain_follower'] = domain_list_followers_count
    
    # 今日关注者
    
    try:
        today_results = es_user_portrait.mget(index=portrait_index_name,doc_type=portrait_index_type,\
            body={'ids':followers_list_today})['docs']

        domain_list_followers_today = []

        for result in today_results:
            if result['found'] == True:
                result = result['_source']
                domain_name = result['domain']
                domain_list_followers_today.append(domain_name)

        domain_list_followers_today_count = Counter(domain_list_followers_today)

    except:
        domain_list_followers_today_count = {}


    # 整理雷达图数据
    # if domain_list_followers_today_count:
    #     for domain, value in domain_list_followers_today_count.iteritems():
    #         try:
    #             domain_value = float(value)/(domain_list_followers_count[domain])
    #         except:
    #             continue
    #         domain_distribute_dict['radar'][domain] = domain_value

    if domain_list_followers_today_count:
        for domain, value in domain_list_followers_today_count.iteritems():
            try:
                domain_value = float(domain_list_followers_today_count[domain])/value
            except:
                continue
            domain_distribute_dict['radar'][domain] = domain_value

    # 整理仪表盘数据
    mark = 0
    print 'domain_list_followers_today_count::',domain_list_followers_today_count
    print 'domain_distribute_dict::',domain_distribute_dict
    if domain_list_followers_today_count:
        n_domain = len(domain_list_followers_count.keys())
        for domain,value in domain_list_followers_today_count.iteritems():
            try:
                mark += float(value)/(domain_list_followers_count[domain]*n_domain)
            except:
                continue
    domain_distribute_dict['mark'] = mark

    return domain_distribute_dict
Ejemplo n.º 37
0
def scan_index_history():
    s_re = scan(es_user_portrait,
                query={
                    'query': {
                        'match_all': {}
                    },
                    'size': 1000
                },
                index=portrait_index_name,
                doc_type=portrait_index_type)
    bulk_action = []
    add_info = {}
    count = 0
    start_ts = time.time()
    now_date = ts2datetime(start_ts - DAY)
    now_date = '2013-09-06'
    #now_date_string = ''.join(now_date.split('-'))
    now_date_string = now_date
    activeness_key = 'activeness_' + now_date_string
    #influence_key = now_date_string
    influence_key = now_date_string
    importance_key = "importance_" + now_date_string
    del_date = ts2datetime(time.time() - DAY * 31)
    #del_date_string = ''.join(del_date.split('-'))
    del_date_string = del_date
    del_activeness_key = 'activeness_' + del_date_string
    #del_influence_key = del_date_string
    del_influence_key = del_date_string
    del_importance_key = "importance_" + del_date_string
    #get max value for importance and activeness
    max_activeness = get_max_index('activeness')
    max_influence = get_max_index('influence')
    max_importance = get_max_index('importance')
    while True:
        try:
            scan_re = s_re.next()['_source']
            count += 1
            uid = scan_re['uid']

            activeness_key = 'activeness_' + now_date_string
            influence_key = now_date_string
            importance_key = "importance_" + now_date_string
            #save to normal activeness and normal influence
            activeness_value = scan_re['activeness']
            influence_value = scan_re['influence']
            importance_value = scan_re['importance']
            normal_activeness = normal_index(activeness_value, max_activeness)
            normal_influence = normal_index(influence_value, max_influence)
            normal_importance = normal_index(importance_value, max_importance)

            add_info[uid] = {
                activeness_key: normal_activeness,
                influence_key: normal_influence,
                importance_key: normal_importance
            }
            if count % 1000 == 0:
                uid_list = add_info.keys()
                evaluate_history_results = es_user_portrait.mget(
                    index=copy_portrait_index_name,
                    doc_type=copy_portrait_index_type,
                    body={'ids': uid_list})['docs']
                '''
                del_date = ts2datetime(time.time() - DAY*31)
                del_date_string = ''.join(s)
                del_activeness_key = 'activeness_'+del_date
                del_influence_key = del_date
                '''
                iter_count = 0
                for uid in uid_list:
                    try:
                        user_history_item = evaluate_history_results[
                            iter_count]['_source']
                    except:
                        user_history_item = {}
                    try:
                        user_history_item.pop(del_activeness_key)
                        user_history_item.pop(del_influence_key)
                        user_history_item.pop(del_importance_key)
                    except:
                        pass
                    new_user_item = dict(user_history_item, **add_info[uid])
                    # yuankun-20151229
                    if add_info[uid][
                            influence_key] < LOW_INFLUENCE_THRESHOULD:  #更新活跃情况,出库
                        try:
                            new_user_item["low_number"] += 1
                        except:
                            new_user_item["low_number"] = 1
                    else:
                        new_user_item["low_number"] = 0
                    aver_activeness, aver_influence, aver_importance = average_value(
                        new_user_item)
                    new_user_item['aver_activeness'] = aver_activeness
                    new_user_item['aver_influence'] = aver_influence
                    new_user_item['aver_importance'] = aver_importance
                    #print 'add_info:', add_info[uid]
                    #print 'user_history_item:', user_history_item
                    #print 'new_user_item:', new_user_item
                    action = {'index': {'_id': uid}}
                    #print 'action:', action
                    bulk_action.extend([action, new_user_item])
                es_user_portrait.bulk(bulk_action,
                                      index=copy_portrait_index_name,
                                      doc_type=copy_portrait_index_type)
                bulk_action = []
                add_info = {}
                iter_count = 0
                end_ts = time.time()
                print '%s sec count 1000' % (end_ts - start_ts)
        except StopIteration:
            print 'all done'
            if len(add_info) != 0:
                uid_list = add_info.keys()
                evaluate_history_results = es_user_portrait.mget(
                    index=copy_portrait_index_name,
                    doc_type=copy_portrait_index_type,
                    body={'ids': uid_list})['docs']
                '''
                del_date = ts2datetime(time.time() - DAY*31)
                del_activeness_key = 'activeness_'+del_date
                del_influence_key = del_date
                '''
                iter_count = 0
                for uid in uid_list:
                    try:
                        user_history_item = evaluate_history_results[
                            iter_count]['_source']
                    except:
                        user_history_item = {}
                    try:
                        user_history_item.pop(del_activeness_key)
                        user_history_item.pop(del_influence_key)
                        user_history_item.pop(del_importance_key)
                    except:
                        pass
                    new_user_item = dict(user_history_item, **add_info[uid])
                    if add_info[uid][influence_key] < LOW_INFLUENCE_THRESHOULD:
                        try:
                            new_user_item["low_number"] += 1
                        except:
                            new_user_item["low_number"] = 1
                    else:
                        new_user_item["low_number"] = 0
                    aver_activeness, aver_influence, aver_importance = average_value(
                        new_user_item)
                    new_user_item['aver_activeness'] = aver_activeness
                    new_user_item['aver_influence'] = aver_influence
                    new_user_item['aver_importance'] = aver_importance
                    action = {'index': {'_id': uid}}
                    bulk_action.extend([action, new_user_item])
                    iter_count += 1
                es_user_portrait.bulk(bulk_action,
                                      index=copy_portrait_index_name,
                                      doc_type=copy_portrait_index_type)
                bulk_action = []
                add_info = {}
                iter_count = 0
            break
        except Exception, e:
            raise e
def social_sensing(task_detail):
    # 任务名 传感器 终止时间 之前状态 创建者 时间
    task_name = task_detail[0]
    social_sensors = task_detail[1]
    stop_time = task_detail[2]
    forward_warning_status = task_detail[3]
    create_by = task_detail[4]
    ts = int(task_detail[5])
    new = int(task_detail[6])

    print ts2date(ts)
    # PART 1

    forward_result = get_forward_numerical_info(task_name, ts, create_by)
    # 之前时间阶段内的原创微博list/retweeted
    forward_origin_weibo_list = query_mid_list(ts - time_interval, social_sensors, forward_time_range)
    forward_retweeted_weibo_list = query_mid_list(ts - time_interval, social_sensors, forward_time_range, 3)
    # 当前阶段内原创微博list
    current_mid_list = query_mid_list(ts, social_sensors, time_interval)
    current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3)
    all_mid_list = []
    all_mid_list.extend(current_mid_list)
    all_mid_list.extend(current_retweeted_mid_list)
    all_mid_list.extend(forward_origin_weibo_list)
    all_mid_list.extend(forward_retweeted_weibo_list)
    all_origin_list = []
    all_origin_list.extend(current_mid_list)
    all_origin_list.extend(forward_origin_weibo_list)
    all_retweeted_list = []
    all_retweeted_list.extend(current_retweeted_mid_list)
    all_retweeted_list.extend(forward_retweeted_weibo_list)  # 被转发微博的mid/root-mid
    print "all mid list: ", len(all_mid_list)
    # print "all_origin_list", all_origin_list
    # print "all_retweeted_list", all_retweeted_list

    # 查询微博在当前时间内的转发和评论数, 聚合按照message_type
    statistics_count = query_related_weibo(ts, all_mid_list, time_interval)
    if all_origin_list:
        origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval)  # 原创微博详情
    else:
        origin_weibo_detail = {}
    if all_retweeted_list:
        retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval)  # 转发微博详情
    else:
        retweeted_weibo_detail = {}
    current_total_count = statistics_count["total_count"]

    # 当前阶段内所有微博总数
    current_retweeted_count = statistics_count["retweeted"]
    current_comment_count = statistics_count["comment"]

    # PART 2
    # 聚合当前时间内积极、中性、悲伤、愤怒情绪分布
    # sentiment_dict = {"0": "neutral", "1":"positive", "2":"sad", "3": "anger"}
    sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0}
    search_results = aggregation_sentiment_related_weibo(ts, all_mid_list, time_interval)
    sentiment_count = search_results
    print "sentiment_count: ", sentiment_count
    negetive_key = ["2", "3", "4", "5", "6"]
    negetive_count = 0
    for key in negetive_key:
        negetive_count += sentiment_count[key]

    # 聚合当前时间内重要的人
    important_uid_list = []
    datetime = ts2datetime(ts - time_interval)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    if exist_es:
        search_results = get_important_user(ts, all_mid_list, time_interval)
        important_uid_list = search_results
    # 根据获得uid_list,从人物库中匹配重要人物
    if important_uid_list:
        important_results = es_user_portrait.mget(
            index=portrait_index_name, doc_type=portrait_index_type, body={"ids": important_uid_list}
        )["docs"]
    else:
        important_results = []
    filter_important_list = []  # uid_list
    if important_results:
        for item in important_results:
            if item["found"]:
                # if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD:
                filter_important_list.append(item["_id"])

    # 判断感知
    burst_reason = signal_nothing_variation
    warning_status = signal_nothing
    finish = unfinish_signal  # "0"
    process_status = "1"

    if forward_result[0]:
        # 根据移动平均判断是否有时间发生
        mean_count = forward_result[1]
        std_count = forward_result[2]
        mean_sentiment = forward_result[3]
        std_sentiment = forward_result[4]
        if (
            mean_count >= MEAN_COUNT
            and current_total_count > mean_count + 1.96 * std_count
            or current_total_count >= len(all_mid_list) * AVERAGE_COUNT
        ):  # 异常点发生
            if forward_warning_status == signal_brust:  # 已有事件发生,改为事件追踪
                warning_status = signal_track
            else:
                warning_status = signal_brust
            burst_reason += signal_count_varition  # 数量异常

        if (
            negetive_count > mean_sentiment + 1.96 * std_sentiment
            and mean_sentiment >= MEAN_COUNT
            or negetive_count >= len(all_mid_list) * AVERAGE_COUNT
        ):
            warning_status = signal_brust
            burst_reason += signal_sentiment_varition  # 负面情感异常, "12"表示两者均异常
            if forward_warning_status == signal_brust:  # 已有事件发生,改为事件追踪
                warning_status = signal_track

    if int(stop_time) <= ts:  # 检查任务是否已经完成
        finish = finish_signal
        process_status = "0"

    # 感知到的事, all_mid_list
    tmp_burst_reason = burst_reason
    topic_list = []
    sensitive_text_list = []

    # 有事件发生时开始
    # if warning_status:
    if 1:
        index_list = []
        important_words = []
        datetime_1 = ts2datetime(ts)
        index_name_1 = flow_text_index_name_pre + datetime_1
        exist_es = es_text.indices.exists(index=index_name_1)
        if exist_es:
            index_list.append(index_name_1)
        datetime_2 = ts2datetime(ts - DAY)
        index_name_2 = flow_text_index_name_pre + datetime_2
        exist_es = es_text.indices.exists(index=index_name_2)
        if exist_es:
            index_list.append(index_name_2)
        if index_list and all_mid_list:
            query_body = {"query": {"filtered": {"filter": {"terms": {"mid": all_mid_list}}}}, "size": 5000}
            search_results = es_text.search(index=index_list, doc_type="text", body=query_body)["hits"]["hits"]
            tmp_sensitive_warning = ""
            text_dict = dict()  # 文本信息
            mid_value = dict()  # 文本赋值
            duplicate_dict = dict()  # 重合字典
            portrait_dict = dict()  # 背景信息
            classify_text_dict = dict()  # 分类文本
            classify_uid_list = []
            duplicate_text_list = []
            sensitive_words_dict = dict()
            if search_results:
                for item in search_results:
                    iter_uid = item["_source"]["uid"]
                    iter_mid = item["_source"]["mid"]
                    iter_text = item["_source"]["text"].encode("utf-8", "ignore")
                    iter_sensitive = item["_source"].get("sensitive", 0)

                    duplicate_text_list.append({"_id": iter_mid, "title": "", "content": iter_text})

                    if iter_sensitive:
                        tmp_sensitive_warning = signal_sensitive_variation  # 涉及到敏感词的微博
                        sensitive_words_dict[iter_mid] = iter_sensitive

                    keywords_dict = json.loads(item["_source"]["keywords_dict"])
                    personal_keywords_dict = dict()
                    for k, v in keywords_dict.iteritems():
                        k = k.encode("utf-8", "ignore")
                        personal_keywords_dict[k] = v
                    classify_text_dict[iter_mid] = personal_keywords_dict
                    classify_uid_list.append(iter_uid)

                # 去重
                if duplicate_text_list:
                    dup_results = duplicate(duplicate_text_list)
                    for item in dup_results:
                        if item["duplicate"]:
                            duplicate_dict[item["_id"]] = item["same_from"]

                # 分类
                if classify_text_dict:
                    classify_results = topic_classfiy(classify_uid_list, classify_text_dict)
                    mid_value = dict()
                    # print "classify_results: ", classify_results
                    for k, v in classify_results.iteritems():  # mid:value
                        mid_value[k] = topic_value_dict[v[0]]

            if tmp_sensitive_warning:
                warning_status = signal_brust
                burst_reason += signal_sensitive_variation
            sensitive_weibo_detail = {}
            if sensitive_words_dict:
                sensitive_mid_list = sensitive_words_dict.keys()
                sensitivie_weibo_detail = query_hot_weibo(ts, sensitive_mid_list, time_interval)

    results = dict()
    results["mid_topic_value"] = json.dumps(mid_value)
    results["duplicate_dict"] = json.dumps(duplicate_dict)
    results["sensitive_words_dict"] = json.dumps(sensitive_words_dict)
    results["sensitive_weibo_detail"] = json.dumps(sensitive_weibo_detail)
    results["origin_weibo_number"] = len(all_origin_list)
    results["retweeted_weibo_number"] = len(all_retweeted_list)
    results["origin_weibo_detail"] = json.dumps(origin_weibo_detail)
    results["retweeted_weibo_detail"] = json.dumps(retweeted_weibo_detail)
    results["retweeted_weibo_count"] = current_retweeted_count
    results["comment_weibo_count"] = current_comment_count
    results["weibo_total_number"] = current_total_count
    results["sentiment_distribution"] = json.dumps(sentiment_count)
    results["important_users"] = json.dumps(filter_important_list)
    results["unfilter_users"] = json.dumps(important_uid_list)
    results["burst_reason"] = tmp_burst_reason
    results["timestamp"] = ts
    # results['clustering_topic'] = json.dumps(topic_list)
    # es存储当前时段的信息
    doctype = create_by + "-" + task_name
    es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results)

    # 更新manage social sensing的es信息
    if not new:
        temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=doctype)[
            "_source"
        ]
        temporal_result["warning_status"] = warning_status
        temporal_result["burst_reason"] = tmp_burst_reason
        temporal_result["finish"] = finish
        temporal_result["processing_status"] = process_status
        history_status = json.loads(temporal_result["history_status"])
        history_status.append([ts, task_name, warning_status])
        temporal_result["history_status"] = json.dumps(history_status)
        es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=doctype, body=temporal_result)
    else:
        print "test"
    return "1"
Ejemplo n.º 39
0
def get_attr_bci(uid_list):
    results = []
    now_ts = time.time()
    now_date = ts2datetime(now_ts - 24*3600)
    ts = datetime2ts(now_date)
    #test
    ts = datetime2ts('2013-09-07')
    user_results = {} # {'uid':{'origin_max..':[], ''}}
    total_weibo_number = 0
    
    fans_number = 0
    origin_weibo_number = 0
    retweeted_weibo_number = 0
    origin_weibo_retweeted_total_number = 0
    origin_weibo_comment_total_number = 0
    retweeted_weibo_retweeted_total_number = 0
    retweeted_weibo_comment_total_number = 0

    origin_weibo_retweeted_top = 0
    origin_weibo_comment_top = 0
    retweeted_weibo_retweeted_top = 0
    retweeted_weibo_comment_top = 0
    influence_dict = {}

    for i in range(0, 7):
        timestamp = ts - i*24*3600
        date = ts2datetime(timestamp)
        hash_key = ''.join(date.split('-'))
        es_user_results = es_cluster.mget(index=hash_key, doc_type='bci', body={'ids':uid_list})['docs']
        for user_dict in es_user_results:
            try:
                user_item = user_dict['_source']
            except:
                next
            uid = user_item['user']
            total_weibo_number += user_item['origin_weibo_number']
            total_weibo_number += user_item['retweeted_weibo_number']
            
            # yuankun revise
            origin_weibo_number += user_item['origin_weibo_number']
            retweeted_weibo_number += user_item['retweeted_weibo_number']
            origin_weibo_retweeted_top += user_item['origin_weibo_retweeted_top_number']
            origin_weibo_comment_top += user_item['origin_weibo_comment_top_number']
            retweeted_weibo_retweeted_top += user_item['retweeted_weibo_retweeted_top_number']
            retweeted_weibo_comment_top += user_item['retweeted_weibo_comment_top_number']
            #print 'user_item:', user_item
            if uid in user_results:
                try:
                    user_results[uid]['origin_weibo_retweeted_top'].append([user_item['origin_weibo_retweeted_top_number'], user_item['origin_weibo_top_retweeted_id']])
                    user_results[uid]['origin_weibo_comment_top'].append([user_item['origin_weibo_comment_top_number'], user_item['origin_weibo_top_comment_id']])
                    user_results[uid]['retweeted_weibo_retweeted_top'].append([user_item['retweeted_weibo_retweeted_top_number'], user_item['retweeted_weibo_top_retweeted_id']])
                    user_results[uid]['retweeted_weibo_comment_top'].append([user_item['retweeted_weibo_comment_top_number'], user_item['retweeted_weibo_top_comment_id']])
                except:
                    user_results[uid]['origin_weibo_retweeted_top'] = [[user_item['origin_weibo_retweeted_top_number'], user_item['origin_weibo_top_retweeted_id']]]
                    user_results[uid]['origin_weibo_comment_top'] = [[user_item['origin_weibo_comment_top_number'], user_item['origin_weibo_top_comment_id']]]
                    user_results[uid]['retweeted_weibo_retweeted_top'] = [[user_item['retweeted_weibo_retweeted_top_number'], user_item['retweeted_weibo_top_retweeted_id']]]
                    user_results[uid]['retweeted_weibo_comment_top'] = [[user_item['retweeted_weibo_comment_top_number'], user_item['retweeted_weibo_top_comment_id']]]
            else:
                #print 'user_item:', [[user_item['origin_weibo_retweeted_top_number'], user_item['origin_weibo_top_retweeted_id']]]
                user_results[uid] = {'origin_weibo_retweeted_top':[[user_item['origin_weibo_retweeted_top_number'], user_item['origin_weibo_top_retweeted_id']]]}
                user_results[uid] = {'origin_weibo_comment_top': [[user_item['origin_weibo_comment_top_number'], user_item['origin_weibo_top_comment_id']]]}
                user_results[uid] = {'retweeted_weibo_retweeted_top': [[user_item['retweeted_weibo_retweeted_top_number'], user_item['retweeted_weibo_top_retweeted_id']]]}
                user_results[uid] = {'retweeted_weibo_comment_top': [[user_item['retweeted_weibo_comment_top_number'], user_item['retweeted_weibo_top_comment_id']]]}
            
            # yuankun need
            #print 'fan_num:', user_item['user_fansnum'], type(user_item['user_fansnum']), type(fans_number)
            fans_number += int(user_item['user_fansnum'])
            origin_weibo_retweeted_total_number += user_item['origin_weibo_retweeted_total_number']
            origin_weibo_comment_total_number += user_item['origin_weibo_comment_total_number']
            retweeted_weibo_retweeted_total_number += user_item['retweeted_weibo_retweeted_total_number']
            retweeted_weibo_comment_total_number += user_item['retweeted_weibo_comment_total_number']

    user_portrait_result = es.mget(index='user_portrait', doc_type='user', body={'ids':uid_list})['docs']
    #print 'user_portrait_result:', user_portrait_result[0]

    # get activeness max & importance max & influence max to normalize
    evaluate_max_result = get_evaluate_max()

    for user_portrait in user_portrait_result:
        #print 'user_portrait:', user_portrait
        try:
            user_portrait_dict = user_portrait['_source']
            #print 'user_portrait_dict:', user_portrait_dict
            uname = user_portrait_dict['uname']
            importance = user_portrait_dict['importance']
            normal_importance = math.log((importance / evaluate_max_result['importance']) * 9 + 1, 10) * 100
            activeness = user_portrait_dict['activeness']
            normal_activeness = math.log(activeness / evaluate_max_result['activeness'] * 9 + 1, 10) * 100
            influence = user_portrait_dict['influence']
            normal_influence = math.log(influence / evaluate_max_result['influence'] * 9 + 1, 10) * 100
        except:
            uname = ''
            normal_importance = ''
            normal_activeness = ''
            normal_influence = ''
        #print 'user_portrait_dict:', user_portrait_dict
        uid = user_portrait_dict['uid']
        user_item_dict = user_results[uid]
        origin_weibo_retweeted_top_item = sorted(user_item_dict['origin_weibo_retweeted_top'], key=lambda x:x[0], reverse=True)[0]
        origin_weibo_comment_top_item = sorted(user_item_dict['origin_weibo_comment_top'], key=lambda x:x[0], reverse=True)[0]
        retweeted_weibo_retweeted_top_item = sorted(user_item_dict['retweeted_weibo_retweeted_top'], key=lambda x:x[0], reverse=True)[0]
        retweeted_weibo_comment_top_item = sorted(user_item_dict['retweeted_weibo_comment_top'], key=lambda x:x[0], reverse=True)[0]
        
        results.append([uid, uname, normal_activeness, normal_importance, normal_influence, origin_weibo_retweeted_top_item ,\
                        origin_weibo_comment_top_item, retweeted_weibo_retweeted_top_item, \
                        retweeted_weibo_comment_top_item])

    #yuankun need
    influence_dict['origin_weibo_retweeted_average_number'] = origin_weibo_retweeted_total_number/origin_weibo_number/7
    influence_dict['origin_weibo_comment_average_number'] = origin_weibo_comment_total_number/origin_weibo_number/7
    influence_dict['retweeted_weibo_retweeted_average_number'] = retweeted_weibo_retweeted_total_number/retweeted_weibo_number/7
    influence_dict['retweeted_weibo_comment_average_number'] = retweeted_weibo_comment_total_number/retweeted_weibo_number/7
    influence_dict['origin_weibo_retweeted_top_number'] = origin_weibo_retweeted_top/len(uid_list)/7
    influence_dict['origin_weibo_comment_top_number'] = origin_weibo_comment_top/len(uid_list)/7
    influence_dict['retweeted_weibo_retweeted_top_number'] = retweeted_weibo_retweeted_top/len(uid_list)/7
    influence_dict['retweeted_weibo_comment_top_number'] = retweeted_weibo_comment_top/len(uid_list)/7
    influence_dict['fans_number'] = fans_number
    influence_dict['total_weibo_number'] = total_weibo_number
    #print 'results:', results
    return {'user_influence_list': json.dumps(results), 'total_weibo_number': total_weibo_number}, influence_dict
Ejemplo n.º 40
0
def social_sensing(task_detail):
    # 任务名 传感器 终止时间 之前状态 创建者 时间
    task_name = task_detail[0]
    social_sensors = task_detail[1]
    stop_time = task_detail[2]
    forward_warning_status = task_detail[3]
    create_by = task_detail[4]
    ts = int(task_detail[5])

    # PART 1
    forward_result = get_forward_numerical_info(task_name, ts, create_by)
    # 之前时间阶段内的原创微博list/retweeted
    forward_origin_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range)
    forward_retweeted_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range, 3)
    # 当前阶段内原创微博list
    current_mid_list = query_mid_list(ts, social_sensors, time_interval)
    current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3)
    all_mid_list = []
    all_mid_list.extend(current_mid_list)
    all_mid_list.extend(current_retweeted_mid_list)
    all_mid_list.extend(forward_origin_weibo_list)
    all_mid_list.extend(forward_retweeted_weibo_list)
    all_origin_list = []
    all_origin_list.extend(current_mid_list)
    all_origin_list.extend(forward_origin_weibo_list)
    all_retweeted_list = []
    all_retweeted_list.extend(current_retweeted_mid_list)
    all_retweeted_list.extend(forward_retweeted_weibo_list)#被转发微博的mid/root-mid
    print "all mid list: ", len(all_mid_list)
    print "all_origin_list", all_origin_list
    print "all_retweeted_list", all_retweeted_list

    # 查询微博在当前时间内的转发和评论数, 聚合按照message_type
    statistics_count = query_related_weibo(ts, all_mid_list, time_interval)
    if all_origin_list:
        origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情
    else:
        origin_weibo_detail = {}
    if all_retweeted_list:
        retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情
    else:
        retweeted_weibo_detail = {}
    current_total_count = statistics_count['total_count']

    # 当前阶段内所有微博总数
    current_retweeted_count = statistics_count['retweeted']
    current_comment_count = statistics_count['comment']


    # PART 2
    # 聚合当前时间内积极、中性、悲伤、愤怒情绪分布
    # sentiment_dict = {"0": "neutral", "1":"positive", "2":"sad", "3": "anger"}
    sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0}
    search_results = aggregation_sentiment_related_weibo(ts, all_mid_list, time_interval)
    sentiment_count = search_results
    print "sentiment_count: ", sentiment_count
    negetive_key = ["2", "3", "4", "5", "6"]
    negetive_count = 0
    for key in negetive_key:
        negetive_count += sentiment_count[key]


    # 聚合当前时间内重要的人
    important_uid_list = []
    datetime = ts2datetime(ts-time_interval)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    if exist_es:
        search_results = get_important_user(ts, all_mid_list, time_interval)
        important_uid_list = search_results.keys()
    # 根据获得uid_list,从人物库中匹配重要人物
    if important_uid_list:
        important_results = es_user_portrait.mget(index=portrait_index_name,doc_type=portrait_index_type, body={"ids": important_uid_list})['docs']
    else:
        important_results = {}
    filter_important_list = [] # uid_list
    if important_results:
        for item in important_results:
            if item['found']:
                #if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD:
                filter_important_list.append(item['_id'])
    print filter_important_list


    #判断感知
    burst_reason = signal_nothing_variation
    warning_status = signal_nothing
    finish = unfinish_signal # "0"
    process_status = "1"

    if forward_result[0]:
        # 根据移动平均判断是否有时间发生
        mean_count = forward_result[1]
        std_count = forward_result[2]
        mean_sentiment = forward_result[3]
        std_sentiment = forward_result[4]
        if mean_count >= MEAN_COUNT and current_total_count > mean_count+1.96*std_count or current_total_count >= len(social_sensors)*0.2*AVERAGE_COUNT: # 异常点发生
            print "====================================================="
            if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪
                warning_status = signal_track
            else:
                warning_status = signal_brust
            burst_reason += signal_count_varition # 数量异常

        if negetive_count > mean_sentiment+1.96*std_sentiment and mean_sentiment >= MEAN_COUNT or negetive_count >= len(social_sensors)*0.2*AVERAGE_COUNT:
            warning_status = signal_brust
            burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常
            if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪
                warning_status = signal_track

    if int(stop_time) <= ts: # 检查任务是否已经完成
        finish = finish_signal
        process_status = "0"

    # 感知到的事, all_mid_list
    tmp_burst_reason = burst_reason
    topic_list = []

    # 有事件发生时开始
    if warning_status:
        index_list = []
        important_words = []
        datetime_1 = ts2datetime(ts)
        index_name_1 = flow_text_index_name_pre + datetime_1
        exist_es = es_text.indices.exists(index=index_name_1)
        if exist_es:
            index_list.append(index_name_1)
        datetime_2 = ts2datetime(ts-DAY)
        index_name_2 = flow_text_index_name_pre + datetime_2
        exist_es = es_text.indices.exists(index=index_name_2)
        if exist_es:
            index_list.append(index_name_2)
        if index_list and all_mid_list:
            query_body = {
                "query":{
                    "filtered":{
                        "filter":{
                            "terms":{"mid": all_mid_list}
                        }
                    }
                },
                "size": 2000
            }
            search_results = es_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits']
            text_list = []
            if search_results:
                for item in search_results:
                    iter_mid = item['_source']['mid']
                    iter_text = item['_source']['text']
                    temp_dict = dict()
                    temp_dict["mid"] = iter_mid
                    temp_dict["text"] = iter_text
                    text_list.append(temp_dict)
            for item in text_list:
                print item['text']
            if len(text_list) == 1:
                top_word = freq_word(text_list[0])
                topic_list = [top_word.keys()]
            elif len(text_list) == 0:
                topic_list = []
                tmp_burst_reason = "" #没有相关微博,归零
                print "***********************************"
            else:
                feature_words, input_word_dict = tfidf(text_list) #生成特征词和输入数据
                word_label, evaluation_results = kmeans(feature_words, text_list) #聚类
                inputs = text_classify(text_list, word_label, feature_words)
                clustering_topic = cluster_evaluation(inputs)
                print "==============================================================="
                print "==============================================================="
                sorted_dict = sorted(clustering_topic.items(), key=lambda x:x[1], reverse=True)
                topic_list = []
                if sorted_dict:
                    for item in sorted_dict:
                        topic_list.append(word_label[item[0]])
            print "topic_list, ", topic_list

    #if not topic_list:
    #    warning_status = signal_nothing
    #    tmp_burst_reason = signal_nothing_variation

    results = dict()
    results['origin_weibo_number'] = len(all_origin_list)
    results['retweeted_weibo_number'] = len(all_retweeted_list)
    results['origin_weibo_detail'] = json.dumps(origin_weibo_detail)
    results['retweeted_weibo_detail'] = json.dumps(retweeted_weibo_detail)
    results['retweeted_weibo_count'] = current_retweeted_count
    results['comment_weibo_count'] = current_comment_count
    results['weibo_total_number'] = current_total_count
    results['sentiment_distribution'] = json.dumps(sentiment_count)
    results['important_users'] = json.dumps(filter_important_list)
    results['unfilter_users'] = json.dumps(important_uid_list)
    results['burst_reason'] = tmp_burst_reason
    results['timestamp'] = ts
    if tmp_burst_reason:
        results['clustering_topic'] = json.dumps(topic_list)
    # es存储当前时段的信息
    doctype = create_by + '-' + task_name
    es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results)

    # 更新manage social sensing的es信息
    temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=doctype)['_source']
    temporal_result['warning_status'] = warning_status
    temporal_result['burst_reason'] = tmp_burst_reason
    temporal_result['finish'] = finish
    temporal_result['processing_status'] = process_status
    history_status = json.loads(temporal_result['history_status'])
    history_status.append([ts, task_name, warning_status])
    temporal_result['history_status'] = json.dumps(history_status)
    es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=doctype, body=temporal_result)

    return "1"
Ejemplo n.º 41
0
def get_attr_portrait(uid_list):
    result = {}
    index_name = 'user_portrait'
    index_type = 'user'
    user_dict_list = es.mget(index=index_name, doc_type=index_type, body={'ids':uid_list})['docs']
    #print 'user_dict:', user_dict_list
    gender_ratio = dict()
    verified_ratio = dict()
    online_pattern_ratio = dict()
    domain_ratio = dict()
    topic_ratio = dict()
    emoticon_ratio = dict()
    keyword_ratio = dict()
    importance_list = []
    activeness_list = []
    influence_list = []
    psycho_status_ratio  = dict()
    psycho_feature_ratio = dict()
    hashtag_ratio = dict()
    activity_geo_ratio = dict()
    for user_dict in user_dict_list:
        user_dict = user_dict['_source']
        #attr1 gender ratio
        gender = user_dict['gender']
        if gender:
            try:
                gender_ratio[gender] += 1
            except:
                gender_ratio[gender] = 1
        #attr2 verified ratio
        verified = user_dict['verified']
        if verified:
            try:
                verified_ratio[verified] += 1
            except:
                verified_ratio[verified] = 1
        #attr3 online pattern
        online_pattern = user_dict['online_pattern']
        if online_pattern:
            online_pattern = json.loads(online_pattern)
            for pattern in online_pattern:
                try:
                    online_pattern_ratio[pattern] += 1
                except:
                    online_pattern_ratio[pattern] = 1
        #attr4 domain
        domain_string = user_dict['domain']
        if domain_string:
            domain_list = domain_string.split('_')
            for domain in domain_list:
                try:
                    domain_ratio[domain] += 1
                except:
                    domain_ratio[domain] = 1
        #attr5 topic
        topic_string = user_dict['topic']
        if topic_string:
            topic_dict = json.loads(topic_string)
            for topic in topic_dict:
                try:
                    topic_ratio[topic] += 1
                except:
                    topic_ratio[topic] = 1
        #attr6 emoticon
        emoticon_string = user_dict['emoticon']
        if emoticon_string:
            emoticon_dict = json.loads(emoticon_string)
            for emoticon in emoticon_dict:
                try:
                    emoticon_ratio[emoticon] += 1
                except:
                    emoticon_ratio[emoticon] = 1
        #attr7 keywords
        keyword_string = user_dict['keywords']
        if keyword_string:
            keyword_dict = json.loads(keyword_string)
            for keyword in keyword_dict:
                try:
                    keyword_ratio[keyword] += keyword_dict[keyword]
                except:
                    keyword_ratio[keyword] = keyword_dict[keyword]
        #attr8 importance distribution
        importance = user_dict['importance']
        importance_rank = get_index_rank(importance, 'importance')
        importance_list.append(int(importance_rank))
        #attr9 activeness distribution
        activeness = user_dict['activeness']
        activeness_rank = get_index_rank(activeness, 'activeness')
        activeness_list.append(int(activeness_rank))
        #attr10 influence distribution
        influence = user_dict['influence']
        influence_rank = get_index_rank(influence, 'influence')
        influence_list.append(int(influence_rank))
        #attr11 psycho_status ratio
        psycho_status_string = user_dict['psycho_status']
        if psycho_status_string:
            psycho_status_dict = json.loads(psycho_status_string)
            for psycho_status in psycho_status_dict:
                try:
                    psycho_status_ratio[psycho_status] += psycho_status_dict[psycho_status]
                except:
                    psycho_status_ratio[psycho_status] = psycho_status_dict[psycho_status]
        #attr12 psycho_feature ratio
        psycho_feature_string = user_dict['psycho_feature']
        if psycho_feature_string:
            psycho_feature_list = psycho_feature_string.split('_')
            for psycho_feature in psycho_feature_list:
                try:
                    psycho_feature_ratio[psycho_feature] += 1
                except:
                    psycho_feature_ratio[psycho_feature] = 1
        #attr13 activity geo ratio
        activity_geo_string = user_dict['activity_geo_dict']
        if activity_geo_string:
            activity_geo_dict = json.loads(activity_geo_string)
            for activity_geo in activity_geo_dict:
                city_list = activity_geo.split('\t')
                city = city_list[len(city_list)-1]
                try:
                    activity_geo_ratio[city] += activity_geo_dict[activity_geo]
                except:
                    activity_geo_ratio[city] = activity_geo_dict[activity_geo]
        #attr14 hashtag
        hashtag_string = user_dict['hashtag_dict']
        if hashtag_string:
            hashtag_dict = json.loads(hashtag_string)
            for hashtag in hashtag_dict:
                try:
                    hashtag_ratio[hashtag] += hashtag_dict[hashtag]
                except:
                    hashtag_ratio[hashtag] = hashtag_dict[hashtag]
    #print 'importance_list:', importance_list
    p, t = np.histogram(importance_list, bins=5, normed=False)
    importance_his = [p.tolist(), t.tolist()]
    #print 'importance_his:', importance_his
    p, t = np.histogram(activeness_list, bins=5, normed=False)
    activeness_his = [p.tolist(), t.tolist()]
    p, t = np.histogram(influence_list, bins=5, normed=False)
    influence_his = [p.tolist(), t.tolist()]
    result['gender'] = json.dumps(gender_ratio)
    result['verified'] = json.dumps(verified_ratio)
    result['online_pattern'] = json.dumps(online_pattern_ratio)
    result['domain'] = json.dumps(domain_ratio)
    result['topic'] = json.dumps(topic_ratio)
    result['psycho_status'] = json.dumps(psycho_status_ratio)
    result['psycho_feature'] = json.dumps(psycho_feature_ratio)
    result['emoticon'] = json.dumps(emoticon_ratio)
    result['keywords'] = json.dumps(keyword_ratio)
    result['hashtag'] = json.dumps(hashtag_ratio)
    result['activity_geo'] = json.dumps(activity_geo_ratio)
    result['importance_his'] = json.dumps(importance_his)
    result['activeness_his'] = json.dumps(activeness_his)
    result['influence_his'] = json.dumps(influence_his)
    return result
Ejemplo n.º 42
0
def key_words_search(task_id,
                     search_type,
                     pre,
                     during,
                     start_time,
                     keyword_list,
                     search_key='',
                     sort_norm='',
                     sort_scope='',
                     time=1,
                     isall=False,
                     number=100):
    number = int(number)
    should = []
    for key in keyword_list:
        if search_type == "hashtag":
            should.append({"prefix": {"text": "#" + key + "#"}})
        else:
            should.append({"wildcard": {"text": "*" + key + "*"}})
    index_list = []
    date = ts2datetime(start_time)
    index_name = pre + date
    while during:
        if es_flow_text.indices.exists(index=index_name):
            index_list.append(index_name)
            start_time = start_time + DAY
            date = ts2datetime(start_time)
            index_name = pre + date
            during -= 1

    print index_list
    uid_set = set()
    text_results = []

    query_body = {
        "query": {
            "bool": {
                "must": should
            }
        },
        "sort": {
            "user_fansnum": {
                "order": "desc"
            }
        },
        "size": 5000
    }

    results = es_flow_text.search(index=index_list,
                                  doc_type='text',
                                  body=query_body,
                                  _source=False,
                                  fields=[
                                      "uid", "user_fansnum", "text",
                                      "message_type", "sentiment", "timestamp",
                                      "geo", "retweeted", "comment"
                                  ])["hits"]["hits"]

    id_index = 0
    index_list = []
    un_uid_list = []
    for item in results:
        if item['fields']['uid'][0] not in uid_set:
            uid_set.add(item['fields']['uid'][0])
            un_uid_list.append(item['fields']['uid'][0])
            index_list.append(id_index)
        id_index += 1

    uid_list = []
    print "un_uid_list: ", len(un_uid_list)
    portrait_list = []
    count = 0
    in_index = 0
    if not isall and un_uid_list:  # 库内
        portrait_results = es_user_portrait.mget(index=USER_INDEX_NAME,
                                                 doc_type=USER_INDEX_TYPE,
                                                 body={"ids": un_uid_list},
                                                 _source=False,
                                                 fields=['uname'])["docs"]
        for item in portrait_results:
            if item["found"]:
                portrait_list.append(item['_id'])
                nick_name = item['fields']['uname'][0]
                if nick_name == 'unknown':
                    nick_name = item['_id']
                index = index_list[in_index]
                weibo_url = weiboinfo2url(results[index]['fields']['uid'][0],
                                          results[index]['_id'])
                text_results.extend([
                    results[index]['fields']['uid'][0],
                    results[index]['fields']['user_fansnum'][0],
                    results[index]['fields']['text'][0],
                    results[index]['fields']['message_type'][0],
                    results[index]['fields']['sentiment'][0],
                    ts2date(results[index]['fields']['timestamp'][0]),
                    results[index]['fields']['geo'][0],
                    results[index]['fields']['retweeted'][0],
                    results[index]['fields']['comment'][0], nick_name,
                    weibo_url
                ])
                count += 1
                if count == number:
                    break
                print "portrait_len, ", len(portrait_list)
            in_index += 1
        if portrait_list:
            uid_list = in_sort_filter(time, sort_norm, sort_scope, None,
                                      portrait_list, True, number)  # sort
    elif un_uid_list:
        profile_result = es_user_profile.mget(index="weibo_user",
                                              doc_type="user",
                                              body={"ids": un_uid_list},
                                              fields=['nick_name'])["docs"]
        for i in range(len(profile_result)):
            index = index_list[i]
            try:
                nick_name = profile_result[i]['fields']['nick_name'][0]
            except:
                nick_name = un_uid_list[i]
            item = results[index]
            weibo_url = weiboinfo2url(item['fields']['uid'][0],
                                      results[index]['_id'])
            text_results.append([
                item['fields']['uid'][0], item['fields']['user_fansnum'][0],
                item['fields']['text'][0], item['fields']['message_type'][0],
                item['fields']['sentiment'][0],
                ts2date(item['fields']['timestamp'][0]),
                results[index]['fields']['geo'][0],
                results[index]['fields']['retweeted'][0],
                results[index]['fields']['comment'][0], nick_name, weibo_url
            ])
            if i == number:
                break
        uid_list = all_sort_filter(un_uid_list[:number], sort_norm, time, True,
                                   number)

    print "filter_uid_list: ", len(uid_list)
    if uid_list:
        results = make_up_user_info(uid_list, isall, time, sort_norm)
    else:
        results = []
    print "results: ", len(results)
    # 修改状态
    task_detail = es_user_portrait.get(index=USER_RANK_KEYWORD_TASK_INDEX,
                                       doc_type=USER_RANK_KEYWORD_TASK_TYPE,
                                       id=task_id)
    item = task_detail['_source']
    item['status'] = 1
    item['result'] = json.dumps(results)
    item['text_results'] = json.dumps(text_results)
    item['number'] = len(results)
    es_user_portrait.index(index=USER_RANK_KEYWORD_TASK_INDEX,
                           doc_type=USER_RANK_KEYWORD_TASK_TYPE,
                           id=task_id,
                           body=item)

    return "1"
Ejemplo n.º 43
0
                    bulk_action.extend([action, new_user_item])
                    iter_count += 1
                es_user_portrait.bulk(bulk_action,
                                      index=copy_portrait_index_name,
                                      doc_type=copy_portrait_index_type)
                bulk_action = []
                add_info = {}
                iter_count = 0
            break
        except Exception, e:
            raise e

    if len(add_info) != 0:
        uid_list = add_info.keys()
        evaluate_history_results = es_user_portrait.mget(
            index=copy_portrait_index_name,
            doc_type=copy_portrait_index_type,
            body={'ids': uid_list})['docs']
        '''
        del_date = ts2datetime(time.time() - DAY*31)
        del_activeness_key = 'activeness_'+del_date
        del_influence_key = del_date
        '''
        iter_count = 0
        for uid in uid_list:
            try:
                user_history_item = evaluate_history_results[iter_count][
                    '_source']
            except:
                user_history_item = {}
            try:
                user_history_item.pop(del_activeness_key)
Ejemplo n.º 44
0
def get_scan_results():
    result_dict = {}
    gender_result = {'1': 0, '2': 0}
    verified_result = {'yes': 0, 'no': 0}
    location_result = {}
    activity_geo_result = {}
    keywords_result = {}
    hashtag_result = {}
    topic_result = {}
    online_pattern_result = {}
    domain_result = {}
    no_gender_count = 0
    no_verified_count = 0
    no_location_count = 0
    no_activity_geo_count = 0
    no_keywords_count = 0
    no_hashtag_count = 0
    no_topic_count = 0
    no_online_pattern_count = 0
    no_domain_count = 0
    s_re = scan(es_user_portrait,
                query={
                    'query': {
                        'match_all': {}
                    },
                    'size': 100
                },
                index=portrait_index_name,
                doc_type=portrait_index_type)
    print 's_re:', s_re
    activity_count = 0
    while True:
        portrait_uid_list = []
        while True:
            try:
                scan_re = s_re.next()['_source']
                # gender ratio count
                portrait_uid_list.append(scan_re['uid'])
                try:
                    gender_result[str(scan_re['gender'])] += 1
                except:
                    no_gender_count += 1
                # verified ratio count
                try:
                    verified_result[str(scan_re['verified'])] += 1
                except:
                    no_verified_count += 1
                # loation top
                try:
                    location = scan_re['location']
                    if len(location.split(' ')) > 1:
                        location = location.split(' ')[0]
                    try:
                        location_result[location] += 1
                    except:
                        location_result[location] = 1
                except:
                    no_location_count += 1
                # activity geo
                try:
                    activity_geo = scan_re['activity_geo_dict']
                    if activity_geo:
                        activity_geo_dict = json.loads(activity_geo)[-1]
                        for geo in activity_geo_dict:
                            geo_list = geo.split('\t')
                            if geo_list[0] == u'中国' and len(geo_list) >= 2:
                                province = geo_list[1]
                                try:
                                    activity_geo_result[
                                        province] += activity_geo_dict[geo]
                                except:
                                    activity_geo_result[
                                        province] = activity_geo_dict[geo]
                except:
                    no_activity_geo_count += 1
                # keywords
                try:
                    keywords = json.loads(scan_re['keywords'])
                    if keywords:
                        for word in keywords:
                            try:
                                keywords_result[word] += keywords[word]
                            except:
                                keywords_result[word] = keywords[word]
                except:
                    no_keywords_count += 1
                # hashtag top
                try:
                    hashtag_dict = json.loads(scan_re['hashtag_dict'])
                    if hashtag_dict:
                        for tag in hashtag_dict:
                            try:
                                hashtag_result[tag] += hashtag_dict[tag]
                            except:
                                hashtag_result[tag] = hashtag_dict[tag]
                except:
                    no_hashtag_count += 1
                # topic top
                try:
                    topic = scan_re['topic_string']
                    if topic:
                        topic_list = topic.split('&')
                        for item in topic_list:
                            try:
                                topic_result[item] += 1
                            except:
                                topic_result[item] = 1
                except:
                    no_topic_count += 1
                # online pattern top
                try:
                    online_pattern = json.loads(scan_re['online_pattern'])
                    if online_pattern:
                        for item in online_pattern:
                            try:
                                online_pattern_result[item] += online_pattern[
                                    item]
                            except:
                                online_pattern_result[item] = online_pattern[
                                    item]
                except:
                    no_online_pattern_count += 1
                # domain top
                try:
                    domain = scan_re['domain']
                    if domain:
                        try:
                            domain_result[domain] += 1
                        except:
                            domain_result[domain] = 1
                except:
                    no_domain_count += 1
            except StopIteration:
                print 'all done'
                now_ts = time.time()
                now_date = ts2datetime(now_ts - DAY)
                index_time = ''.join(now_date.split('-'))
                #test
                index_time = '20130907'
                # gender ratio count
                #count = sum(gender_result.values())
                all_count = es_user_portrait.count(index=portrait_index_name,doc_type=portrait_index_type,\
                    body={'query':{'match_all':{}}})['count']
                count = all_count
                print "count:", count
                gender_ratio = {
                    '1': float(gender_result['1']) / count,
                    '2': float(gender_result['2']) / count
                }
                #print 'gender ratio:', gender_ratio
                activity_result = es_user_portrait.mget(
                    index='bci_' + index_time,
                    doc_type='bci',
                    body={'ids': portrait_uid_list})['docs']
                for activity_item in activity_result:
                    if activity_item['found']:
                        activity_count += 1
                #print 'activity_count:', activity_count
                result_dict['activity_count'] = float(activity_count) / count
                result_dict['gender_ratio'] = json.dumps(gender_ratio)
                # verified ratio count
                count = sum(verified_result.values())
                if count == 0:
                    verified_ratio = {'yes': 0.5, 'no': 0.5}
                else:
                    verified_ratio = {
                        'yes': float(verified_result['yes']) / count,
                        'no': float(verified_result['no']) / count
                    }
                #print 'verified ratio:', verified_ratio
                result_dict['verified_ratio'] = json.dumps(verified_ratio)
                # location top
                if location_result:
                    sort_location = sorted(location_result.items(),
                                           key=lambda x: x[1],
                                           reverse=True)
                    location_top = sort_location[:5]
                else:
                    location_top = {}
                #print 'location top:', location_top
                result_dict['location_top'] = json.dumps(location_top)
                # activity geo top
                if activity_geo_result:
                    sort_activity_geo = sorted(activity_geo_result.items(),
                                               key=lambda x: x[1],
                                               reverse=True)
                    activity_geo_top = sort_activity_geo[:50]
                else:
                    activity_geo_top = {}
                #print 'activity_geo_top:', activity_geo_top
                result_dict['activity_geo_top'] = json.dumps(activity_geo_top)
                # keywords top
                if keywords_result:
                    sort_keywords = sorted(keywords_result.items(),
                                           key=lambda x: x[1],
                                           reverse=True)
                    keywords_top = sort_keywords[:50]
                else:
                    keywords_top = {}
                #print 'keywords_top:', keywords_top
                result_dict['keywords_top'] = json.dumps(keywords_top)
                # hashtag top
                if hashtag_result:
                    sort_hashtag = sorted(hashtag_result.items(),
                                          key=lambda x: x[1],
                                          reverse=True)
                    hashtag_top = sort_hashtag[:50]
                else:
                    hashtag_top = {}
                #print 'hashtag top:', hashtag_top
                result_dict['hashtag_top'] = json.dumps(hashtag_top)
                # topic top
                if topic_result:
                    sort_topic = sorted(topic_result.items(),
                                        key=lambda x: x[1],
                                        reverse=True)
                    topic_top = sort_topic[:50]
                else:
                    topic_top = {}
                #print 'topic top:', topic_top
                result_dict['topic_top'] = json.dumps(topic_top)
                # online_pattern top
                if online_pattern_result:
                    sort_online_pattern = sorted(online_pattern_result.items(),
                                                 key=lambda x: x[1],
                                                 reverse=True)
                    online_pattern_top = sort_online_pattern[:50]
                else:
                    online_pattern_top = {}
                #print 'online pattern top:', online_pattern_top
                result_dict['online_pattern_top'] = json.dumps(
                    online_pattern_top)
                # domain top
                if domain_result:
                    sort_domain = sorted(domain_result.items(),
                                         key=lambda x: x[1],
                                         reverse=True)
                    domain_top = sort_domain[:20]
                else:
                    domain_top = {}
                result_dict['domain_top'] = json.dumps(domain_top)
                result_dict['domain_top_user'] = json.dumps(
                    get_domain_top_user(domain_top))
                result_dict['topic_top_user'] = json.dumps(
                    get_topic_top_user(topic_top))
                return result_dict
            except Exception, r:
                print Exception, r
                return result_dict
        activity_result = es.mget(index='20130907',
                                  doc_type='bci',
                                  body={'ids': portrait_uid_list})['docs']
        for activity_item in activity_result:
            if activity_item['found']:
                activity_count += 1
Ejemplo n.º 45
0
def get_influence_vary_top():
    result = []
    query_body = {
        'query':{
            'match_all':{}
            },
        'size': 10000,
        'sort':[{'vary':{'order': 'desc'}}]
        }
    try:
        es_result =  es.search(index='vary', doc_type='bci', body=query_body)['hits']['hits']
    except Exception, e:
        raise e
    uid_list = [user_dict['_id'] for user_dict in es_result]
    #print 'uid_list:', uid_list
    portrait_result = es.mget(index='user_portrait', doc_type='user', body={'ids':uid_list}, _source=True)['docs']
    #print 'portrait_result:', portrait_result
    count = 0
    for i in range(len(portrait_result)):
        if count >=100:
            break
        #print 'portrait_result:', portrait_result
        if portrait_result[i]['found']:
            uid = portrait_result[i]['_source']['uid']
            uname = portrait_result[i]['_source']['uname']
            vary = es_result[i]['_source']['vary']
            result.append([uid, uname, vary])
            count += 1
        else:
            next
    #print 'result:', result