def tag_vector(uid, date):
    date1 = str(date).replace('-', '')
    index_name = pre_index + date1
    index_flow_text = pre_text_index + date
    result = []

    try:
        bci_result = es_cluster.get(index=index_name, doc_type=influence_doctype, id=uid)["_source"]
    except:
        tag = influence_tag["0"]
        result.append(tag)
        return result

    origin_retweeted = json.loads(bci_result["origin_weibo_retweeted_detail"])
    retweeted_retweeted = json.loads(bci_result["retweeted_weibo_retweeted_detail"])
    origin_comment = json.loads(bci_result["origin_weibo_comment_detail"])
    retweeted_comment = json.loads(bci_result["retweeted_weibo_comment_detail"])
    sum_retweeted = sum(origin_retweeted.values()) + sum(origin_comment.values())
    sum_comment = sum(retweeted_retweeted.values()) + sum(retweeted_comment.values())

    if sum_retweeted >= retweeted_threshold:
        if sum_comment >= comment_threshold:
            tag = influence_tag['3']
        else:
            tag = influence_tag['1']
    else:
        if sum_comment >= comment_threshold:
            tag = influence_tag['2']
        else:
            tag = influence_tag['4']
    result.append(tag)
    return result
def statistics_influence_people(uid, date, style, sensitive=0):
    # output: different retweeted and comment, uids' domain distribution, topic distribution, registeration geo distribution
    results = {} # retwweted weibo people and comment weibo people
    date1 = str(date).replace('-', '')
    index_name = pre_index + date1
    print index_name
    index_flow_text = pre_text_index + date

    try:
        bci_result = es_cluster.get(index=index_name, doc_type=influence_doctype, id=uid)["_source"]
    except:
        bci_result = []
        return results
    origin_mid = [] # origin weibo mid
    retweeted_mid = [] # retweeted weibo mid

    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                        ]
                    }
                }
            }
        },
        "size":1000
    }

    if sensitive:
        query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"range":{"sensitive":{"gt":0}}})

    body_1 = copy.deepcopy(query_body)
    body_2 = copy.deepcopy(query_body)

    body_1["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": 1}}, {"term":{"uid": uid}}])
    result_1 = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=body_1)["hits"]["hits"]
    if result_1:
        for item in result_1:
            origin_mid.append(item['_id'])

    body_1["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": 3}}, {"term":{"uid": uid}}])
    result_2 = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=body_2)["hits"]["hits"]
    if result_2:
        for item in result_2:
            if item['_source'].get('root_mid', ''):
                retweeted_mid.append(item['_source']['root_mid'])    

    if int(style) == 0: # retweeted
        retweeted_results = influenced_user_detail(uid, date, origin_mid, retweeted_mid, 3)
        results = retweeted_results
    else:
        comment_results = influenced_user_detail(uid, date, origin_mid, retweeted_mid, 2)
        results = comment_results
    return results
def comment_on_influence(uid, date):
    date1 = str(date).replace('-', '')
    index_name = pre_index + date1
    index_flow_text = pre_text_index + date
    result = []
    underline = []

    try:
        bci_result = es_cluster.get(index=index_name, doc_type=influence_doctype, id=uid)["_source"]
    except:
        description = CURRENT_INFLUENCE_CONCLUSION['0']
        result.append(description)
        return ([result, underline])

    user_index = bci_result['user_index']
    if user_index < CURRNET_INFLUENCE_THRESHOULD[0]:
        description = CURRENT_INFLUENCE_CONCLUSION['0']
    elif user_index >= CURRNET_INFLUENCE_THRESHOULD[0] and user_index < CURRNET_INFLUENCE_THRESHOULD[1]:
        description = CURRENT_INFLUENCE_CONCLUSION['1']
    elif user_index >= CURRNET_INFLUENCE_THRESHOULD[1] and user_index < CURRNET_INFLUENCE_THRESHOULD[2]:
        description = CURRENT_INFLUENCE_CONCLUSION['2']
    elif user_index >= CURRNET_INFLUENCE_THRESHOULD[2] and user_index < CURRNET_INFLUENCE_THRESHOULD[3]:
        description = CURRENT_INFLUENCE_CONCLUSION['3']
    elif user_index >= CURRNET_INFLUENCE_THRESHOULD[3] and user_index < CURRNET_INFLUENCE_THRESHOULD[4]:
        description = CURRENT_INFLUENCE_CONCLUSION['4']
    else:
        description = CURRENT_INFLUENCE_CONCLUSION['5']
    result.append(description)

    for i in range(4):
        if bci_result[INFLUENCE_TOTAL_LIST[i]] > INFLUENCE_TOTAL_THRESHOULD[i]:
            result.append(INFLUENCE_TOTAL_CONCLUSION[i])
            if bci_result[INFLUENCE_BRUST_LIST[i]] > INFLUENCE_BRUST_THRESHOULD[i]:
                result.append(INFLUENCE_BRUST_CONCLUSION[i])
                underline.append(UNDERLINE_CONCLUSION[i])
            else:
                result.append('')
                underline.append('')
        else:
            result.extend(['',''])
            underline.append('')

    return [result, underline]
def bci_detail(date, uid, sensitive=0):
    if not sensitive:
        bci_index = "bci_" + date.replace("-", "")
        try:
            bci_result = es_bci.get(index=bci_index, doc_type="bci", id=uid)["_source"]
        except:
            bci_result = dict()

        try:
            origin_retweeted = json.loads(bci_result.get("origin_weibo_retweeted_detail", []))
        except:
            origin_retweeted = []
        origin_weibo_retweeted_brust_average = bci_result.get("origin_weibo_retweeted_brust_average", 0)  # 爆发数
        try:
            origin_comment = json.loads(bci_result.get("origin_weibo_comment_detail", []))
        except:
            origin_comment = []
        origin_weibo_comment_brust_average = bci_result.get("origin_weibo_comment_brust_average", 0)
        try:
            retweeted_retweeted = json.loads(bci_result.get("retweeted_weibo_retweeted_detail", []))
        except:
            retweeted_retweeted = []
        retweeted_weibo_retweeted_brust_average = bci_result.get("retweeted_weibo_retweeted_brust_average", 0)
        try:
            retweeted_comment = json.loads(bci_result.get("retweeted_weibo_comment_detail", []))
        except:
            retweeted_comment = []
        retweeted_weibo_comment_brust_average = bci_result.get("retweeted_weibo_comment_brust_average", 0)

    origin_query = query_body(1, uid)
    text_index = "flow_text_" + date
    if not sensitive:
        origin_text = es_text.search(index=text_index, doc_type="text", body=origin_query)["hits"]["hits"]
    else:
        sensitive_origin_query = origin_query["query"]["filtered"]["filter"]["bool"]["must"].append(
            {"range": {"sensitive": {"gt": 0}}}
        )
        origin_text = es_text.search(index=text_index, doc_type="text", body=sensitive_origin_query)["hits"]["hits"]
    # print origin_text
    retweeted_query = query_body(3, uid)
    if not sensitive:
        retweeted_text = es_text.search(index=text_index, doc_type="text", body=retweeted_query)["hits"]["hits"]
    else:
        sensitive_retweeted_query = retweeted_query["query"]["filtered"]["filter"]["bool"]["must"].append(
            {"range": {"sensitive": {"gt": 0}}}
        )
        retweeted_text = es_text.search(index=text_index, doc_type="text", body=sensitive_retweeted_query)["hits"][
            "hits"
        ]

    origin_weibo_number = len(origin_text)  # 1
    retweeted_weibo_number = len(retweeted_text)  # 2

    retweet_total_number = 0  # 转发总数
    comment_total_number = 0  # 评论总数
    origin_retweet_total_number = 0  # 原创被转发总数
    origin_comment_total_number = 0  # 原创被评论总数
    retweet_retweet_total_number = 0  # 转发被转发总数
    retweet_comment_total_number = 0  # 转发被评论总数
    origin_retweet_average_number = 0  # 原创被转发平均数
    origin_comment_average_number = 0  # 原创被评论平均数
    retweet_retweet_average_number = 0  # 转发被转发平均数
    retweet_comment_average_number = 0  # 转发被评论平均数
    origin_retweet_top_number = 0  # 原创被转发最高
    origin_comment_top_number = 0  # 原创被评论最高
    retweet_retweet_top_number = 0  # 转发被转发最高
    retweet_comment_top_number = 0  # 转发被评论最高
    origin_sensitive_words_dict = dict()
    retweeted_sensitive_words_dict = dict()
    for item in origin_text:
        retweet_total_number += item["_source"].get("retweeted", 0)
        comment_total_number += item["_source"].get("comment", 0)
        origin_retweet_total_number += item["_source"].get("retweeted", 0)
        origin_comment_total_number += item["_source"].get("comment", 0)
        if origin_retweet_top_number < item["_source"].get("retweeted", 0):
            origin_retweet_top_number = item["_source"].get("retweeted", 0)
        if origin_comment_top_number < item["_source"].get("comment", 0):
            origin_comment_top_number = item["_source"].get("comment", 0)
        if sensitive:
            sensitive_words_dict = json.loads(item["_source"]["sensitive_words_dict"])
            if sensitive_words_dict:
                for k, v in sensitive_words_dict.iteritems():
                    try:
                        origin_sensitive_words_dict[k] += v
                    except:
                        origin_sensitive_words_dict[k] = v
    for item in retweeted_text:
        retweet_total_number += item["_source"].get("retweeted", 0)
        comment_total_number += item["_source"].get("comment", 0)
        retweet_retweet_total_number += item["_source"].get("retweeted", 0)
        retweet_comment_total_number += item["_source"].get("comment", 0)
        if retweet_retweet_top_number < item["_source"].get("retweeted", 0):
            retweeet_retweet_top_number = item["_source"].get("retweeted", 0)
        if retweet_comment_top_number < item["_source"].get("comment", 0):
            retweet_comment_top_number = item["_source"].get("comment", 0)
        if sensitive:
            sensitive_words_dict = json.loads(item["_source"]["sensitive_words_dict"])
            if sensitive_words_dict:
                for k, v in sensitive_words_dict.iteritems():
                    try:
                        retweeted_sensitive_words_dict[k] += v
                    except:
                        retweeted_sensitive_words_dict[k] = v
    try:
        average_retweet_number = retweet_total_number / (origin_weibo_number + retweeted_weibo_number)  # 平均转发数
    except:
        average_retweet_number = 0
    try:
        average_comment_number = comment_total_number / (origin_weibo_number + retweeted_weibo_number)  # 平均评论数
    except:
        average_comment_number = 0

    try:
        origin_retweet_average_number = origin_retweet_total_number / origin_weibo_number
    except:
        origin_retweet_average_number = 0
    try:
        origin_comment_average_number = origin_comment_total_number / origin_weibo_number
    except:
        origin_comment_average_number = 0
    try:
        retweet_retweet_average_number = retweet_retweet_total_number / retweeted_weibo_number
    except:
        retweet_retweet_average_number = 0
    try:
        retweet_comment_average_number = retweet_comment_total_number / retweeted_weibo_number
    except:
        retweet_comment_average_number = 0

    result = dict()
    result["origin_weibo_number"] = origin_weibo_number
    result["retweeted_weibo_number"] = retweeted_weibo_number
    result["origin_weibo_retweeted_total_number"] = origin_retweet_total_number
    result["origin_weibo_comment_total_number"] = origin_comment_total_number
    result["retweeted_weibo_retweeted_total_number"] = retweet_retweet_total_number
    result["retweeted_weibo_comment_total_number"] = retweet_comment_total_number
    result["origin_weibo_retweeted_average_number"] = origin_retweet_average_number
    result["origin_weibo_comment_average_number"] = origin_comment_average_number
    result["retweeted_weibo_retweeted_average_number"] = retweet_retweet_average_number
    result["retweeted_weibo_comment_average_number"] = retweet_comment_average_number
    result["origin_weibo_retweeted_top_number"] = origin_retweet_top_number
    result["origin_weibo_comment_top_number"] = origin_comment_top_number
    result["retweeted_weibo_retweeted_top_number"] = retweet_retweet_top_number
    result["retweeted_weibo_comment_top_number"] = retweet_comment_top_number
    if not sensitive:
        result["origin_weibo_comment_brust_average"] = origin_weibo_comment_brust_average
        result["origin_weibo_retweeted_brust_average"] = origin_weibo_retweeted_brust_average
        result["retweeted_weibo_comment_brust_average"] = retweeted_weibo_comment_brust_average
        result["retweeted_weibo_retweeted_brust_average"] = retweeted_weibo_retweeted_brust_average
        result["user_index"] = bci_result.get("user_index", 0)
    else:
        result["retweeted_sensitive_words_list"] = sorted(
            retweeted_sensitive_words_dict.items(), key=lambda x: x[1], reverse=True
        )
        result["origin_sensitive_words_list"] = sorted(
            origin_sensitive_words_dict.items(), key=lambda x: x[1], reverse=True
        )
        result["retweeted_sensitive_words_number"] = len(retweeted_sensitive_words_dict)
        result["origin_sensitive_words_number"] = len(origin_sensitive_words_dict)

    return result
Beispiel #5
0
def bci_detail(date, uid, sensitive=0):
    if not sensitive:
        bci_index = "bci_" + date.replace('-', '')
        try:
            bci_result = es_bci.get(index=bci_index, doc_type="bci",
                                    id=uid)['_source']
        except:
            bci_result = dict()

        try:
            origin_retweeted = json.loads(
                bci_result.get("origin_weibo_retweeted_detail", []))
        except:
            origin_retweeted = []
        origin_weibo_retweeted_brust_average = bci_result.get(
            "origin_weibo_retweeted_brust_average", 0)  # 爆发数
        try:
            origin_comment = json.loads(
                bci_result.get("origin_weibo_comment_detail", []))
        except:
            origin_comment = []
        origin_weibo_comment_brust_average = bci_result.get(
            "origin_weibo_comment_brust_average", 0)
        try:
            retweeted_retweeted = json.loads(
                bci_result.get("retweeted_weibo_retweeted_detail", []))
        except:
            retweeted_retweeted = []
        retweeted_weibo_retweeted_brust_average = bci_result.get(
            'retweeted_weibo_retweeted_brust_average', 0)
        try:
            retweeted_comment = json.loads(
                bci_result.get("retweeted_weibo_comment_detail", []))
        except:
            retweeted_comment = []
        retweeted_weibo_comment_brust_average = bci_result.get(
            'retweeted_weibo_comment_brust_average', 0)

    origin_query = query_body(1, uid)
    text_index = "flow_text_" + date
    if not sensitive:
        origin_text = es_text.search(index=text_index,
                                     doc_type="text",
                                     body=origin_query)["hits"]["hits"]
    else:
        sensitive_origin_query = origin_query["query"]["filtered"]["filter"][
            "bool"]["must"].append({"range": {
                "sensitive": {
                    "gt": 0
                }
            }})
        origin_text = es_text.search(
            index=text_index, doc_type="text",
            body=sensitive_origin_query)["hits"]["hits"]
    #print origin_text
    retweeted_query = query_body(3, uid)
    if not sensitive:
        retweeted_text = es_text.search(index=text_index,
                                        doc_type="text",
                                        body=retweeted_query)["hits"]["hits"]
    else:
        sensitive_retweeted_query = retweeted_query["query"]["filtered"][
            "filter"]["bool"]["must"].append(
                {"range": {
                    "sensitive": {
                        "gt": 0
                    }
                }})
        retweeted_text = es_text.search(
            index=text_index, doc_type="text",
            body=sensitive_retweeted_query)["hits"]["hits"]

    origin_weibo_number = len(origin_text)  # 1
    retweeted_weibo_number = len(retweeted_text)  #2

    retweet_total_number = 0  # 转发总数
    comment_total_number = 0  # 评论总数
    origin_retweet_total_number = 0  # 原创被转发总数
    origin_comment_total_number = 0  # 原创被评论总数
    retweet_retweet_total_number = 0  # 转发被转发总数
    retweet_comment_total_number = 0  # 转发被评论总数
    origin_retweet_average_number = 0  # 原创被转发平均数
    origin_comment_average_number = 0  # 原创被评论平均数
    retweet_retweet_average_number = 0  # 转发被转发平均数
    retweet_comment_average_number = 0  # 转发被评论平均数
    origin_retweet_top_number = 0  # 原创被转发最高
    origin_comment_top_number = 0  # 原创被评论最高
    retweet_retweet_top_number = 0  # 转发被转发最高
    retweet_comment_top_number = 0  # 转发被评论最高
    origin_sensitive_words_dict = dict()
    retweeted_sensitive_words_dict = dict()
    for item in origin_text:
        retweet_total_number += item['_source'].get('retweeted', 0)
        comment_total_number += item['_source'].get('comment', 0)
        origin_retweet_total_number += item['_source'].get('retweeted', 0)
        origin_comment_total_number += item['_source'].get('comment', 0)
        if origin_retweet_top_number < item['_source'].get('retweeted', 0):
            origin_retweet_top_number = item['_source'].get('retweeted', 0)
        if origin_comment_top_number < item['_source'].get('comment', 0):
            origin_comment_top_number = item['_source'].get('comment', 0)
        if sensitive:
            sensitive_words_dict = json.loads(
                item['_source']['sensitive_words_dict'])
            if sensitive_words_dict:
                for k, v in sensitive_words_dict.iteritems():
                    try:
                        origin_sensitive_words_dict[k] += v
                    except:
                        origin_sensitive_words_dict[k] = v
    for item in retweeted_text:
        retweet_total_number += item['_source'].get('retweeted', 0)
        comment_total_number += item['_source'].get('comment', 0)
        retweet_retweet_total_number += item['_source'].get('retweeted', 0)
        retweet_comment_total_number += item['_source'].get('comment', 0)
        if retweet_retweet_top_number < item['_source'].get('retweeted', 0):
            retweeet_retweet_top_number = item['_source'].get('retweeted', 0)
        if retweet_comment_top_number < item['_source'].get('comment', 0):
            retweet_comment_top_number = item['_source'].get('comment', 0)
        if sensitive:
            sensitive_words_dict = json.loads(
                item['_source']['sensitive_words_dict'])
            if sensitive_words_dict:
                for k, v in sensitive_words_dict.iteritems():
                    try:
                        retweeted_sensitive_words_dict[k] += v
                    except:
                        retweeted_sensitive_words_dict[k] = v
    try:
        average_retweet_number = retweet_total_number / (
            origin_weibo_number + retweeted_weibo_number)  # 平均转发数
    except:
        average_retweet_number = 0
    try:
        average_comment_number = comment_total_number / (
            origin_weibo_number + retweeted_weibo_number)  # 平均评论数
    except:
        average_comment_number = 0

    try:
        origin_retweet_average_number = origin_retweet_total_number / origin_weibo_number
    except:
        origin_retweet_average_number = 0
    try:
        origin_comment_average_number = origin_comment_total_number / origin_weibo_number
    except:
        origin_comment_average_number = 0
    try:
        retweet_retweet_average_number = retweet_retweet_total_number / retweeted_weibo_number
    except:
        retweet_retweet_average_number = 0
    try:
        retweet_comment_average_number = retweet_comment_total_number / retweeted_weibo_number
    except:
        retweet_comment_average_number = 0

    result = dict()
    result["origin_weibo_number"] = origin_weibo_number
    result["retweeted_weibo_number"] = retweeted_weibo_number
    result["origin_weibo_retweeted_total_number"] = origin_retweet_total_number
    result["origin_weibo_comment_total_number"] = origin_comment_total_number
    result[
        "retweeted_weibo_retweeted_total_number"] = retweet_retweet_total_number
    result[
        "retweeted_weibo_comment_total_number"] = retweet_comment_total_number
    result[
        "origin_weibo_retweeted_average_number"] = origin_retweet_average_number
    result[
        "origin_weibo_comment_average_number"] = origin_comment_average_number
    result[
        "retweeted_weibo_retweeted_average_number"] = retweet_retweet_average_number
    result[
        "retweeted_weibo_comment_average_number"] = retweet_comment_average_number
    result["origin_weibo_retweeted_top_number"] = origin_retweet_top_number
    result["origin_weibo_comment_top_number"] = origin_comment_top_number
    result["retweeted_weibo_retweeted_top_number"] = retweet_retweet_top_number
    result["retweeted_weibo_comment_top_number"] = retweet_comment_top_number
    if not sensitive:
        result[
            "origin_weibo_comment_brust_average"] = origin_weibo_comment_brust_average
        result[
            "origin_weibo_retweeted_brust_average"] = origin_weibo_retweeted_brust_average
        result[
            "retweeted_weibo_comment_brust_average"] = retweeted_weibo_comment_brust_average
        result[
            "retweeted_weibo_retweeted_brust_average"] = retweeted_weibo_retweeted_brust_average
        result['user_index'] = bci_result.get('user_index', 0)
    else:
        result["retweeted_sensitive_words_list"] = sorted(
            retweeted_sensitive_words_dict.items(),
            key=lambda x: x[1],
            reverse=True)
        result["origin_sensitive_words_list"] = sorted(
            origin_sensitive_words_dict.items(),
            key=lambda x: x[1],
            reverse=True)
        result["retweeted_sensitive_words_number"] = len(
            retweeted_sensitive_words_dict)
        result["origin_sensitive_words_number"] = len(
            origin_sensitive_words_dict)

    return result