コード例 #1
0
def get_community_keyword(uid_list, date_time):
    query_body = {
        'query': {
            'filtered': {
                'filter': {
                    'bool': {
                        'must': [{
                            'terms': {
                                'uid': uid_list
                            }
                        }]
                    }
                }
            }
        },
        'aggs': {
            'keywords': {
                'terms': {
                    'field': 'keywords_string',
                    'size': 1000
                }
            }
        }
    }
    flow_text_index_name_list = get_flow_text_index_list(date_time)
    flow_text_exist = es_flow_text.search(index = flow_text_index_name_list,doc_type = flow_text_index_type,\
               body = query_body)['aggregations']['keywords']['buckets']

    word_dict = dict()

    word_dict_new = dict()

    keywords_string = ''
    for item in flow_text_exist:
        word = item['key']
        count = item['doc_count']
        word_dict[word] = count

        keywords_string += '&'
        keywords_string += item['key']

    k_dict = extract_keywords(keywords_string)

    for item_item in k_dict:
        keyword = item_item.word
        # print 'keyword::',keyword,type(keyword)
        if word_dict.has_key(keyword):
            word_dict_new[keyword] = word_dict[keyword]
        else:
            word_dict_new[keyword] = 1

    keyword_dict = sorted(word_dict_new.items(),
                          key=lambda d: d[1],
                          reverse=True)
    #print 'keyword_dict',keyword_dict,keyword_dict[0],type(keyword_dict[0])
    try:
        keyword_name = keyword_dict[0][0] + '_' + keyword_dict[1][0]
    except:
        keyword_name = 'X'
    return json.dumps(keyword_dict), keyword_name
コード例 #2
0
ファイル: target_user.py プロジェクト: yuanhuiru/xnr2
def get_user_keywords(uid,today_datetime):
    flow_text_index_list = get_flow_text_index_list(today_datetime)
    query_body={
        '_source':['keywords_string'],
        'query':{
            'filtered':{
                'filter':{
                    'bool':{
                        'must':[
                            {'term':{'uid':uid}}
                        ]
                    }
                }
            }
        },
        'size':MAX_SEARCH_SIZE
    }
    results = es_flow_text.search(index=flow_text_index_list,doc_type=flow_text_index_type,body=query_body)['hits']['hits']
    # print results
    keywords_list = []
    for item in results:
    	keywords_list.extend(item['_source']['keywords_string'].split('&'))
    temp_keywords = list(set(keywords_list))
    keywords = '&'.join(temp_keywords)
    # print keywords
    return keywords
コード例 #3
0
def followers_domain_update():

    if S_TYPE == 'test':
        current_time = datetime2ts(S_DATE)

    else:
        current_time = int(time.time())

    flow_text_index_name_list = get_flow_text_index_list(current_time)

    query_body = {'query': {'match_all': {}}, 'size': MAX_VALUE}

    search_results = es_xnr.search(index=weibo_xnr_fans_followers_index_name,\
     doc_type=weibo_xnr_fans_followers_index_type,body=query_body)['hits']['hits']
    followers_list_all = []
    for result in search_results:
        result = result['_source']
        followers_list = result['followers_list']
        followers_list_all.extend(followers_list)

    followers_list_all_set_list = list(set(followers_list_all))

    uid_weibo_keywords_dict, keywords_dict_all_users = uid_list_2_uid_keywords_dict(
        followers_list_all_set_list, flow_text_index_name_list)
    uids_avtive_list = uid_weibo_keywords_dict.keys(
    )  # 防止关注列表中有无效uid,或者只有近期活跃的uid才有意义。

    ## 领域分类
    r_domain = dict()
    print 'uids_avtive_list::', uids_avtive_list

    domain, r_domain = domain_classfiy(uids_avtive_list,
                                       uid_weibo_keywords_dict)
    print 'r_domain::', r_domain

    for uid, domain in r_domain.iteritems():
        domain_name = domain_en2ch_dict[domain]
        _id = uid
        try:
            print '_id:::', _id
            get_result = es_xnr.get(index=user_domain_index_name,doc_type=user_domain_index_type,\
             id=_id)['_source']

            get_result['domain_name'] = domain_name
            get_result['update_time'] = int(time.time())
            es_xnr.update(index=user_domain_index_name,doc_type=user_domain_index_type,\
             id=_id,body={'doc':get_result})

        except:
            item_dict = {}
            item_dict['uid'] = uid
            item_dict['domain_name'] = domain_name
            item_dict['update_time'] = int(time.time())

            es_xnr.index(index=user_domain_index_name,doc_type=user_domain_index_type,\
             id=_id,body=item_dict)
コード例 #4
0
ファイル: target_user.py プロジェクト: yuanhuiru/xnr2
def caculate_sensitive_user(today_datetime):
    flow_text_index_list=get_flow_text_index_list(today_datetime)
    # print flow_text_index_list
    #计算敏感度排名靠前的用户
    query_body={
        'query':{
            'filtered':{
                'filter':{
                    'bool':{
                        'must':[
                            {'range':{'sensitive':{'gt':0}}}
                        ]
                    }
                }
            }
        },
        'aggs':{
            'user_sensitive_sum':{
                'terms':{'field':'uid','size':MAX_CACULATE_USER_NUM,'order':[{'sensitive_sum':'desc'}]},
                'aggs':{
                    'sensitive_sum':{
                        'sum':{'field':'sensitive'}
                    }
                }                        
            }
            }
        
    }

    sensitive_uid_info=[]
    sensitive_uidlist=[]    
    try:   
        sensitive_result=es_user_portrait.search(index=flow_text_index_list,doc_type=flow_text_index_type,\
        body=query_body)['aggregations']['user_sensitive_sum']['buckets']
        # print sensitive_result
        for i in xrange(0,len(sensitive_result)):
            user_sensitive=sensitive_result[i]['sensitive_sum']['value']
            user_dict=dict()
            user_dict['uid']=sensitive_result[i]['key']
            user_dict['sensitive']=user_sensitive
            sensitive_uid_info.append(user_dict)
            sensitive_uidlist.append(sensitive_result[i]['key'])
    except:
        sensitive_uid_info=[]
        sensitive_uidlist=[] 

    return sensitive_uidlist,sensitive_uid_info
コード例 #5
0
ファイル: fb_domain_role_base.py プロジェクト: yuanhuiru/xnr2
def get_psy_feature_sort(uids_list, create_time):

    index_name_list = get_flow_text_index_list(create_time)

    query_body = {
        'query': {
            'filtered': {
                'filter': {
                    'terms': {
                        'uid': uids_list
                    }
                }
            }
        },
        'aggs': {
            'sentiment_all': {
                'terms': {
                    'field': 'sentiment',
                    'size': MAX_SEARCH_SIZE
                }
            }
        }
    }

    es_sentiment_counts = es_flow_text.search(index=index_name_list,doc_type=flow_text_index_type,\
                                body=query_body)['aggregations']['sentiment_all']['buckets']
    sentiment_dict = dict()

    for item in es_sentiment_counts:
        sen_no = str(item['key'])
        sen_count = item['doc_count']
        sen_zh = SENTIMENT_DICT_NEW[sen_no]
        sentiment_dict[sen_zh] = sen_count

    sentiment_sort = sorted(sentiment_dict.items(),
                            key=lambda x: x[1],
                            reverse=True)

    return sentiment_sort
コード例 #6
0
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
from textrank4zh import TextRank4Keyword, TextRank4Sentence
sys.path.append('../')
from global_utils import es_flow_text, flow_text_index_name_pre, flow_text_index_type
from global_config import S_DATE
from time_utils import get_flow_text_index_list

abs_path = './'
K1 = 1.5
B = 0.75
K3 = 500
MAX_SIZE = 999999
OPINION_CLUSTER = 5

index_list = get_flow_text_index_list(
    int(time.mktime(time.strptime(S_DATE, "%Y-%m-%d"))))

##对微博文本进行预处理


def cut_filter(text):
    pattern_list = [r'\(分享自 .*\)', r'http://\w*']
    for i in pattern_list:
        p = re.compile(i)
        text = p.sub('', text)
    return text


def re_cut(w_text):  #根据一些规则把无关内容过滤掉

    w_text = cut_filter(w_text)
コード例 #7
0
def get_opinions(task_source, task_id, xnr_user_no, opinion_keywords_list,
                 opinion_type, intel_type):

    query_item = 'text'
    nest_query_list = []
    tweets_list = []
    if task_source == 'weibo':

        if S_TYPE == 'test':
            current_time = datetime2ts(S_DATE)

        else:
            current_time = int(time.time())

        index_name_list = get_flow_text_index_list(current_time, days=5)
        sort_item = 'retweeted'
        for keyword in opinion_keywords_list:
            nest_query_list.append(
                {'wildcard': {
                    query_item: '*' + keyword + '*'
                }})
        uid_list = []

        if len(nest_query_list) == 1:
            SHOULD_PERCENT = 1
        else:
            SHOULD_PERCENT = 1

        if intel_type == 'all':
            query_body = {
                'query': {
                    'bool': {
                        'should': nest_query_list,
                        'minimum_should_match': SHOULD_PERCENT
                    }
                },
                'sort': {
                    sort_item: {
                        'order': 'desc'
                    }
                },
                'size': MAX_SEARCH_SIZE
            }

        elif intel_type == 'follow':

            try:
                follow_results = es_xnr.get(index=weibo_xnr_fans_followers_index_name,doc_type=weibo_xnr_fans_followers_index_type,\
                    id=xnr_user_no)['_source']

                if follow_results:
                    for follow_result in follow_results:
                        uid_list = follow_result['_source']['followers']
            except:
                uid_list = []

            query_body = {
                'query': {
                    'bool': {
                        'should': nest_query_list,
                        'minimum_should_match': SHOULD_PERCENT,
                        'must': [{
                            'terms': {
                                'uid': uid_list
                            }
                        }]
                    }
                },
                'sort': {
                    sort_item: {
                        'order': 'desc'
                    }
                },
                'size': MAX_SEARCH_SIZE
            }

        elif intel_type == 'influence':
            date = ts2datetime(current_time - 24 * 3600)

            if S_TYPE == 'test':
                date = S_DATE_BCI

            weibo_bci_index_name = weibo_bci_index_name_pre + date[:4] + date[
                5:7] + date[8:10]

            query_body_bci = {
                'query': {
                    'match_all': {}
                },
                'sort': {
                    'user_index': {
                        'order': 'desc'
                    }
                },
                'size': 500
            }

            weino_bci_results = es_user_portrait.search(
                index=weibo_bci_index_name,
                doc_type=weibo_bci_index_type,
                body=query_body_bci)['hits']['hits']
            if weino_bci_results:
                for bci_result in weino_bci_results:
                    uid = bci_result['_source']['user']
                    uid_list.append(uid)

            query_body = {
                'query': {
                    'bool': {
                        'should': nest_query_list,
                        'minimum_should_match': SHOULD_PERCENT,
                        'must': [{
                            'terms': {
                                'uid': uid_list
                            }
                        }]
                    }
                },
                'sort': {
                    sort_item: {
                        'order': 'desc'
                    }
                },
                'size': MAX_SEARCH_SIZE
            }

        else:

            query_sensitive = {
                'query': {
                    'match_all': {}
                },
                "aggs": {
                    "uids": {
                        "terms": {
                            "field": "uid",
                            "order": {
                                "avg_sensitive": "desc"
                            }
                        },
                        "aggs": {
                            "avg_sensitive": {
                                "avg": {
                                    "field": "sensitive"
                                }
                            }
                        }
                    }
                },
                'size': 500000
            }

            es_sensitive_result = es_flow_text.search(index=index_name_list,doc_type='text',\
                    body=query_sensitive)['aggregations']['uids']['buckets']
            for item in es_sensitive_result:
                uid = item['key']
                uid_list.append(uid)

            query_body = {
                'query': {
                    'bool': {
                        'should': nest_query_list,
                        'minimum_should_match': SHOULD_PERCENT,
                        'must': [{
                            'terms': {
                                'uid': uid_list
                            }
                        }]
                    }
                },
                'sort': {
                    sort_item: {
                        'order': 'desc'
                    }
                },
                'size': MAX_SEARCH_SIZE
            }

        # 得到tweets_list

        tweets_results = es_flow_text.search(index=index_name_list,
                                             doc_type='text',
                                             body=query_body)['hits']['hits']

        if tweets_results:
            for item in tweets_results:
                item = item['_source']
                weibo = item['text']
                tweets_list.append(weibo)

    else:
        if S_TYPE == 'test':
            current_time = datetime2ts(S_DATE_FB)
        else:
            current_time = int(time.time())
        uid_list = []
        sort_item = 'share'
        opinion_keywords_list = [
            word.encode('utf-8') for word in opinion_keywords_list
        ]
        en_keywords_list = trans(opinion_keywords_list, target_language='en')
        for i in range(len(opinion_keywords_list)):
            keyword = opinion_keywords_list[i].decode('utf-8')
            traditional_keyword = simplified2traditional(keyword)

            if len(en_keywords_list) == len(opinion_keywords_list):  #确保翻译没出错
                en_keyword = en_keywords_list[i]
                nest_query_list.append(
                    {'wildcard': {
                        query_item: '*' + en_keyword + '*'
                    }})

            nest_query_list.append(
                {'wildcard': {
                    query_item: '*' + keyword + '*'
                }})
            nest_query_list.append(
                {'wildcard': {
                    query_item: '*' + traditional_keyword + '*'
                }})

        if len(nest_query_list) == 1:
            SHOULD_PERCENT = 1
        else:
            SHOULD_PERCENT = 1

        if task_source == 'facebook':
            index_name_list = fb_get_flow_text_index_list(current_time, days=5)

            if intel_type == 'all':
                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            elif intel_type == 'follow':

                try:
                    follow_results = es_xnr.get(index=fb_xnr_fans_followers_index_name,doc_type=fb_xnr_fans_followers_index_type,\
                        id=xnr_user_no)['_source']

                    if follow_results:
                        for follow_result in follow_results:
                            uid_list = follow_result['_source']['fans_list']
                except:
                    uid_list = []

                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT,
                            'must': [{
                                'terms': {
                                    'uid': uid_list
                                }
                            }]
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            elif intel_type == 'influence':
                fb_bci_index_name = fb_bci_index_name_pre + ts2datetime(
                    current_time)
                query_body_bci = {
                    'query': {
                        'match_all': {}
                    },
                    'sort': {
                        'influence': {
                            'order': 'desc'
                        }
                    },
                    'size': 500
                }

                fb_bci_results = es_xnr.search(
                    index=fb_bci_index_name,
                    doc_type=fb_bci_index_type,
                    body=query_body_bci)['hits']['hits']
                #print 'fb_bci_results...',len(fb_bci_results)
                if fb_bci_results:
                    for bci_result in fb_bci_results:
                        uid = bci_result['_source']['uid']
                        uid_list.append(uid)

                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT,
                            'must': [{
                                'terms': {
                                    'uid': uid_list
                                }
                            }]
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            else:

                query_sensitive = {
                    'query': {
                        'match_all': {}
                    },
                    "aggs": {
                        "uids": {
                            "terms": {
                                "field": "uid",
                                "order": {
                                    "avg_sensitive": "desc"
                                }
                            },
                            "aggs": {
                                "avg_sensitive": {
                                    "avg": {
                                        "field": "sensitive"
                                    }
                                }
                            }
                        }
                    },
                    'size': 500
                }

                es_sensitive_result = es_xnr.search(index=index_name_list,doc_type='text',\
                        body=query_sensitive)['aggregations']['uids']['buckets']
                #print 'es_sensitive_result...',len(es_sensitive_result)
                for item in es_sensitive_result:
                    uid = item['key']
                    uid_list.append(uid)

                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT,
                            'must': [{
                                'terms': {
                                    'uid': uid_list
                                }
                            }]
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            #print 'query_body...',query_body
            tweets_results = es_xnr.search(index=index_name_list,
                                           doc_type='text',
                                           body=query_body)['hits']['hits']

            if tweets_results:
                for item in tweets_results:
                    item = item['_source']
                    weibo = item['text']
                    tweets_list.append(weibo)

        else:
            index_name_list = tw_get_flow_text_index_list(current_time, days=5)

            if intel_type == 'all':
                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            elif intel_type == 'follow':

                try:
                    follow_results = es_xnr.get(index=tw_xnr_fans_followers_index_name,doc_type=tw_xnr_fans_followers_index_type,\
                        id=xnr_user_no)['_source']

                    if follow_results:
                        for follow_result in follow_results:
                            uid_list = follow_result['_source'][
                                'followers_list']
                except:
                    uid_list = []

                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT,
                            'must': [{
                                'terms': {
                                    'uid': uid_list
                                }
                            }]
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            elif intel_type == 'influence':
                tw_bci_index_name = tw_bci_index_name_pre + ts2datetime(
                    current_time)
                query_body_bci = {
                    'query': {
                        'match_all': {}
                    },
                    'sort': {
                        'influence': {
                            'order': 'desc'
                        }
                    },
                    'size': 500
                }

                tw_bci_results = es_xnr.search(
                    index=tw_bci_index_name,
                    doc_type=tw_bci_index_type,
                    body=query_body_bci)['hits']['hits']
                if tw_bci_results:
                    for bci_result in tw_bci_results:
                        uid = bci_result['_source']['uid']
                        uid_list.append(uid)

                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT,
                            'must': [{
                                'terms': {
                                    'uid': uid_list
                                }
                            }]
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            else:

                query_sensitive = {
                    'query': {
                        'match_all': {}
                    },
                    "aggs": {
                        "uids": {
                            "terms": {
                                "field": "uid",
                                "order": {
                                    "avg_sensitive": "desc"
                                }
                            },
                            "aggs": {
                                "avg_sensitive": {
                                    "avg": {
                                        "field": "sensitive"
                                    }
                                }
                            }
                        }
                    },
                    'size': 500
                }

                es_sensitive_result = es_xnr.search(index=index_name_list,doc_type='text',\
                        body=query_sensitive)['aggregations']['uids']['buckets']
                for item in es_sensitive_result:
                    uid = item['key']
                    uid_list.append(uid)

                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT,
                            'must': [{
                                'terms': {
                                    'uid': uid_list
                                }
                            }]
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            print 'index_name_list...', index_name_list
            print 'query_body........', query_body
            tweets_results = es_xnr.search(index=index_name_list,
                                           doc_type='text',
                                           body=query_body)['hits']['hits']

            if tweets_results:
                for item in tweets_results:
                    item = item['_source']
                    weibo = item['text']
                    tweets_list.append(weibo)

    if tweets_list:
        opinion_name, word_result, text_list = opinion_main(tweets_list,
                                                            k_cluster=5)
        sub_opinion_results = dict()

        topic_keywords_list = []
        summary_text_list = []

        for topic, text in text_list.iteritems():

            topic_name = opinion_name[topic]
            sub_opinion_results[topic_name] = text[:SUB_OPINION_WEIBO_LIMIT]

            topic_keywords_list.extend(topic_name.split('&'))
            summary_text_list.extend(text)

        #try:
        print 'summary_text_list..', len(summary_text_list)
        print 'topic_keywords_list..', topic_keywords_list
        summary = text_generation_main(summary_text_list, topic_keywords_list)
        #summary = summary_main(summary_text_list)
        #except:
        #    summary = ''

    else:
        sub_opinion_results = {}
        summary = ''

    print '开始保存子观点计算结果......'
    print 'summary....', summary
    mark = save_intelligent_opinion_results(task_id, sub_opinion_results,
                                            summary, intel_type)

    return mark
コード例 #8
0
def find_flow_texts(task_source, task_id, event_keywords):

    # 得到nest_query_list
    nest_query_list = []

    keywords_list = event_keywords.split('&')
    keywords_list = [word.encode('utf-8') for word in keywords_list]
    query_item = 'text'
    if task_source != 'weibo':
        #文本中可能存在英文或者繁体字,所以都匹配一下

        en_keywords_list = trans(keywords_list, target_language='en')
        for i in range(len(keywords_list)):
            keyword = keywords_list[i].decode('utf-8')
            traditional_keyword = simplified2traditional(keyword)

            if len(en_keywords_list) == len(keywords_list):  #确保翻译没出错
                en_keyword = en_keywords_list[i]
                nest_query_list.append(
                    {'wildcard': {
                        query_item: '*' + en_keyword + '*'
                    }})

            nest_query_list.append(
                {'wildcard': {
                    query_item: '*' + keyword + '*'
                }})
            nest_query_list.append(
                {'wildcard': {
                    query_item: '*' + traditional_keyword + '*'
                }})

    else:
        for keyword in keywords_list:
            nest_query_list.append(
                {'wildcard': {
                    query_item: '*' + keyword + '*'
                }})

    if len(nest_query_list) == 1:
        SHOULD_PERCENT = 1
    else:
        SHOULD_PERCENT = 1

    # 匹配文本
    if task_source == 'weibo':
        sort_item = 'retweeted'

        if S_TYPE == 'test':
            current_time = datetime2ts(S_DATE)
        else:
            current_time = int(time.time() + 24 * 3600)
        #test

#current_time = int(datetime2ts("2018-05-13"))
        index_name_list = get_flow_text_index_list(current_time, days=2)
        es_name = es_flow_text

    elif task_source == 'facebook':
        sort_item = 'share'
        if S_TYPE == 'test':
            current_time = datetime2ts(S_DATE_FB)
        else:
            current_time = int(time.time() + 24 * 3600)

        index_name_list = fb_get_flow_text_index_list(current_time, days=2)
        es_name = es_xnr

    else:
        sort_item = 'share'
        if S_TYPE == 'test':
            current_time = datetime2ts(S_DATE_TW)
        else:
            current_time = int(time.time() + 24 * 3600)
        index_name_list = tw_get_flow_text_index_list(current_time, days=2)
        es_name = es_xnr

    query_body = {
        'query': {
            'bool': {
                'should': nest_query_list,
                'minimum_should_match': SHOULD_PERCENT
            }
        },
        'sort': {
            sort_item: {
                'order': 'desc'
            }
        },
        'size': 100000
    }
    print 'es_name...', es_name
    print 'index_name_list..', index_name_list

    search_results = es_name.search(index=index_name_list,
                                    doc_type='text',
                                    body=query_body)['hits']['hits']
    print 'len..search_results..', len(search_results)
    save2topic_es(task_source, task_id, search_results)
コード例 #9
0
def get_related_recommendation(task_detail):
    
    avg_sort_uid_dict = {}

    xnr_user_no = task_detail['xnr_user_no']
    sort_item = task_detail['sort_item']
    es_result = es.get(index=weibo_xnr_index_name,doc_type=weibo_xnr_index_type,id=xnr_user_no)['_source']
    uid = es_result['uid']

    monitor_keywords = es_result['monitor_keywords']
    
    monitor_keywords_list = monitor_keywords.split(',')

    nest_query_list = []
    #print 'monitor_keywords_list::',monitor_keywords_list
    for monitor_keyword in monitor_keywords_list:
        #print 'monitor_keyword::::',monitor_keyword
        nest_query_list.append({'wildcard':{'keywords_string':'*'+monitor_keyword+'*'}})
    
    # else:
    try:
        recommend_list = es.get(index=weibo_xnr_fans_followers_index_name,doc_type=weibo_xnr_fans_followers_index_type,id=xnr_user_no)['_source']['followers_list']
    except:
        recommend_list = []

    recommend_set_list = list(set(recommend_list))

    if S_TYPE == 'test':
        current_date = S_DATE
    else:
        current_date = ts2datetime(int(time.time()-24*3600))
    
    flow_text_index_name = flow_text_index_name_pre + current_date

    if sort_item != 'friend':

        uid_list = []
        #uid_list = recommend_set_list
        if sort_item == 'influence':
            sort_item = 'user_fansnum'
        query_body_rec = {
            'query':{
                
                'bool':{
                    'should':nest_query_list
                }
            },
            'aggs':{
                'uid_list':{
                    'terms':{'field':'uid','size':TOP_ACTIVE_SOCIAL,'order':{'avg_sort':'desc'} },
                    'aggs':{'avg_sort':{'avg':{'field':sort_item}}}

                }
            }
        }

        es_rec_result = es_flow_text.search(index=flow_text_index_name,doc_type='text',body=query_body_rec)['aggregations']['uid_list']['buckets']
        #print 'es_rec_result///',es_rec_result
        for item in es_rec_result:
            uid = item['key']
            uid_list.append(uid)
            
            avg_sort_uid_dict[uid] = {}

            if sort_item == 'user_fansnum':
                avg_sort_uid_dict[uid]['sort_item_value'] = int(item['avg_sort']['value'])
            else:
                avg_sort_uid_dict[uid]['sort_item_value'] = round(item['avg_sort']['value'],2)

    else:
        if S_TYPE == 'test':
            uid_list = FRIEND_LIST
            #sort_item = 'sensitive'
        else:
            uid_list = []
            '''
            friends_list_results = es_user_profile.mget(index=profile_index_name,doc_type=profile_index_type,body={'ids':recommend_set_list})['docs']
            for result in friends_list_results:
                friends_list = friends_list + result['friend_list']
            '''
            friends_list = get_friends_list(recommend_set_list)

            friends_set_list = list(set(friends_list))

            #uid_list = friends_set_list

            sort_item_new = 'fansnum'

            query_body_rec = {
                'query':{
                    'bool':{
                        'must':[
                            {'terms':{'uid':friends_set_list}},
                            {'bool':{
                                'should':nest_query_list
                            }}
                        ]
                    }
                },
                'aggs':{
                    'uid_list':{
                        'terms':{'field':'uid','size':TOP_ACTIVE_SOCIAL,'order':{'avg_sort':'desc'} },
                        'aggs':{'avg_sort':{'avg':{'field':sort_item_new}}}

                    }
                }
            }
            es_friend_result = es_flow_text.search(index=flow_text_index_name,doc_type='text',body=query_body_rec)['aggregations']['uid_list']['buckets']
            
            for item in es_friend_result:
                uid = item['key']
                uid_list.append(uid)
                
                avg_sort_uid_dict[uid] = {}
                
                if not item['avg_sort']['value']:
                    avg_sort_uid_dict[uid]['sort_item_value'] = 0
                else:
                    avg_sort_uid_dict[uid]['sort_item_value'] = int(item['avg_sort']['value'])
                
    results_all = []

    for uid in uid_list:
        #if sort_item == 'friend':
        query_body = {
            'query':{
                'filtered':{
                    'filter':{
                        'term':{'uid':uid}
                    }
                }
            }
        }

        es_results = es_user_portrait.search(index=portrait_index_name,doc_type=portrait_index_type,body=query_body)['hits']['hits']

    
       
        if es_results:
            #print 'portrait--',es_results[0]['_source'].keys()
            for item in es_results:
                uid = item['_source']['uid']
                #nick_name,photo_url = uid2nick_name_photo(uid)
                item['_source']['nick_name'] = uid #nick_name
                item['_source']['photo_url'] = ''#photo_url
                weibo_type = judge_follow_type(xnr_user_no,uid)
                sensor_mark = judge_sensing_sensor(xnr_user_no,uid)

                item['_source']['weibo_type'] = weibo_type
                item['_source']['sensor_mark'] = sensor_mark
                try:
                    del item['_source']['group']
                    del item['_source']['activity_geo_dict']
                except:
                    pass


                if sort_item == 'friend':
                    if S_TYPE == 'test':
                        item['_source']['fansnum'] = item['_source']['fansnum']
                    else:
                        item['_source']['fansnum'] = avg_sort_uid_dict[uid]['sort_item_value']
                elif sort_item == 'sensitive':
                    item['_source']['sensitive'] = avg_sort_uid_dict[uid]['sort_item_value']
                    item['_source']['fansnum'] = item['_source']['fansnum']
                else:
                    item['_source']['fansnum'] = avg_sort_uid_dict[uid]['sort_item_value']

                if S_TYPE == 'test':
                    current_time = datetime2ts(S_DATE)
                else:
                    current_time = int(time.time())

                index_name = get_flow_text_index_list(current_time)

                query_body = {
                    'query':{
                        'bool':{
                            'must':[
                                {'term':{'uid':uid}},
                                {'terms':{'message_type':[1,3]}}
                            ]
                        }
                    },
                    'sort':{'retweeted':{'order':'desc'}},
                    'size':5
                }

                es_weibo_results = es_flow_text.search(index=index_name,doc_type=flow_text_index_type,body=query_body)['hits']['hits']

                weibo_list = []
                for weibo in es_weibo_results:
                    weibo = weibo['_source']
                    weibo_list.append(weibo)
                item['_source']['weibo_list'] = weibo_list
                item['_source']['portrait_status'] = True
                results_all.append(item['_source'])
        else:
            item_else = dict()
            item_else['uid'] = uid
            #nick_name,photo_url = uid2nick_name_photo(uid)
            item_else['nick_name'] = uid#nick_name
            item_else['photo_url'] = ''#photo_url
            weibo_type = judge_follow_type(xnr_user_no,uid)
            sensor_mark = judge_sensing_sensor(xnr_user_no,uid)
            item_else['weibo_type'] = weibo_type
            item_else['sensor_mark'] = sensor_mark
            item_else['portrait_status'] = False
            #if sort_item != 'friend':
            #item_else['sort_item_value'] = avg_sort_uid_dict[uid]['sort_item_value']
            # else:
            #     item_else['sort_item_value'] = ''
            

            if S_TYPE == 'test':
                current_time = datetime2ts(S_DATE)
            else:
                current_time = int(time.time())

            index_name = get_flow_text_index_list(current_time)

            query_body = {
                'query':{
                    'term':{'uid':uid}
                },
                'sort':{'retweeted':{'order':'desc'}}
            }

            es_weibo_results = es_flow_text.search(index=index_name,doc_type=flow_text_index_type,body=query_body)['hits']['hits']

            weibo_list = []
            for weibo in es_weibo_results:
                item_else['fansnum'] = weibo['_source']['user_fansnum']
                weibo = weibo['_source']
                weibo_list.append(weibo)
            item_else['weibo_list'] = weibo_list
            item_else['friendsnum'] = 0
            item_else['statusnum'] = 0
            if sort_item == 'sensitive':
                item_else['sensitive'] = avg_sort_uid_dict[uid]['sort_item_value']
            else:
                item_else['fansnum'] = avg_sort_uid_dict[uid]['sort_item_value']

            results_all.append(item_else)
            
    
    return results_all
コード例 #10
0
                sensor_mark = ''

                item['_source']['weibo_type'] = weibo_type
                item['_source']['sensor_mark'] = sensor_mark
                try:
                    del item['_source']['group']
                    del item['_source']['activity_geo_dict']
                except:
                    pass


                item['_source']['user_index'] = avg_sort_uid_dict[uid]['sort_item_value']

                current_time = datetime2ts(current_date)

                index_name = get_flow_text_index_list(current_time)

                query_body = {
                    'query':{
                        'bool':{
                            'must':[
                                {'term':{'uid':uid}},
                                {'terms':{'message_type':[1,3]}}
                            ]
                        }
                    },
                    'sort':{'retweeted':{'order':'desc'}},
                    'size':5
                }

                es_weibo_results = es_flow_text.search(index=index_name,doc_type=flow_text_index_type,body=query_body)['hits']['hits']
コード例 #11
0
def compute_penetration_num(xnr_user_no):

    if S_TYPE == 'test':
        current_time = datetime2ts(S_DATE) - DAY
    else:
        current_time = time.time() - DAY
    
    current_date = ts2datetime(current_time)
    timestamp = datetime2ts(current_date)

    # 找出top 敏感用户
    query_body = {
        'query':{
            'match_all':{}
        },
        'sort':{'sensitive':{'order':'desc'}},
        'size':TOP_ASSESSMENT_NUM
    }

    top_sensitive_users = es_user_portrait.search(index=portrait_index_name,doc_type=portrait_index_type,\
                            body=query_body)['hits']['hits']
    top_sensitive_uid_list = []
    for user in top_sensitive_users:
        user = user['_source']
        top_sensitive_uid_list.append(user['uid'])

    # 计算top敏感用户的微博敏感度均值
    query_body_count = {
        'query':{
            'filtered':{
                'filter':{
                    'terms':{'uid':top_sensitive_uid_list}
                }
            }
        },
        'aggs':{
            'avg_sensitive':{
                'avg':{
                    'field':'sensitive'
                }
            }
        }
    }
    if S_TYPE == 'test':
        index_name = get_flow_text_index_list(timestamp)
    else:
        index_name = flow_text_index_name_pre + current_date

    es_sensitive_result = es_flow_text.search(index=index_name,doc_type=flow_text_index_type,\
        body=query_body_count)['aggregations']
    sensitive_value_top_avg = es_sensitive_result['avg_sensitive']['value']

    if S_TYPE == 'test':
        if not sensitive_value_top_avg:
            sensitive_value_top_avg = 1
    print 'es_sensitive_result::',es_sensitive_result
    # 计算xnr反馈群体的敏感度
    

    #follow_group_mark = get_pene_follow_group_sensitive(xnr_user_no)['sensitive_info'][timestamp]
    #fans_group_mark = get_pene_fans_group_sensitive(xnr_user_no)['sensitive_info'][timestamp]
    try:
        feedback_mark_at = get_pene_feedback_sensitive(xnr_user_no,'be_at')['sensitive_info'][timestamp]
        feedback_mark_retweet = get_pene_feedback_sensitive(xnr_user_no,'be_retweet')['sensitive_info'][timestamp]
        feedback_mark_comment = get_pene_feedback_sensitive(xnr_user_no,'be_comment')['sensitive_info'][timestamp]
    except:
        feedback_mark_at = 0.0839
        feedback_mark_retweet = 0.1199
        feedback_mark_comment = 0.01311
    # try:
    #   report_management_mark_tweet = get_pene_warning_report_sensitive(xnr_user_no)['tweet'][timestamp]
    #   report_management_mark_event = get_pene_warning_report_sensitive(xnr_user_no)['event'][timestamp]
    # except:
    #   report_management_mark_tweet = 0
    #   report_management_mark_event = 0
    # pene_mark = 100*float(follow_group_mark+fans_group_mark+feedback_mark_at+feedback_mark_retweet+\
    #             feedback_mark_comment+report_management_mark_tweet+report_management_mark_event)/sensitive_value_top_avg
    pene_mark = 100 * float(feedback_mark_at+feedback_mark_retweet+feedback_mark_comment)/sensitive_value_top_avg
    pene_mark = round(pene_mark,2)

    return pene_mark
コード例 #12
0
ファイル: cron_compute_hot.py プロジェクト: zhhhzhang/xnr1
def compute_recommend_subopnion(task_detail):

    print '开始分析计算......'

    task_id = task_detail['task_id'].strip('"')

    keywords_string = task_detail['keywords_string']

    keywords_list = keywords_string.split('&')  ## 以 & 切分关键词,得到list

    xnr_user_no = task_detail['xnr_user_no']
    mid = task_detail['mid']

    query_item = 'keywords_string'
    nest_query_list = []
    for keyword in keywords_list:
        nest_query_list.append({'wildcard': {query_item: '*' + keyword + '*'}})
    '''
    ## 重点关注当前虚拟人的关注用户
    if S_TYPE == 'test':
        # followers_list = get_result['followers_list']
        # nest_query_list.append({'terms':followers_list})
        print '全部用户'
    else:
        get_result = es.get(index=weibo_xnr_fans_followers_index_name,doc_type=weibo_xnr_fans_followers_index_type,\
        id=xnr_user_no)['_source']
        followers_list = get_result['followers_list']
        nest_query_list.append({'terms':followers_list})
    '''

    if S_TYPE == 'test':
        create_time = datetime2ts(S_DATE)
    else:
        create_time = datehour2ts(ts2datehour(time.time() - 3600))

    #get_flow_text_index_list(create_time)

    #index_name_list_list = get_flow_text_index_list(now_timestamp)
    index_name_list = get_flow_text_index_list(create_time)
    print 'index_name_list::', index_name_list
    es_results = es_flow_text.search(index=index_name_list,doc_type='text',\
                    body={'query':{'bool':{'must':nest_query_list}},'size':MAX_SEARCH_SIZE})['hits']['hits']

    weibo_list = []  ## 内容推荐和子观点分析的输入

    if es_results:
        for item in es_results:
            item = item['_source']
            weibo = item['text']
            weibo_list.append(weibo)

    ## 内容推荐

    ## 得到推荐句子列表
    print 'weibo_list::::::', weibo_list
    print '开始内容推荐计算......'
    if weibo_list:
        content_results = summary_main(weibo_list)
    else:
        content_results = []

    print '开始保存内容推荐计算结果......'

    mark = save_content_recommendation_results(xnr_user_no, mid,
                                               task_id.encode('utf-8'),
                                               content_results)
    print 'mark_content:::', mark
    if mark == False:
        print '内容推荐结果保存过程中遇到错误,把计算任务重新push到队列中'
        add_task_2_queue(keyword_task_queue_name, task_detail)
    else:
        print '内容推荐计算结果保存完毕......'

    ## 子观点分析
    '''
    输入:
    weibo_data:微博列表,[weibo1,weibo2,...]
    k_cluster:子话题个数 (默认为5)
    输出:
    opinion_name:子话题名称字典,{topic1:name1,topic2:name2,...}
    word_result:子话题关键词对,{topic1:[w1,w2,...],topic2:[w1,w2,...],...}
    text_list:子话题对应的文本,{topic1:[text1,text2,...],topic2:[text1,text2,..],..}
    '''

    print '开始子观点计算......'
    if weibo_list:
        opinion_name, word_result, text_list = opinion_main(weibo_list,
                                                            k_cluster=5)
        sub_opinion_results = dict()

        for topic, text in text_list.iteritems():

            topic_name = opinion_name[topic]
            sub_opinion_results[topic_name] = text[:SUB_OPINION_WEIBO_LIMIT]

    else:
        sub_opinion_results = {}

    print '开始保存子观点计算结果......'
    mark = save_subopnion_results(xnr_user_no, mid, task_id,
                                  sub_opinion_results)
    print 'mark_opinion:::', mark
    if mark == False:

        print '子观点计算结果保存过程中遇到错误,把计算任务重新push到队列中'

        add_task_2_queue(keyword_task_queue_name, task_detail)

    else:
        print '子观点计算结果保存完毕......'