Example #1
0
def get_recommend_follows(task_detail):
    recommend_results = dict()
    # daily_interests_list = task_detail['daily_interests'].split(',')
    monitor_keywords_list = task_detail['monitor_keywords'].split(',')
    create_time = time.time()
    if S_TYPE == 'test':
        create_time = datetime2ts(S_DATE)
    index_name_list = get_flow_text_index_list(create_time)
    '''#FB flow_text中没有daily_interests字段
    ## 日常兴趣关注
    try:
        query_body = {
            'query':{
                'filtered':{
                    'filter':{
                        'terms':{'daily_interests':daily_interests_list}
                    }
                }
            },
            # 'sort':{'user_fansnum':{'order':'desc'}},
            'size':DAILY_INTEREST_TOP_USER,
            '_source':['uid']
        }
        es_results = es_flow_text.search(index=index_name_list,doc_type='text',body=query_body)['hits']['hits']
        daily_interest_uid_set = set()
        for result in es_results:
            daily_interest_uid_set.add(result['_source']['uid'])
        daily_interest_uid_list = list(daily_interest_uid_set)
        es_daily_interests_results = es_user_profile.mget(index=profile_index_name,doc_type=profile_index_type,\
                                                body={'ids':daily_interest_uid_list})['docs']
        nick_name_dict = {} 
        es_daily_interests_results = es_daily_interests_results[:max(NICK_NAME_TOP,len(es_daily_interests_results))]
        for result in es_daily_interests_results:
            if result['found'] == True:
                result = result['_source']
                nick_name_dict[result['uid']] = result['nick_name']
            else:
                continue
        recommend_results['daily_interests'] = nick_name_dict

    except Exception,e:
        print e
        print '没有找到日常兴趣相符的用户'
        recommend_results['daily_interests'] = {}
    '''
    ## 监测词关注
    nest_query_list = []
    #文本中可能存在英文或者繁体字,所以都匹配一下
    monitor_en_keywords_list = trans(monitor_keywords_list,
                                     target_language='en')
    for i in range(len(monitor_keywords_list)):
        monitor_keyword = monitor_keywords_list[i]
        monitor_traditional_keyword = simplified2traditional(monitor_keyword)

        if len(monitor_en_keywords_list) == len(
                monitor_keywords_list):  #确保翻译没出错
            monitor_en_keyword = monitor_en_keywords_list[i]
            nest_query_list.append({
                'wildcard': {
                    'keywords_string': '*' + monitor_en_keyword + '*'
                }
            })

        nest_query_list.append(
            {'wildcard': {
                'keywords_string': '*' + monitor_keyword + '*'
            }})
        nest_query_list.append({
            'wildcard': {
                'keywords_string': '*' + monitor_traditional_keyword + '*'
            }
        })
    try:
        query_body_monitor = {
            'query': {
                'bool': {
                    # 'must':nest_query_list
                    'should': nest_query_list
                }
            },
            # 'sort':{'user_fansnum':{'order':'desc'}},
            'size': MONITOR_TOP_USER,
            '_source': ['uid']
        }
        es_results = es_flow_text.search(
            index=index_name_list, doc_type='text',
            body=query_body_monitor)['hits']['hits']
        monitor_keywords_uid_set = set()
        for result in es_results:
            monitor_keywords_uid_set.add(result['_source']['uid'])
        monitor_keywords_uid_list = list(monitor_keywords_uid_set)

        es_monitor_keywords_results = es_user_profile.mget(index=profile_index_name,doc_type=profile_index_type,\
                                                body={'ids':monitor_keywords_uid_list})['docs']
        nick_name_dict = {}
        es_monitor_keywords_results = es_monitor_keywords_results[:max(
            NICK_NAME_TOP, len(es_monitor_keywords_results))]
        for result in es_monitor_keywords_results:
            if result['found'] == True:
                result = result['_source']
                nick_name_dict[result['uid']] = result['name']
            else:
                continue
        recommend_results['monitor_keywords'] = nick_name_dict
    except Exception, e:
        print e
        print '没有找到监测词相符的用户'
        recommend_results['monitor_keywords'] = {}
def get_opinions(task_source, task_id, xnr_user_no, opinion_keywords_list,
                 opinion_type, intel_type):

    query_item = 'text'
    nest_query_list = []
    tweets_list = []
    if task_source == 'weibo':

        if S_TYPE == 'test':
            current_time = datetime2ts(S_DATE)

        else:
            current_time = int(time.time())

        index_name_list = get_flow_text_index_list(current_time, days=5)
        sort_item = 'retweeted'
        for keyword in opinion_keywords_list:
            nest_query_list.append(
                {'wildcard': {
                    query_item: '*' + keyword + '*'
                }})
        uid_list = []

        if len(nest_query_list) == 1:
            SHOULD_PERCENT = 1
        else:
            SHOULD_PERCENT = 1

        if intel_type == 'all':
            query_body = {
                'query': {
                    'bool': {
                        'should': nest_query_list,
                        'minimum_should_match': SHOULD_PERCENT
                    }
                },
                'sort': {
                    sort_item: {
                        'order': 'desc'
                    }
                },
                'size': MAX_SEARCH_SIZE
            }

        elif intel_type == 'follow':

            try:
                follow_results = es_xnr.get(index=weibo_xnr_fans_followers_index_name,doc_type=weibo_xnr_fans_followers_index_type,\
                    id=xnr_user_no)['_source']

                if follow_results:
                    for follow_result in follow_results:
                        uid_list = follow_result['_source']['followers']
            except:
                uid_list = []

            query_body = {
                'query': {
                    'bool': {
                        'should': nest_query_list,
                        'minimum_should_match': SHOULD_PERCENT,
                        'must': [{
                            'terms': {
                                'uid': uid_list
                            }
                        }]
                    }
                },
                'sort': {
                    sort_item: {
                        'order': 'desc'
                    }
                },
                'size': MAX_SEARCH_SIZE
            }

        elif intel_type == 'influence':
            date = ts2datetime(current_time - 24 * 3600)

            if S_TYPE == 'test':
                date = S_DATE_BCI

            weibo_bci_index_name = weibo_bci_index_name_pre + date[:4] + date[
                5:7] + date[8:10]

            query_body_bci = {
                'query': {
                    'match_all': {}
                },
                'sort': {
                    'user_index': {
                        'order': 'desc'
                    }
                },
                'size': 500
            }

            weino_bci_results = es_user_portrait.search(
                index=weibo_bci_index_name,
                doc_type=weibo_bci_index_type,
                body=query_body_bci)['hits']['hits']
            if weino_bci_results:
                for bci_result in weino_bci_results:
                    uid = bci_result['_source']['user']
                    uid_list.append(uid)

            query_body = {
                'query': {
                    'bool': {
                        'should': nest_query_list,
                        'minimum_should_match': SHOULD_PERCENT,
                        'must': [{
                            'terms': {
                                'uid': uid_list
                            }
                        }]
                    }
                },
                'sort': {
                    sort_item: {
                        'order': 'desc'
                    }
                },
                'size': MAX_SEARCH_SIZE
            }

        else:

            query_sensitive = {
                'query': {
                    'match_all': {}
                },
                "aggs": {
                    "uids": {
                        "terms": {
                            "field": "uid",
                            "order": {
                                "avg_sensitive": "desc"
                            }
                        },
                        "aggs": {
                            "avg_sensitive": {
                                "avg": {
                                    "field": "sensitive"
                                }
                            }
                        }
                    }
                },
                'size': 500000
            }

            es_sensitive_result = es_flow_text.search(index=index_name_list,doc_type='text',\
                    body=query_sensitive)['aggregations']['uids']['buckets']
            for item in es_sensitive_result:
                uid = item['key']
                uid_list.append(uid)

            query_body = {
                'query': {
                    'bool': {
                        'should': nest_query_list,
                        'minimum_should_match': SHOULD_PERCENT,
                        'must': [{
                            'terms': {
                                'uid': uid_list
                            }
                        }]
                    }
                },
                'sort': {
                    sort_item: {
                        'order': 'desc'
                    }
                },
                'size': MAX_SEARCH_SIZE
            }

        # 得到tweets_list

        tweets_results = es_flow_text.search(index=index_name_list,
                                             doc_type='text',
                                             body=query_body)['hits']['hits']

        if tweets_results:
            for item in tweets_results:
                item = item['_source']
                weibo = item['text']
                tweets_list.append(weibo)

    else:
        if S_TYPE == 'test':
            current_time = datetime2ts(S_DATE_FB)
        else:
            current_time = int(time.time())
        uid_list = []
        sort_item = 'share'
        opinion_keywords_list = [
            word.encode('utf-8') for word in opinion_keywords_list
        ]
        en_keywords_list = trans(opinion_keywords_list, target_language='en')
        for i in range(len(opinion_keywords_list)):
            keyword = opinion_keywords_list[i].decode('utf-8')
            traditional_keyword = simplified2traditional(keyword)

            if len(en_keywords_list) == len(opinion_keywords_list):  #确保翻译没出错
                en_keyword = en_keywords_list[i]
                nest_query_list.append(
                    {'wildcard': {
                        query_item: '*' + en_keyword + '*'
                    }})

            nest_query_list.append(
                {'wildcard': {
                    query_item: '*' + keyword + '*'
                }})
            nest_query_list.append(
                {'wildcard': {
                    query_item: '*' + traditional_keyword + '*'
                }})

        if len(nest_query_list) == 1:
            SHOULD_PERCENT = 1
        else:
            SHOULD_PERCENT = 1

        if task_source == 'facebook':
            index_name_list = fb_get_flow_text_index_list(current_time, days=5)

            if intel_type == 'all':
                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            elif intel_type == 'follow':

                try:
                    follow_results = es_xnr.get(index=fb_xnr_fans_followers_index_name,doc_type=fb_xnr_fans_followers_index_type,\
                        id=xnr_user_no)['_source']

                    if follow_results:
                        for follow_result in follow_results:
                            uid_list = follow_result['_source']['fans_list']
                except:
                    uid_list = []

                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT,
                            'must': [{
                                'terms': {
                                    'uid': uid_list
                                }
                            }]
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            elif intel_type == 'influence':
                fb_bci_index_name = fb_bci_index_name_pre + ts2datetime(
                    current_time)
                query_body_bci = {
                    'query': {
                        'match_all': {}
                    },
                    'sort': {
                        'influence': {
                            'order': 'desc'
                        }
                    },
                    'size': 500
                }

                fb_bci_results = es_xnr.search(
                    index=fb_bci_index_name,
                    doc_type=fb_bci_index_type,
                    body=query_body_bci)['hits']['hits']
                #print 'fb_bci_results...',len(fb_bci_results)
                if fb_bci_results:
                    for bci_result in fb_bci_results:
                        uid = bci_result['_source']['uid']
                        uid_list.append(uid)

                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT,
                            'must': [{
                                'terms': {
                                    'uid': uid_list
                                }
                            }]
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            else:

                query_sensitive = {
                    'query': {
                        'match_all': {}
                    },
                    "aggs": {
                        "uids": {
                            "terms": {
                                "field": "uid",
                                "order": {
                                    "avg_sensitive": "desc"
                                }
                            },
                            "aggs": {
                                "avg_sensitive": {
                                    "avg": {
                                        "field": "sensitive"
                                    }
                                }
                            }
                        }
                    },
                    'size': 500
                }

                es_sensitive_result = es_xnr.search(index=index_name_list,doc_type='text',\
                        body=query_sensitive)['aggregations']['uids']['buckets']
                #print 'es_sensitive_result...',len(es_sensitive_result)
                for item in es_sensitive_result:
                    uid = item['key']
                    uid_list.append(uid)

                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT,
                            'must': [{
                                'terms': {
                                    'uid': uid_list
                                }
                            }]
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            #print 'query_body...',query_body
            tweets_results = es_xnr.search(index=index_name_list,
                                           doc_type='text',
                                           body=query_body)['hits']['hits']

            if tweets_results:
                for item in tweets_results:
                    item = item['_source']
                    weibo = item['text']
                    tweets_list.append(weibo)

        else:
            index_name_list = tw_get_flow_text_index_list(current_time, days=5)

            if intel_type == 'all':
                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            elif intel_type == 'follow':

                try:
                    follow_results = es_xnr.get(index=tw_xnr_fans_followers_index_name,doc_type=tw_xnr_fans_followers_index_type,\
                        id=xnr_user_no)['_source']

                    if follow_results:
                        for follow_result in follow_results:
                            uid_list = follow_result['_source'][
                                'followers_list']
                except:
                    uid_list = []

                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT,
                            'must': [{
                                'terms': {
                                    'uid': uid_list
                                }
                            }]
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            elif intel_type == 'influence':
                tw_bci_index_name = tw_bci_index_name_pre + ts2datetime(
                    current_time)
                query_body_bci = {
                    'query': {
                        'match_all': {}
                    },
                    'sort': {
                        'influence': {
                            'order': 'desc'
                        }
                    },
                    'size': 500
                }

                tw_bci_results = es_xnr.search(
                    index=tw_bci_index_name,
                    doc_type=tw_bci_index_type,
                    body=query_body_bci)['hits']['hits']
                if tw_bci_results:
                    for bci_result in tw_bci_results:
                        uid = bci_result['_source']['uid']
                        uid_list.append(uid)

                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT,
                            'must': [{
                                'terms': {
                                    'uid': uid_list
                                }
                            }]
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            else:

                query_sensitive = {
                    'query': {
                        'match_all': {}
                    },
                    "aggs": {
                        "uids": {
                            "terms": {
                                "field": "uid",
                                "order": {
                                    "avg_sensitive": "desc"
                                }
                            },
                            "aggs": {
                                "avg_sensitive": {
                                    "avg": {
                                        "field": "sensitive"
                                    }
                                }
                            }
                        }
                    },
                    'size': 500
                }

                es_sensitive_result = es_xnr.search(index=index_name_list,doc_type='text',\
                        body=query_sensitive)['aggregations']['uids']['buckets']
                for item in es_sensitive_result:
                    uid = item['key']
                    uid_list.append(uid)

                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT,
                            'must': [{
                                'terms': {
                                    'uid': uid_list
                                }
                            }]
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            print 'index_name_list...', index_name_list
            print 'query_body........', query_body
            tweets_results = es_xnr.search(index=index_name_list,
                                           doc_type='text',
                                           body=query_body)['hits']['hits']

            if tweets_results:
                for item in tweets_results:
                    item = item['_source']
                    weibo = item['text']
                    tweets_list.append(weibo)

    if tweets_list:
        opinion_name, word_result, text_list = opinion_main(tweets_list,
                                                            k_cluster=5)
        sub_opinion_results = dict()

        topic_keywords_list = []
        summary_text_list = []

        for topic, text in text_list.iteritems():

            topic_name = opinion_name[topic]
            sub_opinion_results[topic_name] = text[:SUB_OPINION_WEIBO_LIMIT]

            topic_keywords_list.extend(topic_name.split('&'))
            summary_text_list.extend(text)

        #try:
        print 'summary_text_list..', len(summary_text_list)
        print 'topic_keywords_list..', topic_keywords_list
        summary = text_generation_main(summary_text_list, topic_keywords_list)
        #summary = summary_main(summary_text_list)
        #except:
        #    summary = ''

    else:
        sub_opinion_results = {}
        summary = ''

    print '开始保存子观点计算结果......'
    print 'summary....', summary
    mark = save_intelligent_opinion_results(task_id, sub_opinion_results,
                                            summary, intel_type)

    return mark
Example #3
0
def detect_by_keywords(keywords, datetime_list):
    keywords_list = []
    model = gensim.models.KeyedVectors.load_word2vec_format(WORD2VEC_PATH,
                                                            binary=True)
    for word in keywords:
        simi_list = model.most_similar(word, topn=20)
        for simi_word in simi_list:
            keywords_list.append(simi_word[0])

    group_uid_list = set()
    if datetime_list == []:
        return []

    query_item = 'text'
    flow_text_index_name_list = []
    for datetime in datetime_list:
        flow_text_index_name = flow_text_index_name_pre + datetime
        flow_text_index_name_list.append(flow_text_index_name)

    nest_query_list = []
    #文本中可能存在英文或者繁体字,所以都匹配一下
    en_keywords_list = trans(keywords_list, target_language='en')
    for i in range(len(keywords_list)):
        keyword = keywords_list[i]
        traditional_keyword = simplified2traditional(keyword)

        if len(en_keywords_list) == len(keywords_list):  #确保翻译没出错
            en_keyword = en_keywords_list[i]
            nest_query_list.append(
                {'wildcard': {
                    query_item: '*' + en_keyword + '*'
                }})

        nest_query_list.append({'wildcard': {query_item: '*' + keyword + '*'}})
        nest_query_list.append(
            {'wildcard': {
                query_item: '*' + traditional_keyword + '*'
            }})

    count = MAX_DETECT_COUNT
    if len(nest_query_list) == 1:
        SHOULD_PERCENT = 1  # 绝对数量。 保证至少匹配一个词
    else:
        SHOULD_PERCENT = '3'  # 相对数量。 2个词时,保证匹配2个词,3个词时,保证匹配2个词

    query_body = {
        'query': {
            'bool': {
                'should': nest_query_list,
                'minimum_should_match': SHOULD_PERCENT,
                # 'must_not':{'terms':{'uid':white_uid_list}}
            }
        },
        'aggs': {
            'all_uids': {
                'terms': {
                    'field': 'uid',
                    'order': {
                        '_count': 'desc'
                    },
                    'size': count
                }
            }
        }
    }
    es_results = es_flow_text.search(index=flow_text_index_name_list,doc_type=flow_text_index_type,\
                body=query_body,request_timeout=999999)['aggregations']['all_uids']['buckets']

    for i in range(len(es_results)):
        uid = es_results[i]['key']
        group_uid_list.add(uid)
    group_uid_list = list(group_uid_list)
    return group_uid_list
def find_flow_texts(task_source, task_id, event_keywords):

    # 得到nest_query_list
    nest_query_list = []

    keywords_list = event_keywords.split('&')
    keywords_list = [word.encode('utf-8') for word in keywords_list]
    query_item = 'text'
    if task_source != 'weibo':
        #文本中可能存在英文或者繁体字,所以都匹配一下

        en_keywords_list = trans(keywords_list, target_language='en')
        for i in range(len(keywords_list)):
            keyword = keywords_list[i].decode('utf-8')
            traditional_keyword = simplified2traditional(keyword)

            if len(en_keywords_list) == len(keywords_list):  #确保翻译没出错
                en_keyword = en_keywords_list[i]
                nest_query_list.append(
                    {'wildcard': {
                        query_item: '*' + en_keyword + '*'
                    }})

            nest_query_list.append(
                {'wildcard': {
                    query_item: '*' + keyword + '*'
                }})
            nest_query_list.append(
                {'wildcard': {
                    query_item: '*' + traditional_keyword + '*'
                }})

    else:
        for keyword in keywords_list:
            nest_query_list.append(
                {'wildcard': {
                    query_item: '*' + keyword + '*'
                }})

    if len(nest_query_list) == 1:
        SHOULD_PERCENT = 1
    else:
        SHOULD_PERCENT = 1

    # 匹配文本
    if task_source == 'weibo':
        sort_item = 'retweeted'

        if S_TYPE == 'test':
            current_time = datetime2ts(S_DATE)
        else:
            current_time = int(time.time() + 24 * 3600)
        #test

#current_time = int(datetime2ts("2018-05-13"))
        index_name_list = get_flow_text_index_list(current_time, days=2)
        es_name = es_flow_text

    elif task_source == 'facebook':
        sort_item = 'share'
        if S_TYPE == 'test':
            current_time = datetime2ts(S_DATE_FB)
        else:
            current_time = int(time.time() + 24 * 3600)

        index_name_list = fb_get_flow_text_index_list(current_time, days=2)
        es_name = es_xnr

    else:
        sort_item = 'share'
        if S_TYPE == 'test':
            current_time = datetime2ts(S_DATE_TW)
        else:
            current_time = int(time.time() + 24 * 3600)
        index_name_list = tw_get_flow_text_index_list(current_time, days=2)
        es_name = es_xnr

    query_body = {
        'query': {
            'bool': {
                'should': nest_query_list,
                'minimum_should_match': SHOULD_PERCENT
            }
        },
        'sort': {
            sort_item: {
                'order': 'desc'
            }
        },
        'size': 100000
    }
    print 'es_name...', es_name
    print 'index_name_list..', index_name_list

    search_results = es_name.search(index=index_name_list,
                                    doc_type='text',
                                    body=query_body)['hits']['hits']
    print 'len..search_results..', len(search_results)
    save2topic_es(task_source, task_id, search_results)
Example #5
0
        except Exception, e:
            print u'扩展词 Exception:', str(e)

    group_uid_list = set()
    if datetime_list == []:
        return []

    query_item = 'text'
    flow_text_index_name_list = []
    for datetime in datetime_list:
        flow_text_index_name = flow_text_index_name_pre + datetime
        flow_text_index_name_list.append(flow_text_index_name)

    nest_query_list = []
    #文本中可能存在英文或者繁体字,所以都匹配一下
    en_keywords_list = trans(keywords_list, target_language='en')
    for i in range(len(keywords_list)):
        keyword = keywords_list[i]
        traditional_keyword = simplified2traditional(keyword)

        if len(en_keywords_list) == len(keywords_list):  #确保翻译没出错
            en_keyword = en_keywords_list[i]
            nest_query_list.append(
                {'wildcard': {
                    query_item: '*' + en_keyword + '*'
                }})

        nest_query_list.append({'wildcard': {query_item: '*' + keyword + '*'}})
        nest_query_list.append(
            {'wildcard': {
                query_item: '*' + traditional_keyword + '*'