def search_task(task_name, submit_date, state, status):
    results = []
    query = []
    condition_num = 0
    if task_name:
        task_name_list = task_name.split(' ')
        for item in task_name_list:
            #print 'item:', item
            query.append({'wildcard':{'task_name': '*' + item + '*'}})
            condition_num += 1
    if submit_date:
        query.append({'match':{'submit_date': submit_date}})
        condition_num += 1
    if state:
        state_list = state.split(' ')
        for item in state_list:
            query.append({'wildcard':{'state': '*' + item + '*'}})
            condition_num += 1
    if status:
        query.append({'match':{'status': status}})
        condition_num += 1
    if condition_num > 0:
        try:
            source = es.search(
                    index = 'group_result',
                    doc_type = 'group',
                    body = {
                        'query':{
                            'bool':{
                                'must':query
                                }
                            },
                        # 'sort': [{'count':{'order': 'desc'}}],
                        'size': 10000
                        }
                    )
        except Exception as e:
            raise e
    else:
        source = es.search(
                index = 'group_result',
                doc_type = 'group',
                body = {
                    'query':{'match_all':{}
                        },
                    # 'sort': [{'count': {'order': 'desc'}}],
                    'size': 10000
                    }
                )

    try:
        task_dict_list = source['hits']['hits']
    except:
        return None
    result = []
    #print 'len task_dict_list:', len(task_dict_list)
    for task_dict in task_dict_list:
        result.append([task_dict['_source']['task_name'], task_dict['_source']['submit_date'], task_dict['_source']['count'], task_dict['_source']['state'], task_dict['_source']['status']])
    
    return result
Example #2
0
def search_portrait(condition_num, query, sort, size):
    user_result = []
    index_name = 'sensitive_user_portrait'
    index_type = 'user'
    if condition_num > 0:
        result = es.search(index=index_name, doc_type=index_type, \
            body={'query':{'bool':{'must':query}}, 'sort':sort, 'size':size})['hits']['hits']

    else:
        result = es.search(index=index_name, doc_type=index_type, \
                    body={'query':{'match_all':{}}, 'sort':[{sort:{"order":"desc"}}], 'size':size})['hits']['hits']
    if result:
        for item in result:
            user_dict = item['_source']
            score = item['_score']
            if not user_dict['uname']:
                user_dict['uname'] = 'unknown'
            if not user_dict['location']:
                user_dict['location'] = 'unknown'

            user_result.append([
                user_dict['uid'], user_dict['uname'], user_dict['location'],
                user_dict['activeness'], user_dict['importance'],
                user_dict['influence'], score
            ])

    return user_result
Example #3
0
def search_track_task(task_name, submit_date, state, status):
    result = []
    query = []
    condition_num = 0
    if task_name:
        task_name_list = task_name.split(' ')
        for item in task_name_list:
            query.append({'wildcard': {'task_name': '*' + item + '*'}})
            condition_num += 1
    if submit_date:
        query.append({'term': {'submit_date': submit_date}})
        condition_num += 1
    if state:
        state_list = state.split(' ')
        for item in state_list:
            query.append({'wildcard': {'state': '*' + item + '*'}})
            condition_num += 1
    if status:
        query.append({'term': {'status': status}})
        condition_num += 1
    if condition_num > 0:
        try:
            source = es.search(index='group_result',
                               doc_type='group',
                               body={
                                   'query': {
                                       'bool': {
                                           'must': query
                                       }
                                   },
                                   'size': 100000
                               })
        except Exception as e:
            raise e
    else:
        source = es.search(index='group_result',
                           doc_type='group',
                           body={
                               'query': {
                                   'match_all': {}
                               },
                               'size': 100000
                           })
    try:
        task_dict_list = source['hits']['hits']
    except:
        return None
    #print 'task_dict_list:', task_dict_list
    for task_dict in task_dict_list:
        #print 'task_dict:', task_dict['_source'], type(task_dict)
        if 'end_date' not in task_dict:
            task_dict['_source']['end_date'] = u'至今'
        result.append([
            task_dict['_source']['task_name'],
            task_dict['_source']['submit_date'],
            task_dict['_source']['submit_date'], task_dict['_source']['count'],
            task_dict['_source']['state'], task_dict['_source']['status']
        ])
    #print 'result:', result
    return result
def search_track_task(task_name, submit_date, state, status):
    result = []
    query = []
    condition_num = 0
    if task_name:
        task_name_list = task_name.split(' ')
        for item in task_name_list:
            query.append({'wildcard': {'task_name': '*' + item + '*'}})
            condition_num += 1
    if submit_date:
        query.append({'term': {'submit_date': submit_date}})
        condition_num += 1
    if state:
        state_list = state.split(' ')
        for item in state_list:
            query.append({'wildcard': {'state': '*' + item + '*'}})
            condition_num += 1
    if status:
        query.append({'term': {'status': status}})
        condition_num += 1
    if condition_num > 0:
        try:
            source = es.search(
                    index = 'group_result',
                    doc_type = 'group',
                    body = {
                        'query':{
                            'bool':{
                                'must':query
                                }
                            },
                        'size': 100000
                        }
                    )
        except Exception as e:
            raise e
    else:
        source = es.search(
                index = 'group_result',
                doc_type = 'group',
                body = {
                    'query':{'match_all':{}
                        },
                    'size': 100000
                    }
                )
    try:
        task_dict_list = source['hits']['hits']
    except:
        return None
    #print 'task_dict_list:', task_dict_list
    for task_dict in task_dict_list:
        #print 'task_dict:', task_dict['_source'], type(task_dict)
        if 'end_date' not in task_dict:
            task_dict['_source']['end_date'] = u'至今'
        result.append([task_dict['_source']['task_name'], task_dict['_source']['submit_date'], task_dict['_source']['submit_date'] ,task_dict['_source']['count'], task_dict['_source']['state'],task_dict['_source']['status']])
    #print 'result:', result
    return result
Example #5
0
def ajax_get_hot_keywords():
    query_body = {
        "query": {
            "match_all": {}
        },
        "aggs": {
            "hot_words": {
                "terms": {
                    "field": "sensitive_words_string",
                    "size": 20
                }
            }
        }
    }

    sensitive_words = []
    search_results = es_sensitive_user_portrait.search(
        index=portrait_index_name,
        doc_type=portrait_index_type,
        body=query_body)['aggregations']["hot_words"]['buckets']
    if search_results:
        for item in search_results:
            sensitive_words.append([item['key'], item["doc_count"]])

    return json.dumps(sensitive_words)
def search_current_es(domain, date, number):
    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                            {"term": {"topic_string": domain}}
                        ]
                    }
                }
            }
        },
        'sort': {"influence": {"order": "desc"}},
        'size': number
    }

    search_result = es.search(index=date, doc_type='user', body=query_body)['hits']['hits']
    result_list = []
    for item in search_result:
        if not item['_source']['uname']:
            item['_source']['uname'] = 'unknown'
            item['_source']['photo_url'] = 'unknown'
        result_list.append([item['_id'], item['_source']['uname'], item['_source']['photo_url'], item['_source']['influence'], item['_source']['sensitive'], item['_source']['importance'], item['_source']['activeness']])

    return result_list
Example #7
0
def get_attribute_name():
    attribute_name_list = []
    try:
        attribute_result = es.search(index=attribute_index_name, doc_type=attribute_index_type, \
                                     body={'query':{'match_all':{}}})['hits']['hits']
    except Exception, e:
        raise e
def get_sentiment_weibo(task_name, sentiment, timestamp):
    result = []
    #step1: get task user
    #step2: search weibo by condition: task_user, sensitive_status, sentiment, timestamp
    task_exist = identify_task(task_name)
    if not task_exist:
        return 'the task is not exist'
    task_user = task_exist['uid_list']
    query_body = []
    # multi-search: uid_list
    nest_body_list = []
    for uid in task_user:
        nest_body_list.append({'term':{'uid': uid}})
    query_body.append({'bool':{'should': nest_body_list}})
    # range search: timestamo timestamp+900
    query_body.append({'range':{'timestamp':{'from': timestamp, 'to':timestamp+900}}})
    # match search: sensitive_status
    #query_body.append({'term': {'sensitive': sensitive_status}})
    # match search: sentiment
    query_body.append({'term': {'sentiment': sentiment}})
    try:
        weibo_result = es.search(index=text_index_name, doc_type=text_index_type, \
                body={'query':{'bool':{'must': query_body}}, 'sort':[{'timestamp': {'order':'asc'}}], 'size':10000})['hits']['hits']
    except Exception, e:
        raise e
def get_geo_weibo(task_name, geo, timestamp):
    result = []
    #step1: identify task exist
    #step2: search weibo from monitor_user_text by condition:task_user, geo, timestamp
    task_exist = identify_task(task_name)
    if not task_exist:
        return 'the task is not exist'
    task_user = task_exist['uid_list']
    query_body = []
    # multi-search: task user
    nest_body_list = []
    for uid in task_user:
        nest_body_list.append({'term':{'uid': uid}})
    query_body.append({'bool':{'should': nest_body_list}})
    # range-search: task user
    query_body.append({'range':{'timestamp':{'from': timestamp, 'to': timestamp+24*3600}}})
    # term-search: geo
    geo_list = geo.split('\t')
    city = geo_list[-1]
    query_body.append({'wildcard': {'geo': '*' + city + '*'}})
    try:
        weibo_dict_list = es.search(index=text_index_name, doc_type=text_index_type, \
                body={'query':{'bool':{'must': query_body}}, 'sort':[{'timestamp':{'order': 'asc'}}], 'size':10000})['hits']['hits']
    except Exception, e:
        raise e
def get_attribute_name():
    attribute_name_list = []
    try:
        attribute_result = es.search(index=attribute_index_name, doc_type=attribute_index_type, \
                                     body={'query':{'match_all':{}}})['hits']['hits']
    except Exception, e:
        raise e
def search_important(category, detail):
    query_body = {
        "query": {
            "filtered": {
                "filter": {
                    "term": {
                        category: detail
                    }
                }
            }
        },
        "sort": {
            "sensitive": {
                "order": "desc"
            }
        },
        "size": 20
    }
    results = es.search(index="sensitive_user_portrait",
                        doc_type="user",
                        body=query_body)['hits']['hits']
    uid_list = []
    for item in results:
        uid_list.append([item['_source']['uid'], item['_source']['uname']])
    return uid_list
def search_domain(domain, date, order, number=100):
    # top influence user in domain
    count = 0
    count_n = 0
    result_list = []
    date = str(date).replace('-', '')
    order = str(order)

    query_body = {
        "query":{
            "match_all": {}
        },
        "sort": {search_order[order]: {"order": "desc"}},
        "size": 1+count
    }

    while 1:
        search_results = es.search(index=date, doc_type='bci', body=query_body)['hits']['hits']
        uid_list = []
        count += 1
        for item in search_results:
            uid_list.append(item['_id'])
        portrait_results = es.mget(index='sensitive_user_portrait', doc_type='user', body={"ids":uid_list})['docs']
        for item in portrait_results:
            print item
            domain_list = (item['_source']['topic_string']).split('&')  # attention
            if domain in set(domain_list):
                result_list.append([item['_id'], item['_source']['uname'], item['_source']['photo_url'], item['_source']['influence'], item['_source']['sensitive'], item['_source']['importance'], item['_source']['activeness']])
                count_n += 1
        if count_n >= int(number):
            break

    return result_list
def test_influence_rank(domain, date, order):
    uid_list = domain_dict[domain]
    order = str(order)
    query_body = {
        "query":{
            "match_all": {}
        },
        "sort": {search_order[order]: {"order": "desc"}},
        "size": 100
    }
    search_result = es.search(index=date, doc_type="bci", body=query_body)['hits']['hits']
    portrait_result = es.mget(index='sensitive_user_portrait', doc_type='user',  body={"ids": uid_list})['docs']
    results = []
    for i in range(len(search_result)):
        detail = []
        try:
            detail.append(search_result[i]['_source'][search_order[order]])
        except:
            print uid_list[i]
            detail.append(0)
        try:
            temp = portrait_result[i]['_source']
        except:
            continue
        detail.extend([temp['uid'], temp['uname'], temp['photo_url'], temp['activeness'], temp['importance'], temp['sensitive']])
        results.append(detail)
    sorted_list = sorted(results, key=lambda x:x[0], reverse=True)
    return sorted_list
def search_sensitive_text(uid, stype=0, sort_type="timestamp"):
    results = []
    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                            {"term": {"uid": uid}},
                            {"term": {"sensitive": 1}}
                        ]
                    }
                }
            }
        },
        "sort": {'timestamp': {'order': 'desc'}}
    }
    
    query_body['sort'] = {sort_type: {'order': 'desc'}}

    if stype == 0:
        search_results = es.search(index='sensitive_user_text', doc_type='user', body=query_body)['hits']['hits']
    elif int(stype) == 1:
        query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"term": {"message_type": 1}})
    elif int(stype) == 2:
        query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"term": {"message_type": 2}})
    else:
        pass

    if search_results:
        results = search_results
    return results
def get_evaluate_max():
    max_result = {}
    index_name = 'user_portrait'
    index_type = 'user'
    evaluate_index = ['importance', 'influence']
    for evaluate in evaluate_index:
        query_body = {
            'query': {
                'match_all': {}
            },
            'size': 1,
            'sort': [{
                evaluate: {
                    'order': 'desc'
                }
            }]
        }
        try:
            result = es.search(index=index_name,
                               doc_type=index_type,
                               body=query_body)['hits']['hits']
        except Exception, e:
            raise e
        max_evaluate = result[0]['_source'][evaluate]
        max_result[evaluate] = max_evaluate
Example #16
0
def search_attribute(query_body, condition_num):
    item_list = []
    default_size = 100000
    if condition_num == 0:
        try:
            result = es.search(index=attribute_index_name, doc_type=attribute_index_type, \
                    body={'query':{'match_all':{}}, 'size':default_size})['hits']['hits']
        except Exception, e:
            raise e
def search_attribute(query_body, condition_num):
    item_list = []
    default_size = 100000
    if condition_num==0:
        try:
            result = es.search(index=attribute_index_name, doc_type=attribute_index_type, \
                    body={'query':{'match_all':{}}, 'size':default_size})['hits']['hits']
        except Exception, e:
            raise e
def search_in_portrait(category):
    query_body={
        "query":{
            "match_all": {}
        },
        "sort": {category: {"order": "desc"}}
    }
    results = es.search(index="sensitive_user_portrait", doc_type="user", body=query_body)['hits']['hits']
    uid_list = []
    for item in results:
        uid_list.append([item['_source']['uid'], item['_source']['uname'], item['_source'][category]])
    return uid_list
Example #19
0
def user_sentiment_trend(uid):
    query_body = {"query": {"filtered": {"filter": {"term": {"uid": uid}}}}}
    search_results = es.search(index='sensitive_user_text',
                               doc_type='user',
                               body=query_body)['hits']['hits']
    sentiment_dict = dict()
    sentiment_results = dict()
    for item in search_results:
        datetime = ts2datetime(float(item['_source']['timestamp'])).replace(
            '-', '')
        try:
            sentiment_dict[datetime].append(
                json.loads(item['_source']['sentiment']))
        except:
            sentiment_dict[datetime] = [
                json.loads(item['_source']['sentiment'])
            ]
    total_positive = 0
    total_negetive = 0
    total_neutral = 0
    for datetime, sentiment_detail in sentiment_dict.items():
        positive_count = 0
        negetive_count = 0
        neutral_count = 0
        sentiment_results[datetime] = {}
        for item in sentiment_detail:
            if not item:
                try:
                    neutral_count += 1
                except:
                    neutral_count = 1
                total_neutral += 1
                continue
            positive_dict = item.get('126', {})
            positive = sum(positive_dict.values())
            positive_count += positive
            negetive = sum(item.get('127', {}).values()) + sum(
                item.get('128', {}).values()) + sum(
                    item.get('129', {}).values())
            negetive_count += negetive
            if positive > negetive:
                total_positive += 1
            elif positive < negetive:
                total_negetive += 1
            else:
                total_neutral += 1
        sentiment_results[datetime]['neutral'] = neutral_count
        sentiment_results[datetime]['positive'] = positive_count
        sentiment_results[datetime]['negetive'] = negetive_count
    return [[total_positive, total_neutral, total_negetive], sentiment_results]
def search_portrait(condition_num, query, sort, size):
    user_result = []
    index_name = 'sensitive_user_portrait'
    index_type = 'user'
    if condition_num > 0:
        result = es.search(index=index_name, doc_type=index_type, \
            body={'query':{'bool':{'must':query}}, 'sort':sort, 'size':size})['hits']['hits']

    else:
        result = es.search(index=index_name, doc_type=index_type, \
                    body={'query':{'match_all':{}}, 'sort':[{sort:{"order":"desc"}}], 'size':size})['hits']['hits']
    if result:
        for item in result:
            user_dict = item['_source']
            score = item['_score']
            if not user_dict['uname']:
                user_dict['uname'] = 'unknown'
            if not user_dict['location']:
                user_dict['location'] = 'unknown'

            user_result.append([user_dict['uid'], user_dict['uname'], user_dict['location'], user_dict['activeness'], user_dict['importance'], user_dict['influence'], score])

    return user_result
def user_sentiment_trend(uid):
    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "term": {"uid": uid}
                }
            }
        }
    }
    search_results = es.search(index='sensitive_user_text', doc_type='user', body=query_body)['hits']['hits']
    sentiment_dict = dict()
    sentiment_results = dict()
    for item in search_results:
        datetime = ts2datetime(float(item['_source']['timestamp'])).replace('-', '')
        try:
            sentiment_dict[datetime].append(json.loads(item['_source']['sentiment']))
        except:
            sentiment_dict[datetime] = [json.loads(item['_source']['sentiment'])]
    total_positive = 0
    total_negetive = 0
    total_neutral = 0
    for datetime, sentiment_detail in sentiment_dict.items():
        positive_count = 0
        negetive_count = 0
        neutral_count = 0
        sentiment_results[datetime] = {}
        for item in sentiment_detail:
            if not item:
                try:
                    neutral_count += 1
                except:
                    neutral_count  = 1
                total_neutral += 1
                continue
            positive_dict = item.get('126', {})
            positive = sum(positive_dict.values())
            positive_count += positive
            negetive = sum(item.get('127', {}).values()) + sum(item.get('128', {}).values()) + sum(item.get('129', {}).values())
            negetive_count += negetive
            if positive > negetive:
                total_positive += 1
            elif positive < negetive:
                total_negetive += 1
            else:
                total_neutral += 1
        sentiment_results[datetime]['neutral'] = neutral_count
        sentiment_results[datetime]['positive'] = positive_count
        sentiment_results[datetime]['negetive'] = negetive_count
    return [[total_positive, total_neutral, total_negetive], sentiment_results]
def search_portrait(condition_num, query, sort, size):
    user_result = []
    index_name = "sensitive_user_portrait"
    index_type = "user"
    if condition_num > 0:
        result = es.search(
            index=index_name, doc_type=index_type, body={"query": {"bool": {"must": query}}, "sort": sort, "size": size}
        )["hits"]["hits"]

    else:
        result = es.search(
            index=index_name,
            doc_type=index_type,
            body={"query": {"match_all": {}}, "sort": [{sort: {"order": "desc"}}], "size": size},
        )["hits"]["hits"]
    if result:
        for item in result:
            user_dict = item["_source"]
            score = item["_score"]
            if not user_dict["uname"]:
                user_dict["uname"] = "unknown"
            if not user_dict["location"]:
                user_dict["location"] = "unknown"

            user_result.append(
                [
                    user_dict["uid"],
                    user_dict["uname"],
                    user_dict["location"],
                    user_dict["activeness"],
                    user_dict["importance"],
                    user_dict["influence"],
                    score,
                ]
            )

    return user_result
Example #23
0
def search_sensitive_text(uid, stype=0, sort_type="timestamp"):
    results = []
    query_body = {
        "query": {
            "filtered": {
                "filter": {
                    "bool": {
                        "must": [{
                            "term": {
                                "uid": uid
                            }
                        }, {
                            "term": {
                                "sensitive": 1
                            }
                        }]
                    }
                }
            }
        },
        "sort": {
            'timestamp': {
                'order': 'desc'
            }
        }
    }

    query_body['sort'] = {sort_type: {'order': 'desc'}}

    if stype == 0:
        search_results = es.search(index='sensitive_user_text',
                                   doc_type='user',
                                   body=query_body)['hits']['hits']
    elif int(stype) == 1:
        query_body["query"]["filtered"]["filter"]["bool"]["must"].append(
            {"term": {
                "message_type": 1
            }})
    elif int(stype) == 2:
        query_body["query"]["filtered"]["filter"]["bool"]["must"].append(
            {"term": {
                "message_type": 2
            }})
    else:
        pass

    if search_results:
        results = search_results
    return results
def search_domain(domain, date, order, number=100):
    # top influence user in domain
    count = 0
    count_n = 0
    result_list = []
    date = str(date).replace('-', '')
    order = str(order)

    query_body = {
        "query": {
            "match_all": {}
        },
        "sort": {
            search_order[order]: {
                "order": "desc"
            }
        },
        "size": 1 + count
    }

    while 1:
        search_results = es.search(index=date, doc_type='bci',
                                   body=query_body)['hits']['hits']
        uid_list = []
        count += 1
        for item in search_results:
            uid_list.append(item['_id'])
        portrait_results = es.mget(index='sensitive_user_portrait',
                                   doc_type='user',
                                   body={"ids": uid_list})['docs']
        for item in portrait_results:
            print item
            domain_list = (item['_source']['topic_string']).split(
                '&')  # attention
            if domain in set(domain_list):
                result_list.append([
                    item['_id'], item['_source']['uname'],
                    item['_source']['photo_url'], item['_source']['influence'],
                    item['_source']['sensitive'],
                    item['_source']['importance'],
                    item['_source']['activeness']
                ])
                count_n += 1
        if count_n >= int(number):
            break

    return result_list
def search_important(category, detail):
    query_body={
        "query":{
            "filtered":{
                "filter":{
                    "term": {category: detail}
                 }
            }
        },
        "sort": {"sensitive": {"order": "desc"}},
        "size": 20
    }
    results = es.search(index="sensitive_user_portrait", doc_type="user", body=query_body)['hits']['hits']
    uid_list = []
    for item in results:
        uid_list.append([item['_source']['uid'], item['_source']['uname']])
    return uid_list
def user_sentiment_trend(uid):
    query_body = {"query": {"filtered": {"filter": {"term": {"uid": uid}}}}}
    search_results = es.search(index="sensitive_user_text", doc_type="user", body=query_body)["hits"]["hits"]
    sentiment_dict = dict()
    sentiment_results = dict()
    for item in search_results:
        datetime = ts2datetime(float(item["_source"]["timestamp"])).replace("-", "")
        try:
            sentiment_dict[datetime].append(json.loads(item["_source"]["sentiment"]))
        except:
            sentiment_dict[datetime] = [json.loads(item["_source"]["sentiment"])]
    total_positive = 0
    total_negetive = 0
    total_neutral = 0
    for datetime, sentiment_detail in sentiment_dict.items():
        positive_count = 0
        negetive_count = 0
        neutral_count = 0
        sentiment_results[datetime] = {}
        for item in sentiment_detail:
            if not item:
                try:
                    neutral_count += 1
                except:
                    neutral_count = 1
                total_neutral += 1
                continue
            positive_dict = item.get("126", {})
            positive = sum(positive_dict.values())
            positive_count += positive
            negetive = (
                sum(item.get("127", {}).values())
                + sum(item.get("128", {}).values())
                + sum(item.get("129", {}).values())
            )
            negetive_count += negetive
            if positive > negetive:
                total_positive += 1
            elif positive < negetive:
                total_negetive += 1
            else:
                total_neutral += 1
        sentiment_results[datetime]["neutral"] = neutral_count
        sentiment_results[datetime]["positive"] = positive_count
        sentiment_results[datetime]["negetive"] = negetive_count
    return [[total_positive, total_neutral, total_negetive], sentiment_results]
def ajax_get_hot_keywords():
    query_body = {
        "query":{
            "match_all":{}
        },
        "aggs":{
            "hot_words":{
                "terms":{"field": "sensitive_words_string", "size":20}
            }
        }
    }

    sensitive_words = []
    search_results = es_sensitive_user_portrait.search(index=portrait_index_name, doc_type=portrait_index_type, body=query_body)['aggregations']["hot_words"]['buckets']
    if search_results:
        for item in search_results:
            sensitive_words.append([item['key'],item["doc_count"]])

    return json.dumps(sensitive_words)
def get_evaluate_max():
    max_result = {}
    index_name = 'user_portrait'
    index_type = 'user'
    evaluate_index = ['importance', 'influence']
    for evaluate in evaluate_index:
        query_body = {
            'query':{
                'match_all':{}
                },
            'size': 1,
            'sort': [{evaluate: {'order': 'desc'}}]
            }
        try:
            result = es.search(index=index_name, doc_type=index_type, body=query_body)['hits']['hits']
        except Exception, e:
            raise e
        max_evaluate = result[0]['_source'][evaluate]
        max_result[evaluate] = max_evaluate
def get_network(task_exist):
    task_name = task_exist['task_name']
    submit_date = task_exist['submit_date']
    submit_ts = date2ts(submit_date)

    time_segment = 24*3600
    now_ts = time.time()
    now_date = ts2datetime(now_ts)
    now_date_ts = datetime2ts(now_date)
    iter_date_ts = now_date_ts
    iter_count = 1
    date_list = []
    top_list_dict = {}
    while True:
        if iter_count >= 8 or iter_date_ts < submit_ts:
            break
        iter_date = ts2datetime(iter_date_ts)
        date_list.append(iter_date)
        key = 'inner_' + str(iter_date)
        try:
            task_date_result = es.search(index=monitor_index_name, doc_type=task_name, id=key)['_source']
        except:
            task_date_result = {}
        iter_field = ['top1', 'top2', 'top3', 'top4', 'top5']
        for field in iter_field:
            user_count_item = json.loads(task_date_result[field])
            uid = user_count_item[0]
            uname = uid2uname(uid)
            count = user_count_item[1]
            try:
                top_list_dict[field].append([uid, uname, count])
            except:
                top_list_dict[field] = [[uid, uname, count]]
        
        iter_date_ts -= time_segment
        # get inner-retweet group from es---field: inner_graph
        try:
            inner_graph = json.loads(task_date_result['inner_graph'])
        except:
            inner_graph = {}
    
    return [date_list, top_list_dict, inner_graph]
def search_sensitive_text(uid, stype=0, sort_type="timestamp"):
    results = []
    query_body = {
        "query": {"filtered": {"filter": {"bool": {"must": [{"term": {"uid": uid}}, {"term": {"sensitive": 1}}]}}}},
        "sort": {"timestamp": {"order": "desc"}},
    }

    query_body["sort"] = {sort_type: {"order": "desc"}}

    if stype == 0:
        search_results = es.search(index="sensitive_user_text", doc_type="user", body=query_body)["hits"]["hits"]
    elif int(stype) == 1:
        query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"term": {"message_type": 1}})
    elif int(stype) == 2:
        query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"term": {"message_type": 2}})
    else:
        pass

    if search_results:
        results = search_results
    return results
def search_in_portrait(category):
    query_body = {
        "query": {
            "match_all": {}
        },
        "sort": {
            category: {
                "order": "desc"
            }
        }
    }
    results = es.search(index="sensitive_user_portrait",
                        doc_type="user",
                        body=query_body)['hits']['hits']
    uid_list = []
    for item in results:
        uid_list.append([
            item['_source']['uid'], item['_source']['uname'],
            item['_source'][category]
        ])
    return uid_list
def test_influence_rank(domain, date, order):
    uid_list = domain_dict[domain]
    order = str(order)
    query_body = {
        "query": {
            "match_all": {}
        },
        "sort": {
            search_order[order]: {
                "order": "desc"
            }
        },
        "size": 100
    }
    search_result = es.search(index=date, doc_type="bci",
                              body=query_body)['hits']['hits']
    portrait_result = es.mget(index='sensitive_user_portrait',
                              doc_type='user',
                              body={"ids": uid_list})['docs']
    results = []
    for i in range(len(search_result)):
        detail = []
        try:
            detail.append(search_result[i]['_source'][search_order[order]])
        except:
            print uid_list[i]
            detail.append(0)
        try:
            temp = portrait_result[i]['_source']
        except:
            continue
        detail.extend([
            temp['uid'], temp['uname'], temp['photo_url'], temp['activeness'],
            temp['importance'], temp['sensitive']
        ])
        results.append(detail)
    sorted_list = sorted(results, key=lambda x: x[0], reverse=True)
    return sorted_list
def search_current_es(domain, date, number):
    query_body = {
        "query": {
            "filtered": {
                "filter": {
                    "bool": {
                        "must": [{
                            "term": {
                                "topic_string": domain
                            }
                        }]
                    }
                }
            }
        },
        'sort': {
            "influence": {
                "order": "desc"
            }
        },
        'size': number
    }

    search_result = es.search(index=date, doc_type='user',
                              body=query_body)['hits']['hits']
    result_list = []
    for item in search_result:
        if not item['_source']['uname']:
            item['_source']['uname'] = 'unknown'
            item['_source']['photo_url'] = 'unknown'
        result_list.append([
            item['_id'], item['_source']['uname'],
            item['_source']['photo_url'], item['_source']['influence'],
            item['_source']['sensitive'], item['_source']['importance'],
            item['_source']['activeness']
        ])

    return result_list
def get_inner_top_weibo(task_name, date, uid):
    result = []
    # step1: identify the task exist
    # step2: search weibo from monitor_user_text by condition: task_user, date
    task_exist = identify_task(task_name)
    if not task_exist:
        return 'the task is not exist'
    task_user = task_exist['uid_list']
    if uid not in task_user:
        return 'the user is not exist'
    end_ts = datetime2ts(date)
    time_segment = 24*3600
    start_ts = end_ts - time_segment
    query_body = []
    #term search: uid
    query_body.append({'term': uid})
    #range search: date-24*3600, date
    query_body.append({'range':{'timestamp': {'from': start_ts, 'to': end_ts}}})
    try:
        weibo_result = es.search(index=text_index_name, doc_type=text_index_type, \
                body={'query':{'bool':{'must': query_body}}, 'sort':[{'timestamp':{'order':'asc'}}], 'size':10000})['hits']['hits']
    except Exception, e:
        raise e
def search_full_text(uid, date):
    index_flow_text = flow_text_index_name_pre + date
    doctype_flow_text = flow_text_index_type
    result = []
    ts = datetime2ts(date)
    next_ts = ts + 24*3600
    query_body = {
        "query": {
            "filtered":{
                "filter":{
                    "bool": {
                        "must": [
                            {"term": {"uid": uid}}
                        ]
                    }
                }
            }
        },
        "size": 200,
        "sort":{"timestamp":{"order": "desc"}}
    }

    search_results = es.search(index=index_flow_text, doc_type=doctype_flow_text, body=query_body)['hits']['hits']
    for item in search_results:
        detail = []
        source = item['_source']
        detail.append(source.get('sensitive', 0))
        detail.append(source['message_type'])
        ts =source['timestamp']
        re_time = time.strftime('%H:%M:%S', time.localtime(float(ts)))
        detail.append(re_time)
        geo_string = source['geo']
        geo_list = geo_string.split('/t')
        if len(geo_list) >= 3:
            geo = '/t'.join(geo_list[-2:])
        else:
            geo = geo_string
        detail.append(geo)
        detail.append(source['text'])
        date = date.replace('-', '')
        mid = source['mid']
        try:
            weibo_bci = es.get(index=date, doc_type='bci', id=uid)['_source']
        except:
            weibo_bci = {}
        retweeted_number = 0
        comment_number = 0
        if source.get('sensitive', 0):
            if int(source['message_type']) == 1:
                if weibo_bci:
                    if weibo_bci.get('s_origin_weibo_retweeted_detail', {}):
                        retweeted_detail = json.loads(weibo_bci['s_origin_weibo_retweeted_detail'])
                    else:
                        retweeted_detail = {}
                    retweeted_number = retweeted_detail.get(mid, 0)
                    if weibo_bci.get('s_origin_weibo_comment_detail', {}):
                        comment_detail = json.loads(weibo_bci['s_origin_weibo_comment_detail'])
                    else:
                        comment_detail = {}
                    comment_number = comment_detail.get(mid, 0)
            elif int(source['message_type']) == 2:
                if weibo_bci:
                    if weibo_bci.get('s_retweeted_weibo_retweeted_detail', {}):
                        retweeted_detail = json.loads(weibo_bci['s_retweeted_weibo_retweeted_detail'])
                    else:
                        retweeted_detail = {}
                    retweeted_number = retweeted_detail.get(mid, 0)
                    if weibo_bci.get('s_retweetd_weibo_comment_detail', {}):
                        comment_detail = json.loads(weibo_bci['s_retweeted_weibo_comment_detail'])
                    else:
                        comment_detail = {}
                    comment_number = comment_detail.get(mid, 0)
            else:
                pass
        else:
            if int(source['message_type']) == 1:
                if weibo_bci:
                    if weibo_bci.get('origin_weibo_retweeted_detail', {}):
                        retweeted_detail = json.loads(weibo_bci['origin_weibo_retweeted_detail'])
                    else:
                        retweeted_detail = {}
                    retweeted_number = retweeted_detail.get(mid, 0)
                    if weibo_bci.get('origin_weibo_comment_detail', {}):
                        comment_detail = json.loads(weibo_bci['origin_weibo_comment_detail'])
                    else:
                        comment_detail = {}
                    comment_number = comment_detail.get(mid, 0)
            elif int(source['message_type']) == 2:
                if weibo_bci:
                    if weibo_bci.get('retweeted_weibo_retweeted_detail', {}):
                        retweeted_detail = json.loads(weibo_bci['retweeted_weibo_retweeted_detail'])
                    else:
                        retweeted_detail = {}
                    retweeted_number = retweeted_detail.get(mid, 0)
                    if weibo_bci.get('retweetd_weibo_comment_detail', {}):
                        comment_detail = json.loads(weibo_bci['retweeted_weibo_comment_detail'])
                    else:
                        comment_detail = {}
                    comment_number = comment_detail.get(mid, 0)
            else:
                pass
        detail.append(retweeted_number)
        detail.append(comment_number)
        result.append(detail)

    return result
        status = True
    return status

# use to search attribute table
def search_attribute(query_body, condition_num):
    item_list = []
    default_size = 100000
    if condition_num==0:
        try:
            result = es.search(index=attribute_index_name, doc_type=attribute_index_type, \
                    body={'query':{'match_all':{}}, 'size':default_size})['hits']['hits']
        except Exception, e:
            raise e
    else:
        try:
            result = es.search(index=attribute_index_name, doc_type=attribute_index_type, \
                    body={'query':{'bool':{'must':query_body}}, 'size':default_size})['hits']['hits']
        except Exception, e:
            raise e
    if result:
        for item in result:
            print 'item:', item
            source = item['_source']
            item_list.append(source)
    return item_list

# use to change attribtue
def change_attribute(attribute_name, value, user, state):
    status = False
    # identify the attribute_name is in ES - custom attribute
    try:
        result =  es.get(index=attribute_index_name, doc_type=attribute_index_type, id=attribute_name)['_source']
def search_task(task_name, submit_date, state, status):
    results = []
    query = []
    condition_num = 0
    if task_name:
        task_name_list = task_name.split(' ')
        for item in task_name_list:
            #print 'item:', item
            query.append({'wildcard': {'task_name': '*' + item + '*'}})
            condition_num += 1
    if submit_date:
        query.append({'match': {'submit_date': submit_date}})
        condition_num += 1
    if state:
        state_list = state.split(' ')
        for item in state_list:
            query.append({'wildcard': {'state': '*' + item + '*'}})
            condition_num += 1
    if status:
        query.append({'match': {'status': status}})
        condition_num += 1
    if condition_num > 0:
        try:
            source = es.search(
                index='group_result',
                doc_type='group',
                body={
                    'query': {
                        'bool': {
                            'must': query
                        }
                    },
                    # 'sort': [{'count':{'order': 'desc'}}],
                    'size': 10000
                })
        except Exception as e:
            raise e
    else:
        source = es.search(
            index='group_result',
            doc_type='group',
            body={
                'query': {
                    'match_all': {}
                },
                # 'sort': [{'count': {'order': 'desc'}}],
                'size': 10000
            })

    try:
        task_dict_list = source['hits']['hits']
    except:
        return None
    result = []
    #print 'len task_dict_list:', len(task_dict_list)
    for task_dict in task_dict_list:
        result.append([
            task_dict['_source']['task_name'],
            task_dict['_source']['submit_date'], task_dict['_source']['count'],
            task_dict['_source']['state'], task_dict['_source']['status']
        ])

    return result
def compute_mid_result(task_name, task_submit_date):
    result = {'count_0':{}, 'count_1':{}, 'sentiment_0_126':{}, 'sentiment_0_127':{}, 'sentiment_0_128':{},\
            'sentiment_0_129':{}, 'sentiment_0_130':{}, 'sensitive_score':{}, 'geo_0':{}, 'geo_1':{},\
            'hashtag_0':{}, 'hashtag_1':{}, 'sentiment_1_126':{}, 'sentiment_1_127':{}, \
            'sentiment_1_128':{}, 'sentiment_1_129':{}, 'sentiment_1_130':{}}
    #geo & hashtag: day
    #other: 15min
    search_time_segment = 3600 * 4
    #start_ts = datetime2ts(task_submit_date)
    start_ts = date2ts(task_submit_date)
    now_ts = time.time()
    now_date = ts2datetime(now_ts)
    #test
    now_ts = datetime2ts('2013-09-08')
    date_ts = datetime2ts(now_date)
    segment = int((now_ts - date_ts) / 900) + 1
    end_ts = date_ts + segment * 900
    #every search time-range: 4 hour----bulk action to search
    begin_ts = start_ts

    while True:
        if begin_ts >= end_ts:
            break
        compute_ts = ts2date(begin_ts)
        #print 'compute ts:', compute_ts
        query_body = {'range':{'timestamp':{'from': begin_ts, 'to':begin_ts+search_time_segment}}}
        try:
            mid_result_list = es.search(index=monitor_index_name, doc_type=task_name, body={'query':query_body, 'size':100000, 'sort':[{'timestamp':{'order': 'asc'}}]})['hits']['hits']
        except Exception, e:
            raise e
        if mid_result_list:
            for mid_result_item in mid_result_list:
                result_item = mid_result_item['_source']
                timestamp = result_item['timestamp']
                #attr_count
                #print 'compute_count'
                count_dict = json.loads(result_item['count'])
                for sensitive in count_dict:
                    count_key = 'count_' + sensitive
                    result[count_key][str(timestamp)] = count_dict[sensitive]
                #attr_sentiment
                #print 'compute_sentiment'
                sensitive_sentiment_dict = json.loads(result_item['sentiment'])
                for sensitive in sensitive_sentiment_dict:
                    sentiment_dict = sensitive_sentiment_dict[sensitive]
                    for sentiment in sentiment_dict:
                        sentiment_key = 'sentiment_'+sensitive+'_'+sentiment
                        result[sentiment_key][str(timestamp)] = sentiment_dict[sentiment]
                #attr_sensitive_score
                #print 'compute_sensitive_word'
                if 'sensitive_word' in result_item:
                    sensitive_word_dict = json.loads(result_item['sensitive_word'])
                else:
                    sensitive_word_dict = {}
                ts_word_score = 0
                for word in sensitive_word_dict:
                    #print 'word:', json.dumps(word.encode('utf-8')), word.encode('utf-8'), type(word.encode('utf-8'))
                    search_word = word.encode('utf-8')
                    #print 'search_word:', search_word, type(search_word)
                    try:
                        word_identify = json.loads(word_r.hget('sensitive_words', search_word))
                    except:
                        word_identify = [2]
                    ts_word_score += sensitive_word_dict[word] * word_identify[0]
                result['sensitive_score'][str(timestamp)] = ts_word_score
                #attr_geo
                #print 'compute geo'
                timestamp_date = ts2datetime(timestamp)
                sensitive_geo_dict = json.loads(result_item['geo'])
                for sensitive in sensitive_geo_dict:
                    if timestamp_date not in result['geo_'+sensitive]:
                        result['geo_'+sensitive][timestamp_date] = {}
                        
                    geo_dict = sensitive_geo_dict[sensitive]
                    for geo in geo_dict:
                        try:
                            result['geo_'+sensitive][timestamp_date][geo] += geo_dict[geo]
                        except:
                            result['geo_'+sensitive][timestamp_date][geo] = geo_dict[geo]

                #attr_hashtag
                #print 'compute hashtag'
                if 'hashtag' in result_item:
                    sensitive_hashtag_dict = json.loads(result_item['hashtag'])
                else:
                    sensitive_hashtag_dict = {}
                    result['hashtag_0'][timestamp_date] = {}
                    result['hashtag_1'][timestamp_date] = {}
                for sensitive in sensitive_hashtag_dict:
                    for sensitive in sensitive_hashtag_dict:
                        if timestamp_date not in result['hashtag_'+sensitive]:
                            result['hashtag_'+sensitive][timestamp_date] = {}
                        hashtag_dict = sensitive_hashtag_dict[sensitive]
                        for hashtag in hashtag_dict:
                            try:
                                result['hashtag_'+sensitive][timestamp_date][hashtag] += hashtag_dict[hashtag]
                            except:
                                result['hashtag_'+sensitive][timestamp_date][hashtag] = hashtag_dict[hashtag]

        begin_ts += search_time_segment
Example #39
0
    return status


# use to search attribute table
def search_attribute(query_body, condition_num):
    item_list = []
    default_size = 100000
    if condition_num == 0:
        try:
            result = es.search(index=attribute_index_name, doc_type=attribute_index_type, \
                    body={'query':{'match_all':{}}, 'size':default_size})['hits']['hits']
        except Exception, e:
            raise e
    else:
        try:
            result = es.search(index=attribute_index_name, doc_type=attribute_index_type, \
                    body={'query':{'bool':{'must':query_body}}, 'size':default_size})['hits']['hits']
        except Exception, e:
            raise e
    if result:
        for item in result:
            print 'item:', item
            source = item['_source']
            item_list.append(source)
    return item_list


# use to change attribtue
def change_attribute(attribute_name, value, user, state):
    status = False
    # identify the attribute_name is in ES - custom attribute
    try:
def get_attr(date):
    results = dict()
    number = es.count(index="sensitive_user_portrait", doc_type="user")['count']
    results['total_number'] = number

    query_body={
        "query":{
            "filtered":{
                "filter":{
                    "term":{
                        "type": 1
                    }
                }
            }
        }
    }
    sensitive_number = es.count(index="sensitive_user_portrait", doc_type="user", body=query_body)['count']
    results['sensitive_number'] = sensitive_number
    results['influence_number'] = number - sensitive_number

    recommend_in_sensitive = 0
    sensitive_dict = r.hgetall('recommend_sensitive')
    for k,v in sensitive_dict.items():
        if v:
            sensitive_list = json.loads(v)
            recommend_in_sensitive += len(sensitive_list)

    recommend_in_influence = 0
    influence_dict = r.hgetall('recommend_influence')
    for k,v in influence_dict.items():
        if v:
            sensitive_list = json.loads(v)
            recommend_in_influence += len(sensitive_list)
    results['recommend_in'] = recommend_in_influence + recommend_in_sensitive

    results['monitor_number'] = [4, 83] # test
    results['new_sensitive_words'] = 5  # test

    query_body = query_body_module('sensitive_words_string')
    sw_list =  es.search(index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets']
    sensitive_words = []
    for item in sw_list:
        temp = []
        temp.append(item['key'])
        temp.append(item['doc_count'])
        sensitive_words.append(temp)
    results['sensitive_words'] = sensitive_words

    query_body = query_body_module('sensitive_geo_string')
    sg_list =  es.search(index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets']
    sensitive_geo = []
    for item in sg_list:
        temp = []
        temp.append(item['key'])
        temp.append(item['doc_count'])
        sensitive_geo.append(temp)
    results['sensitive_geo'] = sensitive_geo

    query_body = query_body_module('sensitive_hashtag_string')
    sh_list =  es.search(index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets']
    sensitive_hashtag = []
    for item in sh_list:
        temp = []
        temp.append(item['key'])
        temp.append(item['doc_count'])
        sensitive_hashtag.append(temp)
    results['sensitive_hashtag'] = sensitive_hashtag

    query_body = query_body_module('sensitive_geo_string')
    sg_list =  es.search(index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets']
    sensitive_geo = []
    for item in sg_list:
        temp = []
        temp.append(item['key'])
        temp.append(item['doc_count'])
        sensitive_geo.append(temp)
    results['sensitive_geo'] = sensitive_geo

    query_body = query_body_module('psycho_status_string')
    sp_list = es.search(index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets']
    psycho_status = []
    for item in sp_list:
        temp = []
        temp.append(item['key'])
        temp.append(item['doc_count'])
        psycho_status.append(temp)
    results['psycho_status'] = psycho_status

    '''
    query_body = query_body_module('political_tendency')
    st_list =  es.search(index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets']
    political_tendency = []
    for item in st_list:
        temp = []
        temp.append(item['key'])
        temp.append(item['doc_count'])
        political_tendency.append(temp)
    results['political_tendency'] = political_tendency
    '''
    results['political_tendency'] = [['left', 123], ['middle', 768], ['right', 1095]]

    '''
    query_body = query_body_module('domain_string')
    sd_list =  es.search(index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets']
    domain = []
    for item in sd_list:
        temp = []
        temp.append(item['key'])
        temp.append(item['doc_count'])
        domain.append(temp)
    results['domain'] = domain
    '''

    # tendency distribution


    # domain and topic
    domain_list = ['']
    #search_important('domain', )
    domain_results = get_top_user()
    topic_results = get_topic_user()
    results['domain_rank'] = domain_results
    results['topic_rank'] = topic_results



    # rank
    important_list = search_in_portrait('importance')
    results['importance'] = important_list
    results['sensitive'] = search_in_portrait('sensitive')
    results['influence'] = search_in_portrait('influence')
    results['activeness'] = search_in_portrait('activeness')

    query_body={
        "query":{
            "match_all": {}
        },
        "sort": {"s_origin_weibo_comment_total_number": {"order": "desc"}}
    }
    date = ts2datetime(time.time()-24*3600).replace('-','')
    date = '20130907'
    results_list = es.search(index=date, doc_type="bci", body=query_body)['hits']['hits']
    comment_weibo_detail = []
    for item in results_list:
        temp = []
        uid = item['_source']['uid']
        try:
            uname = es_user_profile.get(index='weibo_user', doc_type='user', id=uid)['_source']['nick_name']
        except:
            uname = 'unknown'
        temp.append(item['_source']['uid'])
        temp.append(uname)
        temp.append(item['_source']['s_origin_weibo_comment_total_number'])
        comment_weibo_detail.append(temp)
    results['comment_total'] = comment_weibo_detail

    query_body={
        "query":{
            "match_all": {}
        },
        "sort": {"s_origin_weibo_retweeted_total_number": {"order": "desc"}}
    }
    date = ts2datetime(time.time()-24*3600).replace('-','')
    date = '20130907'
    results_list = es.search(index=date, doc_type="bci", body=query_body)['hits']['hits']
    retweeted_weibo_detail = []
    for item in results_list:
        temp = []
        uid = item['_source']['uid']
        try:
            uname = es_user_profile.get(index='weibo_user', doc_type='user', id=uid)['_source']['nick_name']
        except:
            uname = 'unknown'
        temp.append(item['_source']['uid'])
        temp.append(uname)
        temp.append(item['_source']['s_origin_weibo_retweeted_total_number'])
        retweeted_weibo_detail.append(temp)
    results['retweeted_total'] = retweeted_weibo_detail

    query_body={
        "query":{
            "match_all": {}
        },
        "sort": {"s_origin_weibo_number": {"order": "desc"}}
    }
    date = ts2datetime(time.time()-24*3600).replace('-','')
    date = '20130907'
    results_list = es.search(index=date, doc_type="bci", body=query_body)['hits']['hits']
    weibo_detail = []
    for item in results_list:
        temp = []
        uid = item['_source']['uid']
        try:
            uname = es_user_profile.get(index='weibo_user', doc_type='user', id=uid)['_source']['nick_name']
        except:
            uname = 'unknown'
        temp.append(item['_source']['uid'])
        temp.append(uname)
        temp.append(item['_source']['s_origin_weibo_number'])
        weibo_detail.append(temp)
    results['top_weibo_number'] = weibo_detail



    return results
Example #41
0
def imagine(uid, query_fields_dict,index_name="sensitive_user_portrait", doctype='user'):

    """
    uid: search users relate to uid
    query_fields_dict: defined search field weight
    fields: domain, topic, keywords, psycho_status, psycho_feature, activity_geo, hashtag
    for example: "domain": 2
    domain, psycho_feature
    """
    personal_info = es.get(index="sensitive_user_portrait", doc_type="user", id=uid, _source=True)['_source']

    keys_list = query_fields_dict.keys()
    keys_list.remove('field')
    keys_list.remove('size')

    search_dict = {}
    iter_list = []

    for iter_key in keys_list:
        if iter_key not in personal_info or personal_info[iter_key] == '':
            query_fields_dict.pop(iter_key)
        else:
            iter_list.append(iter_key)
            temp = personal_info[iter_key]
            search_dict[iter_key] = temp.split('&')

    if len(iter_list) == 0:
        return []

    query_body = {
        'query':{
            'function_score':{
                'query':{
                    'bool':{
                        'must':[
                        ]
                    }
                },
                "field_value_factor":{
                }
            }
        }
    }

    score_standard = {}
    score_standard["modifier"] = "log1p"
    if query_fields_dict['field'] == "activeness":
        score_standard['field'] = "activeness"
        score_standard['factor'] = 100
    elif query_fields_dict['field'] == "importance":
        score_standard['field'] = "importance"
        score_standard['factor'] = 0.01
    elif query_fields_dict['field'] == "sensitive":
        score_standard['field'] = "sensitive"
        score_standard['factor'] = 100
    elif query_fields_dict['field'] == 'influence':
        score_standard['field'] = "influence"
        score_standard['factor'] = 0.1
    else:
        score_standard['field'] = "influence"
        score_standard['factor'] = 0
        query_body['query']['function_score']['boost_mode'] = "sum"

    query_body['query']['function_score']['field_value_factor'] = score_standard

    query_fields_dict.pop('field')
    number = es.count(index=index_name, doc_type=doctype, body=query_body)['count']
    query_body['size'] = 100 # default number
    query_number = query_fields_dict['size'] #  required number
    query_fields_dict.pop('size')

    for (k,v) in query_fields_dict.items():

        temp = {}
        temp_list = []
        for iter_key in search_dict[k]:
            temp_list.append({'wildcard':{k:{'wildcard':'*'+iter_key+'*','boost': v}}})

        query_body['query']['function_score']['query']['bool']['must'].append({'bool':{'should':temp_list}})


    result = es.search(index=index_name, doc_type=doctype, body=query_body)['hits']['hits']
    field_list = ['uid','uname', 'activeness','importance', 'influence', 'sensitive']
    return_list = []
    count = 0
    for item in result:
        if uid == item['_id']:
            score = item['_score']
            continue
        info = []
        if not item['_source']['uname']:
            item['_source']['uname'] = 'unknown'
        for field in field_list:
            info.append(item['_source'][field])
        info.append(item['_score'])
        common_dict = dict()
        for iter_key in iter_list:
            iter_common_list = item['_source'][iter_key].split('&')
            search_common_list = list(set(iter_common_list) & set(search_dict[iter_key]))   
            iter_key = shift_dict[iter_key]
            common_dict[iter_key] = search_common_list
        info.append(common_dict)
        return_list.append(info)
        count += 1

        if count == query_number:
            break

    return_list.append(number)

    temp_list = []
    for field in field_list:
        temp_list.append(personal_info[field])

    results = []
    results.append(temp_list)
    results.extend(return_list)


    return results
def get_attr(date):
    results = dict()
    number = es.count(index="sensitive_user_portrait",
                      doc_type="user")['count']
    results['total_number'] = number

    query_body = {"query": {"filtered": {"filter": {"term": {"type": 1}}}}}
    sensitive_number = es.count(index="sensitive_user_portrait",
                                doc_type="user",
                                body=query_body)['count']
    results['sensitive_number'] = sensitive_number
    results['influence_number'] = number - sensitive_number

    recommend_in_sensitive = 0
    sensitive_dict = r.hgetall('recommend_sensitive')
    for k, v in sensitive_dict.items():
        if v:
            sensitive_list = json.loads(v)
            recommend_in_sensitive += len(sensitive_list)

    recommend_in_influence = 0
    influence_dict = r.hgetall('recommend_influence')
    for k, v in influence_dict.items():
        if v:
            sensitive_list = json.loads(v)
            recommend_in_influence += len(sensitive_list)
    results['recommend_in'] = recommend_in_influence + recommend_in_sensitive

    results['monitor_number'] = [4, 83]  # test
    results['new_sensitive_words'] = 5  # test

    query_body = query_body_module('sensitive_words_string')
    sw_list = es.search(
        index='sensitive_user_portrait', doc_type='user',
        body=query_body)['aggregations']['all_interests']['buckets']
    sensitive_words = []
    for item in sw_list:
        temp = []
        temp.append(item['key'])
        temp.append(item['doc_count'])
        sensitive_words.append(temp)
    results['sensitive_words'] = sensitive_words

    query_body = query_body_module('sensitive_geo_string')
    sg_list = es.search(
        index='sensitive_user_portrait', doc_type='user',
        body=query_body)['aggregations']['all_interests']['buckets']
    sensitive_geo = []
    for item in sg_list:
        temp = []
        temp.append(item['key'])
        temp.append(item['doc_count'])
        sensitive_geo.append(temp)
    results['sensitive_geo'] = sensitive_geo

    query_body = query_body_module('sensitive_hashtag_string')
    sh_list = es.search(
        index='sensitive_user_portrait', doc_type='user',
        body=query_body)['aggregations']['all_interests']['buckets']
    sensitive_hashtag = []
    for item in sh_list:
        temp = []
        temp.append(item['key'])
        temp.append(item['doc_count'])
        sensitive_hashtag.append(temp)
    results['sensitive_hashtag'] = sensitive_hashtag

    query_body = query_body_module('sensitive_geo_string')
    sg_list = es.search(
        index='sensitive_user_portrait', doc_type='user',
        body=query_body)['aggregations']['all_interests']['buckets']
    sensitive_geo = []
    for item in sg_list:
        temp = []
        temp.append(item['key'])
        temp.append(item['doc_count'])
        sensitive_geo.append(temp)
    results['sensitive_geo'] = sensitive_geo

    query_body = query_body_module('psycho_status_string')
    sp_list = es.search(
        index='sensitive_user_portrait', doc_type='user',
        body=query_body)['aggregations']['all_interests']['buckets']
    psycho_status = []
    for item in sp_list:
        temp = []
        temp.append(item['key'])
        temp.append(item['doc_count'])
        psycho_status.append(temp)
    results['psycho_status'] = psycho_status
    '''
    query_body = query_body_module('political_tendency')
    st_list =  es.search(index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets']
    political_tendency = []
    for item in st_list:
        temp = []
        temp.append(item['key'])
        temp.append(item['doc_count'])
        political_tendency.append(temp)
    results['political_tendency'] = political_tendency
    '''
    results['political_tendency'] = [['left', 123], ['middle', 768],
                                     ['right', 1095]]
    '''
    query_body = query_body_module('domain_string')
    sd_list =  es.search(index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets']
    domain = []
    for item in sd_list:
        temp = []
        temp.append(item['key'])
        temp.append(item['doc_count'])
        domain.append(temp)
    results['domain'] = domain
    '''

    # tendency distribution

    # domain and topic
    domain_list = ['']
    #search_important('domain', )
    domain_results = get_top_user()
    topic_results = get_topic_user()
    results['domain_rank'] = domain_results
    results['topic_rank'] = topic_results

    # rank
    important_list = search_in_portrait('importance')
    results['importance'] = important_list
    results['sensitive'] = search_in_portrait('sensitive')
    results['influence'] = search_in_portrait('influence')
    results['activeness'] = search_in_portrait('activeness')

    query_body = {
        "query": {
            "match_all": {}
        },
        "sort": {
            "s_origin_weibo_comment_total_number": {
                "order": "desc"
            }
        }
    }
    date = ts2datetime(time.time() - 24 * 3600).replace('-', '')
    date = '20130907'
    results_list = es.search(index=date, doc_type="bci",
                             body=query_body)['hits']['hits']
    comment_weibo_detail = []
    for item in results_list:
        temp = []
        uid = item['_source']['uid']
        try:
            uname = es_user_profile.get(index='weibo_user',
                                        doc_type='user',
                                        id=uid)['_source']['nick_name']
        except:
            uname = 'unknown'
        temp.append(item['_source']['uid'])
        temp.append(uname)
        temp.append(item['_source']['s_origin_weibo_comment_total_number'])
        comment_weibo_detail.append(temp)
    results['comment_total'] = comment_weibo_detail

    query_body = {
        "query": {
            "match_all": {}
        },
        "sort": {
            "s_origin_weibo_retweeted_total_number": {
                "order": "desc"
            }
        }
    }
    date = ts2datetime(time.time() - 24 * 3600).replace('-', '')
    date = '20130907'
    results_list = es.search(index=date, doc_type="bci",
                             body=query_body)['hits']['hits']
    retweeted_weibo_detail = []
    for item in results_list:
        temp = []
        uid = item['_source']['uid']
        try:
            uname = es_user_profile.get(index='weibo_user',
                                        doc_type='user',
                                        id=uid)['_source']['nick_name']
        except:
            uname = 'unknown'
        temp.append(item['_source']['uid'])
        temp.append(uname)
        temp.append(item['_source']['s_origin_weibo_retweeted_total_number'])
        retweeted_weibo_detail.append(temp)
    results['retweeted_total'] = retweeted_weibo_detail

    query_body = {
        "query": {
            "match_all": {}
        },
        "sort": {
            "s_origin_weibo_number": {
                "order": "desc"
            }
        }
    }
    date = ts2datetime(time.time() - 24 * 3600).replace('-', '')
    date = '20130907'
    results_list = es.search(index=date, doc_type="bci",
                             body=query_body)['hits']['hits']
    weibo_detail = []
    for item in results_list:
        temp = []
        uid = item['_source']['uid']
        try:
            uname = es_user_profile.get(index='weibo_user',
                                        doc_type='user',
                                        id=uid)['_source']['nick_name']
        except:
            uname = 'unknown'
        temp.append(item['_source']['uid'])
        temp.append(uname)
        temp.append(item['_source']['s_origin_weibo_number'])
        weibo_detail.append(temp)
    results['top_weibo_number'] = weibo_detail

    return results
def search_full_text(uid, date):
    result = []
    ts = datetime2ts(date)
    next_ts = ts + 24 * 3600
    query_body = {
        "query": {
            "filtered": {
                "filter": {
                    "bool": {"must": [{"term": {"uid": uid}}, {"range": {"timestamp": {"gte": ts, "lt": next_ts}}}]}
                }
            }
        },
        "size": 200,
    }

    search_results = es.search(index="sensitive_user_text", doc_type="user", body=query_body)["hits"]["hits"]
    for item in search_results:
        detail = []
        source = item["_source"]
        detail.append(source["sensitive"])
        detail.append(source["message_type"])
        ts = source["timestamp"]
        re_time = time.strftime("%H:%M:%S", time.localtime(float(ts)))
        detail.append(re_time)
        geo_string = source["geo"]
        geo_list = geo_string.split("/t")
        if len(geo_list) >= 3:
            geo = "/t".join(geo_list[-2:])
        else:
            geo = geo_string
        detail.append(geo)
        detail.append(source["text"])
        date = date.replace("-", "")
        mid = source["mid"]
        try:
            weibo_bci = es.get(index=date, doc_type="bci", id=uid)["_source"]
        except:
            weibo_bci = {}
        retweeted_number = 0
        comment_number = 0
        if source["sensitive"]:
            if int(source["message_type"]) == 1:
                if weibo_bci:
                    if weibo_bci.get("s_origin_weibo_retweeted_detail", {}):
                        retweeted_detail = json.loads(weibo_bci["s_origin_weibo_retweeted_detail"])
                    else:
                        retweeted_detail = {}
                    retweeted_number = retweeted_detail.get(mid, 0)
                    if weibo_bci.get("s_origin_weibo_comment_detail", {}):
                        comment_detail = json.loads(weibo_bci["s_origin_weibo_comment_detail"])
                    else:
                        comment_detail = {}
                    comment_number = comment_detail.get(mid, 0)
            elif int(source["message_type"]) == 2:
                if weibo_bci:
                    if weibo_bci.get("s_retweeted_weibo_retweeted_detail", {}):
                        retweeted_detail = json.loads(weibo_bci["s_retweeted_weibo_retweeted_detail"])
                    else:
                        retweeted_detail = {}
                    retweeted_number = retweeted_detail.get(mid, 0)
                    if weibo_bci.get("s_retweetd_weibo_comment_detail", {}):
                        comment_detail = json.loads(weibo_bci["s_retweeted_weibo_comment_detail"])
                    else:
                        comment_detail = {}
                    comment_number = comment_detail.get(mid, 0)
            else:
                pass
        else:
            if int(source["message_type"]) == 1:
                if weibo_bci:
                    print weibo_bci["origin_weibo_retweeted_detail"]
                    if weibo_bci.get("origin_weibo_retweeted_detail", {}):
                        retweeted_detail = json.loads(weibo_bci["origin_weibo_retweeted_detail"])
                    else:
                        retweeted_detail = {}
                    retweeted_number = retweeted_detail.get(mid, 0)
                    if weibo_bci.get("origin_weibo_comment_detail", {}):
                        comment_detail = json.loads(weibo_bci["origin_weibo_comment_detail"])
                    else:
                        comment_detail = {}
                    comment_number = comment_detail.get(mid, 0)
            elif int(source["message_type"]) == 2:
                if weibo_bci:
                    if weibo_bci.get("retweeted_weibo_retweeted_detail", {}):
                        retweeted_detail = json.loads(weibo_bci["retweeted_weibo_retweeted_detail"])
                    else:
                        retweeted_detail = {}
                    retweeted_number = retweeted_detail.get(mid, 0)
                    if weibo_bci.get("retweetd_weibo_comment_detail", {}):
                        comment_detail = json.loads(weibo_bci["retweeted_weibo_comment_detail"])
                    else:
                        comment_detail = {}
                    comment_number = comment_detail.get(mid, 0)
            else:
                pass
        detail.append(retweeted_number)
        detail.append(comment_number)
        result.append(detail)

    return result
Example #44
0
def search_full_text(uid, date):
    result = []
    ts = datetime2ts(date)
    next_ts = ts + 24 * 3600
    query_body = {
        "query": {
            "filtered": {
                "filter": {
                    "bool": {
                        "must": [{
                            "term": {
                                "uid": uid
                            }
                        }, {
                            "range": {
                                "timestamp": {
                                    "gte": ts,
                                    "lt": next_ts
                                }
                            }
                        }]
                    }
                }
            }
        },
        "size": 200
    }

    search_results = es.search(index='sensitive_user_text',
                               doc_type="user",
                               body=query_body)['hits']['hits']
    for item in search_results:
        detail = []
        source = item['_source']
        detail.append(source['sensitive'])
        detail.append(source['message_type'])
        ts = source['timestamp']
        re_time = time.strftime('%H:%M:%S', time.localtime(float(ts)))
        detail.append(re_time)
        geo_string = source['geo']
        geo_list = geo_string.split('/t')
        if len(geo_list) >= 3:
            geo = '/t'.join(geo_list[-2:])
        else:
            geo = geo_string
        detail.append(geo)
        detail.append(source['text'])
        date = date.replace('-', '')
        mid = source['mid']
        try:
            weibo_bci = es.get(index=date, doc_type='bci', id=uid)['_source']
        except:
            weibo_bci = {}
        retweeted_number = 0
        comment_number = 0
        if source['sensitive']:
            if int(source['message_type']) == 1:
                if weibo_bci:
                    if weibo_bci.get('s_origin_weibo_retweeted_detail', {}):
                        retweeted_detail = json.loads(
                            weibo_bci['s_origin_weibo_retweeted_detail'])
                    else:
                        retweeted_detail = {}
                    retweeted_number = retweeted_detail.get(mid, 0)
                    if weibo_bci.get('s_origin_weibo_comment_detail', {}):
                        comment_detail = json.loads(
                            weibo_bci['s_origin_weibo_comment_detail'])
                    else:
                        comment_detail = {}
                    comment_number = comment_detail.get(mid, 0)
            elif int(source['message_type']) == 2:
                if weibo_bci:
                    if weibo_bci.get('s_retweeted_weibo_retweeted_detail', {}):
                        retweeted_detail = json.loads(
                            weibo_bci['s_retweeted_weibo_retweeted_detail'])
                    else:
                        retweeted_detail = {}
                    retweeted_number = retweeted_detail.get(mid, 0)
                    if weibo_bci.get('s_retweetd_weibo_comment_detail', {}):
                        comment_detail = json.loads(
                            weibo_bci['s_retweeted_weibo_comment_detail'])
                    else:
                        comment_detail = {}
                    comment_number = comment_detail.get(mid, 0)
            else:
                pass
        else:
            if int(source['message_type']) == 1:
                if weibo_bci:
                    print weibo_bci['origin_weibo_retweeted_detail']
                    if weibo_bci.get('origin_weibo_retweeted_detail', {}):
                        retweeted_detail = json.loads(
                            weibo_bci['origin_weibo_retweeted_detail'])
                    else:
                        retweeted_detail = {}
                    retweeted_number = retweeted_detail.get(mid, 0)
                    if weibo_bci.get('origin_weibo_comment_detail', {}):
                        comment_detail = json.loads(
                            weibo_bci['origin_weibo_comment_detail'])
                    else:
                        comment_detail = {}
                    comment_number = comment_detail.get(mid, 0)
            elif int(source['message_type']) == 2:
                if weibo_bci:
                    if weibo_bci.get('retweeted_weibo_retweeted_detail', {}):
                        retweeted_detail = json.loads(
                            weibo_bci['retweeted_weibo_retweeted_detail'])
                    else:
                        retweeted_detail = {}
                    retweeted_number = retweeted_detail.get(mid, 0)
                    if weibo_bci.get('retweetd_weibo_comment_detail', {}):
                        comment_detail = json.loads(
                            weibo_bci['retweeted_weibo_comment_detail'])
                    else:
                        comment_detail = {}
                    comment_number = comment_detail.get(mid, 0)
            else:
                pass
        detail.append(retweeted_number)
        detail.append(comment_number)
        result.append(detail)

    return result