Beispiel #1
0
def get_text(top_list, date, style):

# input: [[mid1, no.1], [mid2, no.2], ['mid3', no.3]]
# output: [[text1, no.1], [text2, no.2], [text3, no.3]]
# mid, retweeted, comment, text, geo, timestamp, sentiment, mid_url
    results = []
    index_flow_text = pre_text_index + date
    #index_list = get_text_index(date)
    if len(top_list) != 0: # no one
        mid_list = []
        for item in top_list:
            mid_list.append(item[0])
	search_result = es.mget(index=index_flow_text, doc_type=flow_text_index_type, body={"ids":mid_list})["docs"]
        for i in range(len(top_list)):
            temp = []
            temp.extend(top_list[i])
            if search_result[i]['found']:
                source = search_result[i]['_source']
                temp.append(source["text"])
                temp.append(source["geo"])
                temp.append(ts2date(source["timestamp"]))
                temp.append(source["sentiment"])
                temp.append(weiboinfo2url(source['uid'], source['mid']))
                temp.append(uid_url+source['uid'])
                temp.append(source['uid'])
                try:
                    uname = es_profile.get(index=profile_index_name, doc_type=profile_index_type, id=source['uid'])["_source"]["nick_name"]
                    temp.append(uname)
                except:
                    temp.append("unknown")
            else:
                temp.extend(["", "", "", "", "", "", "", ""])
            results.append(temp)
    return results
Beispiel #2
0
def get_influence_content(uid, timestamp_from, timestamp_to):
    weibo_list = []
    #split timestamp range to new_range_dict_list
    from_date_ts = datetime2ts(ts2datetime(timestamp_from))
    to_date_ts = datetime2ts(ts2datetime(timestamp_to))
    new_range_dict_list = []
    if from_date_ts != to_date_ts:
        iter_date_ts = from_date_ts
        while iter_date_ts < to_date_ts:
            iter_next_date_ts = iter_date_ts + DAY
            new_range_dict_list.append({
                'range': {
                    'timestamp': {
                        'gte': iter_date_ts,
                        'lt': iter_next_date_ts
                    }
                }
            })
            iter_date_ts = iter_next_date_ts
        if new_range_dict_list[0]['range']['timestamp']['gte'] < timestamp_from:
            new_range_dict_list[0]['range']['timestamp'][
                'gte'] = timestamp_from
        if new_range_dict_list[-1]['range']['timestamp']['lt'] > timestamp_to:
            new_range_dict_list[-1]['range']['timestamp']['lt'] = timestamp_to
    else:
        new_range_dict_list = [{
            'range': {
                'timestamp': {
                    'gte': timestamp_from,
                    'lt': timestamp_to
                }
            }
        }]
    #iter date to search flow_text
    iter_result = []
    for range_item in new_range_dict_list:
        range_from_ts = range_item['range']['timestamp']['gte']
        range_from_date = ts2datetime(range_from_ts)
        flow_text_index_name = flow_text_index_name_pre + range_from_date
        query = []
        query.append({'term': {'uid': uid}})
        query.append(range_item)
        try:
            flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\
                    body={'query':{'bool':{'must': query}}, 'sort':[{'timestamp':'asc'}]})['hits']['hits']
        except:
            flow_text_exist = []
        iter_result.extend(flow_text_exist)
    # get weibo list
    for item in flow_text_exist:
        source = item['_source']
        weibo = {}
        weibo['timestamp'] = ts2date(source['timestamp'])
        weibo['ip'] = source['ip']
        weibo['text'] = source['text']
        weibo['geo'] = '\t'.join(source['geo'].split('&'))
        weibo_list.append(weibo)

    return weibo_list
Beispiel #3
0
def get_activity_weibo(task_name, start_ts, submit_user):
    results = []
    task_id = submit_user + '-' + task_name
    #step1: get task_name uid
    try:
        group_result = es_group_result.get(index=group_index_name, doc_type=group_index_type ,\
                id=task_id, _source=False, fields=['uid_list'])
    except:
        group_result = {}
    if group_result == {}:
        return 'task name invalid'
    try:
        uid_list = group_result['fields']['uid_list']
    except:
        uid_list = []
    if uid_list == []:
        return 'task uid list null'
    #step2: get uid2uname
    uid2uname = {}
    try:
        user_portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \
                body = {'ids':uid_list}, _source=False, fields=['uname'])['docs']
    except:
        user_portrait_result = []
    for item in user_portrait_result:
        uid = item['_id']
        if item['found']==True:
            uname = item['fields']['uname'][0]
        uid2uname[uid] = uname
    #step3: search time_segment weibo
    time_segment = FOUR_HOUR
    end_ts = start_ts + time_segment
    time_date = ts2datetime(start_ts)
    flow_text_index_name = flow_text_index_name_pre + time_date
    query = []
    query.append({'terms':{'uid': uid_list}})
    query.append({'range':{'timestamp':{'gte':start_ts, 'lt':end_ts}}})
    try:
        flow_text_es_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type, \
                body={'query':{'bool':{'must':query}}, 'sort':'timestamp', 'size':MAX_VALUE})['hits']['hits']
    except:
        flow_text_es_result = []
    for item in flow_text_es_result:
        weibo = {}
        source = item['_source']
        weibo['timestamp'] = ts2date(source['timestamp'])
        weibo['ip'] = source['ip']
        weibo['text'] = source['text']
        weibo['geo'] = '\t'.join(source['geo'])
        results.append(weibo)

    return results
Beispiel #4
0
def get_activity_weibo(task_name, start_ts, submit_user):
    results = []
    task_id = submit_user + '-' + task_name
    #step1: get task_name uid
    try:
        group_result = es_group_result.get(index=group_index_name, doc_type=group_index_type ,\
                id=task_id, _source=False, fields=['uid_list'])
    except:
        group_result = {}
    if group_result == {}:
        return 'task name invalid'
    try:
        uid_list = group_result['fields']['uid_list']
    except:
        uid_list = []
    if uid_list == []:
        return 'task uid list null'
    #step2: get uid2uname
    uid2uname = {}
    try:
        user_portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \
                body = {'ids':uid_list}, _source=False, fields=['uname'])['docs']
    except:
        user_portrait_result = []
    for item in user_portrait_result:
        uid = item['_id']
        if item['found'] == True:
            uname = item['fields']['uname'][0]
        uid2uname[uid] = uname
    #step3: search time_segment weibo
    time_segment = FOUR_HOUR
    end_ts = start_ts + time_segment
    time_date = ts2datetime(start_ts)
    flow_text_index_name = flow_text_index_name_pre + time_date
    query = []
    query.append({'terms': {'uid': uid_list}})
    query.append({'range': {'timestamp': {'gte': start_ts, 'lt': end_ts}}})
    try:
        flow_text_es_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type, \
                body={'query':{'bool':{'must':query}}, 'sort':'timestamp', 'size':MAX_VALUE})['hits']['hits']
    except:
        flow_text_es_result = []
    for item in flow_text_es_result:
        weibo = {}
        source = item['_source']
        weibo['timestamp'] = ts2date(source['timestamp'])
        weibo['ip'] = source['ip']
        weibo['text'] = source['text']
        weibo['geo'] = '\t'.join(source['geo'])
        results.append(weibo)

    return results
Beispiel #5
0
def get_influence_content(uid, timestamp_from, timestamp_to):
    weibo_list = []
    #split timestamp range to new_range_dict_list
    from_date_ts = datetime2ts(ts2datetime(timestamp_from))
    to_date_ts = datetime2ts(ts2datetime(timestamp_to))
    new_range_dict_list = []
    if from_date_ts != to_date_ts:
        iter_date_ts = from_date_ts
        while iter_date_ts < to_date_ts:
            iter_next_date_ts = iter_date_ts + DAY
            new_range_dict_list.append({'range':{'timestamp':{'gte':iter_date_ts, 'lt':iter_next_date_ts}}})
            iter_date_ts = iter_next_date_ts
        if new_range_dict_list[0]['range']['timestamp']['gte'] < timestamp_from:
            new_range_dict_list[0]['range']['timestamp']['gte'] = timestamp_from
        if new_range_dict_list[-1]['range']['timestamp']['lt'] > timestamp_to:
            new_range_dict_list[-1]['range']['timestamp']['lt'] = timestamp_to
    else:
        new_range_dict_list = [{'range':{'timestamp':{'gte':timestamp_from, 'lt':timestamp_to}}}]
    #iter date to search flow_text
    iter_result = []
    for range_item in new_range_dict_list:
        range_from_ts = range_item['range']['timestamp']['gte']
        range_from_date = ts2datetime(range_from_ts)
        flow_text_index_name = flow_text_index_name_pre + range_from_date
        query = []
        query.append({'term':{'uid':uid}})
        query.append(range_item)
        try:
            flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\
                    body={'query':{'bool':{'must': query}}, 'sort':[{'timestamp':'asc'}]})['hits']['hits']
        except:
            flow_text_exist = []
        iter_result.extend(flow_text_exist)
    # get weibo list
    for item in flow_text_exist:
        source = item['_source']
        weibo = {}
        weibo['timestamp'] = ts2date(source['timestamp'])
        weibo['ip'] = source['ip']
        weibo['text'] = source['text']
        weibo['geo'] = '\t'.join(source['geo'].split('&'))
        weibo_list.append(weibo)
        
    return weibo_list
Beispiel #6
0
def group_user_weibo(task_name, submit_user, sort_type):
    weibo_list = []
    now_date = ts2datetime(time.time())
    if sort_type == 'retweet':
        sort_type = 'retweeted'
    #run_type
    if RUN_TYPE == 0:
        now_date = RUN_TEST_TIME
        sort_type = 'timestamp'
    #step1: get group user
    task_id = submit_user + '-' + task_name
    try:
        group_exist_result = es_group_result.get(index=group_index_name, doc_type=group_index_type,\
                id=task_id)['_source']
    except:
        group_exist_result = {}
    if not group_exist_result:
        return 'group no exist'
    #step2: get user weibo list
    uid_list = group_exist_result['uid_list']
    for i in range(6,-1,-1):
        iter_date = ts2datetime(datetime2ts(now_date) - i * DAY)
        index_name = flow_text_index_name_pre + iter_date
        try:
            weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\
                    body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'sort':[{sort_type: {'order': 'desc'}}], 'size':100})['hits']['hits']
        except:
            weibo_result = []
        if weibo_result:
            weibo_list.extend(weibo_result)
    #sort_weibo_list = sorted(weibo_list, key=lambda x:x['_source'][sort_type], reverse=True)[:100]
    sort_weibo_list = weibo_list
    #step3: get user name
    try:
        portrait_exist_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \
                body={'ids':uid_list})['docs']
    except:
        portrait_exist_result = []
    uid2uname_dict = {}
    for portrait_item in portrait_exist_result:
        uid = portrait_item['_id']
        if portrait_item['found'] == True:
            source = portrait_item['_source']
            uname = source['uname']
        else:
            uname = 'unknown'
        uid2uname_dict[uid] = uname
    weibo_list = []
    for weibo_item in sort_weibo_list:
        source = weibo_item['_source']
        mid = source['mid']
        uid = source['uid']
        uname = uid2uname_dict[uid]
        text = source['text']
        ip = source['geo']
        timestamp = source['timestamp']
        date = ts2date(timestamp)
        sentiment = source['sentiment']
        weibo_url = weiboinfo2url(uid, mid)
        #run_type:
        if RUN_TYPE == 1:
            try:
                retweet_count = source['retweeted']
            except:
                retweet_count = 0
            try:
                comment_count = source['comment']
            except:
                comment_count = 0
            try:
                sensitive_score = source['sensitive']
            except:
                sensitive_score = 0
        else:
            retweet_count = 0
            comment_count = 0
            sensitive_score = 0
        city = ip2city(ip)
        weibo_list.append([mid, uid, uname, text, ip, city, timestamp, date, retweet_count, comment_count, sensitive_score, weibo_url])
    if sort_type == 'timestamp':
        new_weibo_list = sorted(weibo_list, key=lambda x:x[6], reverse=True)
    elif sort_type == 'retweeted':
        new_weibo_list = sorted(weibo_list, key=lambda x:x[8], reverse=True)
    elif sort_type == 'comment':
        new_weibo_list = sorted(weibo_list, key=lambda x:x[9], reverse=True)
    elif sort_type == 'sensitive':
        new_weibo_list = sorted(weibo_list, key=lambda x:x[10], reverse=True)
    return new_weibo_list
Beispiel #7
0
def search_task(task_name, submit_date, state, status, submit_user):
    results = []
    query = []
    condition_num = 0
    if task_name:
        task_name_list = task_name.split(' ')
        for item in task_name_list:
            query.append({'wildcard':{'task_name': '*' + item + '*'}})
            condition_num += 1
    if submit_date:
        submit_date_ts = datetime2ts(submit_date)
        submit_date_start = submit_date_ts
        submit_date_end = submit_date_ts + DAY
        query.append({'range':{'submit_date': {'gte': submit_date_start, 'lt': submit_date_end}}})
        condition_num += 1
    if state:
        state_list = state.split(' ')
        for item in state_list:
            query.append({'wildcard':{'state': '*' + item + '*'}})
            condition_num += 1
    if status:
        query.append({'match':{'status': status}})
        condition_num += 1
    if submit_user:
        query.append({'term':{'submit_user': submit_user}})
        condition_num += 1
    print es_group_result,group_index_name,group_index_type
    if condition_num > 0:
        query.append({'term':{'task_type': 'analysis'}})
        try:
            source = es_group_result.search(
                    index = group_index_name,
                    doc_type = group_index_type,
                    body = {
                        'query':{
                            'bool':{
                                'must':query
                                }
                            },
                        'sort': [{'count':{'order': 'desc'}}],
                        'size': MAX_VALUE
                        }
                    )
        except Exception as e:
            raise e
    else:
        query.append({'term':{'task_type': 'analysis'}})
        source = es.search(
                index = group_index_name,
                doc_type = group_index_type,
                body = {
                    'query':{'bool':{
                        'must':query
                        }
                        },
                    'sort': [{'count': {'order': 'desc'}}],
                    'size': MAX_VALUE
                    }
                )

    try:
        task_dict_list = source['hits']['hits']
    except:
        return None
    result = []
    for task_dict in task_dict_list:
        try:
            state = task_dict['_source']['state']
        except:
            state = ''
        try:
            status = task_dict['_source']['status']
        except:
            status = 0
        #result.append([task_dict['_source']['task_name'], task_dict['_source']['submit_date'], task_dict['_source']['count'], state, status])
        result.append({'task_name':task_dict['_source']['task_name'],'submit_date':ts2date(task_dict['_source']['submit_date']), 'group_count':task_dict['_source']['count'], 'status':status})
    
    return result
Beispiel #8
0
def new_get_user_weibo(uid, sort_type):
    results = []
    weibo_list = []
    now_date = ts2datetime(time.time())
    #run_type
    if RUN_TYPE == 0:
        now_date = RUN_TEST_TIME
        sort_type = 'timestamp'
    #step1:get user name
    print '708'
    try:
        user_profile_result = es_user_profile.get(index=profile_index_name, doc_type=profile_index_type,\
                id=uid, _source=False, fields=['nick_name'])
    except:
        user_profile_result = {}
    print '714',len(user_profile_result)
    if user_profile_result:
        uname = user_profile_result['fields']['nick_name'][0]
    else:
        uname = ''
    #step2:get user weibo
    for i in range(7, 0, -1):
        if RUN_TYPE == 1:
            iter_date = ts2datetime(datetime2ts(now_date) - i * DAY)
        else:
            iter_date = '2013-09-01'
        index_name = flow_text_index_name_pre + iter_date
        print '726'
        try:
            weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\
                    body={'query':{'filtered':{'filter':{'term': {'uid': uid}}}}, 'size':MAX_VALUE})['hits']['hits']
            #print weibo_result
        except:
            weibo_result = []
        print '732',len(weibo_result)
        if weibo_result:
            weibo_list.extend(weibo_result)
    
    #sort_weibo_list = sorted(weibo_list, key=lambda x:x['_source'][sort_type], reverse=True)[:100]
    mid_set = set()
    for weibo_item in weibo_list:
        source = weibo_item['_source']
        mid = source['mid']
        uid = source['uid']
        text = source['text']
        ip = source['ip']
        timestamp = source['timestamp']
        date = ts2date(timestamp)
        sentiment = source['sentiment']
        weibo_url = weiboinfo2url(uid, mid)
        #run_type
        if RUN_TYPE == 1:
            try:
                retweet_count = source['retweeted']
            except:
                retweet_count = 0
            try:
                comment_count = source['comment']
            except:
                comment_count = 0
            try:
                sensitive_score = source['sensitive']
            except:
                sensitive_score = 0
        else:
            retweet_count = 0
            comment_count = 0
            sensitive_score = 0
        city = ip2city(ip)
        if mid not in mid_set:
            results.append([mid, uid, text, ip, city,timestamp, date, retweet_count, comment_count, sensitive_score, weibo_url])
            mid_set.add(mid)
    if sort_type == 'timestamp':
        sort_results = sorted(results, key=lambda x:x[5], reverse=True)
    elif sort_type == 'retweet_count':
        sort_results = sorted(results, key=lambda x:x[7], reverse=True)
    elif sort_type == 'comment_count':
        sort_results = sorted(results, key=lambda x:x[8], reverse=True)
    elif sort_type == 'sensitive':
        sort_results = sorted(results, key=lambda x:x[9], reverse=True)
    print '778'
    return sort_results
Beispiel #9
0
def group_user_weibo(task_name, submit_user, sort_type):
    weibo_list = []
    now_date = ts2datetime(time.time())
    if sort_type == 'retweet':
        sort_type = 'retweeted'
    #run_type
    if RUN_TYPE == 0:
        now_date = RUN_TEST_TIME
        sort_type = 'timestamp'
    #step1: get group user
    task_id = submit_user + '-' + task_name
    try:
        group_exist_result = es_group_result.get(index=group_index_name, doc_type=group_index_type,\
                id=task_id)['_source']
    except:
        group_exist_result = {}
    if not group_exist_result:
        return 'group no exist'
    #step2: get user weibo list
    uid_list = group_exist_result['uid_list']
    for i in range(6, -1, -1):
        iter_date = ts2datetime(datetime2ts(now_date) - i * DAY)
        index_name = flow_text_index_name_pre + iter_date
        try:
            weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\
                    body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'sort':[{sort_type: {'order': 'desc'}}], 'size':100})['hits']['hits']
        except:
            weibo_result = []
        if weibo_result:
            weibo_list.extend(weibo_result)
    #sort_weibo_list = sorted(weibo_list, key=lambda x:x['_source'][sort_type], reverse=True)[:100]
    sort_weibo_list = weibo_list
    #step3: get user name
    try:
        portrait_exist_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \
                body={'ids':uid_list})['docs']
    except:
        portrait_exist_result = []
    uid2uname_dict = {}
    for portrait_item in portrait_exist_result:
        uid = portrait_item['_id']
        if portrait_item['found'] == True:
            source = portrait_item['_source']
            uname = source['uname']
        else:
            uname = 'unknown'
        uid2uname_dict[uid] = uname
    weibo_list = []
    for weibo_item in sort_weibo_list:
        source = weibo_item['_source']
        mid = source['mid']
        uid = source['uid']
        uname = uid2uname_dict[uid]
        text = source['text']
        ip = source['geo']
        timestamp = source['timestamp']
        date = ts2date(timestamp)
        sentiment = source['sentiment']
        weibo_url = weiboinfo2url(uid, mid)
        #run_type:
        if RUN_TYPE == 1:
            try:
                retweet_count = source['retweeted']
            except:
                retweet_count = 0
            try:
                comment_count = source['comment']
            except:
                comment_count = 0
            try:
                sensitive_score = source['sensitive']
            except:
                sensitive_score = 0
        else:
            retweet_count = 0
            comment_count = 0
            sensitive_score = 0
        city = ip2city(ip)
        weibo_list.append([
            mid, uid, uname, text, ip, city, timestamp, date, retweet_count,
            comment_count, sensitive_score, weibo_url
        ])
    if sort_type == 'timestamp':
        new_weibo_list = sorted(weibo_list, key=lambda x: x[6], reverse=True)
    elif sort_type == 'retweeted':
        new_weibo_list = sorted(weibo_list, key=lambda x: x[8], reverse=True)
    elif sort_type == 'comment':
        new_weibo_list = sorted(weibo_list, key=lambda x: x[9], reverse=True)
    elif sort_type == 'sensitive':
        new_weibo_list = sorted(weibo_list, key=lambda x: x[10], reverse=True)
    return new_weibo_list
Beispiel #10
0
def get_origin_weibo_detail(ts, user, task_name, size, order, message_type=1):
    _id = user + '-' + task_name
    task_detail = es_user_portrait.get(index=index_sensing_task, doc_type=_id, id=ts)['_source']
    print '37',index_sensing_task,_id
    mid_value = json.loads(task_detail['mid_topic_value'])
    duplicate_dict = json.loads(task_detail['duplicate_dict'])
    tmp_duplicate_dict = dict()
    for k,v in duplicate_dict.iteritems():
        try:
            tmp_duplicate_dict[v].append(k)
        except:
            tmp_duplicate_dict[v] = [k, v]


    if message_type == 1:
        weibo_detail = json.loads(task_detail['origin_weibo_detail'])
    elif message_type == 2:
        weibo_detail = json.loads(task_detail['retweeted_weibo_detail'])
    else:
        weibo_detail = json.loads(task_detail['sensitive_weibo_detail'])
    weibo_detail_list = []
    if weibo_detail:
        for iter_mid, item in weibo_detail.iteritems():
            tmp = []
            tmp.append(iter_mid)
            tmp.append(item[iter_mid])
            tmp.append(item['retweeted'])
            tmp.append(item['comment'])
            weibo_detail_list.append(tmp)
    mid_list = weibo_detail.keys()
    print len(mid_list)
    results = []
    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "terms":{"mid": mid_list}
                }
            }
        },
        "size": 1000,
        "sort": {"timestamp": {"order": "desc"}}
    }


    index_list = []
    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts-DAY)
    index_name = flow_text_index_name_pre + datetime
    print es_text
    exist_es = es_text.indices.exists(index_name)
    print exist_es
    if exist_es:
        index_list.append(index_name)
    index_name_1 = flow_text_index_name_pre + datetime_1
    exist_es_1 = es_text.indices.exists(index_name_1)
    if exist_es_1:
        index_list.append(index_name_1)

    if index_list and mid_list:
        search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    else:
        search_results = []

    uid_list = []
    text_dict = dict() # 文本信息
    portrait_dict = dict() # 背景信息
    sort_results = []
    if search_results:
        for item in search_results:
            uid_list.append(item["_source"]['uid'])
            text_dict[item['_id']] = item['_source'] # _id是mid
        if uid_list:
            portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"]
            for item in portrait_result:
                if item['found']:
                    portrait_dict[item['_id']] = {"nick_name": item["fields"]["nick_name"][0], "photo_url": item["fields"]["photo_url"][0]}
                else:
                    portrait_dict[item['_id']] = {"nick_name": item['_id'], "photo_url":""}


        if order == "total":
            sorted_list = sorted(weibo_detail_list, key=lambda x:x[1], reverse=True)[:10]
        elif order == "retweeted":
            sorted_list = sorted(weibo_detail_list, key=lambda x:x[2], reverse=True)[:10]
        elif order == "comment":
            sorted_list = sorted(weibo_detail_list, key=lambda x:x[3], reverse=True)[:10]
        else:
            sorted_list = weibo_detail_list

        count_n = 0
        results_dict = dict()
        mid_index_dict = dict()
        for item in sorted_list: # size
            mid = item[0]
            iter_text = text_dict.get(mid, {})
            temp = []
            # uid, nick_name, photo_url, text, sentiment, timestamp, geo, keywords_string, message_type
            if iter_text:
                uid = iter_text['uid']
                temp.append(uid)
                iter_portrait = portrait_dict.get(uid, {})
                if iter_portrait:
                    temp.append(iter_portrait['nick_name'])
                    temp.append(iter_portrait['photo_url'])
                else:
                    temp.extend([uid,''])
                temp.append(iter_text["text"])
                temp.append(iter_text["sentiment"])
                temp.append(ts2date(iter_text['timestamp']))
                temp.append(iter_text['geo'])
                if message_type == 1:
                    temp.append(1)
                elif message_type == 2:
                    temp.append(3)
                else:
                    temp.append(iter_text['message_type'])
                #jln 提取关键词
                f_key = get_weibo_single(iter_text['text'])
                temp.append(sorted(f_key.iteritems(),key=lambda x:x[1],reverse=True))
                
                temp.append(item[2])
                temp.append(item[3])
                temp.append(iter_text.get('sensitive', 0))
                temp.append(iter_text['timestamp'])
                temp.append(mid_value[mid])
                temp.append(mid)
                


                results.append(temp)
            count_n += 1


                

        results = sorted(results, key=operator.itemgetter(-4, -2, -6), reverse=True) # -4 -2 -3
        sort_results = []
        count = 0
        for item in results:
            sort_results.append([item])
            mid_index_dict[item[-1]] = count
            count += 1

        
        if tmp_duplicate_dict:
            remove_list = []
            value_list = tmp_duplicate_dict.values() # [[mid, mid], ]
            for item in value_list:
                tmp = []
                for mid in item:
                    if mid_index_dict.get(mid, 0):
                        tmp.append(mid_index_dict[mid])
                if len(tmp) > 1:
                    tmp_min = min(tmp)
                else:
                    continue
                tmp.remove(tmp_min)
                for iter_count in tmp:
                    sort_results[tmp_min].extend(sort_results[iter_count])
                    remove_list.append(sort_results[iter_count])
            if remove_list:
                for item in remove_list:
                    sort_results.remove(item)
        

    return sort_results
Beispiel #11
0
def get_positive_weibo_detail(ts, social_sensors, keywords_list, size, sentiment_type=1):
    former_mid_list = query_mid_list(ts-time_interval, keywords_list, time_segment, social_sensors) # 前一段时间内的微博mid list
    current_mid_list = query_mid_list(ts, keywords_list, time_interval,  social_sensors)
    mid_list = []
    mid_list.extend(former_mid_list)
    mid_list.extend(current_mid_list)

    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                            {"range":{
                                "timestamp":{
                                    "gte": ts - time_interval,
                                    "lt": ts
                                }
                            }},
                        ],
                        "should":[
                            {"terms": {"root_mid": mid_list}},
                            {"terms": {"mid": mid_list}},
                            {"terms":{"keywords_string": keywords_list}}
                        ]
                    }
                }
            }
        },
        "sort": {"timestamp": {"order": "desc"}},
        "size": 100
    }



    #if social_sensors and int(sentiment_type) == 1:
    #    query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"terms":{"uid": social_sensors}})

    if int(sentiment_type) == 1 or int(sentiment_type) == 0:
        query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"term":{"sentiment":sentiment_type}})
    else:
        query_body["query"]["filtered"]["filter"]["bool"]["must"] = [{"terms":{"sentiment": ["2", "3"]}}]

    # 判断当前ts和ts-time_interval是否属于同一天,确定查询哪个es
    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts-time_interval)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    index_name_1 = flow_text_index_name_pre + datetime_1
    exist_es_1 = es_text.indices.exists(index_name_1)

    # 1. 聚合原创微博mid list
    if datetime == datetime_1 and exist_es:
        search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    elif datetime != datetime_1 and exist_es_1:
        search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    else:
        search_results = []

    uid_list = []
    results = []
    if search_results:
        for item in search_results:
            uid_list.append(item["_source"]['uid'])
        if uid_list:
            portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"]

        for i in range(len(uid_list)):
            item = search_results[i]['_source']
            temp = []
            # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type
            temp.append(item['uid'])
            if portrait_result[i]['found']:
                temp.append(portrait_result[i]["fields"]["nick_name"][0])
                temp.append(portrait_result[i]["fields"]["photo_url"][0])
            else:
                temp.append("unknown")
                temp.append("")
            temp.append(item["text"])
            temp.append(item["sentiment"])
            temp.append(ts2date(item['timestamp']))
            temp.append(item['geo'])
            keywords_set = set(item['keywords_string'].split('&'))
            common_keywords = set(keywords_list) & keywords_set
            temp.append(list(common_keywords))
            temp.append(item['message_type'])
            results.append(temp)

    return results
Beispiel #12
0
def get_retweet_weibo_detail(ts, user, task_name, size, text_type, type_value):
    _id = user + '-' + task_name
    task_detail = es_user_portrait.get(index=index_sensing_task, doc_type=_id, id=ts)['_source']
    origin_weibo_detail = json.loads(task_detail['origin_weibo_detail'])
    retweeted_weibo_detail = json.loads(task_detail['retweeted_weibo_detail'])

    mid_list = []
    mid_list.extend(origin_weibo_detail.keys())
    mid_list.extend(retweeted_weibo_detail.keys())

    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                            {"range":{
                                "timestamp":{
                                    "gte": ts - time_interval,
                                    "lt": ts
                                }
                            }},
                            {"terms": {"root_mid": mid_list}}
                        ]
                    }
                }
            }
        },
        "sort": {"timestamp": {"order": "desc"}},
        "size": 100
    }

    if text_type == "message_type":
        query_body['query']['filtered']['filter']['bool']['must'].append({"term":{text_type: type_value}})
    if text_type == "sentiment":
        #if isinstance(type_value, str):
        if len(type_value) == 1:
            query_body['query']['filtered']['filter']['bool']['must'].append({"term":{text_type: type_value}})
        else:
            query_body['query']['filtered']['filter']['bool']['must'].append({"terms":{text_type: type_value}})

    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts-time_interval)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    index_name_1 = flow_text_index_name_pre + datetime_1
    exist_es_1 = es_text.indices.exists(index_name_1)

    # 1. 查询微博
    if datetime == datetime_1 and exist_es:
        search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    elif datetime != datetime_1 and exist_es_1:
        search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    else:
        search_results = []
    #print search_results
    # 2. 获取微博相关信息
    results = []
    uid_list = []
    if search_results:
        for item in search_results:
            uid_list.append(item["_source"]['uid'])
        if uid_list:
            portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"]

        for i in range(len(uid_list)):
            item = search_results[i]['_source']
            temp = []
            # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type
            temp.append(item['uid'])
            if portrait_result[i]['found']:
                temp.append(portrait_result[i]["fields"]["nick_name"][0])
                temp.append(portrait_result[i]["fields"]["photo_url"][0])
            else:
                temp.append(item['uid'])
                temp.append("")
            temp.append(item["text"])
            #print item['text']
            temp.append(item["sentiment"])
            temp.append(ts2date(item['timestamp']))
            temp.append(item['geo'])
            temp.append(item["message_type"])
            results.append(temp)

    return results
Beispiel #13
0
def search_task(task_name, submit_date, state, status, submit_user):
    results = []
    query = []
    condition_num = 0
    if task_name:
        task_name_list = task_name.split(' ')
        for item in task_name_list:
            query.append({'wildcard': {'task_name': '*' + item + '*'}})
            condition_num += 1
    if submit_date:
        submit_date_ts = datetime2ts(submit_date)
        submit_date_start = submit_date_ts
        submit_date_end = submit_date_ts + DAY
        query.append({
            'range': {
                'submit_date': {
                    'gte': submit_date_start,
                    'lt': submit_date_end
                }
            }
        })
        condition_num += 1
    if state:
        state_list = state.split(' ')
        for item in state_list:
            query.append({'wildcard': {'state': '*' + item + '*'}})
            condition_num += 1
    if status:
        query.append({'match': {'status': status}})
        condition_num += 1
    if submit_user:
        query.append({'term': {'submit_user': submit_user}})
        condition_num += 1
    print es_group_result, group_index_name, group_index_type
    if condition_num > 0:
        query.append({'term': {'task_type': 'analysis'}})
        try:
            source = es_group_result.search(index=group_index_name,
                                            doc_type=group_index_type,
                                            body={
                                                'query': {
                                                    'bool': {
                                                        'must': query
                                                    }
                                                },
                                                'sort': [{
                                                    'count': {
                                                        'order': 'desc'
                                                    }
                                                }],
                                                'size':
                                                MAX_VALUE
                                            })
        except Exception as e:
            raise e
    else:
        query.append({'term': {'task_type': 'analysis'}})
        source = es.search(index=group_index_name,
                           doc_type=group_index_type,
                           body={
                               'query': {
                                   'bool': {
                                       'must': query
                                   }
                               },
                               'sort': [{
                                   'count': {
                                       'order': 'desc'
                                   }
                               }],
                               'size': MAX_VALUE
                           })

    try:
        task_dict_list = source['hits']['hits']
    except:
        return None
    result = []
    for task_dict in task_dict_list:
        try:
            state = task_dict['_source']['state']
        except:
            state = ''
        try:
            status = task_dict['_source']['status']
        except:
            status = 0
        #result.append([task_dict['_source']['task_name'], task_dict['_source']['submit_date'], task_dict['_source']['count'], state, status])
        result.append({
            'task_name':
            task_dict['_source']['task_name'],
            'submit_date':
            ts2date(task_dict['_source']['submit_date']),
            'group_count':
            task_dict['_source']['count'],
            'status':
            status
        })

    return result
Beispiel #14
0
def get_positive_weibo_detail(ts,
                              social_sensors,
                              keywords_list,
                              size,
                              sentiment_type=1):
    former_mid_list = query_mid_list(ts - time_interval, keywords_list,
                                     time_segment,
                                     social_sensors)  # 前一段时间内的微博mid list
    current_mid_list = query_mid_list(ts, keywords_list, time_interval,
                                      social_sensors)
    mid_list = []
    mid_list.extend(former_mid_list)
    mid_list.extend(current_mid_list)

    query_body = {
        "query": {
            "filtered": {
                "filter": {
                    "bool": {
                        "must": [
                            {
                                "range": {
                                    "timestamp": {
                                        "gte": ts - time_interval,
                                        "lt": ts
                                    }
                                }
                            },
                        ],
                        "should": [{
                            "terms": {
                                "root_mid": mid_list
                            }
                        }, {
                            "terms": {
                                "mid": mid_list
                            }
                        }, {
                            "terms": {
                                "keywords_string": keywords_list
                            }
                        }]
                    }
                }
            }
        },
        "sort": {
            "timestamp": {
                "order": "desc"
            }
        },
        "size": 100
    }

    #if social_sensors and int(sentiment_type) == 1:
    #    query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"terms":{"uid": social_sensors}})

    if int(sentiment_type) == 1 or int(sentiment_type) == 0:
        query_body["query"]["filtered"]["filter"]["bool"]["must"].append(
            {"term": {
                "sentiment": sentiment_type
            }})
    else:
        query_body["query"]["filtered"]["filter"]["bool"]["must"] = [{
            "terms": {
                "sentiment": ["2", "3"]
            }
        }]

    # 判断当前ts和ts-time_interval是否属于同一天,确定查询哪个es
    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts - time_interval)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    index_name_1 = flow_text_index_name_pre + datetime_1
    exist_es_1 = es_text.indices.exists(index_name_1)

    # 1. 聚合原创微博mid list
    if datetime == datetime_1 and exist_es:
        search_results = es_text.search(index=index_name,
                                        doc_type=flow_text_index_type,
                                        body=query_body)["hits"]["hits"]
    elif datetime != datetime_1 and exist_es_1:
        search_results = es_text.search(index=index_name_1,
                                        doc_type=flow_text_index_type,
                                        body=query_body)["hits"]["hits"]
    else:
        search_results = []

    uid_list = []
    results = []
    if search_results:
        for item in search_results:
            uid_list.append(item["_source"]['uid'])
        if uid_list:
            portrait_result = es_profile.mget(
                index=profile_index_name,
                doc_type=profile_index_type,
                body={"ids": uid_list},
                fields=['nick_name', 'photo_url'])["docs"]

        for i in range(len(uid_list)):
            item = search_results[i]['_source']
            temp = []
            # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type
            temp.append(item['uid'])
            if portrait_result[i]['found']:
                temp.append(portrait_result[i]["fields"]["nick_name"][0])
                temp.append(portrait_result[i]["fields"]["photo_url"][0])
            else:
                temp.append("unknown")
                temp.append("")
            temp.append(item["text"])
            temp.append(item["sentiment"])
            temp.append(ts2date(item['timestamp']))
            temp.append(item['geo'])
            keywords_set = set(item['keywords_string'].split('&'))
            common_keywords = set(keywords_list) & keywords_set
            temp.append(list(common_keywords))
            temp.append(item['message_type'])
            results.append(temp)

    return results
Beispiel #15
0
def get_origin_weibo_detail(ts, user, task_name, size, order, message_type=1):
    _id = user + '-' + task_name
    task_detail = es_user_portrait.get(index=index_sensing_task,
                                       doc_type=_id,
                                       id=ts)['_source']
    print '37', index_sensing_task, _id
    mid_value = json.loads(task_detail['mid_topic_value'])
    duplicate_dict = json.loads(task_detail['duplicate_dict'])
    tmp_duplicate_dict = dict()
    for k, v in duplicate_dict.iteritems():
        try:
            tmp_duplicate_dict[v].append(k)
        except:
            tmp_duplicate_dict[v] = [k, v]

    if message_type == 1:
        weibo_detail = json.loads(task_detail['origin_weibo_detail'])
    elif message_type == 2:
        weibo_detail = json.loads(task_detail['retweeted_weibo_detail'])
    else:
        weibo_detail = json.loads(task_detail['sensitive_weibo_detail'])
    weibo_detail_list = []
    if weibo_detail:
        for iter_mid, item in weibo_detail.iteritems():
            tmp = []
            tmp.append(iter_mid)
            tmp.append(item[iter_mid])
            tmp.append(item['retweeted'])
            tmp.append(item['comment'])
            weibo_detail_list.append(tmp)
    mid_list = weibo_detail.keys()
    print len(mid_list)
    results = []
    query_body = {
        "query": {
            "filtered": {
                "filter": {
                    "terms": {
                        "mid": mid_list
                    }
                }
            }
        },
        "size": 1000,
        "sort": {
            "timestamp": {
                "order": "desc"
            }
        }
    }

    index_list = []
    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts - DAY)
    index_name = flow_text_index_name_pre + datetime
    print es_text
    exist_es = es_text.indices.exists(index_name)
    print exist_es
    if exist_es:
        index_list.append(index_name)
    index_name_1 = flow_text_index_name_pre + datetime_1
    exist_es_1 = es_text.indices.exists(index_name_1)
    if exist_es_1:
        index_list.append(index_name_1)

    if index_list and mid_list:
        search_results = es_text.search(index=index_list,
                                        doc_type=flow_text_index_type,
                                        body=query_body)["hits"]["hits"]
    else:
        search_results = []

    uid_list = []
    text_dict = dict()  # 文本信息
    portrait_dict = dict()  # 背景信息
    sort_results = []
    if search_results:
        for item in search_results:
            uid_list.append(item["_source"]['uid'])
            text_dict[item['_id']] = item['_source']  # _id是mid
        if uid_list:
            portrait_result = es_profile.mget(
                index=profile_index_name,
                doc_type=profile_index_type,
                body={"ids": uid_list},
                fields=['nick_name', 'photo_url'])["docs"]
            for item in portrait_result:
                if item['found']:
                    portrait_dict[item['_id']] = {
                        "nick_name": item["fields"]["nick_name"][0],
                        "photo_url": item["fields"]["photo_url"][0]
                    }
                else:
                    portrait_dict[item['_id']] = {
                        "nick_name": item['_id'],
                        "photo_url": ""
                    }

        if order == "total":
            sorted_list = sorted(weibo_detail_list,
                                 key=lambda x: x[1],
                                 reverse=True)[:10]
        elif order == "retweeted":
            sorted_list = sorted(weibo_detail_list,
                                 key=lambda x: x[2],
                                 reverse=True)[:10]
        elif order == "comment":
            sorted_list = sorted(weibo_detail_list,
                                 key=lambda x: x[3],
                                 reverse=True)[:10]
        else:
            sorted_list = weibo_detail_list

        count_n = 0
        results_dict = dict()
        mid_index_dict = dict()
        for item in sorted_list:  # size
            mid = item[0]
            iter_text = text_dict.get(mid, {})
            temp = []
            # uid, nick_name, photo_url, text, sentiment, timestamp, geo, keywords_string, message_type
            if iter_text:
                uid = iter_text['uid']
                temp.append(uid)
                iter_portrait = portrait_dict.get(uid, {})
                if iter_portrait:
                    temp.append(iter_portrait['nick_name'])
                    temp.append(iter_portrait['photo_url'])
                else:
                    temp.extend([uid, ''])
                temp.append(iter_text["text"])
                temp.append(iter_text["sentiment"])
                temp.append(ts2date(iter_text['timestamp']))
                temp.append(iter_text['geo'])
                if message_type == 1:
                    temp.append(1)
                elif message_type == 2:
                    temp.append(3)
                else:
                    temp.append(iter_text['message_type'])
                #jln 提取关键词
                f_key = get_weibo_single(iter_text['text'])
                temp.append(
                    sorted(f_key.iteritems(), key=lambda x: x[1],
                           reverse=True))

                temp.append(item[2])
                temp.append(item[3])
                temp.append(iter_text.get('sensitive', 0))
                temp.append(iter_text['timestamp'])
                temp.append(mid_value[mid])
                temp.append(mid)

                results.append(temp)
            count_n += 1

        results = sorted(results,
                         key=operator.itemgetter(-4, -2, -6),
                         reverse=True)  # -4 -2 -3
        sort_results = []
        count = 0
        for item in results:
            sort_results.append([item])
            mid_index_dict[item[-1]] = count
            count += 1

        if tmp_duplicate_dict:
            remove_list = []
            value_list = tmp_duplicate_dict.values()  # [[mid, mid], ]
            for item in value_list:
                tmp = []
                for mid in item:
                    if mid_index_dict.get(mid, 0):
                        tmp.append(mid_index_dict[mid])
                if len(tmp) > 1:
                    tmp_min = min(tmp)
                else:
                    continue
                tmp.remove(tmp_min)
                for iter_count in tmp:
                    sort_results[tmp_min].extend(sort_results[iter_count])
                    remove_list.append(sort_results[iter_count])
            if remove_list:
                for item in remove_list:
                    sort_results.remove(item)

    return sort_results
Beispiel #16
0
def get_retweet_weibo_detail(ts, user, task_name, size, text_type, type_value):
    _id = user + '-' + task_name
    task_detail = es_user_portrait.get(index=index_sensing_task,
                                       doc_type=_id,
                                       id=ts)['_source']
    origin_weibo_detail = json.loads(task_detail['origin_weibo_detail'])
    retweeted_weibo_detail = json.loads(task_detail['retweeted_weibo_detail'])

    mid_list = []
    mid_list.extend(origin_weibo_detail.keys())
    mid_list.extend(retweeted_weibo_detail.keys())

    query_body = {
        "query": {
            "filtered": {
                "filter": {
                    "bool": {
                        "must": [{
                            "range": {
                                "timestamp": {
                                    "gte": ts - time_interval,
                                    "lt": ts
                                }
                            }
                        }, {
                            "terms": {
                                "root_mid": mid_list
                            }
                        }]
                    }
                }
            }
        },
        "sort": {
            "timestamp": {
                "order": "desc"
            }
        },
        "size": 100
    }

    if text_type == "message_type":
        query_body['query']['filtered']['filter']['bool']['must'].append(
            {"term": {
                text_type: type_value
            }})
    if text_type == "sentiment":
        #if isinstance(type_value, str):
        if len(type_value) == 1:
            query_body['query']['filtered']['filter']['bool']['must'].append(
                {"term": {
                    text_type: type_value
                }})
        else:
            query_body['query']['filtered']['filter']['bool']['must'].append(
                {"terms": {
                    text_type: type_value
                }})

    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts - time_interval)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    index_name_1 = flow_text_index_name_pre + datetime_1
    exist_es_1 = es_text.indices.exists(index_name_1)

    # 1. 查询微博
    if datetime == datetime_1 and exist_es:
        search_results = es_text.search(index=index_name,
                                        doc_type=flow_text_index_type,
                                        body=query_body)["hits"]["hits"]
    elif datetime != datetime_1 and exist_es_1:
        search_results = es_text.search(index=index_name_1,
                                        doc_type=flow_text_index_type,
                                        body=query_body)["hits"]["hits"]
    else:
        search_results = []
    #print search_results
    # 2. 获取微博相关信息
    results = []
    uid_list = []
    if search_results:
        for item in search_results:
            uid_list.append(item["_source"]['uid'])
        if uid_list:
            portrait_result = es_profile.mget(
                index=profile_index_name,
                doc_type=profile_index_type,
                body={"ids": uid_list},
                fields=['nick_name', 'photo_url'])["docs"]

        for i in range(len(uid_list)):
            item = search_results[i]['_source']
            temp = []
            # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type
            temp.append(item['uid'])
            if portrait_result[i]['found']:
                temp.append(portrait_result[i]["fields"]["nick_name"][0])
                temp.append(portrait_result[i]["fields"]["photo_url"][0])
            else:
                temp.append(item['uid'])
                temp.append("")
            temp.append(item["text"])
            #print item['text']
            temp.append(item["sentiment"])
            temp.append(ts2date(item['timestamp']))
            temp.append(item['geo'])
            temp.append(item["message_type"])
            results.append(temp)

    return results