def get_text(top_list, date, style):

# input: [[mid1, no.1], [mid2, no.2], ['mid3', no.3]]
# output: [[text1, no.1], [text2, no.2], [text3, no.3]]
# mid, retweeted, comment, text, geo, timestamp, sentiment, mid_url
    results = []
    index_flow_text = pre_text_index + date
    #index_list = get_text_index(date)
    if len(top_list) != 0: # no one
        mid_list = []
        for item in top_list:
            mid_list.append(item[0])
	search_result = es.mget(index=index_flow_text, doc_type=flow_text_index_type, body={"ids":mid_list})["docs"]
        for i in range(len(top_list)):
            temp = []
            temp.extend(top_list[i])
            if search_result[i]['found']:
                source = search_result[i]['_source']
                temp.append(source["text"])
                temp.append(source["geo"])
                temp.append(ts2date(source["timestamp"]))
                temp.append(source["sentiment"])
                temp.append(weiboinfo2url(source['uid'], source['mid']))
                temp.append(uid_url+source['uid'])
                temp.append(source['uid'])
                try:
                    uname = es_profile.get(index=profile_index_name, doc_type=profile_index_type, id=source['uid'])["_source"]["nick_name"]
                    temp.append(uname)
                except:
                    temp.append("unknown")
            else:
                temp.extend(["", "", "", "", "", "", "", ""])
            results.append(temp)
    return results
Exemple #2
0
def new_get_user_weibo(uid, sort_type):
    results = []
    weibo_list = []
    now_date = ts2datetime(time.time())
    #run_type
    if RUN_TYPE == 0:
        now_date = RUN_TEST_TIME
        sort_type = 'timestamp'
    #step1:get user name
    try:
        user_profile_result = es_user_profile.get(index=profile_index_name, doc_type=profile_index_type,\
                id=uid, _source=False, fields=['nick_name'])
    except:
        user_profile_result = {}
    if user_profile_result:
        uname = user_profile_result['fields']['nick_name'][0]
    else:
        uname = ''
    #step2:get user weibo
    for i in range(7, 0, -1):
        iter_date = ts2datetime(datetime2ts(now_date) - i * DAY)
        index_name = flow_text_index_name_pre + iter_date
        try:
            weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\
                    body={'query':{'filtered':{'filter':{'term': {'uid': uid}}}}, 'sort':sort_type, 'size':100})['hits']['hits']
        except:
            weibo_result = []
        if weibo_result:
            weibo_list.extend(weibo_result)
    print 'weibo_list:', weibo_list[0]
    sort_weibo_list = sorted(weibo_list,
                             key=lambda x: x['_source'][sort_type],
                             reverse=True)[:100]
    for weibo_item in sort_weibo_list:
        source = weibo_item['_source']
        mid = source['mid']
        uid = source['uid']
        text = source['text']
        ip = source['geo']
        timestamp = source['timestamp']
        date = ts2date(timestamp)
        sentiment = source['sentiment']
        #run_type
        if RUN_TYPE == 1:
            retweet_count = source['retweet_count']
            comment_count = source['comment_count']
            sensitive_score = source['sensitive']
        else:
            retweet_count = 0
            comment_count = 0
            sensitive_score = 0
        city = ip2city(ip)
        results.append([
            mid, uid, text, ip, city, timestamp, date, retweet_count,
            comment_count, sensitive_score
        ])

    return results
def new_get_user_weibo(uid, sort_type):
    results = []
    weibo_list = []
    now_date = ts2datetime(time.time())
    #run_type
    if RUN_TYPE == 0:
        now_date = RUN_TEST_TIME
        sort_type = 'timestamp'
    #step1:get user name
    try:
        user_profile_result = es_user_profile.get(index=profile_index_name, doc_type=profile_index_type,\
                id=uid, _source=False, fields=['nick_name'])
    except:
        user_profile_result = {}
    if user_profile_result:
        uname = user_profile_result['fields']['nick_name'][0]
    else:
        uname = ''
    #step2:get user weibo
    for i in range(7, 0, -1):
        iter_date = ts2datetime(datetime2ts(now_date) - i * DAY)
        index_name = flow_text_index_name_pre + iter_date
        try:
            weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\
                    body={'query':{'filtered':{'filter':{'term': {'uid': uid}}}}, 'sort':sort_type, 'size':100})['hits']['hits']
        except:
            weibo_result = []
        if weibo_result:
            weibo_list.extend(weibo_result)
    print 'weibo_list:', weibo_list[0]
    sort_weibo_list = sorted(weibo_list, key=lambda x:x['_source'][sort_type], reverse=True)[:100]
    for weibo_item in sort_weibo_list:
        source = weibo_item['_source']
        mid = source['mid']
        uid = source['uid']
        text = source['text']
        ip = source['geo']
        timestamp = source['timestamp']
        date = ts2date(timestamp)
        sentiment = source['sentiment']
        #run_type
        if RUN_TYPE == 1:
            retweet_count = source['retweet_count']
            comment_count = source['comment_count']
            sensitive_score = source['sensitive']
        else:
            retweet_count = 0
            comment_count = 0
            sensitive_score = 0
        city = ip2city(ip)
        results.append([mid, uid, text, ip, city,timestamp, date, retweet_count, comment_count, sensitive_score])

    return results
def get_text(top_list, date, user_info, style):

    # input: [[mid1, no.1], [mid2, no.2], ['mid3', no.3]]
    # output: [[text1, no.1], [text2, no.2], [text3, no.3]]
    # mid, retweeted, comment, text, geo, timestamp, sentiment, mid_url
    results = []
    detail_list = [
        "origin_weibo_retweeted_detail",
        "origin_weibo_comment_detail",
        "retweeted_weibo_retweeted_detail",
        "retweeted_weibo_comment_detail",
    ]
    index_flow_text = pre_text_index + date
    if len(top_list) != 0:  # no one
        mid_list = []
        for i in range(len(top_list)):
            mid_list.append(top_list[i][0])
        search_result = es.mget(index=index_flow_text, doc_type=flow_text_index_type, body={"ids": mid_list})["docs"]
        for i in range(len(top_list)):
            temp = []
            temp.append(mid_list[i])
            if int(style) == 0:
                temp.append(top_list[i][1])
                temp.append(json.loads(user_info[detail_list[1]]).get(top_list[i][0], 0))
            elif int(style) == 1:
                temp.append(json.loads(user_info[detail_list[0]]).get(top_list[i][0], 0))
                temp.append(top_list[i][1])
            elif int(style) == 2:
                temp.append(top_list[i][1])
                temp.append(json.loads(user_info[detail_list[3]]).get(top_list[i][0], 0))
            else:
                temp.append(json.loads(user_info[detail_list[2]]).get(top_list[i][0], 0))
                temp.append(top_list[i][1])
            if search_result[i]["found"]:
                source = search_result[i]["_source"]
                temp.append(source["text"])
                temp.append(source["geo"])
                temp.append(ts2date(source["timestamp"]))
                temp.append(source["sentiment"])
                temp.append(weiboinfo2url(source["uid"], source["mid"]))
                temp.append(uid_url + source["uid"])
                temp.append(source["uid"])
                try:
                    uname = es_profile.get(index=profile_index_name, doc_type=profile_index_type, id=source["uid"])[
                        "_source"
                    ]["nick_name"]
                    temp.append(uname)
                except:
                    temp.append("unknown")
            else:
                temp.extend(["", "", "", "", "", "", "", ""])
            results.append(temp)
    return results
Exemple #5
0
def get_text(top_list, date, user_info, style):

# input: [[mid1, no.1], [mid2, no.2], ['mid3', no.3]]
# output: [[text1, no.1], [text2, no.2], [text3, no.3]]
# mid, retweeted, comment, text, geo, timestamp, sentiment, mid_url
    results = []
    detail_list = ["origin_weibo_retweeted_detail", "origin_weibo_comment_detail", "retweeted_weibo_retweeted_detail", "retweeted_weibo_comment_detail"]
    index_flow_text = pre_text_index + date
    if len(top_list) != 0: # no one
        mid_list = []
        for i in range(len(top_list)):
            mid_list.append(top_list[i][0])
        search_result = es.mget(index=index_flow_text, doc_type=flow_text_index_type, body={"ids":mid_list})["docs"]
        for i in range(len(top_list)):
            temp = []
            temp.append(mid_list[i])
            if int(style) == 0:
                temp.append(top_list[i][1])
                temp.append(json.loads(user_info[detail_list[1]]).get(top_list[i][0], 0))
            elif int(style) == 1:
                temp.append(json.loads(user_info[detail_list[0]]).get(top_list[i][0], 0))
                temp.append(top_list[i][1])
            elif int(style) == 2:
                temp.append(top_list[i][1])
                temp.append(json.loads(user_info[detail_list[3]]).get(top_list[i][0], 0))
            else:
                temp.append(json.loads(user_info[detail_list[2]]).get(top_list[i][0], 0))
                temp.append(top_list[i][1])
            if search_result[i]["found"]:
                source = search_result[i]["_source"]
                temp.append(source["text"])
                temp.append(source["geo"])
                temp.append(ts2date(source["timestamp"]))
                temp.append(source["sentiment"])
                temp.append(weiboinfo2url(source['uid'], source['mid']))
                temp.append(uid_url+source['uid'])
                temp.append(source['uid'])
                try:
                    uname = es_profile.get(index=profile_index_name, doc_type=profile_index_type, id=source['uid'])["_source"]["nick_name"]
                    temp.append(uname)
                except:
                    temp.append("unknown")
            else:
                temp.extend(["", "", "", "", "", "", "", ""])
            results.append(temp)
    return results
def new_get_user_profile(uid):
    try:
        #print 'trying',es_user_profile,profile_index_name
        results = es_user_profile.get(index=profile_index_name, doc_type=profile_index_type,\
                id=uid)['_source']
        #print es_user_profile,profile_index_name
    except:
        results = {}
    #get new fansnum and statusnum
    try:
        bci_history_result = es_bci_history.get(
            index=bci_history_index_name,
            doc_type=bci_history_index_type,
            id=uid)['_source']
    except:
        bci_history_result = {}
    if not results:
        results['uid'] = uid
        results['photo_url'] = ''
        results['nick_name'] = ''
        results['verified_type'] = ''
        results['verified_type_ch'] = ''
        results['fansnum'] = ''
        results['friendsnum'] = ''
        results['statusnum'] = ''
        results['user_location'] = ''
        results['description'] = ''
    else:
        verified_num_type = results['verified_type']
        try:
            verified_ch_type = verified_num2ch_dict[verified_num_type]
        except:
            verified_ch_type = ''
        results['verified_type_ch'] = verified_ch_type

    if bci_history_result:
        try:
            results['fansnum'] = int(bci_history_result['user_fansnum'])
            results['friendsnum'] = int(bci_history_result['user_friendsnum'])
            results['statusnum'] = int(bci_history_result['weibo_month_sum'])
        except:
            pass

    return results
def new_get_user_profile(uid):
    try:
        results = es_user_profile.get(index=profile_index_name, doc_type=profile_index_type,\
                id=uid)['_source']
    except:
        results = {}
    if not results:
        results['uid'] = uid
        results['photo_url'] = ''
        results['nick_name'] = ''
        results['verified_type'] = ''
        results['verified_type_ch'] = ''
        results['fansnum'] = ''
        results['friendsnum'] = ''
        results['statusnum'] = ''
        results['user_location'] = ''
        results['description'] = ''
    else:
        verified_num_type = results['verified_type']
        verified_ch_type = verified_num2ch_dict[verified_num_type]
        results['verified_type_ch'] = verified_ch_type

    return results
def new_get_user_profile(uid):
    try:
        results = es_user_profile.get(index=profile_index_name, doc_type=profile_index_type,\
                id=uid)['_source']
    except:
        results = {}
    if not results:
        results['uid'] = uid
        results['photo_url'] = ''
        results['nick_name'] = ''
        results['verified_type'] = ''
        results['verified_type_ch'] = ''
        results['fansnum'] = ''
        results['friendsnum'] = ''
        results['statusnum'] = ''
        results['user_location'] = ''
        results['description'] = ''
    else:
        verified_num_type = results['verified_type']
        verified_ch_type = verified_num2ch_dict[verified_num_type]
        results['verified_type_ch'] = verified_ch_type

    return results
def new_get_user_profile(uid):
    try:
        results = es_user_profile.get(index=profile_index_name, doc_type=profile_index_type,\
                id=uid)['_source']
    except:
        results = {}
    #get new fansnum and statusnum
    try:
        bci_history_result = es_bci_history.get(index=bci_history_index_name, doc_type=bci_history_index_type, id=uid)['_source']
    except:
        bci_history_result = {}
    if not results:
        results['uid'] = uid
        results['photo_url'] = ''
        results['nick_name'] = ''
        results['verified_type'] = ''
        results['verified_type_ch'] = ''
        results['fansnum'] = ''
        results['friendsnum'] = ''
        results['statusnum'] = ''
        results['user_location'] = ''
        results['description'] = ''
    else:
        verified_num_type = results['verified_type']
        try:
            verified_ch_type = verified_num2ch_dict[verified_num_type]
        except:
            verified_ch_type = ''
        results['verified_type_ch'] = verified_ch_type
    
    if bci_history_result:
        results['fansnum'] = int(bci_history_result['user_fansnum'])
        results['friendsnum'] = int(bci_history_result['user_friendsnum'])
        results['statusnum'] = int(bci_history_result['weibo_month_sum'])
    
    return results
def new_get_user_weibo(uid, sort_type):
    results = []
    weibo_list = []
    now_date = ts2datetime(time.time())
    #run_type
    if RUN_TYPE == 0:
        now_date = RUN_TEST_TIME
        sort_type = 'timestamp'
    #step1:get user name
    print '708'
    try:
        user_profile_result = es_user_profile.get(index=profile_index_name, doc_type=profile_index_type,\
                id=uid, _source=False, fields=['nick_name'])
    except:
        user_profile_result = {}
    print '714', len(user_profile_result)
    if user_profile_result:
        uname = user_profile_result['fields']['nick_name'][0]
    else:
        uname = ''
    #step2:get user weibo
    for i in range(7, 0, -1):
        if RUN_TYPE == 1:
            iter_date = ts2datetime(datetime2ts(now_date) - i * DAY)
        else:
            iter_date = '2013-09-01'
        index_name = flow_text_index_name_pre + iter_date
        print '726'
        try:
            weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\
                    body={'query':{'filtered':{'filter':{'term': {'uid': uid}}}}, 'size':MAX_VALUE})['hits']['hits']
            #print weibo_result
        except:
            weibo_result = []
        print '732', len(weibo_result)
        if weibo_result:
            weibo_list.extend(weibo_result)

    #sort_weibo_list = sorted(weibo_list, key=lambda x:x['_source'][sort_type], reverse=True)[:100]
    mid_set = set()
    for weibo_item in weibo_list:
        source = weibo_item['_source']
        mid = source['mid']
        uid = source['uid']
        text = source['text']
        ip = source['ip']
        timestamp = source['timestamp']
        date = ts2date(timestamp)
        sentiment = source['sentiment']
        weibo_url = weiboinfo2url(uid, mid)
        #run_type
        if RUN_TYPE == 1:
            try:
                retweet_count = source['retweeted']
            except:
                retweet_count = 0
            try:
                comment_count = source['comment']
            except:
                comment_count = 0
            try:
                sensitive_score = source['sensitive']
            except:
                sensitive_score = 0
        else:
            retweet_count = 0
            comment_count = 0
            sensitive_score = 0
        city = ip2city(ip)
        if mid not in mid_set:
            results.append([
                mid, uid, text, ip, city, timestamp, date, retweet_count,
                comment_count, sensitive_score, weibo_url
            ])
            mid_set.add(mid)
    if sort_type == 'timestamp':
        sort_results = sorted(results, key=lambda x: x[5], reverse=True)
    elif sort_type == 'retweet_count':
        sort_results = sorted(results, key=lambda x: x[7], reverse=True)
    elif sort_type == 'comment_count':
        sort_results = sorted(results, key=lambda x: x[8], reverse=True)
    elif sort_type == 'sensitive':
        sort_results = sorted(results, key=lambda x: x[9], reverse=True)
    print '778'
    return sort_results
Exemple #11
0
def search_attribute_portrait(uid):
    results = dict()
    index_name = 'user_portrait'
    index_type = 'user'
    try:
        results = es_user_portrait.get(index=index_name, doc_type=index_type, id=uid)['_source']
    except:
        results = None
        return None
    keyword_list = []
    if results['keywords']:
        keywords_dict = json.loads(results['keywords'])
        sort_word_list = sorted(keywords_dict.items(), key=lambda x:x[1], reverse=True)
        #print 'sort_word_list:', sort_word_list
        results['keywords'] = sort_word_list
    else:
        results['keywords'] = []
    #print 'keywords:', results
    geo_top = []
    if results['activity_geo_dict']:
        geo_dict = json.loads(results['activity_geo_dict'])
        sort_geo_dict = sorted(geo_dict.items(), key=lambda x:x[1], reverse=True)
        geo_top = sort_geo_dict
        results['activity_geo'] = geo_top
    else:
        results['activity_geo'] = []
    if results['hashtag_dict']:
        hashtag_dict = json.loads(results['hashtag_dict'])
        sort_hashtag_dict = sorted(hashtag_dict.items(), key=lambda x:x[1], reverse=True)
        results['hashtag_dict'] = sort_hashtag_dict[:5]
        descriptions = hashtag_description(hashtag_dict)
        results['hashtag_description'] = descriptions
    else:
        results['hashtag_dict'] = []
        results['hashtag_description'] = ''
    emotion_result = {}
    emotion_conclusion_dict = {}
    if results['emotion_words']:
        emotion_words_dict = json.loads(results['emotion_words'])
        for word_type in emotion_mark_dict:
            try:
                word_dict = emotion_words_dict[word_type]
                if word_type=='126' or word_type=='127':
                    emotion_conclusion_dict[word_type] = word_dict
                sort_word_dict = sorted(word_dict.items(), key=lambda x:x[1], reverse=True)
                #print 'sort_word_dict:', sort_word_dict
                word_list = sort_word_dict[:5]
            except:
                word_list = []
            emotion_result[emotion_mark_dict[word_type]] = word_list
    #print 'emotion_words:', type(emotion_result)
    results['emotion_words'] = emotion_result
    #emotion_conclusion
    results['emotion_conclusion'] = get_emotion_conclusion(emotion_conclusion_dict)
    #topic
    if results['topic']:
        topic_dict = json.loads(results['topic'])
        sort_topic_dict = sorted(topic_dict.items(), key=lambda x:x[1], reverse=True)
        results['topic'] = sort_topic_dict[:5]
    else:
        results['topic'] = []
    #domain
    if results['domain']:
        domain_string = results['domain']
        domain_list = domain_string.split('_')
        results['domain'] = domain_list
    else:
        results['domain'] = []
    #emoticon
    if results['emoticon']:
        emoticon_dict = json.loads(results['emoticon'])
        sort_emoticon_dict = sorted(emoticon_dict.items(), key=lambda x:x[1], reverse=True)
        results['emoticon'] = sort_emoticon_dict[:5]
    else:
        results['emoticon'] = []
    #online_pattern
    if results['online_pattern']:
        online_pattern_dict = json.loads(results['online_pattern'])
        sort_online_pattern_dict = sorted(online_pattern_dict.items(), key=lambda x:x[1], reverse=True)
        results['online_pattern'] = sort_online_pattern_dict[:5]
    else:
        results['online_pattern'] = []
    #psycho_status
    if results['psycho_status']:
        psycho_status_dict = json.loads(results['psycho_status'])
        sort_psycho_status_dict = sorted(psycho_status_dict.items(), key=lambda x:x[1], reverse=True)
        results['psycho_status'] = sort_psycho_status_dict[:5]
    else:
        results['psycho_status'] = []
    #psycho_feature
    if results['psycho_feature']:
        psycho_feature_list = results['psycho_feature'].split('_')
        results['psycho_feature'] = psycho_feature_list
    else:
        results['psycho_feature'] = []
    #state
    if results['uid']:
        uid = results['uid']
        try:
            profile_result = es_user_profile.get(index='weibo_user', doc_type='user', id=uid)
        except:
            profile_result = None
        try:
            user_state = profile_result['_source']['description']
            results['description'] = user_state
        except:
            results['description'] = ''
    else:
        results['uid'] = ''
        results['description'] = ''
    
    if results['importance']:
        #print results['importance']
        query_body = {
                'query':{
                    "range":{
                        "importance":{
                        "from": results['importance'],
                        "to": 1000000
                        }
                        }
                    }
                }
        importance_rank = es_user_portrait.count(index=index_name, doc_type=index_type, body=query_body)
        if importance_rank['_shards']['successful'] != 0:
            #print 'importance_rank:', importance_rank
            results['importance_rank'] = importance_rank['count']
        else:
            print 'es_importance_rank error'
            results['importance_rank'] = 0
    else:
        results['importance_rank'] = 0
    if results['activeness']:
        query_body = {
                'query':{
                    "range":{
                        "activeness":{
                            "from":results['activeness'],
                            "to": 1000000
                            }
                        }
                    }
                }
        activeness_rank = es_user_portrait.count(index=index_name, doc_type=index_type, body=query_body)
        if activeness_rank['_shards']['successful'] != 0:
            results['activeness_rank'] = activeness_rank['count']
        else:
            print 'es_activess_rank error'
            results['activeness_rank'] = 0
    if results['influence']:
        query_body = {
                'query':{
                    'range':{
                        'influence':{
                            'from':results['influence'],
                            'to': 1000000
                            }
                        }
                    }
                }
        influence_rank = es_user_portrait.count(index=index_name, doc_type=index_type, body=query_body)
        if influence_rank['_shards']['successful'] != 0:
            results['influence_rank'] = influence_rank['count']
        else:
            print 'es_influence_rank error'
            results['influence_rank'] = 0
    #total count in user_portrait
    query_body ={
            'query':{
                'match_all':{}
                }
            }
    all_count_results = es_user_portrait.count(index=index_name, doc_type=index_type, body=query_body)
    if all_count_results['_shards']['successful'] != 0:
        results['all_count'] = all_count_results['count']
    else:
        print 'es_user_portrait error'
        results['all_count'] = 0
    #link conclusion
    link_ratio = results['link']
    results['link_conclusion'] = get_link_conclusion(link_ratio)
    return results