Beispiel #1
0
def counts(start_ts, end_ts, topic, en_name, keywords):
    query_body = {
        'query': {
            'match_all': {}
            # 'term':{'en_name':topic}
        },
        'aggs': {
            'diff_uids': {
                'cardinality': {
                    'field': 'uid'
                }
            }
        },
        'size': 999999999
    }
    result = []
    index_list = []
    weibo_count = 0
    result = es_event.search(index=en_name,
                             doc_type=event_text_type,
                             body=query_body)
    #print result
    weibo_counts = result['hits']['total']
    uid_counts = result['aggregations']['diff_uids']['value']
    print weibo_counts, uid_counts
    #task_id = str(start_ts)+'_'+str(end_ts)+'_'+en_name+'_'+submit_user
    #print es_event.update(index=event_analysis_name,doc_type=event_type,id=task_id,body={'doc':{'weibo_counts':weibo_counts,'uid_counts':uid_counts}})
    return weibo_counts, uid_counts
Beispiel #2
0
def get_topic_weibo(topic,en_name,start_ts,end_ts,keywords,mid):
    query_body = {'query':{'match_all':{}},'sort':'timestamp','size':1}
    try:
        task_exist = es_event.search(index=en_name,doc_type=event_type,body=query_body)['hits']['hits']
    except:
        get_mappings(en_name)
    find_flow_texts_scan(start_ts,end_ts,topic,en_name,keywords,mid)
def getEsIndexName(topic_name):
    #body={"query": {"match_all": {}}}
    query_body = {'query': {'match': {'name': topic_name}}}
    try:
        res = es_event.search(index='topics', body=query_body)['hits']['hits']
        return res[0]['_source']['index_name']
    except:
        return -1
def get_users(topic,begin_ts,end_ts):
	uid_list = set()
	query_body = {   
	    'query':{
	        'bool':{
	            'must':[
	                {'term':{'en_name':topic}},
	                {'range':{
	                    'timestamp':{'gte': begin_ts, 'lt':end_ts} 
	                }
	            }]
	        }
	    },
	    'size':999999999
	}
	result = es_event.search(index=event_text,doc_type=event_text_type, fields=['uid'],body=query_body)['hits']['hits']
	for i in result:
		uid_list.add(i['fields']['uid'][0])
	print len(uid_list)
	if RUN_TYPE == 0:
		post = datetime2ts(RUN_TEST_TIME) #datetimestr2ts(RUN_TEST_TIME) 
		post = ts2datetimestr(post)
	else:
		post = ts2datetimestr(time.time())
		
	print  bci_day_pre+post,bci_day_type,es_user_portrait
	user_result = es_bci.mget(index=bci_day_pre+post ,doc_type=bci_day_type,body={'ids':list(uid_list)})['docs']
	
	user_influence_dict = {}
	for i in user_result:
		#print i
		if i['found']:
			i = i['_source']
			user_influence_dict[i['user']] = i['user_index']
			#print i,type(i)
			
			#print i['activeness'],i['influence'],i['fansnum']

	user = sorted(user_influence_dict.iteritems(),key=lambda x:x[1],reverse=True)[:100]
	#print user
	user_dict = {}
	for i in user:
		try:
			result = es_user_profile.get(index=profile_index_name,doc_type=profile_index_type,id=i[0])
			u_type = result['_source']['verified_type']
			if u_type in auth_list:
				u_type = auth_type
			else:
				u_type = user_type
			user_dict[i[0]] = {'user_type':u_type,'influ':i[1]}
		except:
			user_dict[i[0]] = {'user_type':user_type,'influ':i[1]}


	try:
		es_event.update(index=event_analysis_name,doc_type=event_type,id=topic,body={'doc':{'user_results':json.dumps(user_dict)}})
	except Exception,e:
	    es_event.index(index=event_analysis_name,doc_type=event_type,id=topic,body={'user_results':json.dumps(user_dict)})
def compute_mtype_count(topic, begin_ts, end_ts, during):
    all_mtype_dict = {}
    #print begin_ts,end_ts
    query_body = {
        'query': {
            'bool': {
                'must': [
                    {
                        'range': {
                            'timestamp': {
                                'gte': begin_ts,
                                'lt': end_ts
                            }
                        }
                    },
                    {
                        'term': {
                            'en_name': topic
                        }
                    }  #topic}}  jln
                ]
            }
        },
        'aggs': {
            'all_interests': {
                'terms': {
                    'field': 'message_type',
                    'size': MTYPE_COUNT
                }
            }
        }
    }

    weibo_mtype_count = es_event.search(index=event_text, doc_type=event_text_type,body=query_body)\
         ['aggregations']['all_interests']['buckets']
    print es_event, event_text, event_text_type
    print 'weibo_mtype_count:::::::::::::::::', weibo_mtype_count
    print begin_ts, end_ts, len(weibo_mtype_count)
    iter_mtype_dict = {}
    for mtype_item in weibo_mtype_count:
        mtype = mtype_item['key']
        mtype_count = mtype_item['doc_count']
        try:
            iter_mtype_dict[mtype] += mtype_count
        except:
            iter_mtype_dict[mtype] = mtype_count

    return iter_mtype_dict
Beispiel #6
0
def get_task():
    query_body = {
        'query': {
            'term': {
                'compute_status': 0
            }
        },
        'sort': {
            'submit_ts': {
                'order': 'asc'
            }
        }
    }
    result = es_event.search(index=event_task_name,
                             doc_type=event_task_type,
                             body=query_body)
    return result['hits']['hits']
Beispiel #7
0
def test(topic, start_ts, end_ts):
    print start_ts, end_ts
    query_body = {
        'query': {
            'filtered': {
                'filter': {
                    'range': {
                        'timestamp': {
                            'gte': start_ts,
                            'lt': end_ts
                        }
                    }
                }
            }
        }
    }
    weibo = es_event.search(index=topic,
                            doc_type=event_text_type,
                            body=query_body)['hits']['hits']  #字典
    print weibo
def compute_real_info(topic,begin_ts,end_ts,relation):
	info_dict = {}
	
	query_body = {   
		'query':{
			'bool':{
				'must':[
					{'term':{'en_name':topic}},
					{'term':{'message_type':1}},
					{'wildcard':{'text':'【*】*'}},
						{'range':{
						'timestamp':{'gte': begin_ts, 'lt':end_ts} 
						}
					}]
					}
				},
			'size':1,
			'sort':{'retweeted':{'order':'desc'}}
	}
	result = es_event.search(index=topic,doc_type=event_text_type,body=query_body)['hits']['hits']
	#抽取事件的人物、机构、地点和时间
	print result[0]['_source']['text']
	basics = get_news_main(result[0]['_source']['text'])
	print basics
	info_dict['real_auth'] = basics['organization']
	info_dict['real_geo'] = basics['place']
	info_dict['real_time'] = basics['time']
	info_dict['real_person'] = basics['people']
	#存关系
	if('join' in relation.split('&')):
		rel_list = []
		if info_dict['real_auth'] !='NULL':
			resu = create_person(org_node,org_primary,info_dict['real_auth'],org_index_name)
			if resu != 'Node Wrong':
				rel_list.append([[2,topic],'join',[0,info_dict['real_auth']]])
		if info_dict['real_person'] !='NULL':
			resu = create_person(people_node,people_primary,info_dict['real_person'],node_index_name)
			if resu != 'Node Wrong':
				rel_list.append([[2,topic],'join',[1,info_dict['real_person']]])
		try:
			nodes_rels(rel_list)
		except:
			pass

	query_body = {   
		'query':{
		'bool':{
			'must':[
				{'term':{'en_name':topic}},
				{'range':{
					'timestamp':{'gte': begin_ts, 'lt':end_ts} 
				}
				}]
			}
		},
		'size':10000
	}
	result = es_event.search(index=topic,doc_type=event_text_type,fields=['text'],body=query_body)['hits']['hits']
	text_list = []
	for i in result:
		text_list.append(i['fields']['text'][0])
	# print text_list

	#事件类型
	try:
		event = es_event.get(index=event_task_name,doc_type=event_task_type,id=topic)['_source']
		info_dict['event_type'] = event['event_type']
	except:
		info_dict['event_type'] = cut_weibo(text_list)
	info_dict['topics'] = json.dumps(get_topic_word(text_list,10))
	
	keywords = get_keyword(''.join(text_list),2)
	info_dict['keywords'] = '&'.join([i[0] for i in keywords])
	info_dict['keywords_list'] = json.dumps(keywords)

	hashtag = get_hashtag(''.join(text_list))
	info_dict['hashtag_dict'] = json.dumps(hashtag)
	info_dict['hashtag'] = '&'.join(list(hashtag.keys()))
	

	try:
		es_event.update(index=event_analysis_name,doc_type=event_type,id=topic,body={'doc':info_dict})
	except Exception,e:
		es_event.index(index=event_analysis_name,doc_type=event_type,id=topic,body=info_dict)
def get_users(topic,begin_ts,end_ts,relation):
	uid_list = set()
	query_body = {   
	'query':{
		'bool':{
			'must':[
					{'term':{'en_name':topic}},
					# {'wildcard':{'text':'【*】*'}},
					{'range':{
					'timestamp':{'gte': begin_ts, 'lt':end_ts} 
					}
				}]
			}
		},
		'size':999999999
	}
	result = es_event.search(index=topic,doc_type=event_text_type, fields=['uid'],body=query_body)['hits']['hits']
	for i in result:
		uid_list.add(i['fields']['uid'][0])
	print len(uid_list)
	if RUN_TYPE == 0:
		post = datetime2ts(RUN_TEST_TIME) #datetimestr2ts(RUN_TEST_TIME) 
		post = ts2datetimestr(post)
	else:
		post = ts2datetimestr(time.time())
		
	print  bci_day_pre+post,bci_day_type,es_user_portrait
	user_result = es_bci.mget(index=bci_day_pre+post ,doc_type=bci_day_type,body={'ids':list(uid_list)})['docs']
	
	user_influence_dict = {}
	for i in user_result:
		# print i
		if i['found']:
			i = i['_source']
			user_influence_dict[i['user']] = i['user_index']
			#print i,type(i)
			
			#print i['activeness'],i['influence'],i['fansnum']

	user = sorted(user_influence_dict.iteritems(),key=lambda x:x[1],reverse=True)[:100]
	#print user
	not_in_user_list = event_user_portrait([i[0] for i in user])
	user_dict = {}
	p_list = []
	a_list = []
	for i in user:
		# if i[0] not in not_in_user_list:
		# print es_user_profile.get(index=profile_index_name,doc_type=profile_index_type,id=i[0])

		try:
			result = es_user_profile.get(index=profile_index_name,doc_type=profile_index_type,id=i[0])
			print result
			u_type = result['_source']['verified_type']
			print u_type
			if u_type in org_list:
				u_type = auth_type
				a_list.append(i[0])
			else:
				u_type = user_type
				p_list.append(i[0])
			user_dict[i[0]] = {'user_type':u_type,'influ':i[1]}

		except:
			user_dict[i[0]] = {'user_type':user_type,'influ':i[1]}
			p_list.append(i[0])
	print len(a_list),len(p_list)
	if('discuss' in relation.split('&')):
		rel_list = []
		for i in p_list:
			resu = create_person(people_node,people_primary,i,node_index_name)
			if resu != 'Node Wrong':
				rel_list.append([[2,topic],'discuss',[1,i]])
		for i in a_list:
			resu = create_person(org_node,org_primary,i,org_index_name)
			if resu != 'Node Wrong':
				rel_list.append([[2,topic],'discuss',[0,i]])
		try:
			nodes_rels(rel_list)
		except:
			pass


	try:
		es_event.update(index=event_analysis_name,doc_type=event_type,id=topic,body={'doc':{'user_results':json.dumps(user_dict)}})
	except Exception,e:
		es_event.index(index=event_analysis_name,doc_type=event_type,id=topic,body={'user_results':json.dumps(user_dict)})
Beispiel #10
0
def continue_compute():
    ts = time.time()
    query_body = {'query':{'term':{'range':{'end_ts':{'gte':time.time()}}}},'sort':{'submit_ts':{'order':'asc'}},'size':100000}
    result = es_event.search(index=event_task_name,doc_type=event_task_type,body=query_body)
    return result['hits']['hits']
Beispiel #11
0
def get_real(info_dict, topic, begin_ts, end_ts, relation):
    query_body = {
        'query': {
            'bool': {
                'must': [{
                    'term': {
                        'en_name': topic
                    }
                }, {
                    'term': {
                        'message_type': 1
                    }
                }, {
                    'wildcard': {
                        'text': '【*】*'
                    }
                }, {
                    'range': {
                        'timestamp': {
                            'gte': begin_ts,
                            'lt': end_ts
                        }
                    }
                }]
            }
        },
        'size': 1,
        'sort': {
            'retweeted': {
                'order': 'desc'
            }
        }
    }
    result = es_event.search(index=topic,
                             doc_type=event_text_type,
                             body=query_body)['hits']['hits']
    if len(result) == 0:
        query_body = {
            'query': {
                'bool': {
                    'must': [{
                        'term': {
                            'en_name': topic
                        }
                    }, {
                        'wildcard': {
                            'text': '【*】*'
                        }
                    }, {
                        'range': {
                            'timestamp': {
                                'gte': begin_ts,
                                'lt': end_ts
                            }
                        }
                    }]
                }
            },
            'size': 1,
            'sort': {
                'retweeted': {
                    'order': 'desc'
                }
            }
        }
        result = es_event.search(index=topic,
                                 doc_type=event_text_type,
                                 body=query_body)['hits']['hits']

    #抽取事件的人物、机构、地点和时间
    if len(result) != 0:
        print result[0]['_source']['text']
        basics = get_news_main(result[0]['_source']['text'])
        print basics
        info_dict['real_auth'] = basics['organization']
        info_dict['real_geo'] = basics['place']
        info_dict['real_time'] = basics['time']
        info_dict['real_person'] = basics['people']
    else:
        info_dict['real_auth'] = info_dict['real_geo'] = info_dict[
            'real_time'] = info_dict['real_person'] = 'NULL'
    #存关系
    if ('join' in relation.split('&')):
        rel_list = []
        if info_dict['real_auth'] != 'NULL':
            resu = create_person(org_node, org_primary, info_dict['real_auth'],
                                 org_index_name)
            if resu != 'Node Wrong':
                rel_list.append([[2, topic], 'join',
                                 [0, info_dict['real_auth']]])
        if info_dict['real_person'] != 'NULL':
            resu = create_person(people_node, people_primary,
                                 info_dict['real_person'], node_index_name)
            if resu != 'Node Wrong':
                rel_list.append([[2, topic], 'join',
                                 [1, info_dict['real_person']]])
        try:
            nodes_rels(rel_list)
        except:
            pass
    return info_dict
Beispiel #12
0
def compute_sentiment_count(topic, begin_ts, end_ts, during):
    all_sentiment_dict = {}
    query_body = {
        'query': {
            'bool': {
                'must': [
                    {
                        'range': {
                            'timestamp': {
                                'gte': begin_ts,
                                'lt': end_ts
                            }
                        }
                    },
                    {
                        'term': {
                            'en_name': topic
                        }
                    }  #topic}}  jln
                ]
            }
        },
        'aggs': {
            'all_interests': {
                'terms': {
                    'field': 'sentiment',
                    'size': SENTIMENT_TYPE_COUNT
                }  #,
                # 'aggs':{
                #     'geo':{
                #         'terms':{
                #         'field':'geo'
                #         }
                #     }
                # }
            }
        }
    }
    weibo_sentiment_count = es_event.search(index=topic,doc_type=event_text_type,body=query_body)\
                                ['aggregations']['all_interests']['buckets']
    #print 'wwwwwwwwwwwwwwwwwwwwww'
    #print weibo_sentiment_count
    iter_sentiment_dict = {}
    for sentiment_item in weibo_sentiment_count:
        sentiment = sentiment_item['key']
        sentiment_count = sentiment_item['doc_count']
        try:
            iter_sentiment_dict[sentiment] += sentiment_count  #'1':4
        except:
            iter_sentiment_dict[sentiment] = sentiment_count
    #print '============================'
    #all_sentiment_dict[end_ts] = iter_sentiment_dict   #按时间段存各个情绪的数量值
    #results = sorted(all_sentiment_dict.items(), key=lambda x:x[0])  #按时间段对情绪数量排序
    #results = all_sentiment_dict
    #print type(results)
    #trend_results = {}
    #for sentiment in SENTIMENT_FIRST:
    #    trend_results[sentiment] = [[item[0], item[1][sentiment]] for item in sort_sentiment_dict]
    #results = trend_results

    #print results
    #save_rt_results('count', topic, results, during)
    #save_rt_results_es('count', topic, results, during)
    #return results
    return iter_sentiment_dict
def cityTopic(topic,start_ts,over_ts,during=Fifteenminutes, n_limit=TOP_WEIBOS_LIMIT):
    if topic and topic != '':
        start_ts = int(start_ts)
        over_ts = int(over_ts)

        over_ts = ts2HourlyTime(over_ts, during)
        interval = (over_ts - start_ts) / during


        item_exist = es_event.get(index=event_analysis_name,doc_type=event_type,id=topic)['_source']
        try:
            geo_result = json.loads(item_exist['geo_results'])
        except:
            geo_result = {}


        #topics = topic.strip().split(',')
        for i in range(interval, 0, -1):
            mtype_ccount = {}  # mtype为message_type,ccount为{city:count}
            begin_ts = over_ts - during * i
            end_ts = begin_ts + during
            # print begin_ts,end_ts,topic
            weibos = []
            first_item = {}
            
            for k,v in mtype_kv.iteritems(): #v代表转发、评论、原创

                #geo_result['geo_cityCount'][end_ts][v] = []

                #geo_result = {}
                #city_dict = {}
                query_body = {   #按message_type得到微博
                    'query':{
                        'bool':{
                            'must':[
                                {'term':{'message_type':v}},  
                                # {'term':{'en_name':topic}},
                                {'range':{
                                    'timestamp':{'gte': begin_ts, 'lt':end_ts} 
                                }
                            }]
                        }
                    },
                    'sort':{SORT_FIELD:{"order":"desc"}},
                    'size':n_limit
                    }
                # print topic,event_text_type,query_body
                mtype_weibo = es_event.search(index=topic,doc_type=event_text_type,body=query_body)['hits']['hits']
                # print len(mtype_weibo)
                #save_ws_results(topic, end_ts, during, n_limit, mtype_weibo)    
                #微博直接保存下来
                # print '160',es_event,event_text,event_text_type,query_body,len(mtype_weibo)
                if len(mtype_weibo) == 0:
                    continue
                first_item = mtype_weibo[0]['_source']
                #数每个地方的不同类型的数量
                for weibo in mtype_weibo:  #对于每条微博
                    try:
                        geo = weibo['_source']['geo'].encode('utf8')
                    except:
                        continue
                    #print geo,type(geo)
                    province,city = split_city(geo)
                    #print province,city

                    
                    if province != 'unknown':
                        try:
                            geo_result[v][province][city]+=1
                            geo_result[v][province]['total']+=1
                        except:
                            try:
                                geo_result[v][province][city]=1
                                geo_result[v][province]['total']+=1
                            except:
                                try:
                                    geo_result[v][province]={city:1,'total':1}
                                except:
                                    try:
                                        geo_result[v]={province:{city:1,'total':1}}
                                    except:
                                        geo_result={v:{province:{city:1,'total':1}}}

                        
                    
                        # geo_result[v][province][city] += 1  
                        # try:
                        #     geo_result[v][province]['total'] += 1
                        # except:
                        #     try:
                        #         geo_result[v][province]['total']=1
                        #     except:
                        #         geo_result[v]={province:{'total':1}}

                                
                #geo_result[end_ts][v] = geo_result
                #print mtype_ccount   v:message type
                #save_rt_results(topic, mtype_ccount, during, first_item)

        save_rt_results_es(topic, geo_result)

        return geo_result
Beispiel #14
0
def compute_real_info(topic, begin_ts, end_ts, relation, submit_user,
                      submit_ts):
    info_dict = {}
    query_body = {
        'query': {
            'bool': {
                'must': [{
                    'term': {
                        'en_name': topic
                    }
                }, {
                    'range': {
                        'timestamp': {
                            'gte': begin_ts,
                            'lt': end_ts
                        }
                    }
                }]
            }
        },
        'size': 10000
    }
    result = es_event.search(index=topic,
                             doc_type=event_text_type,
                             fields=['text'],
                             body=query_body)['hits']['hits']
    text_list = []
    for i in result:
        text_list.append(i['fields']['text'][0])

    #事件类型
    try:
        event = es_event.get(index=event_task_name,
                             doc_type=event_task_type,
                             id=topic)['_source']
        info_dict['event_type'] = event['event_type']
    except:
        info_dict['event_type'] = cut_weibo(text_list)

    try:
        event = es_event.get(index=event_task_name,
                             doc_type=event_task_type,
                             id=topic)['_source']
        info_dict['real_auth'] = event['real_auth']
        info_dict['real_geo'] = event['real_geo']
        info_dict['real_time'] = event['real_time']
        info_dict['real_person'] = event['real_person']
    except:
        info_dict = get_real(info_dict, topic, begin_ts, end_ts, relation)

    info_dict['topics'] = json.dumps(get_topic_word(text_list, 10))

    keywords = get_keyword(''.join(text_list), 2)
    info_dict['keywords'] = '&'.join([i[0] for i in keywords])
    info_dict['keywords_list'] = json.dumps(keywords)

    hashtag = get_hashtag(''.join(text_list))
    info_dict['hashtag_dict'] = json.dumps(hashtag)
    info_dict['hashtag'] = '&'.join(list(hashtag.keys()))

    try:
        es_event.update(index=event_analysis_name,
                        doc_type=event_type,
                        id=topic,
                        body={'doc': info_dict})
    except Exception, e:
        es_event.index(index=event_analysis_name,
                       doc_type=event_type,
                       id=topic,
                       body=info_dict)
def excel_read():
    data = xlrd.open_workbook('events.xlsx')
    table = data.sheets()[0]  # 打开第一张表
    nrows = table.nrows  # 获取表的行数

    for i in range(nrows):

        if i == 0:  # 跳过第一行
            continue

        now_ts = int(time.time())
        keywords_list = table.row_values(i)[1].split(' ')
        keywords = '&'.join(keywords_list)
        event_type = table.row_values(i)[2]
        print event_type
        condition = []
        for w in keywords_list:
            condition.append({'term': {'keywords': w}})
            print w

        condition.append({'term': {'compute_status': 1}})
        es_query = {'query': {'bool': {'must': condition}}}

        res = es_event.search(index=event_task_name, doc_type=event_task_type, \
            body=es_query, request_timeout=999999,params={"search_type":"query_and_fetch"})
        print res['hits']['hits']

        if len(res['hits']['hits']) == 1:
            en_id = res['hits']['hits'][0]['_id']
            es_event.update(index=event_task_name,
                            doc_type=event_task_type,
                            id=en_id,
                            body={'doc': {
                                'event_type': event_type
                            }})
            es_event.update(index=event_analysis_name,
                            doc_type='text',
                            id=en_id,
                            body={'doc': {
                                'event_type': event_type
                            }})
        elif len(res['hits']['hits']) >= 1:
            en_id = res['hits']['hits'][0]['_id']
            es_event.update(index=event_task_name, doc_type=event_task_type, id=en_id,  \
                body={'doc':{'event_type':event_type}})
            try:
                task_exist = es_event.get(index=event_analysis_name,
                                          doc_type='text',
                                          id=task_id)['_source']
            except:
                task_exist = {}
            if task_exist:
                es_event.update(index=event_analysis_name,
                                doc_type='text',
                                id=en_id,
                                body={'doc': {
                                    'event_type': event_type
                                }})
            else:
                print 'event_result not exist' + en_id
            print "查询到多个结果!", i

    print 'END'
Beispiel #16
0
def compute_sentiment_weibo(topic, begin_ts, end_ts, k_limit, w_limit, during):
    #print topic
    sentiments = SENTIMENT_FIRST + SENTIMENT_SECOND
    all_sen_weibo = {}
    results = {}
    #results_geo_count = {}
    geo_count = {}
    for sentiment in sentiments:
        province_dict = {}
        query_body = {
            'query': {
                'bool': {
                    'must': [
                        {
                            'term': {
                                'sentiment': sentiment
                            }
                        },  #一个话题,不同情绪下给定时间里按关键词聚合
                        {
                            'range': {
                                'timestamp': {
                                    'gte': begin_ts,
                                    'lt': end_ts
                                }
                            }
                        }
                    ]
                }
            },
            'sort': {
                "retweeted": {
                    "order": "desc"
                }
            },
            'size': w_limit
        }
        sentiment_weibo = es_event.search(index=topic,
                                          doc_type=event_text_type,
                                          body=query_body)['hits']['hits']  #字典
        if len(sentiment_weibo) > 0:
            '''
            all_sen_weibo[sentiment] = []
            for i in range(0,len(sentiment_weibo)):
                #print sentiment_weibo[i]['_source']['retweeted']
                all_sen_weibo[sentiment].append(sentiment_weibo[i]['_source'])
            '''
            for weibo in sentiment_weibo:  #对于每条微博
                if not weibo['_source']['geo']:
                    continue
                geo = weibo['_source']['geo'].encode('utf8')
                province, city = split_city(geo)
                if province != 'unknown':
                    #print province,city
                    try:
                        province_dict[province]['total'] += 1
                    except:
                        province_dict[province] = {'total': 1}
                    try:
                        province_dict[province][city] += 1
                    except:
                        province_dict[province][city] = 1
            geo_count[sentiment] = [end_ts, province_dict]

        else:
            continue


#原有的存微博的
#results[end_ts] = all_sen_weibo
#results_geo_count[end_ts] = geo_count
#print len(results)
#save_rt_results('weibos', topic, results, during, k_limit, w_limit)
#save_rt_results_es('weibos', topic, results, during, k_limit, w_limit)
#print len(geo_count)
#save_rt_results('geo_count',topic,geo_count,during)
#save_rt_results_es('geo_count',topic,geo_count,during)
#print geo_count
#return results,geo_count   #{'时间戳':{'情绪1':[{微博字段},{微博字段}],'情绪2':[]}}
    return geo_count  #{'时间戳':{'情绪1':[{微博字段},{微博字段}],'情绪2':[]}}
Beispiel #17
0
def compute_sentiment_keywords(topic, begin_ts, end_ts, k_limit, w_limit,
                               during):
    all_keyword_dict = {}
    #print 'kkkkkkkkkkkkkkkkkkkk'
    sen_with_keyword = {}
    sentiments = SENTIMENT_FIRST + SENTIMENT_SECOND
    for sentiment in sentiments:
        query_body = {
            'query': {
                'bool': {
                    'must': [
                        {
                            'term': {
                                'sentiment': sentiment
                            }
                        },  #一个话题,不同情绪下给定时间里按关键词聚合
                        {
                            'range': {
                                'timestamp': {
                                    'gte': begin_ts,
                                    'lt': end_ts
                                }
                            }
                        }
                    ]
                }
            },
            'aggs': {
                'all_interests': {
                    'terms': {
                        'field': 'keywords_string',
                        'size': k_limit  #SENTIMENT_MAX_KEYWORDS
                    }
                }
            }
        }

        show_keywords_dict = es_event.search(index=topic,doc_type=event_text_type,body=query_body)\
                        ['aggregations']['all_interests']['buckets']
        #print show_keywords_dict
        #keywords_list = [[item['key'], item['doc_count']] for item in show_keywords_dict]
        #print '======================='
        #print keywords_list

        keyword_dict = {}
        for keyword in show_keywords_dict:
            key = keyword['key']
            count = keyword['doc_count']
            try:
                keyword_dict[key] += count
            except:
                keyword_dict[key] = count

        sen_with_keyword[sentiment] = sorted(keyword_dict.items(),
                                             key=lambda x: x[1],
                                             reverse=True)[:k_limit]
        #print sen_with_keyword
        #print sorted(sen_with_keyword.items(), key=lambda x:x[0], reverse=True)[:k_limit]
    all_keyword_dict[end_ts] = sen_with_keyword
    #还要加按15min切片,然后存

    #results = sorted(all_keyword_dict.items(), key=lambda x:x[1][3], reverse=True)[:k_limit]
    results = all_keyword_dict  #{时间戳:{'情绪1':{'词1':1,'词2':2}}}
    save_rt_results('kcount', topic, results, during, k_limit, w_limit)