def compute_real_info(topic,begin_ts,end_ts,relation):
	info_dict = {}
	
	query_body = {   
		'query':{
			'bool':{
				'must':[
					{'term':{'en_name':topic}},
					{'term':{'message_type':1}},
					{'wildcard':{'text':'【*】*'}},
						{'range':{
						'timestamp':{'gte': begin_ts, 'lt':end_ts} 
						}
					}]
					}
				},
			'size':1,
			'sort':{'retweeted':{'order':'desc'}}
	}
	result = es_event.search(index=topic,doc_type=event_text_type,body=query_body)['hits']['hits']
	#抽取事件的人物、机构、地点和时间
	print result[0]['_source']['text']
	basics = get_news_main(result[0]['_source']['text'])
	print basics
	info_dict['real_auth'] = basics['organization']
	info_dict['real_geo'] = basics['place']
	info_dict['real_time'] = basics['time']
	info_dict['real_person'] = basics['people']
	#存关系
	if('join' in relation.split('&')):
		rel_list = []
		if info_dict['real_auth'] !='NULL':
			resu = create_person(org_node,org_primary,info_dict['real_auth'],org_index_name)
			if resu != 'Node Wrong':
				rel_list.append([[2,topic],'join',[0,info_dict['real_auth']]])
		if info_dict['real_person'] !='NULL':
			resu = create_person(people_node,people_primary,info_dict['real_person'],node_index_name)
			if resu != 'Node Wrong':
				rel_list.append([[2,topic],'join',[1,info_dict['real_person']]])
		try:
			nodes_rels(rel_list)
		except:
			pass

	query_body = {   
		'query':{
		'bool':{
			'must':[
				{'term':{'en_name':topic}},
				{'range':{
					'timestamp':{'gte': begin_ts, 'lt':end_ts} 
				}
				}]
			}
		},
		'size':10000
	}
	result = es_event.search(index=topic,doc_type=event_text_type,fields=['text'],body=query_body)['hits']['hits']
	text_list = []
	for i in result:
		text_list.append(i['fields']['text'][0])
	# print text_list

	#事件类型
	try:
		event = es_event.get(index=event_task_name,doc_type=event_task_type,id=topic)['_source']
		info_dict['event_type'] = event['event_type']
	except:
		info_dict['event_type'] = cut_weibo(text_list)
	info_dict['topics'] = json.dumps(get_topic_word(text_list,10))
	
	keywords = get_keyword(''.join(text_list),2)
	info_dict['keywords'] = '&'.join([i[0] for i in keywords])
	info_dict['keywords_list'] = json.dumps(keywords)

	hashtag = get_hashtag(''.join(text_list))
	info_dict['hashtag_dict'] = json.dumps(hashtag)
	info_dict['hashtag'] = '&'.join(list(hashtag.keys()))
	

	try:
		es_event.update(index=event_analysis_name,doc_type=event_type,id=topic,body={'doc':info_dict})
	except Exception,e:
		es_event.index(index=event_analysis_name,doc_type=event_type,id=topic,body=info_dict)
Beispiel #2
0
def compute_task(task):
    print task
    # task=['雾霾','type','1480003100','1480176000','1483500427743']
    task_id = task['_id']
    task = task['_source']
    topic = task['name']#task[0]#['name']
    #en_name = task['en_name']
    RUN_TYPE = 1
    if RUN_TYPE == 0:
        start_ts = 1480003200#task['start_ts']
        begin_ts = 1480003200
        end_ts = 1480176000#task['end_ts']
    else:
        start_ts = task['start_ts']
        begin_ts = task['start_ts']
        end_ts = task['end_ts']

    try:
        start_ts = task['compute_ts']
        task['compute_ts'] = time.time()
    except:
        task['compute_ts'] = time.time()

    if end_ts > time.time():
        end_ts = time.time()
    submit_ts = task['submit_ts']#int(task[4])
    #可选的计算关系realtion  用&连接的字符串
    relation = task['relation_compute']#task[5]  

    keywords = task['keywords'].split('&')    #关键词或者mid
    #compute_status = task['status']
    # mid = task['mid']
    # task_id = 'event-'+str(start_ts)+'-'+str(end_ts)+'-'+str(submit_ts)
    en_name = task_id
    t1=time.time()
    re_mid = re.compile('^\d{16}$')
    try:
        mid = re.match(re_mid,task_id).group()
    except:
        mid = ''
    exist_flag = exist(task_id)
    get_topic_weibo(topic,task_id,start_ts,end_ts,keywords,mid)
    print exist_flag
    if exist_flag:
        #start compute
        #try:

        resu = create_person(event_node,event_primary,en_name,event_index_name)
        if resu == 'Node Wrong':
            return 'Node Wrong'
        weibo_counts,uid_counts=counts(start_ts,end_ts,topic,en_name,keywords)

        es_event.update(index=event_task_name,doc_type=event_task_type,id=task_id,body={'doc':{'compute_status':-1}})

        # es_event.index(index=event_analysis_name,doc_type=event_type,id=task_id,body={'name':topic,'start_ts':start_ts,'end_ts':end_ts,'submit_ts':submit_ts,'compute_status':0,'en_name':task_id,'relation_compute':relation})
        task['compute_status']=-1
        task['weibo_counts']=weibo_counts
        task['uid_counts']=uid_counts
        try:
            flag = es_event.get(index=event_analysis_name,doc_type=event_type,id=task_id)['_source']
            w_counts = flag['weibo_counts']+weibo_counts
            u_counts = flag['uid_counts']+uid_counts
            es_event.update(index=event_analysis_name,doc_type=event_type,id=task_id,body={'doc':{'compute_status':-1,'weibo_counts':w_counts,'uid_counts':u_counts}})
        except:
            es_event.index(index=event_analysis_name,doc_type=event_type,id=task_id,body=task)
            es_event.update(index=event_analysis_name,doc_type=event_type,id=task_id,body={'doc':{'hashtag_dict':'','topics':'','geo_results':'','real_geo':'','real_auth':'','sentiment_results':'','time_results':'','hashtag':'','real_time':'','user_results':'','real_person':'','keywords_list:''}})

        print 'finish change status'
        
        if es_event.get(index=event_analysis_name,doc_type=event_type,id=task_id)['_source']['weibo_counts'] == 0:
            return 1

        #geo
        
        cityTopic(en_name, start_ts, end_ts)
        es_event.update(index=event_analysis_name,doc_type=event_type,id=task_id,body={'doc':{'compute_status':-2}})
        print 'finish geo analyze'
        #language
        compute_real_info(en_name, begin_ts, end_ts,relation,task['submit_user'],task['submit_ts'])
        es_event.update(index=event_analysis_name,doc_type=event_type,id=task_id,body={'doc':{'compute_status':-3}})
        print 'finish language analyze'
        #time
        propagateCronTopic(en_name, start_ts, end_ts)
        es_event.update(index=event_analysis_name,doc_type=event_type,id=task_id,body={'doc':{'compute_status':-4}})
        print 'finish time analyze'

        
        #sentiment
        sentimentTopic(en_name, start_ts=start_ts, over_ts=end_ts)
        print 'finish sentiment analyze'
        #finish compute

        print es_event.update(index=event_analysis_name,doc_type=event_type,id=task_id,body={'doc':{'compute_status':1,'finish_ts':int(time.time())}})
        print 'finish change status done'
        print time.time()
        
        if('contain' in relation.split('&')):
            #计算关系
            related_event_ids = event_input(keywords,en_name)
            rel_list = []
            for i in related_event_ids:
                create_person(event_node,event_primary,i,event_index_name)
                rel_list.append([[2,en_name],'contain',[2,i]])
            nodes_rels(rel_list)

        es_event.update(index=event_task_name,doc_type=event_task_type,id=task_id,body={'doc':{'compute_status':1}})


    
    t2=time.time()-t1
    print task_id,t2
            # except:
            #   raise
            #   break
            #get_attr(en_name, start_ts, end_ts)
        # else:
        #     pass
    return 1
def get_users(topic,begin_ts,end_ts,relation):
	uid_list = set()
	query_body = {   
	'query':{
		'bool':{
			'must':[
					{'term':{'en_name':topic}},
					# {'wildcard':{'text':'【*】*'}},
					{'range':{
					'timestamp':{'gte': begin_ts, 'lt':end_ts} 
					}
				}]
			}
		},
		'size':999999999
	}
	result = es_event.search(index=topic,doc_type=event_text_type, fields=['uid'],body=query_body)['hits']['hits']
	for i in result:
		uid_list.add(i['fields']['uid'][0])
	print len(uid_list)
	if RUN_TYPE == 0:
		post = datetime2ts(RUN_TEST_TIME) #datetimestr2ts(RUN_TEST_TIME) 
		post = ts2datetimestr(post)
	else:
		post = ts2datetimestr(time.time())
		
	print  bci_day_pre+post,bci_day_type,es_user_portrait
	user_result = es_bci.mget(index=bci_day_pre+post ,doc_type=bci_day_type,body={'ids':list(uid_list)})['docs']
	
	user_influence_dict = {}
	for i in user_result:
		# print i
		if i['found']:
			i = i['_source']
			user_influence_dict[i['user']] = i['user_index']
			#print i,type(i)
			
			#print i['activeness'],i['influence'],i['fansnum']

	user = sorted(user_influence_dict.iteritems(),key=lambda x:x[1],reverse=True)[:100]
	#print user
	not_in_user_list = event_user_portrait([i[0] for i in user])
	user_dict = {}
	p_list = []
	a_list = []
	for i in user:
		# if i[0] not in not_in_user_list:
		# print es_user_profile.get(index=profile_index_name,doc_type=profile_index_type,id=i[0])

		try:
			result = es_user_profile.get(index=profile_index_name,doc_type=profile_index_type,id=i[0])
			print result
			u_type = result['_source']['verified_type']
			print u_type
			if u_type in org_list:
				u_type = auth_type
				a_list.append(i[0])
			else:
				u_type = user_type
				p_list.append(i[0])
			user_dict[i[0]] = {'user_type':u_type,'influ':i[1]}

		except:
			user_dict[i[0]] = {'user_type':user_type,'influ':i[1]}
			p_list.append(i[0])
	print len(a_list),len(p_list)
	if('discuss' in relation.split('&')):
		rel_list = []
		for i in p_list:
			resu = create_person(people_node,people_primary,i,node_index_name)
			if resu != 'Node Wrong':
				rel_list.append([[2,topic],'discuss',[1,i]])
		for i in a_list:
			resu = create_person(org_node,org_primary,i,org_index_name)
			if resu != 'Node Wrong':
				rel_list.append([[2,topic],'discuss',[0,i]])
		try:
			nodes_rels(rel_list)
		except:
			pass


	try:
		es_event.update(index=event_analysis_name,doc_type=event_type,id=topic,body={'doc':{'user_results':json.dumps(user_dict)}})
	except Exception,e:
		es_event.index(index=event_analysis_name,doc_type=event_type,id=topic,body={'user_results':json.dumps(user_dict)})
Beispiel #4
0
def get_real(info_dict, topic, begin_ts, end_ts, relation):
    query_body = {
        'query': {
            'bool': {
                'must': [{
                    'term': {
                        'en_name': topic
                    }
                }, {
                    'term': {
                        'message_type': 1
                    }
                }, {
                    'wildcard': {
                        'text': '【*】*'
                    }
                }, {
                    'range': {
                        'timestamp': {
                            'gte': begin_ts,
                            'lt': end_ts
                        }
                    }
                }]
            }
        },
        'size': 1,
        'sort': {
            'retweeted': {
                'order': 'desc'
            }
        }
    }
    result = es_event.search(index=topic,
                             doc_type=event_text_type,
                             body=query_body)['hits']['hits']
    if len(result) == 0:
        query_body = {
            'query': {
                'bool': {
                    'must': [{
                        'term': {
                            'en_name': topic
                        }
                    }, {
                        'wildcard': {
                            'text': '【*】*'
                        }
                    }, {
                        'range': {
                            'timestamp': {
                                'gte': begin_ts,
                                'lt': end_ts
                            }
                        }
                    }]
                }
            },
            'size': 1,
            'sort': {
                'retweeted': {
                    'order': 'desc'
                }
            }
        }
        result = es_event.search(index=topic,
                                 doc_type=event_text_type,
                                 body=query_body)['hits']['hits']

    #抽取事件的人物、机构、地点和时间
    if len(result) != 0:
        print result[0]['_source']['text']
        basics = get_news_main(result[0]['_source']['text'])
        print basics
        info_dict['real_auth'] = basics['organization']
        info_dict['real_geo'] = basics['place']
        info_dict['real_time'] = basics['time']
        info_dict['real_person'] = basics['people']
    else:
        info_dict['real_auth'] = info_dict['real_geo'] = info_dict[
            'real_time'] = info_dict['real_person'] = 'NULL'
    #存关系
    if ('join' in relation.split('&')):
        rel_list = []
        if info_dict['real_auth'] != 'NULL':
            resu = create_person(org_node, org_primary, info_dict['real_auth'],
                                 org_index_name)
            if resu != 'Node Wrong':
                rel_list.append([[2, topic], 'join',
                                 [0, info_dict['real_auth']]])
        if info_dict['real_person'] != 'NULL':
            resu = create_person(people_node, people_primary,
                                 info_dict['real_person'], node_index_name)
            if resu != 'Node Wrong':
                rel_list.append([[2, topic], 'join',
                                 [1, info_dict['real_person']]])
        try:
            nodes_rels(rel_list)
        except:
            pass
    return info_dict