def compute_real_info(topic,begin_ts,end_ts,relation): info_dict = {} query_body = { 'query':{ 'bool':{ 'must':[ {'term':{'en_name':topic}}, {'term':{'message_type':1}}, {'wildcard':{'text':'【*】*'}}, {'range':{ 'timestamp':{'gte': begin_ts, 'lt':end_ts} } }] } }, 'size':1, 'sort':{'retweeted':{'order':'desc'}} } result = es_event.search(index=topic,doc_type=event_text_type,body=query_body)['hits']['hits'] #抽取事件的人物、机构、地点和时间 print result[0]['_source']['text'] basics = get_news_main(result[0]['_source']['text']) print basics info_dict['real_auth'] = basics['organization'] info_dict['real_geo'] = basics['place'] info_dict['real_time'] = basics['time'] info_dict['real_person'] = basics['people'] #存关系 if('join' in relation.split('&')): rel_list = [] if info_dict['real_auth'] !='NULL': resu = create_person(org_node,org_primary,info_dict['real_auth'],org_index_name) if resu != 'Node Wrong': rel_list.append([[2,topic],'join',[0,info_dict['real_auth']]]) if info_dict['real_person'] !='NULL': resu = create_person(people_node,people_primary,info_dict['real_person'],node_index_name) if resu != 'Node Wrong': rel_list.append([[2,topic],'join',[1,info_dict['real_person']]]) try: nodes_rels(rel_list) except: pass query_body = { 'query':{ 'bool':{ 'must':[ {'term':{'en_name':topic}}, {'range':{ 'timestamp':{'gte': begin_ts, 'lt':end_ts} } }] } }, 'size':10000 } result = es_event.search(index=topic,doc_type=event_text_type,fields=['text'],body=query_body)['hits']['hits'] text_list = [] for i in result: text_list.append(i['fields']['text'][0]) # print text_list #事件类型 try: event = es_event.get(index=event_task_name,doc_type=event_task_type,id=topic)['_source'] info_dict['event_type'] = event['event_type'] except: info_dict['event_type'] = cut_weibo(text_list) info_dict['topics'] = json.dumps(get_topic_word(text_list,10)) keywords = get_keyword(''.join(text_list),2) info_dict['keywords'] = '&'.join([i[0] for i in keywords]) info_dict['keywords_list'] = json.dumps(keywords) hashtag = get_hashtag(''.join(text_list)) info_dict['hashtag_dict'] = json.dumps(hashtag) info_dict['hashtag'] = '&'.join(list(hashtag.keys())) try: es_event.update(index=event_analysis_name,doc_type=event_type,id=topic,body={'doc':info_dict}) except Exception,e: es_event.index(index=event_analysis_name,doc_type=event_type,id=topic,body=info_dict)
def compute_task(task): print task # task=['雾霾','type','1480003100','1480176000','1483500427743'] task_id = task['_id'] task = task['_source'] topic = task['name']#task[0]#['name'] #en_name = task['en_name'] RUN_TYPE = 1 if RUN_TYPE == 0: start_ts = 1480003200#task['start_ts'] begin_ts = 1480003200 end_ts = 1480176000#task['end_ts'] else: start_ts = task['start_ts'] begin_ts = task['start_ts'] end_ts = task['end_ts'] try: start_ts = task['compute_ts'] task['compute_ts'] = time.time() except: task['compute_ts'] = time.time() if end_ts > time.time(): end_ts = time.time() submit_ts = task['submit_ts']#int(task[4]) #可选的计算关系realtion 用&连接的字符串 relation = task['relation_compute']#task[5] keywords = task['keywords'].split('&') #关键词或者mid #compute_status = task['status'] # mid = task['mid'] # task_id = 'event-'+str(start_ts)+'-'+str(end_ts)+'-'+str(submit_ts) en_name = task_id t1=time.time() re_mid = re.compile('^\d{16}$') try: mid = re.match(re_mid,task_id).group() except: mid = '' exist_flag = exist(task_id) get_topic_weibo(topic,task_id,start_ts,end_ts,keywords,mid) print exist_flag if exist_flag: #start compute #try: resu = create_person(event_node,event_primary,en_name,event_index_name) if resu == 'Node Wrong': return 'Node Wrong' weibo_counts,uid_counts=counts(start_ts,end_ts,topic,en_name,keywords) es_event.update(index=event_task_name,doc_type=event_task_type,id=task_id,body={'doc':{'compute_status':-1}}) # es_event.index(index=event_analysis_name,doc_type=event_type,id=task_id,body={'name':topic,'start_ts':start_ts,'end_ts':end_ts,'submit_ts':submit_ts,'compute_status':0,'en_name':task_id,'relation_compute':relation}) task['compute_status']=-1 task['weibo_counts']=weibo_counts task['uid_counts']=uid_counts try: flag = es_event.get(index=event_analysis_name,doc_type=event_type,id=task_id)['_source'] w_counts = flag['weibo_counts']+weibo_counts u_counts = flag['uid_counts']+uid_counts es_event.update(index=event_analysis_name,doc_type=event_type,id=task_id,body={'doc':{'compute_status':-1,'weibo_counts':w_counts,'uid_counts':u_counts}}) except: es_event.index(index=event_analysis_name,doc_type=event_type,id=task_id,body=task) es_event.update(index=event_analysis_name,doc_type=event_type,id=task_id,body={'doc':{'hashtag_dict':'','topics':'','geo_results':'','real_geo':'','real_auth':'','sentiment_results':'','time_results':'','hashtag':'','real_time':'','user_results':'','real_person':'','keywords_list:''}}) print 'finish change status' if es_event.get(index=event_analysis_name,doc_type=event_type,id=task_id)['_source']['weibo_counts'] == 0: return 1 #geo cityTopic(en_name, start_ts, end_ts) es_event.update(index=event_analysis_name,doc_type=event_type,id=task_id,body={'doc':{'compute_status':-2}}) print 'finish geo analyze' #language compute_real_info(en_name, begin_ts, end_ts,relation,task['submit_user'],task['submit_ts']) es_event.update(index=event_analysis_name,doc_type=event_type,id=task_id,body={'doc':{'compute_status':-3}}) print 'finish language analyze' #time propagateCronTopic(en_name, start_ts, end_ts) es_event.update(index=event_analysis_name,doc_type=event_type,id=task_id,body={'doc':{'compute_status':-4}}) print 'finish time analyze' #sentiment sentimentTopic(en_name, start_ts=start_ts, over_ts=end_ts) print 'finish sentiment analyze' #finish compute print es_event.update(index=event_analysis_name,doc_type=event_type,id=task_id,body={'doc':{'compute_status':1,'finish_ts':int(time.time())}}) print 'finish change status done' print time.time() if('contain' in relation.split('&')): #计算关系 related_event_ids = event_input(keywords,en_name) rel_list = [] for i in related_event_ids: create_person(event_node,event_primary,i,event_index_name) rel_list.append([[2,en_name],'contain',[2,i]]) nodes_rels(rel_list) es_event.update(index=event_task_name,doc_type=event_task_type,id=task_id,body={'doc':{'compute_status':1}}) t2=time.time()-t1 print task_id,t2 # except: # raise # break #get_attr(en_name, start_ts, end_ts) # else: # pass return 1
def get_users(topic,begin_ts,end_ts,relation): uid_list = set() query_body = { 'query':{ 'bool':{ 'must':[ {'term':{'en_name':topic}}, # {'wildcard':{'text':'【*】*'}}, {'range':{ 'timestamp':{'gte': begin_ts, 'lt':end_ts} } }] } }, 'size':999999999 } result = es_event.search(index=topic,doc_type=event_text_type, fields=['uid'],body=query_body)['hits']['hits'] for i in result: uid_list.add(i['fields']['uid'][0]) print len(uid_list) if RUN_TYPE == 0: post = datetime2ts(RUN_TEST_TIME) #datetimestr2ts(RUN_TEST_TIME) post = ts2datetimestr(post) else: post = ts2datetimestr(time.time()) print bci_day_pre+post,bci_day_type,es_user_portrait user_result = es_bci.mget(index=bci_day_pre+post ,doc_type=bci_day_type,body={'ids':list(uid_list)})['docs'] user_influence_dict = {} for i in user_result: # print i if i['found']: i = i['_source'] user_influence_dict[i['user']] = i['user_index'] #print i,type(i) #print i['activeness'],i['influence'],i['fansnum'] user = sorted(user_influence_dict.iteritems(),key=lambda x:x[1],reverse=True)[:100] #print user not_in_user_list = event_user_portrait([i[0] for i in user]) user_dict = {} p_list = [] a_list = [] for i in user: # if i[0] not in not_in_user_list: # print es_user_profile.get(index=profile_index_name,doc_type=profile_index_type,id=i[0]) try: result = es_user_profile.get(index=profile_index_name,doc_type=profile_index_type,id=i[0]) print result u_type = result['_source']['verified_type'] print u_type if u_type in org_list: u_type = auth_type a_list.append(i[0]) else: u_type = user_type p_list.append(i[0]) user_dict[i[0]] = {'user_type':u_type,'influ':i[1]} except: user_dict[i[0]] = {'user_type':user_type,'influ':i[1]} p_list.append(i[0]) print len(a_list),len(p_list) if('discuss' in relation.split('&')): rel_list = [] for i in p_list: resu = create_person(people_node,people_primary,i,node_index_name) if resu != 'Node Wrong': rel_list.append([[2,topic],'discuss',[1,i]]) for i in a_list: resu = create_person(org_node,org_primary,i,org_index_name) if resu != 'Node Wrong': rel_list.append([[2,topic],'discuss',[0,i]]) try: nodes_rels(rel_list) except: pass try: es_event.update(index=event_analysis_name,doc_type=event_type,id=topic,body={'doc':{'user_results':json.dumps(user_dict)}}) except Exception,e: es_event.index(index=event_analysis_name,doc_type=event_type,id=topic,body={'user_results':json.dumps(user_dict)})
def get_real(info_dict, topic, begin_ts, end_ts, relation): query_body = { 'query': { 'bool': { 'must': [{ 'term': { 'en_name': topic } }, { 'term': { 'message_type': 1 } }, { 'wildcard': { 'text': '【*】*' } }, { 'range': { 'timestamp': { 'gte': begin_ts, 'lt': end_ts } } }] } }, 'size': 1, 'sort': { 'retweeted': { 'order': 'desc' } } } result = es_event.search(index=topic, doc_type=event_text_type, body=query_body)['hits']['hits'] if len(result) == 0: query_body = { 'query': { 'bool': { 'must': [{ 'term': { 'en_name': topic } }, { 'wildcard': { 'text': '【*】*' } }, { 'range': { 'timestamp': { 'gte': begin_ts, 'lt': end_ts } } }] } }, 'size': 1, 'sort': { 'retweeted': { 'order': 'desc' } } } result = es_event.search(index=topic, doc_type=event_text_type, body=query_body)['hits']['hits'] #抽取事件的人物、机构、地点和时间 if len(result) != 0: print result[0]['_source']['text'] basics = get_news_main(result[0]['_source']['text']) print basics info_dict['real_auth'] = basics['organization'] info_dict['real_geo'] = basics['place'] info_dict['real_time'] = basics['time'] info_dict['real_person'] = basics['people'] else: info_dict['real_auth'] = info_dict['real_geo'] = info_dict[ 'real_time'] = info_dict['real_person'] = 'NULL' #存关系 if ('join' in relation.split('&')): rel_list = [] if info_dict['real_auth'] != 'NULL': resu = create_person(org_node, org_primary, info_dict['real_auth'], org_index_name) if resu != 'Node Wrong': rel_list.append([[2, topic], 'join', [0, info_dict['real_auth']]]) if info_dict['real_person'] != 'NULL': resu = create_person(people_node, people_primary, info_dict['real_person'], node_index_name) if resu != 'Node Wrong': rel_list.append([[2, topic], 'join', [1, info_dict['real_person']]]) try: nodes_rels(rel_list) except: pass return info_dict