def save_results_es(calc, topic, results, during, klimit=TOP_KEYWORDS_LIMIT, wlimit=TOP_WEIBOS_LIMIT): if calc == 'time_results': id = topic #results = json.dumps(results) try: item_exist = es_event.get(index=event_analysis_name, doc_type=event_type, id=id)['_source'] try: time_results = json.loads(item_exist['time_results']) except: time_results = [] time_results.append(results) es_event.update( index=event_analysis_name, doc_type=event_type, id=id, body={'doc': { 'time_results': json.dumps(time_results) }}) except Exception, e: es_event.index(index=event_analysis_name, doc_type=event_type, id=id, body={'time_results': json.dumps(results)})
def save_ws_results_es(topic, ts, during, n_limit, province,city,weibos): #mappings_event_geo_province_weibos() #index_name = index_event_geo_province_weibos #index_type = type_event_geo_province_weibos #mappings_event_analysis_results(topic) index_name = index_event_analysis_results index_type = type_event_analysis_results item = {} item['en_name'] = topic item['end_ts'] = ts item['range'] = during item['limit'] = n_limit item['province'] = province item['city'] = city item['weibo'] = json.dumps(weibos) id = topic + '_' + ts try: item_exist = es_event.get(index=index_name,doc_type=index_type,id=id)['_source'] es_event.update(index=index_name,doc_type=index_type,id=id,body={'doc':item}) except Exception,e: es_event.index(index=index_name,doc_type=index_type,id=id,body=item)
def save_rt_results_es(topic, results): id = topic index_name = event_analysis_name #index_event_analysis_results index_type = event_type #try: print es_event.update(index=index_name,doc_type=index_type,id=id,body={'doc':{'geo_results':json.dumps(results)}})
def get_users(topic,begin_ts,end_ts): uid_list = set() query_body = { 'query':{ 'bool':{ 'must':[ {'term':{'en_name':topic}}, {'range':{ 'timestamp':{'gte': begin_ts, 'lt':end_ts} } }] } }, 'size':999999999 } result = es_event.search(index=event_text,doc_type=event_text_type, fields=['uid'],body=query_body)['hits']['hits'] for i in result: uid_list.add(i['fields']['uid'][0]) print len(uid_list) if RUN_TYPE == 0: post = datetime2ts(RUN_TEST_TIME) #datetimestr2ts(RUN_TEST_TIME) post = ts2datetimestr(post) else: post = ts2datetimestr(time.time()) print bci_day_pre+post,bci_day_type,es_user_portrait user_result = es_bci.mget(index=bci_day_pre+post ,doc_type=bci_day_type,body={'ids':list(uid_list)})['docs'] user_influence_dict = {} for i in user_result: #print i if i['found']: i = i['_source'] user_influence_dict[i['user']] = i['user_index'] #print i,type(i) #print i['activeness'],i['influence'],i['fansnum'] user = sorted(user_influence_dict.iteritems(),key=lambda x:x[1],reverse=True)[:100] #print user user_dict = {} for i in user: try: result = es_user_profile.get(index=profile_index_name,doc_type=profile_index_type,id=i[0]) u_type = result['_source']['verified_type'] if u_type in auth_list: u_type = auth_type else: u_type = user_type user_dict[i[0]] = {'user_type':u_type,'influ':i[1]} except: user_dict[i[0]] = {'user_type':user_type,'influ':i[1]} try: es_event.update(index=event_analysis_name,doc_type=event_type,id=topic,body={'doc':{'user_results':json.dumps(user_dict)}}) except Exception,e: es_event.index(index=event_analysis_name,doc_type=event_type,id=topic,body={'user_results':json.dumps(user_dict)})
def input_one(en_name, event_type): es_event.update(index=event_task_name, doc_type=event_task_type, id=en_name, body={'doc': { 'event_type': event_type }}) es_event.update(index=event_analysis_name, doc_type='text', id=en_name, body={'doc': { 'event_type': event_type }})
def excel_read_v2(): data = xlrd.open_workbook('event1.xlsx') table = data.sheets()[0] # 打开第一张表 nrows = table.nrows # 获取表的行数 for i in range(nrows): if i == 0: continue en_name = table.row_values(i)[0] event_type = table.row_values(i)[1] print en_name, event_type print es_event.update(index=event_task_name, doc_type=event_task_type, id=en_name, body={'doc': { 'event_type': event_type }}) # es_event.update(index=event_analysis_name, doc_type='text', id=en_name, body={'doc':{'event_type':event_type}}) print 'END'
def save_rt_results_es(calc, topic, results, during, klimit=TOP_KEYWORDS_LIMIT, wlimit=TOP_WEIBOS_LIMIT): #mappings_event_analysis_results(topic) index_name = event_analysis_name #index_event_analysis_results index_type = event_type #type_event_analysis_results if calc == 'sentiment_results': id = topic try: item_exist = es_event.get(index=index_name, doc_type=index_type, id=id)['_source'] try: sentiment_results = json.loads(item_exist['sentiment_results']) except: sentiment_results = [] sentiment_results.append(results) es_event.update(index=index_name, doc_type=index_type, id=id, body={ 'doc': { 'sentiment_results': json.dumps(sentiment_results) } }) except Exception, e: es_event.index(index=index_name, doc_type=index_type, id=id, body={'sentiment_results': json.dumps(results)})
def compute_real_info(topic,begin_ts,end_ts,relation): info_dict = {} query_body = { 'query':{ 'bool':{ 'must':[ {'term':{'en_name':topic}}, {'term':{'message_type':1}}, {'wildcard':{'text':'【*】*'}}, {'range':{ 'timestamp':{'gte': begin_ts, 'lt':end_ts} } }] } }, 'size':1, 'sort':{'retweeted':{'order':'desc'}} } result = es_event.search(index=topic,doc_type=event_text_type,body=query_body)['hits']['hits'] #抽取事件的人物、机构、地点和时间 print result[0]['_source']['text'] basics = get_news_main(result[0]['_source']['text']) print basics info_dict['real_auth'] = basics['organization'] info_dict['real_geo'] = basics['place'] info_dict['real_time'] = basics['time'] info_dict['real_person'] = basics['people'] #存关系 if('join' in relation.split('&')): rel_list = [] if info_dict['real_auth'] !='NULL': resu = create_person(org_node,org_primary,info_dict['real_auth'],org_index_name) if resu != 'Node Wrong': rel_list.append([[2,topic],'join',[0,info_dict['real_auth']]]) if info_dict['real_person'] !='NULL': resu = create_person(people_node,people_primary,info_dict['real_person'],node_index_name) if resu != 'Node Wrong': rel_list.append([[2,topic],'join',[1,info_dict['real_person']]]) try: nodes_rels(rel_list) except: pass query_body = { 'query':{ 'bool':{ 'must':[ {'term':{'en_name':topic}}, {'range':{ 'timestamp':{'gte': begin_ts, 'lt':end_ts} } }] } }, 'size':10000 } result = es_event.search(index=topic,doc_type=event_text_type,fields=['text'],body=query_body)['hits']['hits'] text_list = [] for i in result: text_list.append(i['fields']['text'][0]) # print text_list #事件类型 try: event = es_event.get(index=event_task_name,doc_type=event_task_type,id=topic)['_source'] info_dict['event_type'] = event['event_type'] except: info_dict['event_type'] = cut_weibo(text_list) info_dict['topics'] = json.dumps(get_topic_word(text_list,10)) keywords = get_keyword(''.join(text_list),2) info_dict['keywords'] = '&'.join([i[0] for i in keywords]) info_dict['keywords_list'] = json.dumps(keywords) hashtag = get_hashtag(''.join(text_list)) info_dict['hashtag_dict'] = json.dumps(hashtag) info_dict['hashtag'] = '&'.join(list(hashtag.keys())) try: es_event.update(index=event_analysis_name,doc_type=event_type,id=topic,body={'doc':info_dict}) except Exception,e: es_event.index(index=event_analysis_name,doc_type=event_type,id=topic,body=info_dict)
def get_users(topic,begin_ts,end_ts,relation): uid_list = set() query_body = { 'query':{ 'bool':{ 'must':[ {'term':{'en_name':topic}}, # {'wildcard':{'text':'【*】*'}}, {'range':{ 'timestamp':{'gte': begin_ts, 'lt':end_ts} } }] } }, 'size':999999999 } result = es_event.search(index=topic,doc_type=event_text_type, fields=['uid'],body=query_body)['hits']['hits'] for i in result: uid_list.add(i['fields']['uid'][0]) print len(uid_list) if RUN_TYPE == 0: post = datetime2ts(RUN_TEST_TIME) #datetimestr2ts(RUN_TEST_TIME) post = ts2datetimestr(post) else: post = ts2datetimestr(time.time()) print bci_day_pre+post,bci_day_type,es_user_portrait user_result = es_bci.mget(index=bci_day_pre+post ,doc_type=bci_day_type,body={'ids':list(uid_list)})['docs'] user_influence_dict = {} for i in user_result: # print i if i['found']: i = i['_source'] user_influence_dict[i['user']] = i['user_index'] #print i,type(i) #print i['activeness'],i['influence'],i['fansnum'] user = sorted(user_influence_dict.iteritems(),key=lambda x:x[1],reverse=True)[:100] #print user not_in_user_list = event_user_portrait([i[0] for i in user]) user_dict = {} p_list = [] a_list = [] for i in user: # if i[0] not in not_in_user_list: # print es_user_profile.get(index=profile_index_name,doc_type=profile_index_type,id=i[0]) try: result = es_user_profile.get(index=profile_index_name,doc_type=profile_index_type,id=i[0]) print result u_type = result['_source']['verified_type'] print u_type if u_type in org_list: u_type = auth_type a_list.append(i[0]) else: u_type = user_type p_list.append(i[0]) user_dict[i[0]] = {'user_type':u_type,'influ':i[1]} except: user_dict[i[0]] = {'user_type':user_type,'influ':i[1]} p_list.append(i[0]) print len(a_list),len(p_list) if('discuss' in relation.split('&')): rel_list = [] for i in p_list: resu = create_person(people_node,people_primary,i,node_index_name) if resu != 'Node Wrong': rel_list.append([[2,topic],'discuss',[1,i]]) for i in a_list: resu = create_person(org_node,org_primary,i,org_index_name) if resu != 'Node Wrong': rel_list.append([[2,topic],'discuss',[0,i]]) try: nodes_rels(rel_list) except: pass try: es_event.update(index=event_analysis_name,doc_type=event_type,id=topic,body={'doc':{'user_results':json.dumps(user_dict)}}) except Exception,e: es_event.index(index=event_analysis_name,doc_type=event_type,id=topic,body={'user_results':json.dumps(user_dict)})
def compute_task(task): print task # task=['雾霾','type','1480003100','1480176000','1483500427743'] task_id = task['_id'] task = task['_source'] topic = task['name']#task[0]#['name'] #en_name = task['en_name'] RUN_TYPE = 1 if RUN_TYPE == 0: start_ts = 1480003200#task['start_ts'] begin_ts = 1480003200 end_ts = 1480176000#task['end_ts'] else: start_ts = task['start_ts'] begin_ts = task['start_ts'] end_ts = task['end_ts'] try: start_ts = task['compute_ts'] task['compute_ts'] = time.time() except: task['compute_ts'] = time.time() if end_ts > time.time(): end_ts = time.time() submit_ts = task['submit_ts']#int(task[4]) #可选的计算关系realtion 用&连接的字符串 relation = task['relation_compute']#task[5] keywords = task['keywords'].split('&') #关键词或者mid #compute_status = task['status'] # mid = task['mid'] # task_id = 'event-'+str(start_ts)+'-'+str(end_ts)+'-'+str(submit_ts) en_name = task_id t1=time.time() re_mid = re.compile('^\d{16}$') try: mid = re.match(re_mid,task_id).group() except: mid = '' exist_flag = exist(task_id) get_topic_weibo(topic,task_id,start_ts,end_ts,keywords,mid) print exist_flag if exist_flag: #start compute #try: resu = create_person(event_node,event_primary,en_name,event_index_name) if resu == 'Node Wrong': return 'Node Wrong' weibo_counts,uid_counts=counts(start_ts,end_ts,topic,en_name,keywords) es_event.update(index=event_task_name,doc_type=event_task_type,id=task_id,body={'doc':{'compute_status':-1}}) # es_event.index(index=event_analysis_name,doc_type=event_type,id=task_id,body={'name':topic,'start_ts':start_ts,'end_ts':end_ts,'submit_ts':submit_ts,'compute_status':0,'en_name':task_id,'relation_compute':relation}) task['compute_status']=-1 task['weibo_counts']=weibo_counts task['uid_counts']=uid_counts try: flag = es_event.get(index=event_analysis_name,doc_type=event_type,id=task_id)['_source'] w_counts = flag['weibo_counts']+weibo_counts u_counts = flag['uid_counts']+uid_counts es_event.update(index=event_analysis_name,doc_type=event_type,id=task_id,body={'doc':{'compute_status':-1,'weibo_counts':w_counts,'uid_counts':u_counts}}) except: es_event.index(index=event_analysis_name,doc_type=event_type,id=task_id,body=task) es_event.update(index=event_analysis_name,doc_type=event_type,id=task_id,body={'doc':{'hashtag_dict':'','topics':'','geo_results':'','real_geo':'','real_auth':'','sentiment_results':'','time_results':'','hashtag':'','real_time':'','user_results':'','real_person':'','keywords_list:''}}) print 'finish change status' if es_event.get(index=event_analysis_name,doc_type=event_type,id=task_id)['_source']['weibo_counts'] == 0: return 1 #geo cityTopic(en_name, start_ts, end_ts) es_event.update(index=event_analysis_name,doc_type=event_type,id=task_id,body={'doc':{'compute_status':-2}}) print 'finish geo analyze' #language compute_real_info(en_name, begin_ts, end_ts,relation,task['submit_user'],task['submit_ts']) es_event.update(index=event_analysis_name,doc_type=event_type,id=task_id,body={'doc':{'compute_status':-3}}) print 'finish language analyze' #time propagateCronTopic(en_name, start_ts, end_ts) es_event.update(index=event_analysis_name,doc_type=event_type,id=task_id,body={'doc':{'compute_status':-4}}) print 'finish time analyze' #sentiment sentimentTopic(en_name, start_ts=start_ts, over_ts=end_ts) print 'finish sentiment analyze' #finish compute print es_event.update(index=event_analysis_name,doc_type=event_type,id=task_id,body={'doc':{'compute_status':1,'finish_ts':int(time.time())}}) print 'finish change status done' print time.time() if('contain' in relation.split('&')): #计算关系 related_event_ids = event_input(keywords,en_name) rel_list = [] for i in related_event_ids: create_person(event_node,event_primary,i,event_index_name) rel_list.append([[2,en_name],'contain',[2,i]]) nodes_rels(rel_list) es_event.update(index=event_task_name,doc_type=event_task_type,id=task_id,body={'doc':{'compute_status':1}}) t2=time.time()-t1 print task_id,t2 # except: # raise # break #get_attr(en_name, start_ts, end_ts) # else: # pass return 1
def excel_read(): data = xlrd.open_workbook('events.xlsx') table = data.sheets()[0] # 打开第一张表 nrows = table.nrows # 获取表的行数 for i in range(nrows): if i == 0: # 跳过第一行 continue now_ts = int(time.time()) keywords_list = table.row_values(i)[1].split(' ') keywords = '&'.join(keywords_list) event_type = table.row_values(i)[2] print event_type condition = [] for w in keywords_list: condition.append({'term': {'keywords': w}}) print w condition.append({'term': {'compute_status': 1}}) es_query = {'query': {'bool': {'must': condition}}} res = es_event.search(index=event_task_name, doc_type=event_task_type, \ body=es_query, request_timeout=999999,params={"search_type":"query_and_fetch"}) print res['hits']['hits'] if len(res['hits']['hits']) == 1: en_id = res['hits']['hits'][0]['_id'] es_event.update(index=event_task_name, doc_type=event_task_type, id=en_id, body={'doc': { 'event_type': event_type }}) es_event.update(index=event_analysis_name, doc_type='text', id=en_id, body={'doc': { 'event_type': event_type }}) elif len(res['hits']['hits']) >= 1: en_id = res['hits']['hits'][0]['_id'] es_event.update(index=event_task_name, doc_type=event_task_type, id=en_id, \ body={'doc':{'event_type':event_type}}) try: task_exist = es_event.get(index=event_analysis_name, doc_type='text', id=task_id)['_source'] except: task_exist = {} if task_exist: es_event.update(index=event_analysis_name, doc_type='text', id=en_id, body={'doc': { 'event_type': event_type }}) else: print 'event_result not exist' + en_id print "查询到多个结果!", i print 'END'
def compute_real_info(topic, begin_ts, end_ts, relation, submit_user, submit_ts): info_dict = {} query_body = { 'query': { 'bool': { 'must': [{ 'term': { 'en_name': topic } }, { 'range': { 'timestamp': { 'gte': begin_ts, 'lt': end_ts } } }] } }, 'size': 10000 } result = es_event.search(index=topic, doc_type=event_text_type, fields=['text'], body=query_body)['hits']['hits'] text_list = [] for i in result: text_list.append(i['fields']['text'][0]) #事件类型 try: event = es_event.get(index=event_task_name, doc_type=event_task_type, id=topic)['_source'] info_dict['event_type'] = event['event_type'] except: info_dict['event_type'] = cut_weibo(text_list) try: event = es_event.get(index=event_task_name, doc_type=event_task_type, id=topic)['_source'] info_dict['real_auth'] = event['real_auth'] info_dict['real_geo'] = event['real_geo'] info_dict['real_time'] = event['real_time'] info_dict['real_person'] = event['real_person'] except: info_dict = get_real(info_dict, topic, begin_ts, end_ts, relation) info_dict['topics'] = json.dumps(get_topic_word(text_list, 10)) keywords = get_keyword(''.join(text_list), 2) info_dict['keywords'] = '&'.join([i[0] for i in keywords]) info_dict['keywords_list'] = json.dumps(keywords) hashtag = get_hashtag(''.join(text_list)) info_dict['hashtag_dict'] = json.dumps(hashtag) info_dict['hashtag'] = '&'.join(list(hashtag.keys())) try: es_event.update(index=event_analysis_name, doc_type=event_type, id=topic, body={'doc': info_dict}) except Exception, e: es_event.index(index=event_analysis_name, doc_type=event_type, id=topic, body=info_dict)