def get_user_in_event(event_id):
    result = es_event.get(index=event_analysis_name, doc_type= event_type, id=event_id)["_source"]
    # trend_pusher
    trend_pusher = json.loads(result["trend_pusher"])

    # trend_maker
    trend_maker = json.loads(result["trend_maker"])

    # pagerank
    pagerank = json.loads(result["pagerank"])
    print len(trend_pusher), len(trend_maker), len(pagerank)

    f = open("event_user_list.txt", "a")
    for item in trend_pusher:
        f.write(str(item["uid"]))
        f.write("\n")

    for item in trend_maker:
        f.write(str(item["uid"]))
        f.write("\n")

    for item in pagerank:
        f.write(str(item))
        f.write("\n")

    f.close()
def save_ws_results_es(topic, ts, during, n_limit, province,city,weibos):

    #mappings_event_geo_province_weibos()
    #index_name = index_event_geo_province_weibos
    #index_type = type_event_geo_province_weibos

    #mappings_event_analysis_results(topic)
    index_name = index_event_analysis_results
    index_type = type_event_analysis_results

    item = {}

    item['en_name'] = topic
    item['end_ts'] = ts
    item['range'] = during
    item['limit'] = n_limit
    item['province'] = province
    item['city'] = city
    item['weibo'] = json.dumps(weibos)
    
    id = topic + '_' + ts

    try:
        item_exist = es_event.get(index=index_name,doc_type=index_type,id=id)['_source']
        es_event.update(index=index_name,doc_type=index_type,id=id,body={'doc':item})
    except Exception,e:
        es_event.index(index=index_name,doc_type=index_type,id=id,body=item)
def save_results_es(calc,
                    topic,
                    results,
                    during,
                    klimit=TOP_KEYWORDS_LIMIT,
                    wlimit=TOP_WEIBOS_LIMIT):

    if calc == 'time_results':

        id = topic

        #results = json.dumps(results)

        try:
            item_exist = es_event.get(index=event_analysis_name,
                                      doc_type=event_type,
                                      id=id)['_source']
            try:
                time_results = json.loads(item_exist['time_results'])
            except:
                time_results = []
            time_results.append(results)
            es_event.update(
                index=event_analysis_name,
                doc_type=event_type,
                id=id,
                body={'doc': {
                    'time_results': json.dumps(time_results)
                }})
        except Exception, e:
            es_event.index(index=event_analysis_name,
                           doc_type=event_type,
                           id=id,
                           body={'time_results': json.dumps(results)})
def get_user_in_event(event_id):
    result = es_event.get(index=event_analysis_name,
                          doc_type=event_type,
                          id=event_id)["_source"]
    # trend_pusher
    trend_pusher = json.loads(result["trend_pusher"])
    trend_list = []
    for item in trend_pusher:
        trend_list.append(item["uid"])

    # trend_maker
    trend_maker = json.loads(result["trend_maker"])
    maker_list = []
    for item in trend_maker:
        maker_list.append(item["uid"])

    # pagerank
    pagerank = json.loads(result["pagerank"])
    print len(trend_pusher), len(trend_maker), len(pagerank)

    create_rel_uid2event(trend_list, event_id, 'ipusher')

    create_rel_uid2event(maker_list, event_id, 'maker')

    create_rel_uid2event(pagerank, event_id, 'join')
Exemple #5
0
def immediate_compute(task_id):
    try:
        task = es_event.get(index=event_task_name,
                            doc_type=event_task_type,
                            id=task_id)
        compute_task(task)
    except:
        return None
Exemple #6
0
def exist(task_id):
    #print task_id
    try:
        task_exist = es_event.get(index=event_task_name,doc_type=event_task_type,id=task_id)['_source']
    except:
        task_exist = {}
    if not task_exist:
        return False
    else:
        return True
Exemple #7
0
def uid_diff():
    s_re = scan(es_event,index='user_portrait_0312',doc_type='user')
    uid_list = set()
    while True:
        try:
            scan_re = s_re.next()
            uid_list.add(scan_re['_id'])
        except:
            print len(uid_list)
            break
    result = es_event.get(index='event_result',doc_type='text',id='bei-jing-fang-jia-zheng-ce-1480176000')['_source']
    event_uid = set(json.loads(result['user_results']).keys())
    print len(event_uid)
    print uid_list - event_uid
    print event_uid - uid_list
Exemple #8
0
def save_rt_results_es(calc,
                       topic,
                       results,
                       during,
                       klimit=TOP_KEYWORDS_LIMIT,
                       wlimit=TOP_WEIBOS_LIMIT):

    #mappings_event_analysis_results(topic)
    index_name = event_analysis_name  #index_event_analysis_results
    index_type = event_type  #type_event_analysis_results

    if calc == 'sentiment_results':

        id = topic

        try:
            item_exist = es_event.get(index=index_name,
                                      doc_type=index_type,
                                      id=id)['_source']
            try:
                sentiment_results = json.loads(item_exist['sentiment_results'])
            except:
                sentiment_results = []
            sentiment_results.append(results)
            es_event.update(index=index_name,
                            doc_type=index_type,
                            id=id,
                            body={
                                'doc': {
                                    'sentiment_results':
                                    json.dumps(sentiment_results)
                                }
                            })
        except Exception, e:
            es_event.index(index=index_name,
                           doc_type=index_type,
                           id=id,
                           body={'sentiment_results': json.dumps(results)})
def compute_real_info(topic,begin_ts,end_ts,relation):
	info_dict = {}
	
	query_body = {   
		'query':{
			'bool':{
				'must':[
					{'term':{'en_name':topic}},
					{'term':{'message_type':1}},
					{'wildcard':{'text':'【*】*'}},
						{'range':{
						'timestamp':{'gte': begin_ts, 'lt':end_ts} 
						}
					}]
					}
				},
			'size':1,
			'sort':{'retweeted':{'order':'desc'}}
	}
	result = es_event.search(index=topic,doc_type=event_text_type,body=query_body)['hits']['hits']
	#抽取事件的人物、机构、地点和时间
	print result[0]['_source']['text']
	basics = get_news_main(result[0]['_source']['text'])
	print basics
	info_dict['real_auth'] = basics['organization']
	info_dict['real_geo'] = basics['place']
	info_dict['real_time'] = basics['time']
	info_dict['real_person'] = basics['people']
	#存关系
	if('join' in relation.split('&')):
		rel_list = []
		if info_dict['real_auth'] !='NULL':
			resu = create_person(org_node,org_primary,info_dict['real_auth'],org_index_name)
			if resu != 'Node Wrong':
				rel_list.append([[2,topic],'join',[0,info_dict['real_auth']]])
		if info_dict['real_person'] !='NULL':
			resu = create_person(people_node,people_primary,info_dict['real_person'],node_index_name)
			if resu != 'Node Wrong':
				rel_list.append([[2,topic],'join',[1,info_dict['real_person']]])
		try:
			nodes_rels(rel_list)
		except:
			pass

	query_body = {   
		'query':{
		'bool':{
			'must':[
				{'term':{'en_name':topic}},
				{'range':{
					'timestamp':{'gte': begin_ts, 'lt':end_ts} 
				}
				}]
			}
		},
		'size':10000
	}
	result = es_event.search(index=topic,doc_type=event_text_type,fields=['text'],body=query_body)['hits']['hits']
	text_list = []
	for i in result:
		text_list.append(i['fields']['text'][0])
	# print text_list

	#事件类型
	try:
		event = es_event.get(index=event_task_name,doc_type=event_task_type,id=topic)['_source']
		info_dict['event_type'] = event['event_type']
	except:
		info_dict['event_type'] = cut_weibo(text_list)
	info_dict['topics'] = json.dumps(get_topic_word(text_list,10))
	
	keywords = get_keyword(''.join(text_list),2)
	info_dict['keywords'] = '&'.join([i[0] for i in keywords])
	info_dict['keywords_list'] = json.dumps(keywords)

	hashtag = get_hashtag(''.join(text_list))
	info_dict['hashtag_dict'] = json.dumps(hashtag)
	info_dict['hashtag'] = '&'.join(list(hashtag.keys()))
	

	try:
		es_event.update(index=event_analysis_name,doc_type=event_type,id=topic,body={'doc':info_dict})
	except Exception,e:
		es_event.index(index=event_analysis_name,doc_type=event_type,id=topic,body=info_dict)
Exemple #10
0
def compute_task(task):
    print task
    # task=['雾霾','type','1480003100','1480176000','1483500427743']
    task_id = task['_id']
    task = task['_source']
    topic = task['name']#task[0]#['name']
    #en_name = task['en_name']
    RUN_TYPE = 1
    if RUN_TYPE == 0:
        start_ts = 1480003200#task['start_ts']
        begin_ts = 1480003200
        end_ts = 1480176000#task['end_ts']
    else:
        start_ts = task['start_ts']
        begin_ts = task['start_ts']
        end_ts = task['end_ts']

    try:
        start_ts = task['compute_ts']
        task['compute_ts'] = time.time()
    except:
        task['compute_ts'] = time.time()

    if end_ts > time.time():
        end_ts = time.time()
    submit_ts = task['submit_ts']#int(task[4])
    #可选的计算关系realtion  用&连接的字符串
    relation = task['relation_compute']#task[5]  

    keywords = task['keywords'].split('&')    #关键词或者mid
    #compute_status = task['status']
    # mid = task['mid']
    # task_id = 'event-'+str(start_ts)+'-'+str(end_ts)+'-'+str(submit_ts)
    en_name = task_id
    t1=time.time()
    re_mid = re.compile('^\d{16}$')
    try:
        mid = re.match(re_mid,task_id).group()
    except:
        mid = ''
    exist_flag = exist(task_id)
    get_topic_weibo(topic,task_id,start_ts,end_ts,keywords,mid)
    print exist_flag
    if exist_flag:
        #start compute
        #try:

        resu = create_person(event_node,event_primary,en_name,event_index_name)
        if resu == 'Node Wrong':
            return 'Node Wrong'
        weibo_counts,uid_counts=counts(start_ts,end_ts,topic,en_name,keywords)

        es_event.update(index=event_task_name,doc_type=event_task_type,id=task_id,body={'doc':{'compute_status':-1}})

        # es_event.index(index=event_analysis_name,doc_type=event_type,id=task_id,body={'name':topic,'start_ts':start_ts,'end_ts':end_ts,'submit_ts':submit_ts,'compute_status':0,'en_name':task_id,'relation_compute':relation})
        task['compute_status']=-1
        task['weibo_counts']=weibo_counts
        task['uid_counts']=uid_counts
        try:
            flag = es_event.get(index=event_analysis_name,doc_type=event_type,id=task_id)['_source']
            w_counts = flag['weibo_counts']+weibo_counts
            u_counts = flag['uid_counts']+uid_counts
            es_event.update(index=event_analysis_name,doc_type=event_type,id=task_id,body={'doc':{'compute_status':-1,'weibo_counts':w_counts,'uid_counts':u_counts}})
        except:
            es_event.index(index=event_analysis_name,doc_type=event_type,id=task_id,body=task)
            es_event.update(index=event_analysis_name,doc_type=event_type,id=task_id,body={'doc':{'hashtag_dict':'','topics':'','geo_results':'','real_geo':'','real_auth':'','sentiment_results':'','time_results':'','hashtag':'','real_time':'','user_results':'','real_person':'','keywords_list:''}})

        print 'finish change status'
        
        if es_event.get(index=event_analysis_name,doc_type=event_type,id=task_id)['_source']['weibo_counts'] == 0:
            return 1

        #geo
        
        cityTopic(en_name, start_ts, end_ts)
        es_event.update(index=event_analysis_name,doc_type=event_type,id=task_id,body={'doc':{'compute_status':-2}})
        print 'finish geo analyze'
        #language
        compute_real_info(en_name, begin_ts, end_ts,relation,task['submit_user'],task['submit_ts'])
        es_event.update(index=event_analysis_name,doc_type=event_type,id=task_id,body={'doc':{'compute_status':-3}})
        print 'finish language analyze'
        #time
        propagateCronTopic(en_name, start_ts, end_ts)
        es_event.update(index=event_analysis_name,doc_type=event_type,id=task_id,body={'doc':{'compute_status':-4}})
        print 'finish time analyze'

        
        #sentiment
        sentimentTopic(en_name, start_ts=start_ts, over_ts=end_ts)
        print 'finish sentiment analyze'
        #finish compute

        print es_event.update(index=event_analysis_name,doc_type=event_type,id=task_id,body={'doc':{'compute_status':1,'finish_ts':int(time.time())}})
        print 'finish change status done'
        print time.time()
        
        if('contain' in relation.split('&')):
            #计算关系
            related_event_ids = event_input(keywords,en_name)
            rel_list = []
            for i in related_event_ids:
                create_person(event_node,event_primary,i,event_index_name)
                rel_list.append([[2,en_name],'contain',[2,i]])
            nodes_rels(rel_list)

        es_event.update(index=event_task_name,doc_type=event_task_type,id=task_id,body={'doc':{'compute_status':1}})


    
    t2=time.time()-t1
    print task_id,t2
            # except:
            #   raise
            #   break
            #get_attr(en_name, start_ts, end_ts)
        # else:
        #     pass
    return 1
Exemple #11
0
def immediate_compute(task_id):
    # try:
    task = es_event.get(index=event_task_name,doc_type=event_task_type,id=task_id)
    compute_task(task)
def cityTopic(topic,start_ts,over_ts,during=Fifteenminutes, n_limit=TOP_WEIBOS_LIMIT):
    if topic and topic != '':
        start_ts = int(start_ts)
        over_ts = int(over_ts)

        over_ts = ts2HourlyTime(over_ts, during)
        interval = (over_ts - start_ts) / during


        item_exist = es_event.get(index=event_analysis_name,doc_type=event_type,id=topic)['_source']
        try:
            geo_result = json.loads(item_exist['geo_results'])
        except:
            geo_result = {}


        #topics = topic.strip().split(',')
        for i in range(interval, 0, -1):
            mtype_ccount = {}  # mtype为message_type,ccount为{city:count}
            begin_ts = over_ts - during * i
            end_ts = begin_ts + during
            # print begin_ts,end_ts,topic
            weibos = []
            first_item = {}
            
            for k,v in mtype_kv.iteritems(): #v代表转发、评论、原创

                #geo_result['geo_cityCount'][end_ts][v] = []

                #geo_result = {}
                #city_dict = {}
                query_body = {   #按message_type得到微博
                    'query':{
                        'bool':{
                            'must':[
                                {'term':{'message_type':v}},  
                                # {'term':{'en_name':topic}},
                                {'range':{
                                    'timestamp':{'gte': begin_ts, 'lt':end_ts} 
                                }
                            }]
                        }
                    },
                    'sort':{SORT_FIELD:{"order":"desc"}},
                    'size':n_limit
                    }
                # print topic,event_text_type,query_body
                mtype_weibo = es_event.search(index=topic,doc_type=event_text_type,body=query_body)['hits']['hits']
                # print len(mtype_weibo)
                #save_ws_results(topic, end_ts, during, n_limit, mtype_weibo)    
                #微博直接保存下来
                # print '160',es_event,event_text,event_text_type,query_body,len(mtype_weibo)
                if len(mtype_weibo) == 0:
                    continue
                first_item = mtype_weibo[0]['_source']
                #数每个地方的不同类型的数量
                for weibo in mtype_weibo:  #对于每条微博
                    try:
                        geo = weibo['_source']['geo'].encode('utf8')
                    except:
                        continue
                    #print geo,type(geo)
                    province,city = split_city(geo)
                    #print province,city

                    
                    if province != 'unknown':
                        try:
                            geo_result[v][province][city]+=1
                            geo_result[v][province]['total']+=1
                        except:
                            try:
                                geo_result[v][province][city]=1
                                geo_result[v][province]['total']+=1
                            except:
                                try:
                                    geo_result[v][province]={city:1,'total':1}
                                except:
                                    try:
                                        geo_result[v]={province:{city:1,'total':1}}
                                    except:
                                        geo_result={v:{province:{city:1,'total':1}}}

                        
                    
                        # geo_result[v][province][city] += 1  
                        # try:
                        #     geo_result[v][province]['total'] += 1
                        # except:
                        #     try:
                        #         geo_result[v][province]['total']=1
                        #     except:
                        #         geo_result[v]={province:{'total':1}}

                                
                #geo_result[end_ts][v] = geo_result
                #print mtype_ccount   v:message type
                #save_rt_results(topic, mtype_ccount, during, first_item)

        save_rt_results_es(topic, geo_result)

        return geo_result
def excel_read():
    data = xlrd.open_workbook('events.xlsx')
    table = data.sheets()[0]  # 打开第一张表
    nrows = table.nrows  # 获取表的行数

    for i in range(nrows):

        if i == 0:  # 跳过第一行
            continue

        now_ts = int(time.time())
        keywords_list = table.row_values(i)[1].split(' ')
        keywords = '&'.join(keywords_list)
        event_type = table.row_values(i)[2]
        print event_type
        condition = []
        for w in keywords_list:
            condition.append({'term': {'keywords': w}})
            print w

        condition.append({'term': {'compute_status': 1}})
        es_query = {'query': {'bool': {'must': condition}}}

        res = es_event.search(index=event_task_name, doc_type=event_task_type, \
            body=es_query, request_timeout=999999,params={"search_type":"query_and_fetch"})
        print res['hits']['hits']

        if len(res['hits']['hits']) == 1:
            en_id = res['hits']['hits'][0]['_id']
            es_event.update(index=event_task_name,
                            doc_type=event_task_type,
                            id=en_id,
                            body={'doc': {
                                'event_type': event_type
                            }})
            es_event.update(index=event_analysis_name,
                            doc_type='text',
                            id=en_id,
                            body={'doc': {
                                'event_type': event_type
                            }})
        elif len(res['hits']['hits']) >= 1:
            en_id = res['hits']['hits'][0]['_id']
            es_event.update(index=event_task_name, doc_type=event_task_type, id=en_id,  \
                body={'doc':{'event_type':event_type}})
            try:
                task_exist = es_event.get(index=event_analysis_name,
                                          doc_type='text',
                                          id=task_id)['_source']
            except:
                task_exist = {}
            if task_exist:
                es_event.update(index=event_analysis_name,
                                doc_type='text',
                                id=en_id,
                                body={'doc': {
                                    'event_type': event_type
                                }})
            else:
                print 'event_result not exist' + en_id
            print "查询到多个结果!", i

    print 'END'
Exemple #14
0
def compute_real_info(topic, begin_ts, end_ts, relation, submit_user,
                      submit_ts):
    info_dict = {}
    query_body = {
        'query': {
            'bool': {
                'must': [{
                    'term': {
                        'en_name': topic
                    }
                }, {
                    'range': {
                        'timestamp': {
                            'gte': begin_ts,
                            'lt': end_ts
                        }
                    }
                }]
            }
        },
        'size': 10000
    }
    result = es_event.search(index=topic,
                             doc_type=event_text_type,
                             fields=['text'],
                             body=query_body)['hits']['hits']
    text_list = []
    for i in result:
        text_list.append(i['fields']['text'][0])

    #事件类型
    try:
        event = es_event.get(index=event_task_name,
                             doc_type=event_task_type,
                             id=topic)['_source']
        info_dict['event_type'] = event['event_type']
    except:
        info_dict['event_type'] = cut_weibo(text_list)

    try:
        event = es_event.get(index=event_task_name,
                             doc_type=event_task_type,
                             id=topic)['_source']
        info_dict['real_auth'] = event['real_auth']
        info_dict['real_geo'] = event['real_geo']
        info_dict['real_time'] = event['real_time']
        info_dict['real_person'] = event['real_person']
    except:
        info_dict = get_real(info_dict, topic, begin_ts, end_ts, relation)

    info_dict['topics'] = json.dumps(get_topic_word(text_list, 10))

    keywords = get_keyword(''.join(text_list), 2)
    info_dict['keywords'] = '&'.join([i[0] for i in keywords])
    info_dict['keywords_list'] = json.dumps(keywords)

    hashtag = get_hashtag(''.join(text_list))
    info_dict['hashtag_dict'] = json.dumps(hashtag)
    info_dict['hashtag'] = '&'.join(list(hashtag.keys()))

    try:
        es_event.update(index=event_analysis_name,
                        doc_type=event_type,
                        id=topic,
                        body={'doc': info_dict})
    except Exception, e:
        es_event.index(index=event_analysis_name,
                       doc_type=event_type,
                       id=topic,
                       body=info_dict)
Exemple #15
0
                    },
                    "first_compute": {
                        "type": "long"
                    },
                    "immediate_compute": {
                        "type": "long"
                    }
                }
            }
        }
    }

    if not es.indices.exists(index=event_analysis_name):

        print es.indices.create(index=event_analysis_name,
                                body=index_info,
                                ignore=400)

    return '1'


if __name__ == "__main__":

    #mappings_event_analysis_results()
    a = es.get(index='event_result',
               doc_type='text',
               id='xiang-gang-qian-zong-du-qian-ze-liang-you-er-ren-1482126431'
               )['_source']['time_results']
    print json.loads(a)
    print type(json.loads(a))