def geo_list(r, topic): #对每条微博得到转微博、mid、话题、时间、原地理位置、转发地理位置 # {original:xx, mid:xx, topic:xx, ts:xx, origin_location:xx, repost_location:xx} location_dict = {} message_type = r['message_type'] if message_type == 3: # 转发 geo = r['geo'].encode('utf8') try: repost_location = str(split_city(geo)) #把元组转换成了字符串 except: return None #print r['mid'],r['root_mid'] if r['root_mid']: query_body = { 'query': { 'filtered': { 'filter': { 'term': { 'mid': r['root_mid'] } } } } } item = weibo_es.search(index=topic, doc_type=weibo_index_type, body=query_body)['hits']['hits'] if item != []: try: origin_location = str( split_city(item[0]['_source']['geo'].encode('utf8'))) except: return None #if repost_location[2:4] != 'unknown' and origin_location[2:4] != 'un': if repost_location[2:4] != 'un' and origin_location[ 2:4] != 'un': # str(['unknown','unknown'])所以2,3位‘un’ location_dict['original'] = 0 location_dict['mid'] = r['mid'] location_dict['topic'] = topic location_dict['ts'] = r['timestamp'] location_dict['origin_location'] = origin_location location_dict['repost_location'] = repost_location return location_dict else: geo = r['geo'].encode('utf8') try: origin_location = str(split_city(geo)) except: return None if origin_location[2:4] != 'un': location_dict['original'] = 1 location_dict['mid'] = r['mid'] location_dict['topic'] = topic location_dict['ts'] = r['timestamp'] location_dict['origin_location'] = origin_location location_dict['repost_location'] = None return location_dict return None
def cityTopic(uids_list,flow_text_index_name,n_limit=TOP_WEIBOS_LIMIT): if flow_text_index_name and flow_text_index_name != '': geo_cityTopic_results = {} geo_cityTopic_results['geo_weibos'] = {} geo_cityTopic_results['geo_cityCount'] = {} province_dict = {} query_body = { 'query':{ 'filtered':{ 'filter':{ 'terms':{ 'uid':uids_list } } } }, 'sort':{SORT_FIELD:{"order":"desc"}}, 'size':n_limit } mtype_weibo = es_flow_text.search(index=flow_text_index_name,doc_type=flow_text_index_type,body=query_body)['hits']['hits'] for weibo in mtype_weibo: #对于每条微博 try: geo = weibo['_source']['geo'].encode('utf8') print 'geo' print geo except: continue #print geo,type(geo) province,city = split_city(geo) #print province,city if province != 'unknown': try: province_dict[province][city] += 1 except: try: province_dict[province][city] = 1 except: province_dict[province] = {city:1} try: province_dict[province]['total'] += 1 except: try: province_dict[province]['total'] = 1 except: province_dict[province] = {'total': 1} geo_cityTopic_results = province_dict return geo_cityTopic_results
def cityTopic(flow_text_index_name,n_limit=TOP_WEIBOS_LIMIT): if flow_text_index_name and flow_text_index_name != '': geo_cityTopic_results = {} geo_cityTopic_results['geo_weibos'] = {} geo_cityTopic_results['geo_cityCount'] = {} province_dict = {} first_item = {} query_body = { 'query':{ 'match_all':{} }, 'sort':{SORT_FIELD:{"order":"desc"}}, 'size':n_limit } mtype_weibo = weibo_es.search(index=flow_text_index_name,doc_type=flow_text_index_type,body=query_body)['hits']['hits'] #save_ws_results(topic, end_ts, during, n_limit, mtype_weibo) #微博直接保存下来 #if len(mtype_weibo) == 0: # continue first_item = mtype_weibo[0]['_source'] #数每个地方的不同类型的数量 for weibo in mtype_weibo: #对于每条微博 try: geo = weibo['_source']['geo'].encode('utf8') except: continue #print geo,type(geo) province,city = split_city(geo) #print province,city if province != 'unknown': try: province_dict[province][city] += 1 except: try: province_dict[province][city] = 1 except: province_dict[province] = {city:1} try: province_dict[province]['total'] += 1 except: try: province_dict[province]['total'] = 1 except: province_dict[province] = {'total': 1} geo_cityTopic_results = province_dict return geo_cityTopic_results
def cityTopic(topic, start_ts, over_ts, during=Fifteenminutes, n_limit=TOP_WEIBOS_LIMIT): if topic and topic != '': #start_ts = int(start_ts) #over_ts = int(over_ts) #over_ts = ts2HourlyTime(over_ts, during) #interval = (over_ts - start_ts) / during geo_cityTopic_results = {} geo_cityTopic_results['geo_weibos'] = {} geo_cityTopic_results['geo_cityCount'] = {} province_dict = {} for k, v in mtype_kv.iteritems(): #v代表转发、评论、原创 first_item = {} query_body = { #按message_type得到微博 'query': { 'bool': { 'must': [{ 'term': { 'message_type': v } }, { 'range': { 'timestamp': { 'gte': start_ts, 'lt': over_ts } } }] } }, 'sort': { SORT_FIELD: { "order": "desc" } }, 'size': 10000000 } mtype_weibo = weibo_es.search(index=topic, doc_type=weibo_index_type, body=query_body)['hits']['hits'] #save_ws_results(topic, end_ts, during, n_limit, mtype_weibo) #微博直接保存下来 if len(mtype_weibo) == 0: continue first_item = mtype_weibo[0]['_source'] #数每个地方的不同类型的数量 count_i = 0 for weibo in mtype_weibo: #对于每条微博 count_i += 1 try: geo = weibo['_source']['geo'].encode('utf8') except: continue #print geo,type(geo) province, city = split_city(geo) #print province,city if count_i <= n_limit: try: geo_cityTopic_results['geo_weibos'][v].append( [province, city, weibo]) except: geo_cityTopic_results['geo_weibos'][v] = [[ province, city, weibo ]] if province != 'unknown': try: province_dict[province][city] += 1 except: try: province_dict[province][city] = 1 except: province_dict[province] = {city: 1} try: province_dict[province]['total'] += 1 except: try: province_dict[province]['total'] = 1 except: province_dict[province] = {'total': 1} geo_cityTopic_results['geo_cityCount'][v] = province_dict return geo_cityTopic_results
def cityTopic(topic, start_ts, over_ts, during=Fifteenminutes, n_limit=TOP_WEIBOS_LIMIT): if topic and topic != '': start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during #topics = topic.strip().split(',') for i in range(interval, 0, -1): mtype_ccount = {} # mtype为message_type,ccount为{city:count} begin_ts = over_ts - during * i end_ts = begin_ts + during print begin_ts, end_ts, topic weibos = [] first_item = {} for k, v in mtype_kv.iteritems(): #v代表转发、评论、原创 province_dict = {} city_dict = {} query_body = { #按message_type得到微博 'query': { 'bool': { 'must': [{ 'term': { 'message_type': v } }, { 'range': { 'timestamp': { 'gte': begin_ts, 'lt': end_ts } } }] } }, 'sort': { SORT_FIELD: { "order": "desc" } }, 'size': n_limit } mtype_weibo = weibo_es.search(index=topic, doc_type=weibo_index_type, body=query_body)['hits']['hits'] #save_ws_results(topic, end_ts, during, n_limit, mtype_weibo) #微博直接保存下来 if len(mtype_weibo) == 0: continue first_item = mtype_weibo[0]['_source'] #数每个地方的不同类型的数量 for weibo in mtype_weibo: #对于每条微博 try: geo = weibo['_source']['geo'].encode('utf8') except: continue #print geo,type(geo) province, city = split_city(geo) #print province,city if province != 'unknown': try: province_dict[province][city] += 1 province_dict[province]['total'] += 1 except: province_dict[province] = {} province_dict[province][city] = 1 province_dict[province]['total'] = 1 save_ws_results(topic, end_ts, during, n_limit, province, city, weibo) # try: # city_dict[city] += 1 # except: # city_dict[city] = 1 # try: # province_dict[province].append(city_dict) # except: # province_dict[province] = [] # province_dict[province].append(city_dict) # try: # province_dict[province] += 1 # except: # province_dict[province] = 1 # sorted_province_dict = sorted(province_dict.items(), key=lambda x: x[0], reverse=False)[:n_limit] #就是x[0] # sorted_city_dict = sorted(city_dict.items(), key=lambda x: x[0], reverse=False)[:n_limit] # print sorted_province_dict # print sorted_city_dict ccount = province_dict # ccount['province'] = sorted_province_dict # ccount['city'] = sorted_city_dict mtype_ccount[v] = [ end_ts, ccount ] #{'message_type':[shijian,{['province':('provice':cishu),()],'city':[(city:cishu)}]} #print mtype_ccount save_rt_results(topic, mtype_ccount, during, first_item)
def cityTopic(topic,start_ts,over_ts,during=Fifteenminutes, n_limit=TOP_WEIBOS_LIMIT): if topic and topic != '': start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during item_exist = es_event.get(index=event_analysis_name,doc_type=event_type,id=topic)['_source'] try: geo_result = json.loads(item_exist['geo_results']) except: geo_result = {} #topics = topic.strip().split(',') for i in range(interval, 0, -1): mtype_ccount = {} # mtype为message_type,ccount为{city:count} begin_ts = over_ts - during * i end_ts = begin_ts + during # print begin_ts,end_ts,topic weibos = [] first_item = {} for k,v in mtype_kv.iteritems(): #v代表转发、评论、原创 #geo_result['geo_cityCount'][end_ts][v] = [] #geo_result = {} #city_dict = {} query_body = { #按message_type得到微博 'query':{ 'bool':{ 'must':[ {'term':{'message_type':v}}, # {'term':{'en_name':topic}}, {'range':{ 'timestamp':{'gte': begin_ts, 'lt':end_ts} } }] } }, 'sort':{SORT_FIELD:{"order":"desc"}}, 'size':n_limit } # print topic,event_text_type,query_body mtype_weibo = es_event.search(index=topic,doc_type=event_text_type,body=query_body)['hits']['hits'] # print len(mtype_weibo) #save_ws_results(topic, end_ts, during, n_limit, mtype_weibo) #微博直接保存下来 # print '160',es_event,event_text,event_text_type,query_body,len(mtype_weibo) if len(mtype_weibo) == 0: continue first_item = mtype_weibo[0]['_source'] #数每个地方的不同类型的数量 for weibo in mtype_weibo: #对于每条微博 try: geo = weibo['_source']['geo'].encode('utf8') except: continue #print geo,type(geo) province,city = split_city(geo) #print province,city if province != 'unknown': try: geo_result[v][province][city]+=1 geo_result[v][province]['total']+=1 except: try: geo_result[v][province][city]=1 geo_result[v][province]['total']+=1 except: try: geo_result[v][province]={city:1,'total':1} except: try: geo_result[v]={province:{city:1,'total':1}} except: geo_result={v:{province:{city:1,'total':1}}} # geo_result[v][province][city] += 1 # try: # geo_result[v][province]['total'] += 1 # except: # try: # geo_result[v][province]['total']=1 # except: # geo_result[v]={province:{'total':1}} #geo_result[end_ts][v] = geo_result #print mtype_ccount v:message type #save_rt_results(topic, mtype_ccount, during, first_item) save_rt_results_es(topic, geo_result) return geo_result