def propagateCronTopic(topic, xapian_search_weibo, start_ts, over_ts, sort_field=SORT_FIELD, \ save_fields=RESP_ITER_KEYS, during=Fifteenminutes, w_limit=TOP_WEIBOS_LIMIT, k_limit=TOP_KEYWORDS_LIMIT): if topic and topic != '': start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during for i in range(interval, 0, -1): begin_ts = over_ts - during * i end_ts = begin_ts + during print begin_ts, end_ts, 'topic %s starts calculate' % topic.encode( 'utf-8') mtype_count = {} mtype_kcount = {} # mtype_kcount={mtype:[terms]} mtype_weibo = {} # mtype_weibo={mtype:weibo} query_dict = {'timestamp': {'$gt': begin_ts, '$lt': end_ts}} for k, v in mtype_kv.iteritems(): query_dict['message_type'] = v count, results = xapian_search_weibo.search(query=query_dict, fields=fields_list) mset = xapian_search_weibo.search(query=query_dict, sort_by=[sort_field], \ max_offset=w_limit, mset_direct=True) kcount = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=['terms']), top=k_limit) top_ws = top_weibos(results, top=w_limit) mtype_count[v] = [end_ts, count] mtype_kcount[v] = [end_ts, kcount] mtype_weibo[v] = [end_ts, top_ws] save_pc_results(topic, mtype_count, during) save_kc_results(topic, mtype_kcount, during, k_limit) save_ws_results(topic, mtype_weibo, during, w_limit)
def propagateCronTopic(topic, xapian_search_weibo, start_ts, over_ts, sort_field=SORT_FIELD, \ save_fields=RESP_ITER_KEYS, during=Fifteenminutes, w_limit=TOP_WEIBOS_LIMIT, k_limit=TOP_KEYWORDS_LIMIT): if topic and topic != '': start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during for i in range(interval, 0, -1): begin_ts = over_ts - during * i end_ts = begin_ts + during print begin_ts, end_ts, 'topic %s starts calculate' % topic.encode('utf-8') mtype_count = {} mtype_kcount = {} # mtype_kcount={mtype:[terms]} mtype_weibo = {} # mtype_weibo={mtype:weibo} query_dict = { 'timestamp': {'$gt': begin_ts, '$lt': end_ts} } for k, v in mtype_kv.iteritems(): query_dict['message_type'] = v count, results = xapian_search_weibo.search(query=query_dict, fields=fields_list) mset = xapian_search_weibo.search(query=query_dict, sort_by=[sort_field], \ max_offset=w_limit, mset_direct=True) kcount = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=['terms']), top=k_limit) top_ws = top_weibos(results, top=w_limit) mtype_count[v] = [end_ts, count] mtype_kcount[v] = [end_ts, kcount] mtype_weibo[v] = [end_ts, top_ws] save_pc_results(topic, mtype_count, during) save_kc_results(topic, mtype_kcount, during, k_limit) save_ws_results(topic, mtype_weibo, during, w_limit)
def cityTopic(topic, start_ts, over_ts, during=Fifteenminutes, n_limit=TOP_WEIBOS_LIMIT): if topic and topic != '': #start_ts = int(start_ts) #over_ts = int(over_ts) #over_ts = ts2HourlyTime(over_ts, during) #interval = (over_ts - start_ts) / during geo_cityTopic_results = {} geo_cityTopic_results['geo_weibos'] = {} geo_cityTopic_results['geo_cityCount'] = {} province_dict = {} for k, v in mtype_kv.iteritems(): #v代表转发、评论、原创 first_item = {} query_body = { #按message_type得到微博 'query': { 'bool': { 'must': [{ 'term': { 'message_type': v } }, { 'range': { 'timestamp': { 'gte': start_ts, 'lt': over_ts } } }] } }, 'sort': { SORT_FIELD: { "order": "desc" } }, 'size': 10000000 } mtype_weibo = weibo_es.search(index=topic, doc_type=weibo_index_type, body=query_body)['hits']['hits'] #save_ws_results(topic, end_ts, during, n_limit, mtype_weibo) #微博直接保存下来 if len(mtype_weibo) == 0: continue first_item = mtype_weibo[0]['_source'] #数每个地方的不同类型的数量 count_i = 0 for weibo in mtype_weibo: #对于每条微博 count_i += 1 try: geo = weibo['_source']['geo'].encode('utf8') except: continue #print geo,type(geo) province, city = split_city(geo) #print province,city if count_i <= n_limit: try: geo_cityTopic_results['geo_weibos'][v].append( [province, city, weibo]) except: geo_cityTopic_results['geo_weibos'][v] = [[ province, city, weibo ]] if province != 'unknown': try: province_dict[province][city] += 1 except: try: province_dict[province][city] = 1 except: province_dict[province] = {city: 1} try: province_dict[province]['total'] += 1 except: try: province_dict[province]['total'] = 1 except: province_dict[province] = {'total': 1} geo_cityTopic_results['geo_cityCount'][v] = province_dict return geo_cityTopic_results
def cityTopic(topic, start_ts, over_ts, during=Fifteenminutes, n_limit=TOP_WEIBOS_LIMIT): if topic and topic != '': start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during #topics = topic.strip().split(',') for i in range(interval, 0, -1): mtype_ccount = {} # mtype为message_type,ccount为{city:count} begin_ts = over_ts - during * i end_ts = begin_ts + during print begin_ts, end_ts, topic weibos = [] first_item = {} for k, v in mtype_kv.iteritems(): #v代表转发、评论、原创 province_dict = {} city_dict = {} query_body = { #按message_type得到微博 'query': { 'bool': { 'must': [{ 'term': { 'message_type': v } }, { 'range': { 'timestamp': { 'gte': begin_ts, 'lt': end_ts } } }] } }, 'sort': { SORT_FIELD: { "order": "desc" } }, 'size': n_limit } mtype_weibo = weibo_es.search(index=topic, doc_type=weibo_index_type, body=query_body)['hits']['hits'] #save_ws_results(topic, end_ts, during, n_limit, mtype_weibo) #微博直接保存下来 if len(mtype_weibo) == 0: continue first_item = mtype_weibo[0]['_source'] #数每个地方的不同类型的数量 for weibo in mtype_weibo: #对于每条微博 try: geo = weibo['_source']['geo'].encode('utf8') except: continue #print geo,type(geo) province, city = split_city(geo) #print province,city if province != 'unknown': try: province_dict[province][city] += 1 province_dict[province]['total'] += 1 except: province_dict[province] = {} province_dict[province][city] = 1 province_dict[province]['total'] = 1 save_ws_results(topic, end_ts, during, n_limit, province, city, weibo) # try: # city_dict[city] += 1 # except: # city_dict[city] = 1 # try: # province_dict[province].append(city_dict) # except: # province_dict[province] = [] # province_dict[province].append(city_dict) # try: # province_dict[province] += 1 # except: # province_dict[province] = 1 # sorted_province_dict = sorted(province_dict.items(), key=lambda x: x[0], reverse=False)[:n_limit] #就是x[0] # sorted_city_dict = sorted(city_dict.items(), key=lambda x: x[0], reverse=False)[:n_limit] # print sorted_province_dict # print sorted_city_dict ccount = province_dict # ccount['province'] = sorted_province_dict # ccount['city'] = sorted_city_dict mtype_ccount[v] = [ end_ts, ccount ] #{'message_type':[shijian,{['province':('provice':cishu),()],'city':[(city:cishu)}]} #print mtype_ccount save_rt_results(topic, mtype_ccount, during, first_item)
def cityTopic(topic,start_ts,over_ts,during=Fifteenminutes, n_limit=TOP_WEIBOS_LIMIT): if topic and topic != '': start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during item_exist = es_event.get(index=event_analysis_name,doc_type=event_type,id=topic)['_source'] try: geo_result = json.loads(item_exist['geo_results']) except: geo_result = {} #topics = topic.strip().split(',') for i in range(interval, 0, -1): mtype_ccount = {} # mtype为message_type,ccount为{city:count} begin_ts = over_ts - during * i end_ts = begin_ts + during # print begin_ts,end_ts,topic weibos = [] first_item = {} for k,v in mtype_kv.iteritems(): #v代表转发、评论、原创 #geo_result['geo_cityCount'][end_ts][v] = [] #geo_result = {} #city_dict = {} query_body = { #按message_type得到微博 'query':{ 'bool':{ 'must':[ {'term':{'message_type':v}}, # {'term':{'en_name':topic}}, {'range':{ 'timestamp':{'gte': begin_ts, 'lt':end_ts} } }] } }, 'sort':{SORT_FIELD:{"order":"desc"}}, 'size':n_limit } # print topic,event_text_type,query_body mtype_weibo = es_event.search(index=topic,doc_type=event_text_type,body=query_body)['hits']['hits'] # print len(mtype_weibo) #save_ws_results(topic, end_ts, during, n_limit, mtype_weibo) #微博直接保存下来 # print '160',es_event,event_text,event_text_type,query_body,len(mtype_weibo) if len(mtype_weibo) == 0: continue first_item = mtype_weibo[0]['_source'] #数每个地方的不同类型的数量 for weibo in mtype_weibo: #对于每条微博 try: geo = weibo['_source']['geo'].encode('utf8') except: continue #print geo,type(geo) province,city = split_city(geo) #print province,city if province != 'unknown': try: geo_result[v][province][city]+=1 geo_result[v][province]['total']+=1 except: try: geo_result[v][province][city]=1 geo_result[v][province]['total']+=1 except: try: geo_result[v][province]={city:1,'total':1} except: try: geo_result[v]={province:{city:1,'total':1}} except: geo_result={v:{province:{city:1,'total':1}}} # geo_result[v][province][city] += 1 # try: # geo_result[v][province]['total'] += 1 # except: # try: # geo_result[v][province]['total']=1 # except: # geo_result[v]={province:{'total':1}} #geo_result[end_ts][v] = geo_result #print mtype_ccount v:message type #save_rt_results(topic, mtype_ccount, during, first_item) save_rt_results_es(topic, geo_result) return geo_result