def results_gen(r, topic): # {original:xx, mid:xx, topic:xx, ts:xx, origin_location:xx, repost_location:xx} location_dict = {} message_type = r['message_type'] if message_type == 3: # 转发 # print 'retweeted_mid', r['retweeted_mid'] try: if (len(r['geo'].split('.')) == 4): repost_location = IP2city(r['geo']) else: repost_location = geo2city(r['geo']) except: return None if check_location([repost_location]): # 过滤不能解析的item if r['retweeted_mid']: # 过滤retweed_mid不完整的item item = xapian_search.search_by_id(r['retweeted_mid'], fields=['geo', '_id']) if item: try: if (len(item['geo'].split('.')) == 4): origin_location = IP2city(item['geo']) else: origin_location = geo2city(item['geo']) except: return None if check_location([origin_location]): location_dict['original'] = 0 location_dict['mid'] = r['_id'] location_dict['topic'] = topic location_dict['ts'] = r['timestamp'] location_dict[ 'origin_location'] = origin_location.split('\t')[1] location_dict[ 'repost_location'] = repost_location.split('\t')[1] return location_dict elif message_type == 1: # 原创 try: if (len(r['geo'].split('.')) == 4): origin_location = IP2city(r['geo']) else: origin_location = geo2city(r['geo']) except: return None if check_location([origin_location]): location_dict['original'] = 1 location_dict['mid'] = r['_id'] location_dict['topic'] = topic location_dict['ts'] = r['timestamp'] location_dict['origin_location'] = origin_location.split('\t')[1] location_dict['repost_location'] = None return location_dict return None
def results_gen(r, topic): # {original:xx, mid:xx, topic:xx, ts:xx, origin_location:xx, repost_location:xx} location_dict = {} message_type = r["message_type"] if message_type == 3: # 转发 # print 'retweeted_mid', r['retweeted_mid'] try: if len(r["geo"].split(".")) == 4: repost_location = IP2city(r["geo"]) else: repost_location = geo2city(r["geo"]) except: return None if check_location([repost_location]): # 过滤不能解析的item if r["retweeted_mid"]: # 过滤retweed_mid不完整的item item = xapian_search.search_by_id(r["retweeted_mid"], fields=["geo", "_id"]) if item: try: if len(item["geo"].split(".")) == 4: origin_location = IP2city(item["geo"]) else: origin_location = geo2city(item["geo"]) except: return None if check_location([origin_location]): location_dict["original"] = 0 location_dict["mid"] = r["_id"] location_dict["topic"] = topic location_dict["ts"] = r["timestamp"] location_dict["origin_location"] = origin_location.split("\t")[1] location_dict["repost_location"] = repost_location.split("\t")[1] return location_dict elif message_type == 1: # 原创 try: if len(r["geo"].split(".")) == 4: origin_location = IP2city(r["geo"]) else: origin_location = geo2city(r["geo"]) except: return None if check_location([origin_location]): location_dict["original"] = 1 location_dict["mid"] = r["_id"] location_dict["topic"] = topic location_dict["ts"] = r["timestamp"] location_dict["origin_location"] = origin_location.split("\t")[1] location_dict["repost_location"] = None return location_dict return None
def readAcum(stylenum, topic, start_ts, end_ts, during): pointnum = (end_ts - start_ts) / during # 时间点数 spatial_dict = {} spatial_info_list = [] global_max_count = 0 global_first_timestamp = end_ts global_first_city = "" for i in range(pointnum + 1): end_ts = start_ts + during * i max_count = 0 first_item = {} city_count = {} first_item, city_count = Pcount(end_ts, during, stylenum, topic) for city in city_count: j = i while j > 0: previous_data = spatial_info_list[j - 1] if city in previous_data: city_count[city] += previous_data[city] break else: j -= 1 if i > 0: previous_data = spatial_info_list[i - 1] for city in previous_data: try: city_count[city] except KeyError: city_count[city] = previous_data[city] continue if city_count.values(): max_count = max(city_count.values()) if global_max_count < max_count: global_max_count = max_count spatial_info_list.append(city_count) topic_spatial_info = province_color_map(city_count) spatial_dict[str( end_ts)] = topic_spatial_info # spatial_dict = {end_ts:map_data} try: if first_item['timestamp'] <= global_first_timestamp: global_first_timestamp = first_item['timestamp'] global_first_city = geo2city(first_item['geo']) except KeyError: pass return global_max_count, spatial_dict, global_first_city
def readAcum(stylenum, topic, start_ts, end_ts, during): pointnum = (end_ts - start_ts) / during # 时间点数 spatial_dict = {} spatial_info_list = [] global_max_count = 0 global_first_timestamp = end_ts global_first_city = "" for i in range(pointnum + 1): end_ts = start_ts + during * i max_count = 0 first_item = {} city_count = {} first_item, city_count = Pcount(end_ts, during, stylenum, topic) for city in city_count: j = i while j > 0: previous_data = spatial_info_list[j - 1] if city in previous_data: city_count[city] += previous_data[city] break else: j -= 1 if i > 0: previous_data = spatial_info_list[i - 1] for city in previous_data: try: city_count[city] except KeyError: city_count[city] = previous_data[city] continue if city_count.values(): max_count = max(city_count.values()) if global_max_count < max_count: global_max_count = max_count spatial_info_list.append(city_count) topic_spatial_info = province_color_map(city_count) spatial_dict[str(end_ts)] = topic_spatial_info # spatial_dict = {end_ts:map_data} try: if first_item["timestamp"] <= global_first_timestamp: global_first_timestamp = first_item["timestamp"] global_first_city = geo2city(first_item["geo"]) except KeyError: pass return global_max_count, spatial_dict, global_first_city
def get_city_weibo(topic, start_ts, end_ts, unit=MinInterval, limit=TOP_WEIBOS_LIMIT): weibos = [] if (end_ts - start_ts < unit): upbound = int(math.ceil(end_ts / (unit * 1.0)) * unit) item = db.session.query(CityWeibos).filter(CityWeibos.end==upbound, \ CityWeibos.topic==topic, \ CityWeibos.range==unit, \ CityWeibos.limit==limit).first() if item: news = _json_loads(item.weibos) for weibo_item in news: weibos.append((weibo_item['reposts_count'], weibo_item)) else: upbound = int(math.ceil(end_ts / (unit * 1.0)) * unit) lowbound = (start_ts / unit) * unit items = db.session.query(CityWeibos).filter(CityWeibos.end>lowbound, \ CityWeibos.end<=upbound, \ CityWeibos.topic==topic, \ CityWeibos.range==unit, \ CityWeibos.limit==limit).all() for item in items: news = _json_loads(item.weibos) for weibo_item in news: try: weibos.append((weibo_item['reposts_count'], weibo_item)) except: continue sorted_weibos = sorted(weibos, key=lambda k: k[0], reverse=True) city_dict = {} k = 0 for reposts_count, result in sorted_weibos: k += 1 if k > 1000: break uid = result['user'] user_info = acquire_user_by_id(uid) if user_info: result['username'] = user_info['name'] else: result['username'] = '******' time = ts2date(result['timestamp']) result['time'] = time try: if (len(result['geo'].split('.')) == 4): full_area = IP2city(result['geo']) result['geo'] = full_area city = full_area.split('\t')[1] else: city = geo2city(result['geo']).split('\t')[1] except: city = '' result['weibo_link'] = weiboinfo2url(result['user'], result['_id']) if city in province_list: try: city_dict[city].append(result) except: city_dict[city] = [result] return city_dict
def get_city_weibo(topic, start_ts, end_ts, unit=MinInterval, limit=TOP_WEIBOS_LIMIT): weibos = [] if end_ts - start_ts < unit: upbound = int(math.ceil(end_ts / (unit * 1.0)) * unit) item = ( db.session.query(CityWeibos) .filter( CityWeibos.end == upbound, CityWeibos.topic == topic, CityWeibos.range == unit, CityWeibos.limit == limit, ) .first() ) if item: news = _json_loads(item.weibos) for weibo_item in news: weibos.append((weibo_item["reposts_count"], weibo_item)) else: upbound = int(math.ceil(end_ts / (unit * 1.0)) * unit) lowbound = (start_ts / unit) * unit items = ( db.session.query(CityWeibos) .filter( CityWeibos.end > lowbound, CityWeibos.end <= upbound, CityWeibos.topic == topic, CityWeibos.range == unit, CityWeibos.limit == limit, ) .all() ) for item in items: news = _json_loads(item.weibos) for weibo_item in news: try: weibos.append((weibo_item["reposts_count"], weibo_item)) except: continue sorted_weibos = sorted(weibos, key=lambda k: k[0], reverse=True) city_dict = {} k = 0 for reposts_count, result in sorted_weibos: k += 1 if k > 1000: break uid = result["user"] user_info = acquire_user_by_id(uid) if user_info: result["username"] = user_info["name"] else: result["username"] = "******" time = ts2date(result["timestamp"]) result["time"] = time try: if len(result["geo"].split(".")) == 4: full_area = IP2city(result["geo"]) result["geo"] = full_area city = full_area.split("\t")[1] else: city = geo2city(result["geo"]).split("\t")[1] except: city = "" result["weibo_link"] = weiboinfo2url(result["user"], result["_id"]) if city in province_list: try: city_dict[city].append(result) except: city_dict[city] = [result] return city_dict
def cityCronTopic(topic, xapian_search_weibo, start_ts, over_ts, during=Fifteenminutes, n_limit=TOP_WEIBOS_LIMIT): if topic and topic != '': start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during topics = topic.strip().split(',') for i in range(interval, 0, -1): mtype_ccount = {} # mtype为message_type,ccount为{city:count} begin_ts = over_ts - during * i end_ts = begin_ts + during weibos = [] query_dict = { 'timestamp': { '$gt': begin_ts, '$lt': end_ts }, } for k, v in mtype_kv.iteritems(): ccount = {} first_timestamp = end_ts first_item = {} query_dict['message_type'] = v count, weibo_results = xapian_search_weibo.search( query=query_dict, fields=fields_list ) # weibo_results是在指定时间段、topic、message_type的微博匹配集 for weibo_result in weibo_results(): if (weibo_result['timestamp'] <= first_timestamp): first_timestamp = weibo_result['timestamp'] first_item = weibo_result try: if (len(weibo_result['geo'].split('.')) == 4): city = IP2city(weibo_result['geo']) if city: try: ccount[city] += 1 except KeyError: ccount[city] = 1 else: continue else: city = geo2city(weibo_result['geo']) if city: try: ccount[city] += 1 except KeyError: ccount[city] = 1 else: continue except: continue if (v == 1) or (v == 3): # 只存储原创和转发 weibos.append(weibo_result) mtype_ccount[v] = [end_ts, ccount] save_rt_results(topic, mtype_ccount, during, first_item) sorted_weibos = sorted(weibos, key=lambda k: k[SORT_FIELD], reverse=True) sorted_weibos = sorted_weibos[:n_limit] save_ws_results(topic, end_ts, during, n_limit, sorted_weibos)
def cityCronTopic(topic, xapian_search_weibo, start_ts, over_ts, during=Fifteenminutes, n_limit=TOP_WEIBOS_LIMIT): if topic and topic != '': start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during topics = topic.strip().split(',') for i in range(interval, 0, -1): mtype_ccount = {} # mtype为message_type,ccount为{city:count} begin_ts = over_ts - during * i end_ts = begin_ts + during weibos = [] query_dict = { 'timestamp': {'$gt': begin_ts, '$lt': end_ts}, } for k, v in mtype_kv.iteritems(): ccount={} first_timestamp = end_ts first_item = {} query_dict['message_type'] = v count,weibo_results = xapian_search_weibo.search(query=query_dict, fields=fields_list)# weibo_results是在指定时间段、topic、message_type的微博匹配集 for weibo_result in weibo_results(): if (weibo_result['timestamp'] <= first_timestamp ): first_timestamp = weibo_result['timestamp'] first_item = weibo_result try: if (len(weibo_result['geo'].split('.')) == 4): city = IP2city(weibo_result['geo']) if city: try: ccount[city] += 1 except KeyError: ccount[city] = 1 else: continue else: city = geo2city(weibo_result['geo']) if city: try: ccount[city] += 1 except KeyError: ccount[city] = 1 else: continue except: continue if (v == 1) or (v == 3): # 只存储原创和转发 weibos.append(weibo_result) mtype_ccount[v] = [end_ts, ccount] save_rt_results(topic, mtype_ccount, during, first_item) sorted_weibos = sorted(weibos, key=lambda k: k[SORT_FIELD], reverse=True) sorted_weibos = sorted_weibos[:n_limit] save_ws_results(topic, end_ts, during, n_limit, sorted_weibos)