Beispiel #1
0
def results_gen(r, topic):
    # {original:xx, mid:xx, topic:xx, ts:xx, origin_location:xx, repost_location:xx}
    location_dict = {}
    message_type = r['message_type']
    if message_type == 3:  # 转发
        # print 'retweeted_mid', r['retweeted_mid']
        try:
            if (len(r['geo'].split('.')) == 4):
                repost_location = IP2city(r['geo'])
            else:
                repost_location = geo2city(r['geo'])
        except:
            return None
        if check_location([repost_location]):  # 过滤不能解析的item
            if r['retweeted_mid']:  # 过滤retweed_mid不完整的item
                item = xapian_search.search_by_id(r['retweeted_mid'],
                                                  fields=['geo', '_id'])
                if item:
                    try:
                        if (len(item['geo'].split('.')) == 4):
                            origin_location = IP2city(item['geo'])
                        else:
                            origin_location = geo2city(item['geo'])
                    except:
                        return None
                    if check_location([origin_location]):
                        location_dict['original'] = 0
                        location_dict['mid'] = r['_id']
                        location_dict['topic'] = topic
                        location_dict['ts'] = r['timestamp']
                        location_dict[
                            'origin_location'] = origin_location.split('\t')[1]
                        location_dict[
                            'repost_location'] = repost_location.split('\t')[1]
                        return location_dict

    elif message_type == 1:  # 原创
        try:
            if (len(r['geo'].split('.')) == 4):
                origin_location = IP2city(r['geo'])
            else:
                origin_location = geo2city(r['geo'])
        except:
            return None
        if check_location([origin_location]):
            location_dict['original'] = 1
            location_dict['mid'] = r['_id']
            location_dict['topic'] = topic
            location_dict['ts'] = r['timestamp']
            location_dict['origin_location'] = origin_location.split('\t')[1]
            location_dict['repost_location'] = None
            return location_dict

    return None
Beispiel #2
0
def results_gen(r, topic):
    # {original:xx, mid:xx, topic:xx, ts:xx, origin_location:xx, repost_location:xx}
    location_dict = {}
    message_type = r["message_type"]
    if message_type == 3:  # 转发
        # print 'retweeted_mid', r['retweeted_mid']
        try:
            if len(r["geo"].split(".")) == 4:
                repost_location = IP2city(r["geo"])
            else:
                repost_location = geo2city(r["geo"])
        except:
            return None
        if check_location([repost_location]):  # 过滤不能解析的item
            if r["retweeted_mid"]:  # 过滤retweed_mid不完整的item
                item = xapian_search.search_by_id(r["retweeted_mid"], fields=["geo", "_id"])
                if item:
                    try:
                        if len(item["geo"].split(".")) == 4:
                            origin_location = IP2city(item["geo"])
                        else:
                            origin_location = geo2city(item["geo"])
                    except:
                        return None
                    if check_location([origin_location]):
                        location_dict["original"] = 0
                        location_dict["mid"] = r["_id"]
                        location_dict["topic"] = topic
                        location_dict["ts"] = r["timestamp"]
                        location_dict["origin_location"] = origin_location.split("\t")[1]
                        location_dict["repost_location"] = repost_location.split("\t")[1]
                        return location_dict

    elif message_type == 1:  # 原创
        try:
            if len(r["geo"].split(".")) == 4:
                origin_location = IP2city(r["geo"])
            else:
                origin_location = geo2city(r["geo"])
        except:
            return None
        if check_location([origin_location]):
            location_dict["original"] = 1
            location_dict["mid"] = r["_id"]
            location_dict["topic"] = topic
            location_dict["ts"] = r["timestamp"]
            location_dict["origin_location"] = origin_location.split("\t")[1]
            location_dict["repost_location"] = None
            return location_dict

    return None
Beispiel #3
0
def readAcum(stylenum, topic, start_ts, end_ts, during):
    pointnum = (end_ts - start_ts) / during  # 时间点数
    spatial_dict = {}
    spatial_info_list = []
    global_max_count = 0
    global_first_timestamp = end_ts
    global_first_city = ""

    for i in range(pointnum + 1):
        end_ts = start_ts + during * i
        max_count = 0
        first_item = {}
        city_count = {}
        first_item, city_count = Pcount(end_ts, during, stylenum, topic)

        for city in city_count:
            j = i
            while j > 0:
                previous_data = spatial_info_list[j - 1]
                if city in previous_data:
                    city_count[city] += previous_data[city]
                    break
                else:
                    j -= 1
        if i > 0:
            previous_data = spatial_info_list[i - 1]
            for city in previous_data:
                try:
                    city_count[city]
                except KeyError:
                    city_count[city] = previous_data[city]
                    continue

        if city_count.values():
            max_count = max(city_count.values())

        if global_max_count < max_count:
            global_max_count = max_count
        spatial_info_list.append(city_count)
        topic_spatial_info = province_color_map(city_count)
        spatial_dict[str(
            end_ts)] = topic_spatial_info  # spatial_dict = {end_ts:map_data}
        try:
            if first_item['timestamp'] <= global_first_timestamp:
                global_first_timestamp = first_item['timestamp']
                global_first_city = geo2city(first_item['geo'])
        except KeyError:
            pass
    return global_max_count, spatial_dict, global_first_city
Beispiel #4
0
def readAcum(stylenum, topic, start_ts, end_ts, during):
    pointnum = (end_ts - start_ts) / during  # 时间点数
    spatial_dict = {}
    spatial_info_list = []
    global_max_count = 0
    global_first_timestamp = end_ts
    global_first_city = ""

    for i in range(pointnum + 1):
        end_ts = start_ts + during * i
        max_count = 0
        first_item = {}
        city_count = {}
        first_item, city_count = Pcount(end_ts, during, stylenum, topic)

        for city in city_count:
            j = i
            while j > 0:
                previous_data = spatial_info_list[j - 1]
                if city in previous_data:
                    city_count[city] += previous_data[city]
                    break
                else:
                    j -= 1
        if i > 0:
            previous_data = spatial_info_list[i - 1]
            for city in previous_data:
                try:
                    city_count[city]
                except KeyError:
                    city_count[city] = previous_data[city]
                    continue

        if city_count.values():
            max_count = max(city_count.values())

        if global_max_count < max_count:
            global_max_count = max_count
        spatial_info_list.append(city_count)
        topic_spatial_info = province_color_map(city_count)
        spatial_dict[str(end_ts)] = topic_spatial_info  # spatial_dict = {end_ts:map_data}
        try:
            if first_item["timestamp"] <= global_first_timestamp:
                global_first_timestamp = first_item["timestamp"]
                global_first_city = geo2city(first_item["geo"])
        except KeyError:
            pass
    return global_max_count, spatial_dict, global_first_city
Beispiel #5
0
def get_city_weibo(topic,
                   start_ts,
                   end_ts,
                   unit=MinInterval,
                   limit=TOP_WEIBOS_LIMIT):
    weibos = []
    if (end_ts - start_ts < unit):
        upbound = int(math.ceil(end_ts / (unit * 1.0)) * unit)
        item = db.session.query(CityWeibos).filter(CityWeibos.end==upbound, \
                                                       CityWeibos.topic==topic, \
                                                       CityWeibos.range==unit, \
                                                       CityWeibos.limit==limit).first()
        if item:
            news = _json_loads(item.weibos)
            for weibo_item in news:
                weibos.append((weibo_item['reposts_count'], weibo_item))
    else:
        upbound = int(math.ceil(end_ts / (unit * 1.0)) * unit)
        lowbound = (start_ts / unit) * unit
        items = db.session.query(CityWeibos).filter(CityWeibos.end>lowbound, \
                                                         CityWeibos.end<=upbound, \
                                                         CityWeibos.topic==topic, \
                                                         CityWeibos.range==unit, \
                                                         CityWeibos.limit==limit).all()
        for item in items:
            news = _json_loads(item.weibos)
            for weibo_item in news:
                try:
                    weibos.append((weibo_item['reposts_count'], weibo_item))
                except:
                    continue
    sorted_weibos = sorted(weibos, key=lambda k: k[0], reverse=True)

    city_dict = {}
    k = 0
    for reposts_count, result in sorted_weibos:
        k += 1
        if k > 1000:
            break

        uid = result['user']
        user_info = acquire_user_by_id(uid)
        if user_info:
            result['username'] = user_info['name']
        else:
            result['username'] = '******'
        time = ts2date(result['timestamp'])
        result['time'] = time
        try:
            if (len(result['geo'].split('.')) == 4):
                full_area = IP2city(result['geo'])
                result['geo'] = full_area
                city = full_area.split('\t')[1]
            else:
                city = geo2city(result['geo']).split('\t')[1]
        except:
            city = ''
        result['weibo_link'] = weiboinfo2url(result['user'], result['_id'])
        if city in province_list:
            try:
                city_dict[city].append(result)
            except:
                city_dict[city] = [result]
    return city_dict
Beispiel #6
0
def get_city_weibo(topic, start_ts, end_ts, unit=MinInterval, limit=TOP_WEIBOS_LIMIT):
    weibos = []
    if end_ts - start_ts < unit:
        upbound = int(math.ceil(end_ts / (unit * 1.0)) * unit)
        item = (
            db.session.query(CityWeibos)
            .filter(
                CityWeibos.end == upbound,
                CityWeibos.topic == topic,
                CityWeibos.range == unit,
                CityWeibos.limit == limit,
            )
            .first()
        )
        if item:
            news = _json_loads(item.weibos)
            for weibo_item in news:
                weibos.append((weibo_item["reposts_count"], weibo_item))
    else:
        upbound = int(math.ceil(end_ts / (unit * 1.0)) * unit)
        lowbound = (start_ts / unit) * unit
        items = (
            db.session.query(CityWeibos)
            .filter(
                CityWeibos.end > lowbound,
                CityWeibos.end <= upbound,
                CityWeibos.topic == topic,
                CityWeibos.range == unit,
                CityWeibos.limit == limit,
            )
            .all()
        )
        for item in items:
            news = _json_loads(item.weibos)
            for weibo_item in news:
                try:
                    weibos.append((weibo_item["reposts_count"], weibo_item))
                except:
                    continue
    sorted_weibos = sorted(weibos, key=lambda k: k[0], reverse=True)

    city_dict = {}
    k = 0
    for reposts_count, result in sorted_weibos:
        k += 1
        if k > 1000:
            break

        uid = result["user"]
        user_info = acquire_user_by_id(uid)
        if user_info:
            result["username"] = user_info["name"]
        else:
            result["username"] = "******"
        time = ts2date(result["timestamp"])
        result["time"] = time
        try:
            if len(result["geo"].split(".")) == 4:
                full_area = IP2city(result["geo"])
                result["geo"] = full_area
                city = full_area.split("\t")[1]
            else:
                city = geo2city(result["geo"]).split("\t")[1]
        except:
            city = ""
        result["weibo_link"] = weiboinfo2url(result["user"], result["_id"])
        if city in province_list:
            try:
                city_dict[city].append(result)
            except:
                city_dict[city] = [result]
    return city_dict
Beispiel #7
0
def cityCronTopic(topic,
                  xapian_search_weibo,
                  start_ts,
                  over_ts,
                  during=Fifteenminutes,
                  n_limit=TOP_WEIBOS_LIMIT):
    if topic and topic != '':
        start_ts = int(start_ts)
        over_ts = int(over_ts)

        over_ts = ts2HourlyTime(over_ts, during)
        interval = (over_ts - start_ts) / during

        topics = topic.strip().split(',')
        for i in range(interval, 0, -1):
            mtype_ccount = {}  # mtype为message_type,ccount为{city:count}

            begin_ts = over_ts - during * i
            end_ts = begin_ts + during
            weibos = []

            query_dict = {
                'timestamp': {
                    '$gt': begin_ts,
                    '$lt': end_ts
                },
            }

            for k, v in mtype_kv.iteritems():
                ccount = {}
                first_timestamp = end_ts
                first_item = {}
                query_dict['message_type'] = v
                count, weibo_results = xapian_search_weibo.search(
                    query=query_dict, fields=fields_list
                )  # weibo_results是在指定时间段、topic、message_type的微博匹配集
                for weibo_result in weibo_results():
                    if (weibo_result['timestamp'] <= first_timestamp):
                        first_timestamp = weibo_result['timestamp']
                        first_item = weibo_result
                    try:
                        if (len(weibo_result['geo'].split('.')) == 4):
                            city = IP2city(weibo_result['geo'])
                            if city:
                                try:
                                    ccount[city] += 1
                                except KeyError:
                                    ccount[city] = 1
                            else:
                                continue
                        else:
                            city = geo2city(weibo_result['geo'])
                            if city:
                                try:
                                    ccount[city] += 1
                                except KeyError:
                                    ccount[city] = 1
                            else:
                                continue
                    except:
                        continue

                    if (v == 1) or (v == 3):  # 只存储原创和转发
                        weibos.append(weibo_result)

                mtype_ccount[v] = [end_ts, ccount]
            save_rt_results(topic, mtype_ccount, during, first_item)

            sorted_weibos = sorted(weibos,
                                   key=lambda k: k[SORT_FIELD],
                                   reverse=True)
            sorted_weibos = sorted_weibos[:n_limit]
            save_ws_results(topic, end_ts, during, n_limit, sorted_weibos)
Beispiel #8
0
def cityCronTopic(topic, xapian_search_weibo, start_ts, over_ts, during=Fifteenminutes, n_limit=TOP_WEIBOS_LIMIT):
    if topic and topic != '':
        start_ts = int(start_ts)
        over_ts = int(over_ts)

        over_ts = ts2HourlyTime(over_ts, during)
        interval = (over_ts - start_ts) / during

        topics = topic.strip().split(',')
        for i in range(interval, 0, -1):
            mtype_ccount = {}  # mtype为message_type,ccount为{city:count}

            begin_ts = over_ts - during * i
            end_ts = begin_ts + during
            weibos = []

            query_dict = {
                'timestamp': {'$gt': begin_ts, '$lt': end_ts},
            }

            for k, v in mtype_kv.iteritems():
                ccount={}
                first_timestamp = end_ts
                first_item = {}
                query_dict['message_type'] = v
                count,weibo_results = xapian_search_weibo.search(query=query_dict, fields=fields_list)# weibo_results是在指定时间段、topic、message_type的微博匹配集
                for weibo_result in weibo_results():
                    if (weibo_result['timestamp'] <= first_timestamp ):
                        first_timestamp = weibo_result['timestamp']
                        first_item = weibo_result
                    try:
                        if (len(weibo_result['geo'].split('.')) == 4):
                            city = IP2city(weibo_result['geo'])
                            if city:
                                try:
                                    ccount[city] += 1   
                                except KeyError:
                                    ccount[city] = 1    
                            else:
                                continue
                        else:
                            city = geo2city(weibo_result['geo'])
                            if city:
                                try:
                                    ccount[city] += 1   
                                except KeyError:
                                    ccount[city] = 1    
                            else:
                                continue
                    except:
                        continue

                    if (v == 1) or (v == 3): # 只存储原创和转发
                        weibos.append(weibo_result)

                mtype_ccount[v] = [end_ts, ccount]
            save_rt_results(topic, mtype_ccount, during, first_item)

            sorted_weibos = sorted(weibos, key=lambda k: k[SORT_FIELD], reverse=True)
            sorted_weibos = sorted_weibos[:n_limit]
            save_ws_results(topic, end_ts, during, n_limit, sorted_weibos)