def readPropagateSpatial(keyid): propagateSpatials = db.session.query(PropagateSpatial).filter(PropagateSpatial.topic_id==keyid).all() if len(propagateSpatials): city_count = dict() for p in propagateSpatials: city_count[p.city] = p.count map_data = province_color_map(city_count) return map_data else: city_count = {} province_name=dict() html = '''<select name="province" id="province" defvalue="11"><option value="34">安徽</option><option value="11">北京</option><option value="50">重庆</option><option value="35">福建</option><option value="62">甘肃</option> <option value="44">广东</option><option value="45">广西</option><option value="52">贵州</option><option value="46">海南</option><option value="13">河北</option> <option value="23">黑龙江</option><option value="41">河南</option><option value="42">湖北</option><option value="43">湖南</option><option value="15">内蒙古</option><option value="32">江苏</option> <option value="36">江西</option><option value="22">吉林</option><option value="21">辽宁</option><option value="64">宁夏</option><option value="63">青海</option><option value="14">山西</option><option value="37">山东</option> <option value="31">上海</option><option value="51">四川</option><option value="12">天津</option><option value="54">西藏</option><option value="65">新疆</option><option value="53">云南</option><option value="33">浙江</option> <option value="61">陕西</option><option value="71">台湾</option><option value="81">香港</option><option value="82">澳门</option><option value="400">海外</option><option value="100">其他</option></select>''' province_soup = BeautifulSoup(html) for province in province_soup.findAll('option'): pp = province.string key = province['value'] province_name[key] = pp if pp == u'海外' or pp == u'其他': continue city_count[pp] = 0 map_data = province_color_map(city_count) return map_data
def readNews(stylenum, topic, end_ts , during): """将从数据库中读取的数据转化为map_data """ max_count = 0 city_count = {} first_item = {} first_item, city_count = PcountNews(end_ts, during, stylenum, topic) # PCount从db中计算各个省市地区的总数 if city_count.values(): max_count = max(city_count.values()) map_data = province_color_map(city_count) return max_count, map_data, first_item
def readPropagateSpatial(stylenum, topic, end_ts, during): """将从数据库中读取的数据转化为map_data """ max_count = 0 city_count = {} first_item = {} first_item, city_count = Pcount(end_ts, during, stylenum, topic) # PCount从db中计算各个省市地区的总数 if city_count.values(): max_count = max(city_count.values()) map_data = province_color_map(city_count) return max_count, map_data, first_item
def profile_group_location(fieldEnName): domainid = DOMAIN_LIST.index(fieldEnName) datestr = '20130901' verified_count, unverified_count, province_dict = 0, 0, {} _verified_count, _unverified_count, _province_dict = getDomainBasic(domainid, datestr) verified_count = int(_verified_count) unverified_count = int(_unverified_count) province_dict = _province_dict city_count = province_dict results = province_color_map(city_count) return json.dumps(results)
def readAcum(stylenum, topic, start_ts, end_ts, during): pointnum = (end_ts - start_ts) / during # 时间点数 spatial_dict = {} spatial_info_list = [] global_max_count = 0 global_first_timestamp = end_ts global_first_city = "" for i in range(pointnum + 1): end_ts = start_ts + during * i max_count = 0 first_item = {} city_count = {} first_item, city_count = Pcount(end_ts, during, stylenum, topic) for city in city_count: j = i while j > 0: previous_data = spatial_info_list[j - 1] if city in previous_data: city_count[city] += previous_data[city] break else: j -= 1 if i > 0: previous_data = spatial_info_list[i - 1] for city in previous_data: try: city_count[city] except KeyError: city_count[city] = previous_data[city] continue if city_count.values(): max_count = max(city_count.values()) if global_max_count < max_count: global_max_count = max_count spatial_info_list.append(city_count) topic_spatial_info = province_color_map(city_count) spatial_dict[str( end_ts)] = topic_spatial_info # spatial_dict = {end_ts:map_data} try: if first_item['timestamp'] <= global_first_timestamp: global_first_timestamp = first_item['timestamp'] global_first_city = geo2city(first_item['geo']) except KeyError: pass return global_max_count, spatial_dict, global_first_city
def readAcumNews(stylenum, topic, start_ts, end_ts, during): pointnum = (end_ts - start_ts) / during # 时间点数 spatial_dict = {} spatial_info_list = [] global_max_count = 0 global_first_timestamp = end_ts global_first_city = "" for i in range(pointnum + 1): end_ts = start_ts + during * i max_count = 0 first_item = {} city_count = {} first_item, city_count = PcountNews(end_ts, during, stylenum, topic) for city in city_count: j = i while j > 0: previous_data = spatial_info_list[j-1] if city in previous_data: city_count[city] += previous_data[city] break else: j -= 1 if i > 0: previous_data = spatial_info_list[i-1] for city in previous_data: try: city_count[city] except KeyError: city_count[city] = previous_data[city] continue if city_count.values(): max_count = max(city_count.values()) if global_max_count < max_count: global_max_count = max_count spatial_info_list.append(city_count) topic_spatial_info = province_color_map(city_count) spatial_dict[str(end_ts)] = topic_spatial_info # spatial_dict = {end_ts:map_data} try: if first_item['timestamp'] <= global_first_timestamp: global_first_timestamp = first_item['timestamp'] global_first_city = '' except KeyError: pass return global_max_count, spatial_dict, global_first_city
def calculate_topic(kw): #初始化 topic_info = {} topic_index = {} date_list = [] perday_count_list = [] topic_rel_blog = [] topic_url = [] topic_participents = [] topic_leader = [] topic_date = [] blogs_sum = 0 comments_sum = 0 topic_ori_blog = [] city_count={} html = '''<select name="province" id="province" defvalue="11"><option value="34">安徽</option><option value="11">北京</option><option value="50">重庆</option><option value="35">福建</option><option value="62">甘肃</option> <option value="44">广东</option><option value="45">广西</option><option value="52">贵州</option><option value="46">海南</option><option value="13">河北</option> <option value="23">黑龙江</option><option value="41">河南</option><option value="42">湖北</option><option value="43">湖南</option><option value="15">内蒙古</option><option value="32">江苏</option> <option value="36">江西</option><option value="22">吉林</option><option value="21">辽宁</option><option value="64">宁夏</option><option value="63">青海</option><option value="14">山西</option><option value="37">山东</option> <option value="31">上海</option><option value="51">四川</option><option value="12">天津</option><option value="54">西藏</option><option value="65">新疆</option><option value="53">云南</option><option value="33">浙江</option> <option value="61">陕西</option><option value="71">台湾</option><option value="81">香港</option><option value="82">澳门</option><option value="400">海外</option><option value="100">其他</option></select>''' province_soup = BeautifulSoup(html) for province in province_soup.findAll('option'): pp = province.string if pp == u'海外' or pp == u'其他': continue city_count[pp] = 0 gt = calendar.timegm(datetime(2012, 1, 1).timetuple()) lt = calendar.timegm(datetime(2012, 1, 10).timetuple()) s = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_weibo', schema_version=2) count, get_results = s.search(query={'text': [u'%s'%kw], 'timestamp': {'$gt': gt, '$lt': lt}}, sort_by=['timestamp'], fields=['text', 'timestamp','reposts_count','comments_count','user', 'terms', '_id','retweeted_status','bmiddle_pic','geo','source','attitudes_count']) for r in get_results(): # 获取时间与每天微博数量 temp_date = date.fromtimestamp(r['timestamp']) if len(date_list) == 0: date_list.append(temp_date) perday_count_list.append(1) else: if temp_date < date_list[-1]: if temp_date in date_list: temp_index = date_list.index(temp_date) perday_count_list[temp_index] += 1 else: i = 0 while i < len(date_list): if temp_date < date_list[0]: date_list.insert(0,temp_date) perday_count_list.insert(0,1) break else: if temp_date > date_list[i] and temp_date < date_list[i+1]: date_list.insert(i+1,temp_date) perday_count_list.insert(i+1,1) break else: i += 1 if temp_date == date_list[-1]: perday_count_list[-1] += 1 if temp_date > date_list[-1]: timedelta = date(2000,1,2)-date(2000,1,1) while date_list[-1] != temp_date: temp_date1 = date_list[-1] + timedelta date_list.append(temp_date1) perday_count_list.append(0) perday_count_list[-1] = 1 if r['user']: uid = int(r['user']) user = get_user(uid) if user != None: if user not in topic_participents: topic_participents.append(user) if r['retweeted_status'] == None: temp_ori = {} temp_ori['status'] = r temp_ori['user'] = user topic_ori_blog.append(temp_ori) if r['reposts_count'] != None and r['comments_count'] != None: rc = r['reposts_count'] + r['comments_count'] if rc > 1500: topic_leader.append(user) if r['reposts_count'] > 1000: temp = {} temp['status'] = r temp['status']['created_at'] = datetime.fromtimestamp(r['timestamp']) temp['status']['text'] = r['text'].decode("utf-8") temp['status']['source'] = re.match('<.*?>(.*)<.*?>', r['source']).group(1).decode("utf-8") temp['user'] = user topic_rel_blog.append(temp) if r['bmiddle_pic']: topic_url.append(r['bmiddle_pic']) if r['geo'] != None and r['geo'].has_key('province_name'): p = r['geo']['province_name'].split('省')[0] if p == u'海外' or p == u'其他': pass else: city_count[p] += 1 elif user['location']: p = user['location'].split(' ')[0] if p == u'海外' or p == u'其他': pass else: city_count[p] += 1 else: pass else: pass comments_sum = comments_sum + r['comments_count'] blogs_sum += 1 timedelta = len(date_list) avg = blogs_sum/float(timedelta) i = 0 persistent_index = 0 temp_sudden = 0 while i < int(timedelta): if perday_count_list[i] > avg: persistent_index += 1 temp_sudden = perday_count_list[i]-avg+temp_sudden i += 1 else: i += 1 sudden_index = '%10.2f'%(temp_sudden/float(blogs_sum)) coverage_index = '%10.2f'%((blogs_sum + comments_sum)/(24*float(timedelta))) media_index = 0 top_medias = [] medias = db.session.query(Media) for media in medias: media_name = media.mediaName top_medias.append(media_name) media_list = [] for r in topic_ori_blog: tmedia = [] tmedia.append(r['user']['name']) x = r['status']['comments_count']+r['status']['reposts_count'] tmedia.append(x) media_list.append(tmedia) sorted(media_list, key=lambda tmedia: tmedia[1],reverse = True) if len(media_list) >= 20: m = 0 while m < 20: if media_list[m][0] in top_medias: media_index += 1 m += 1 else: m += 1 else: m = 0 while m < len(media_list): if media_list[m][0] in top_medias: media_index += 1 m += 1 else: m += 1 leader_index = len(topic_leader) work_list = [] work_count = [] fields = db.session.query(Field) for field in fields: field_name = field.fieldName work_list.append(field_name) work_count.append(0) for r in topic_participents: k = 0 while k < len(work_list): if r['userField'] == work_list[k]: work_count[k] += 1 break else: k += 1 topic_index['persistent_index'] = persistent_index topic_index['sudden_index'] = sudden_index topic_index['coverage_index'] = coverage_index topic_index['media_index'] = media_index topic_index['leader_index'] = leader_index map_data = province_color_map(city_count) topic_info['topic_poster'] = topic_participents[0]['name'] topic_info['topic_post_date'] = date_list[0] topic_info['topic_leader_count'] = len(topic_leader) topic_info['topic_participents'] = len(topic_participents) topic_info['blogs_sum'] = blogs_sum topic_info['topic_ori_blog_count'] = len(topic_ori_blog) topic_info['topic_url'] = topic_url topic_info['perday_count_list'] = perday_count_list topic_info['date_list'] = date_list topic_info['topic_rel_blog'] = topic_rel_blog topic_info['geo'] = map_data topic_info['topic_leader'] = topic_leader topic_info['topic_working_list'] = work_list topic_info['topic_working_count'] = work_count topic_info['topic_index'] = topic_index topic_info['gt'] = gt topic_info['lt'] = lt return topic_info
def calculate_single(_id, test): #初始化 blog_info = {} city_count = {} html = '''<select name="province" id="province" defvalue="11"><option value="34">安徽</option><option value="11">北京</option><option value="50">重庆</option><option value="35">福建</option><option value="62">甘肃</option> <option value="44">广东</option><option value="45">广西</option><option value="52">贵州</option><option value="46">海南</option><option value="13">河北</option> <option value="23">黑龙江</option><option value="41">河南</option><option value="42">湖北</option><option value="43">湖南</option><option value="15">内蒙古</option><option value="32">江苏</option> <option value="36">江西</option><option value="22">吉林</option><option value="21">辽宁</option><option value="64">宁夏</option><option value="63">青海</option><option value="14">山西</option><option value="37">山东</option> <option value="31">上海</option><option value="51">四川</option><option value="12">天津</option><option value="54">西藏</option><option value="65">新疆</option><option value="53">云南</option><option value="33">浙江</option> <option value="61">陕西</option><option value="71">台湾</option><option value="81">香港</option><option value="82">澳门</option><option value="400">海外</option><option value="100">其他</option></select>''' province_soup = BeautifulSoup(html) for province in province_soup.findAll('option'): pp = province.string if pp == u'海外' or pp == u'其他': continue city_count[pp] = 0 begin_ts1 = time.mktime(datetime(2011, 1, 1).timetuple()) now = date.today() now_year = int(now.year) now_month = int(now.month) now_day = int(now.day) end_ts1 = time.mktime(datetime(now_year, now_month, now_day).timetuple()) #获取原微博信息 status_ori = get_ori_status(_id, test) #获取相关微博 if test == 'None': count, get_results = xapian_search_weibo.search(query={'retweeted_status': _id, 'timestamp': {'$gt': begin_ts1, '$lt': end_ts1} }, sort_by=['timestamp']) else: count, get_results = xapian_search_weibo_test.search(query={'retweeted_status': _id, 'timestamp': {'$gt': begin_ts1, '$lt': end_ts1} }, sort_by=['timestamp']) print count reposter = [] date_list = [] date_list.append(date.fromtimestamp(status_ori['timestamp'])) perday_repost_count = [] perday_repost_count.append(1) per = date(2000,01,02)-date(2000,01,01) reposts_sum = 0 comments_sum = 0 key_reposter = [] for r in get_results(): if r['user']: user = get_user(r['user'], test) if user['location'] != None: p = user['location'] tp = p.split(' ') ppp = tp[0] if ppp == u'海外' or ppp == u'其他': pass else: city_count[ppp] += 1 if user not in reposter: reposter.append(user) if r['reposts_count'] > 1000: key_reposter.append(user) else: pass tempdate = date.fromtimestamp(r['timestamp']) if tempdate < date_list[-1]: if tempdate in date_list: temp_index = date_list.index(tempdate) perday_repost_count[temp_index] += 1 else: i = 0 while i < len(date_list): if tempdate > date_list[i] and tempdate < date_list[i+1]: date_list.insert(i+1,tempdate) perday_repost_count.insert(i+1,1) break else: i += 1 if tempdate == date_list[-1]: perday_repost_count[-1] += 1 if tempdate > date_list[-1]: timedelta = date(2000,1,2)-date(2000,1,1) while date_list[-1] != tempdate: tempdate1 = date_list[-1] + timedelta date_list.append(tempdate1) perday_repost_count.append(0) perday_repost_count[-1] = 1 reposts_sum += r['reposts_count'] try: comments_sum += r['comments_count'] except: pass totalRepost = reposts_sum + 1 avg = (float(totalRepost))/len(date_list) persistent_index = 0 sudden_count = 0 j = 0 while j < len(date_list): if perday_repost_count[j] > avg: persistent_index += 1 sudden_count = perday_repost_count[j]-avg+sudden_count j += 1 else: j += 1 sudden_index = '%10.2f'%((float(sudden_count))/totalRepost) coverage_index = '%10.2f'%((totalRepost+comments_sum)/float(24*len(date_list))) media_index = 0 medias = db.session.query(Media) for key in reposter: if key in medias: media_index += 1 else: pass print city_count map_data = province_color_map(city_count) print map_data leader_index = len(key_reposter) blog_info['status'] = status_ori blog_info['user'] = status_ori['user'] blog_info['repost_users'] = reposter blog_info['datelist'] = date_list blog_info['perday_count'] = perday_repost_count blog_info['persistent_index'] = persistent_index blog_info['sudden_index'] = sudden_index blog_info['coverage_index'] = coverage_index blog_info['media_index'] = media_index blog_info['leader_index'] = leader_index blog_info['geo'] = map_data blog_info['key_reposter'] = key_reposter return blog_info