def standard_water(): """ 获取所有的水贴信息,判定条件为评论数小于等于5 :return: a list contain water post. """ reviews = database.select_all('postn', 'review_num') titles = database.select_all('postn', 'post_home') water_post = [] for review, title in zip(reviews, titles): if int(review[0]) <= 5: water_post.append(title[0]) return water_post
def statistic_review(): """ 统计post回复数据 :return: """ quality = { "0-5": 0, "5-50": 0, "50-100": 0, "0.1K-0.5K": 0, "0.5K-1K": 0, "1K+": 0 } reviews = database.select_all("postn", "review_num") for review in reviews: if 0 <= int(review[0]) < 5: quality["0-5"] += 1 elif 5 <= int(review[0]) < 50: quality["5-50"] += 1 elif 50 <= int(review[0]) < 100: quality["50-100"] += 1 elif 100 <= int(review[0]) < 500: quality["0.1K-0.5K"] += 1 elif 500 <= int(review[0]) < 1000: quality["0.5K-1K"] += 1 else: quality["1K+"] += 1 return quality
def statistic_barage(): barages = database.select_all('pure_users', 'barage') age = { "onedown": 0, "one": 0, "two": 0, "three": 0, "four": 0, "five": 0, "six": 0, "seven": 0, "eight": 0, "nine": 0, "ten": 0, "eleven": 0, "twelve": 0, "twelveup": 0 } for barage in barages: try: if 0 <= float(barage[0]) < 1: age['onedown'] += 1 elif 1 <= float(barage[0]) < 2: age['one'] += 1 elif 2 <= float(barage[0]) < 3: age['two'] += 1 elif 3 <= float(barage[0]) < 4: age['three'] += 1 elif 4 <= float(barage[0]) < 5: age['four'] += 1 elif 5 <= float(barage[0]) < 6: age['five'] += 1 elif 6 <= float(barage[0]) < 7: age['six'] += 1 elif 7 <= float(barage[0]) < 8: age['seven'] += 1 elif 8 <= float(barage[0]) < 9: age['eight'] += 1 elif 9 <= float(barage[0]) < 10: age['nine'] += 1 elif 10 <= float(barage[0]) < 11: age['ten'] += 1 elif 11 <= float(barage[0]) < 12: age['eleven'] += 1 elif 12 <= float(barage[0]) <13: age['twelve'] += 1 else: age['twelveup'] += 1 except ValueError as why: pass return age
def get_detail_member(): """ # 获取member详细信息----->member_detail :return: """ urls = database.select_all('sortmember', 'homepage') for url in urls: print(url) res = index.get_response(url[0]) soup = index.get_bs(res) data = index.author_post_info(soup) if data != 0: database.insert_sql_member_detail('member_detail', data)
def statistic_client(): select_result = database.select_all('postn', 'client_type') android = 0 apple = 0 unknown = 0 for result in select_result: if result[0] == 'android': android += 1 elif result[0] == 'apple': apple += 1 else: unknown += 1 return android, apple, unknown
def zombie_member(): """ # 获取僵尸会员用户 :return: 僵尸用户数 """ zombie_user = 0 urls = database.select_all('sortmember', 'homepage') for url in urls: print(url) res = index.get_response(url[0]) soup = index.get_bs(res) data = index.author_post_info(soup) if data == 0: zombie_user += 1 return zombie_user
def statistic_sex(): """ Return the number of boys, girls and unknow. :return: """ sexes = database.select_all('pure_users', 'sex') boys = 0 girls = 0 for sex in sexes: for one in sex: if str(one) == 'female': girls += 1 else: boys += 1 return boys, girls
def barage_wrong(): abc = database.select_all("pure_users", 'id', 'barage') datetime.now() d = date(2003, 11, 25) t = time(0, 0, 0) publishing = datetime.combine(d, t) maxage = datetime.now() - publishing maxyear = int(str(maxage)[0:4]) / 365 print(maxyear) wrongage = [] for age in abc: try: if float(age[1]) > maxyear: wrongage.append(age[0]) except ValueError as why: wrongage.append(age[0]) return wrongage
def save_detail_post(): """ # # #以下代码分析帖子主页,提取需要的数据----->postn :return: """ selectResult = database.select_all('source1', 'homepage') for home in selectResult: homeUrl = ''.join(home) print(homeUrl) response = index.get_response(homeUrl) try: post = index.post_info(index.get_bs(response)) post.insert(0, homeUrl) print(post) database.insert_sql_post('postn', post) except AttributeError as why: continue except KeyError as why: continue
def statistic_post(): """ 判断用户的活跃度,活跃等级为七级, 以用户的发帖数为判断标准 :return: """ posts = database.select_all('pure_users', 'post_number') active = { "0-20": 0, "20-100": 0, "100-500": 0, "0.5K-1K": 0, "1K-5K": 0, "5K-10K": 0, "10K+": 0 } for post in posts: try: if post[0][-1] == '万': post_int = float(post[0][:-1]) * 10000 else: post_int = int(post[0]) if 0 <= post_int < 20: active["0-20"] += 1 elif 20 <= post_int < 100: active["20-100"] += 1 elif 100 <= post_int < 500: active["100-500"] += 1 elif 500 <= post_int < 1000: active["0.5K-1K"] += 1 elif 1000 <= post_int < 5000: active["1K-5K"] += 1 elif 5000 <= post_int < 10000: active["5K-10K"] += 1 else: active["10K+"] += 1 except ValueError as why: pass return active
# !/usr/bin/env python3 # -*- coding: utf-8 -*- from biye.analysis.simple import an_time from biye.spider.operate import dict_to_json as tr from biye.spider.operate import database # # 获取每个小时中的发帖数 abc = database.select_all('postn', 'str_date', 'str_time') hour = an_time.count_post_hour(abc) tr.dict_to_json(hour, name='../../show/data/hour.json') # 获取每月的发帖数 month_2017, month_2016, month_2015 = an_time.count_post_month(abc) tr.dict_to_json(month_2017, name='../../show/data/month_2017.json') tr.dict_to_json(month_2016, name='../../data/show/month_2016.json') tr.dict_to_json(month_2015, name='../../show/data/month_2015.json')
# !/usr/bin/env python3 # -*- coding: utf-8 -*- from biye.spider.operate import database import pymysql au_ids = database.select_all('users', 'au_id') ids = database.select_all('users', 'id') allin = database.select_all('pure_users', 'au_id') for au_id, data_id in zip(au_ids, ids): print(data_id[0]) conn = pymysql.connect(host='localhost', user='******', db='tieba', charset='utf8mb4') cursor = conn.cursor() cursor.execute('select au_id, nickname, sex, barage, post_number, member from users where id=%d' % data_id[0]) data = cursor.fetchone() conn.close() if au_id in allin: print('exist') else: print('not exist!') database.insert_sql_users('pure_users', data)