def __init__(self, name, thread=5, interval=1): super().__init__() self.name = name self.db = db sc = SimpyderConfig() sc.USER_AGENT = FAKE_UA sc.PARSE_THREAD_NUMER = thread sc.LOG_LEVEL = "INFO" sc.DOWNLOAD_INTERVAL = interval self.set_config(sc)
'$position': 0 } } }, True) item['data']['mid'] = item['mid'] db.author_data.replace_one( { 'mid': item['data']['mid'], 'datetime': item['data']['datetime'] }, item['data'], upsert=True) if 'object_id' in item: self.sentCallBack(item['object_id'], db['user_record']) return (item) s = BiliobAuthorSpider("旧作者爬虫") sc = SimpyderConfig() sc.PARSE_THREAD_NUMER = 1 sc.LOG_LEVEL = "INFO" sc.USER_AGENT = FAKE_UA sc.DOWNLOAD_INTERVAL = 0.15 s.set_config(sc) coll = db['author'] if __name__ == "__main__": s.config.LOG_LEVEL == 'DEBUG' s.run()
item['tag_list'] = [None] if db.video.find_one({'bvid': item['bvid']}, {'bvid': item['bvid']}) != None: db.video.update_one( {'bvid': item['bvid']}, {'$set': { 'aid': item['aid'], 'tag': item['tag_list'] }}, upsert=True) else: db.video.update_one( {'aid': item['aid']}, {'$set': { 'bvid': item['bvid'], 'tag': item['tag_list'] }}, upsert=True) return item s = BiliobTagSpider("标签爬虫") sc = SimpyderConfig() sc.PARSE_THREAD_NUMER = 8 sc.LOG_LEVEL = "INFO" sc.USER_AGENT = FAKE_UA s.set_config(sc) if __name__ == "__main__": s.run()
date = datetime.utcnow() items = [] for d in data: try: title = d.xpath('div[@class="HotList-itemTitle"]/text()')[0] value = int(d.xpath( 'div[@class="HotList-itemMetrics"]/text()')[0].rstrip('万热度')) items.append([title, value, date]) except Exception as e: self.logger.exception(e) return items def save(self, item): for e in item: db.zhihu.hot.insert({ 'title': e[0], 'value': e[1], 'date': e[2] }) return e if __name__ == "__main__": s = HotSearchSpider("知乎热搜") sc = SimpyderConfig() sc.COOKIE = FAKE_UA sc.DOWNLOAD_INTERVAL = 600 sc.PARSE_THREAD_NUMER = 1 s.set_config(sc) s.run()
'cReply': item['current_reply'], 'cJannchie': item['current_jannchie'], 'cDatetime': item['current_datetime'], 'author': item['author'], 'subChannel': item['subChannel'], 'channel': item['channel'], 'mid': item['mid'], 'pic': item['pic'], 'title': item['title'], 'datetime': datetime.datetime.fromtimestamp( item['datetime']) }, '$push': { 'data': { '$each': [item['data']], '$position': 0 } } }, True) if 'object_id' in item: self.sentCallBack(item['object_id'], db['user_record']) return item if __name__ == "__main__": s = BiliobVideoSpider("biliob-video-spider") sc = SimpyderConfig() sc.USER_AGENT = FAKE_UA s.set_config(sc) s.run()
data.append({ 'date': datetime.datetime.now().strftime("%Y-%m-%d %H:%M"), 'title': title, 'value': int(point) }) except Exception as e: print('[ERROR]'.format(title)) return data f = csv.DictWriter(open('./zhihu.csv', 'w', encoding='utf-8-sig'), fieldnames=['date', 'title', 'value']) def save(items): for item in items: f.writerow(item) pass s = Spider() s.assemble(get_url, parse, save) sc = SimpyderConfig() sc.PARSE_THREAD_NUMER = 1 sc.COOKIE = cookie sc.USER_AGENT = FAKE_UA s.set_config(sc) s.run() pass
import datetime from db import db from simpyder import Spider, FAKE_UA, SimpyderConfig class SiteInfoSpider(Spider): def gen_url(self): yield 'https://api.bilibili.com/x/web-interface/online' def parse(self, res): return res.json()['data'] def save(self, item): item['datetime'] = datetime.datetime.utcnow() + datetime.timedelta( hours=8) db.site_info.insert_one(item) return item if __name__ == "__main__": s = SiteInfoSpider("site-info") sc = SimpyderConfig() sc.PARSE_THREAD_NUMER = 1 sc.DOWNLOAD_INTERVAL = 10 sc.LOG_LEVEL = "DEBUG" sc.USER_AGENT = FAKE_UA sc.COOKIE = '' s.set_config(sc) s.run()
''' 这是一个DEMO。该程序用于爬取B站AV号小于100的视频页面标题 ''' import requests from simpyder import Spider from simpyder import SimpyderConfig def gen_url(): for each_id in range(100): yield "https://www.bilibili.com/video/av{}".format(each_id) def parse(response): return response.xpath('//meta[@name="title"]/@content')[0] def save(item): print(item) pass if __name__ == "__main__": s1 = Spider("BILIBILI TITLE SPIDER", gen_url, parse, save) sc = SimpyderConfig() sc.COOKIES = "example:value;" sc.USER_AGENT = "my user agent" s1.assemble(gen_url=gen_url, parse=parse, save=save, config=sc) s1.run()