def set_ua_from_fua(): """ 从fake user agent获取ua信息 """ ua = fake_useragent.UserAgent() ua_dict = { 'chrome': ua.chrome, 'ie': ua.ie, 'opera': ua.opera, 'firefox': ua.firefox, 'safari': ua.safari, } rc = RedisClient() rc.put('useragents', ua_dict)
def __init__(self): """ 从redis中获取保存的ua值,如果没有,则新下载 """ self.headers = { 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Connection': 'keep-alive', } rc = RedisClient() self.ua_dict = rc.get_all('useragents') while not self.ua_dict: Headers.set_ua_from_fua() self.ua_dict = rc.get_all('useragents')
def __init__(self, *args, **kwargs): super(AnchorSpider, self).__init__(*args, **kwargs) print("__init__") self.redis_client = RedisClient().getInstance()
class AnchorSpider(scrapy.Spider): name = "douyu_anchor" allowed_domains = [] start_urls = [apiconstants.get_api_douyu_list_url(0)] """ 1.获取主播列表页中的主播房间url,交给scrapy下载后进行解析 1.获取下一页的url并交给scrapy进行下载,下载完成交给parse解析 """ offset = 0 def __init__(self, *args, **kwargs): super(AnchorSpider, self).__init__(*args, **kwargs) print("__init__") self.redis_client = RedisClient().getInstance() def parse(self, response): is_end = False anchor_list = [] anchor_uids = [] if response.body: result = json.loads(response.body) if result and int(result['error']) == 0: result_anchor_list = result['data'] result_count = len(result_anchor_list) print("result_count : " + str(result_count)) if result_count > 0: for anchor_item in result_anchor_list: anchor = AncharItem() anchor['room_id'] = anchor_item['room_id'] anchor['room_href'] = anchor_item['url'] anchor['room_name'] = anchor_item['room_name'] anchor['room_status'] = anchor_item['show_status'] anchor['room_thumb'] = anchor_item['room_src'] anchor['nickname'] = anchor_item['nickname'] anchor['avatar'] = anchor_item['avatar'] anchor['sex'] = 0 anchor['weight'] = 0 # owner_weight anchor['cate_id'] = anchor_item['cate_id'] anchor['start_time'] = anchor_item['show_time'] anchor['fans_num'] = anchor_item['fans'] anchor['online_num'] = anchor_item['online'] if anchor_item.get("jumpUrl"): # 存在的情况,会跳转到外部连接,如企鹅直播 pass anchor_list.append(anchor) anchor_uids.append(anchor['room_id']) # 交给主播个人数据解析 roominfo_url = apiconstants.get_douyu_roominfo_url( anchor['room_id']) # 如果有数据了,那就不获取了 anchor_redis_name = 'anchor:1' + ":" + str( anchor['room_id']) if self.redis_client.exists(anchor_redis_name): yield anchor else: yield Request(url=roominfo_url, callback=self.parse_anchor_info) else: is_end = True self.offset = self.offset + result_count print(anchor_uids) # 提取下一页并交给scrapy进行下载 if is_end: print("爬取结束") else: url = apiconstants.get_api_douyu_list_url(self.offset) yield Request(url=url, callback=self.parse) # 爬取主播个人数据 def parse_anchor_info(self, response): if response.body: result = json.loads(response.body) if result and int(result['error']) == 0: result_anchor_info = result['data'] anchor_info = AncharItem() anchor_info['room_id'] = result_anchor_info['room_id'] # anchor_info['room_href'] = result_anchor_info['room_href'] anchor_info['room_href'] = "" anchor_info['room_name'] = result_anchor_info['room_name'] anchor_info['room_status'] = result_anchor_info['room_status'] anchor_info['room_thumb'] = result_anchor_info['room_thumb'] anchor_info['nickname'] = result_anchor_info['owner_name'] anchor_info['avatar'] = result_anchor_info['avatar'] anchor_info['sex'] = 0 anchor_info['weight'] = 0 # owner_weight anchor_info['cate_id'] = result_anchor_info['cate_id'] anchor_info['cate_name'] = result_anchor_info['cate_name'] anchor_info['start_time'] = result_anchor_info['start_time'] anchor_info['fans_num'] = result_anchor_info['fans_num'] anchor_info['online_num'] = result_anchor_info['online'] anchor_info['gift_list'] = result_anchor_info['gift'] yield anchor_info
def __init__(self): print("__init__") self.redis_client = RedisClient().getInstance()
def __init__(self, dbpool): self.dbpool = dbpool self.redis_client = RedisClient()
class MysqlTwistedPipeline(object): def __init__(self, dbpool): self.dbpool = dbpool self.redis_client = RedisClient() @classmethod def from_settings(cls, settings): dbparms = config.DB_config.get("mysql") dbparms['db'] = config.database dbparms['cursorclass'] = MySQLdb.cursors.DictCursor dbparms['use_unicode'] = True dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms) return cls(dbpool) def process_item(self, item, spider): # 使用twisted将mysql插入变成异步执行 query = self.dbpool.runInteraction(self.do_insert_anthor, item) # 因为是异步的,所以错误的查询 query.addErrback(self.handle_error) # 处理异常 anthor_id = int(item['room_id']) # # 存入Redis礼物数据 gift_list = item['gift_list'] if gift_list: for gift in gift_list: gift_redis_name = 'gift:' + str(apiconstants.PLATFORM_DOUYU) + ":" + gift['id'] # 平台加礼物ID self.redis_client.getInstance().hmset(gift_redis_name, dict(gift)) # 存入Redis主播数据 item.pop('gift_list') anchor_redis_name = 'anchor:' + str(apiconstants.PLATFORM_DOUYU) + ":" + str(anthor_id) self.redis_client.getInstance().hmset(anchor_redis_name, dict(item)) # 更新数据库数据 anchor_id_list_redis_name = 'anchor_id_list:' + str(apiconstants.PLATFORM_DOUYU) self.redis_client.getInstance().sadd(anchor_id_list_redis_name, anthor_id) # 保存主播数据 def do_insert_anthor(self, cursor, item): # 判断主播是否存在 exist_sql = "select * from anthor where platform=%s and room_id=%s" % (1, item['room_id']) cursor.execute(exist_sql) cursor.fetchall() if cursor.rowcount == 0: print("不存在主播数据,入库") # 执行具体的插入 insert_sql = """ insert into anthor(nickname,avatar,sex,weight,platform,room_id,room_href,room_name,room_thumb,cate_id,fans_num) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) """ cursor.execute(insert_sql, ( item['nickname'], item['avatar'], item['sex'], item['weight'], 1, item['room_id'], item['room_href'], item['room_name'], item['room_thumb'], item['cate_id'], item['fans_num'])) else: print("存在主播数据") # 保存主播礼物数据 def do_insert_gift(self, cursor, item): # 判断主播是否存在 exist_sql = "select * from gift where platform=%s and gid=%s" % (1, item['room_id']) cursor.execute(exist_sql) cursor.fetchall() if cursor.rowcount == 0: print("不存在主播数据,入库") # 执行具体的插入 insert_sql = """ insert into gift(gid,name,desc,intro,platform,cost,contribution) VALUES (%s,%s,%s,%s,%s,%s,%s) """ cursor.execute(insert_sql, ( item['gid'], item['name'], item['desc'], item['intro'], item['platform'], item['cost'], item['contribution'])) else: print("存在主播数据") def handle_error(self, failure): # 处理异步插入的异常 print(failure)