class ImoocSpider(SPRedisSpider): name = 'imooc' redis_key = f'{name}:start_urls' allowed_domains = [] start_urls = ['http://www.imooc.com/'] custom_settings = { 'LOG_LEVEL': "INFO", 'LOG_FILE': log(name), 'CONCURRENT_REQUESTS': 5, # 控制并发数,默认16 'DOWNLOAD_DELAY': 3, # 控制下载延迟,默认0 'DOWNLOADER_MIDDLEWARES': { 'SP.middlewares.UserAgentMiddleWare.UserAgentMiddleWare': 100, # 'SP.middlewares.HeadersMiddleWare.MiddleWare': 101, # 在meta中增加headers # 'SP.middlewares.CookiesMiddleWare.MiddleWare': 102, # 在meta中增加cookies # 'SP.middlewares.PayloadMiddleWare.MiddleWare': 103, # 在meta中增加payload # 'SP.middlewares.ProxyMiddleWare.ProxyMiddleWare': 104, # 使用代理ip # 'SP.middlewares.RequestsMiddleWare.RequestMiddleWare': 105, # 使用requests # 'scrapy_splash.SplashCookiesMiddleware': 723, # 在meta中增加splash 需要启用3个中间件 # 'scrapy_splash.SplashMiddleware': 725, # 在meta中增加splash 需要启用3个中间件 # 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, # 在meta中增加splash 需要启用3个中间件 # 'SP.middlewares.SizeRetryMiddleware.MiddleWare': 900 # 重试中间件,允许设置 MINSIZE(int),response.body 长度小于该值时,自动触发重试 }, } def parse(self, response): pass
class ProxySpider(SPRedisSpider): name = 'proxy' redis_key = f'{name}:start_urls' allowed_domains = [] start_urls = ['https://www.kuaidaili.com/free/inha/'] custom_settings = { 'LOG_LEVEL': "INFO", 'LOG_FILE': log(name), 'CONCURRENT_REQUESTS': 10, # 控制并发数,默认16 'DOWNLOAD_DELAY': 1, # 控制下载延迟,默认0 'ITEM_PIPELINES': { # 'SP.pipelines.ipproxy_pipelines.MysqlTwistedPipline': 1, 'SP.pipelines.ipproxy_pipelines.RedisPipeline': 2, }, 'DOWNLOADER_MIDDLEWARES': { 'SP.middlewares.UserAgentMiddleWare.UserAgentMiddleWare': 10, }, } def parse(self, response): for i in range(1, 50): request_url = "{0}{1}/".format(self.start_urls[0], i) yield scrapy.Request(url=request_url, callback=self.parse_detail) def parse_detail(self, response): all_trs = response.xpath("//*[@id='list']//tr") for tr in all_trs[1:]: # 生成不同的item异步插入数据库,避免主线程item共享异常 item_loader = FirstItemLoader(item=ProxyItem(), response=response) texts = tr.css("td::text").extract() item_loader.add_value("ip", texts[0]) item_loader.add_value("port", texts[1]) item_loader.add_value("type", texts[3]) item_loader.add_value("updated_time", texts[-1]) item = item_loader.load_item() yield item
class zhifang_Spider(SPRedisSpider): name = 'zhifang' redis_key = f'{name}:start_urls' allowed_domains = [] custom_settings = { 'LOG_LEVEL': "INFO", 'LOG_FILE': log(name), # 'CONCURRENT_REQUESTS': 5, # 控制并发数,默认16 # 'DOWNLOAD_DELAY': 3, # 控制下载延迟,默认0 'ITEM_PIPELINES': { # 'SP.pipelines.pipelines_file.FilePipeline': 100, # 附件下载 # 'SP.pipelines.pipelines_clean.CleanPipeline': 101, # 字段清洗 # 'SP.pipelines.zhifang_pipelines.RdbmPipeline': 200, # 关系型数据库 # 'SP.pipelines.zhifang_pipelines.HbasePipeline': 201 # Hbase # 'SP.pipelines.zhifang_pipelines.MongodbPipeline': 202, # Mongodb # 'SP.pipelines.zhifang_pipelines.KafkaPipeline': 203 # Kafka 'SP.pipelines.zhifang_pipelines.ElasticSearchPipeline': 204 # ElasticSearch }, 'DOWNLOADER_MIDDLEWARES': { 'SP.middlewares.UserAgentMiddleWare.UserAgentMiddleWare': 100, # 'SP.middlewares.HeadersMiddleWare.MiddleWare': 101, # 在meta中增加headers # 'SP.middlewares.CookiesMiddleWare.MiddleWare': 102, # 在meta中增加cookies # 'SP.middlewares.PayloadMiddleWare.MiddleWare': 103, # 在meta中增加payload # 'SP.middlewares.ProxyMiddleWare.ProxyMiddleWare': 104, # 使用代理ip # 'SP.middlewares.RequestsMiddleWare.RequestMiddleWare': 105, # 使用requests # 'scrapy_splash.SplashCookiesMiddleware': 723, # 在meta中增加splash 需要启用3个中间件 # 'scrapy_splash.SplashMiddleware': 725, # 在meta中增加splash 需要启用3个中间件 # 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, # 在meta中增加splash 需要启用3个中间件 'SP.middlewares.SizeRetryMiddleware.MiddleWare': 900 # 重试中间件,允许设置 MINSIZE(int),response.body 长度小于该值时,自动触发重试 }, } def get_callback(self, callback): # url去重设置:True 不去重 False 去重 callback_dt = { 'list': (self.list_parse, True), 'detail': (self.detail_parse, True), } return callback_dt.get(callback) def list_parse(self, response): rows = BeautifulSoup(response.text, 'lxml').find_all('div', class_="fangyuan_list-con") reqs = [] for row in rows: detail_url = row.find('a').get('href') list_item = zhifang_list_Item() # save value for your item here like: # list_item['title'] = row.find('a').text list_item['tit'] = row.find('p', class_="tit").text list_item['txt'] = row.find('p', class_="txt").text list_item['tit2'] = row.find_all('p', class_="tit")[-1].text list_item['price'] = row.find('p', class_="price").text list_item['agent'] = row.find('p', class_="name").text # default column list_item['detail_full_url'] = response.urljoin(detail_url) list_item['pkey'] = md5(list_item['detail_full_url']) list_item['pagenum'] = response.meta.get('pagenum') yield list_item req = ScheduledRequest( url=list_item['detail_full_url'], method='GET', callback='detail', body={}, # 如果是POST,在这边填写post字典 meta={ 'fkey': list_item['pkey'], 'pagenum': list_item['pagenum'], # 反爬相关的meta字典也填写这边,然后在spider中启用相应的中间件 # 'cookies': {}, # 一般反爬 # 'splash': {'wait': 2} # js加载、异步加载渲染 }) reqs.append(req) # 将详情链接作为新的任务 推到redis RedisCtrl().reqs_push(self.redis_key, reqs) def detail_parse(self, response): soup = BeautifulSoup(response.text, 'lxml') prs = soup.find('div', class_="price clearfix").find_all('li') describe = soup.find('dl', class_="describe") cols = describe.find_all('dd') detail_item = zhifang_detail_Item() # save value for your item here like: # detail_item['title'] = soup.find('h1').text detail_item['type1'] = prs[0].text detail_item['type2'] = prs[1].text detail_item['type3'] = prs[2].text detail_item['plot_name'] = cols[0].text detail_item['area'] = cols[1].text detail_item['look_time'] = cols[2].text detail_item['source_id'] = cols[3].text # default column detail_item['fkey'] = response.meta.get('fkey') detail_item['pagenum'] = response.meta.get('pagenum') yield detail_item imgs = soup.find('ul', class_="bigImg").find_all('li') for px, img in enumerate(imgs, 1): file_url = img.find('a').get('href') file_item = zhifang_file_Item() # save value for your item here like: # detail_item['title'] = soup.find('h1').text file_item['file_url'] = response.urljoin(file_url) file_item['file_name'] = f"{detail_item['plot_name']}/{px}" file_item['file_type'] = get_file_type(file_url, 'jpg') # default column file_item['fkey'] = response.meta.get('fkey') file_item['pagenum'] = response.meta.get('pagenum') yield file_item