def __init__(self, *args, **kwargs): super(PansosoSpider3, self).__init__(*args, **kwargs) self.level3_file = settings.get('LEVEL3_FILE') self.level4_file = settings.get('LEVEL4_FILE') rm_file(self.level4_file) self.start_urls = read_file(self.level3_file)
def _prase_prepare(self): """ 解析前准备 """ if self.mode == 'override': rm_file(self.level2_file) write_file(self.level1_file, self.start_urls, mode=self.mode) print("写入文件[%s]成功" % self.level1_file)
def __init__(self, mode='append', *args, **kwargs): super(DashengpanSpider2, self).__init__(*args, **kwargs) self.mode = mode self.level2_file = settings.get('LEVEL2_FILE') self.level3_file = settings.get('LEVEL3_FILE') if self.mode == 'override': rm_file(self.level3_file) self.start_urls = read_file(self.level2_file)
def _prase_prepare(self): """ 解析前准备 """ self.level1_file = settings.get('LEVEL1_FILE') self.level2_file = settings.get('LEVEL2_FILE') rm_file(self.level1_file) rm_file(self.level2_file) write_file(self.level1_file, self.start_urls, mode='override') print("写入文件[%s]成功" % self.level1_file)
def __init__(self, *args, **kwargs): super(PansosoSpider2, self).__init__(*args, **kwargs) self.level2_file = settings.get('LEVEL2_FILE') self.level3_file = settings.get('LEVEL3_FILE') rm_file(self.level3_file) self.start_urls = read_file(self.level2_file) print('>>>>>>>>>>>>> 2') print(dir(self)) print(dir(self.start_requests)) print(self.start_requests)
def __init__(self, mode='append', *args, **kwargs): super(DashengpanSpider2, self).__init__(*args, **kwargs) self.mode = mode self.level2_file = settings.get('LEVEL2_FILE') self.result_file = settings.get('RESULT_FILE') if self.mode == 'override': rm_file(self.result_file) self.start_urls = read_file(self.level2_file) self.start_urls = [i for i in self.start_urls if i.startswith('http')] if settings.get("IS_USE_DELAY_LOAD_URL", False): # 延迟加载 self.browser = webdriver.Chrome() self.browser.set_page_load_timeout(30)
class Pansoso2Spider(scrapy.Spider): u""" 盘搜搜爬虫02 - 输入页面提取下载信息 """ name = "pansoso2" allowed_domains = ["www.pansoso.com"] settings = get_project_settings() level2_file = settings.get('LEVEL2_FILE') level3_file = settings.get('LEVEL3_FILE') rm_file(level3_file) start_urls = read_file(level2_file) def parse(self, response): if response.status == 200: selector = scrapy.Selector(response) # <div class="down"> infos = selector.xpath('//div[@class="down"]') level3_urls = [] for info in infos: hrefs = info.xpath('a/@href').extract() hrefs = [i for i in hrefs if '.html' not in i] href = hrefs[0] level3_urls.append(href) write_file(self.level3_file, level3_urls, mode='append')
class PansosoSpiderThread(scrapy.Spider): u""" 盘搜搜爬虫03 - 提取百度云链接 """ name = "pansoso3" allowed_domains = ["www.pansoso.com"] settings = get_project_settings() level3_file = settings.get('LEVEL3_FILE') level4_file = settings.get('LEVEL4_FILE') rm_file(level4_file) start_urls = read_file(level3_file) def parse(self, response): time.sleep(0.5) if response.status == 200: selector = scrapy.Selector(response) infos = selector.xpath('//div[@class="file"]') level4_urls = [] for info in infos: href = info.xpath('p/a/@href').extract()[0] print(href) level4_urls.append(href) write_file(self.level4_file, level4_urls, mode='append')
def __init__(self, search_text='excel', page=1, mode='append', *args, **kwargs): u""" 指定爬虫参数 exp: scrapy crawl myspider -a http_user=myuser -a http_pass=mypassword -a user_agent=mybot :param search_text: 搜索内容 :param page: 查询页数 :param mode: override / append 覆盖 or 追加 """ super(DashengpanSpider1, self).__init__(*args, **kwargs) page = int(page) self.mode = mode self.base_url = "https://www.dashengpan.com" self.level1_file = settings.get('LEVEL1_FILE') self.level2_file = settings.get('LEVEL2_FILE') print('>>>>>>>> @Spider_name: %s @search_text: %s @page: %s' % (self.name, search_text, page)) start_url = 'https://www.dashengpan.com/search?keyword=%s' % ( search_text) http_dic = { ' ': '%20', } for key, val in list(http_dic.items()): start_url = start_url.replace(key, val) self.start_urls = [] for i in range(1, page + 1): self.start_urls.append(start_url + '&page=%s' % i) if self.mode == 'override': rm_file(self.level1_file) self._prase_prepare()