def parse(self, response): # self.count = self.count+1 #print('这是第', self.count, '个页面') print('当前爬取页面' + response.request.url.strip('*/')) print('当前集合大小', len(self.url_pool)) titles = response.xpath('//a/@href').extract() basic_url = response.request.url.strip('*/') # 对于url进行拼接处理 for url in titles: # print(url) item = sudaMainItem() matchFullUrl = re.match(r'^(http|https)://([\w.]+/?)\S*', url, re.M | re.I) matchRelateUrl = re.match(r'^/([\w.]?/?)\S*', url, re.M | re.I) matchRelateUrl2 = re.match(r'^[^/]([\w.]?/?)\S*', url, re.M | re.I) if url: if matchFullUrl: true_url = url # print('原始url', true_url) elif matchRelateUrl: true_url = basic_url + url # print('拼接url1', true_url) elif matchRelateUrl2: true_url = basic_url + '/' + url # print('拼接url2', true_url) else: true_url = url # print('未处理且未匹配', true_url) if self.judge_suda(true_url): item['father'] = basic_url item['url'] = true_url self.url_pool.add(true_url) yield item # if true_url not in self.url_pool: # item['distinct_url'] = true_url # else: # item['distinct_url'] = 'duplicate' # yield item # url_list = self.getDistinctUrls() # print(url_list) url_pool_copy = copy.deepcopy(self.url_pool) # url_pool_copy = list(self.url_pool) for next_url in url_pool_copy: # print('这是第', index, '个元素') if 'http://' in next_url or 'https://' in next_url: yield scrapy.Request(next_url, self.parse, dont_filter=False) else: yield scrapy.Request('http://' + next_url, self.parse, dont_filter=False)
def parsePage(self, response): # self.count = self.count+1 #print('这是第', self.count, '个页面') print('当前爬取页面' + response.request.url.strip('*/')) randomdelay = random.randint(0, 4) time.sleep(randomdelay) print("### random delay: %s s ###" % (randomdelay)) # print('当前集合大小', len(self.url_pool)) titles = response.xpath('//a/@href').extract() basic_url = response.request.url.strip('*/') # 对于url进行拼接处理 for url in titles: # print(url) item = sudaMainItem() matchFullUrl = re.match(r'^(http|https)://([\w.]+/?)\S*', url, re.M | re.I) # matchRelateUrl = re.match(r'^/([\w.]?/?)\S*', url, re.M | re.I) # matchRelateUrl2 = re.match(r'^[^/]([\w.]?/?)\S*', url, re.M | re.I) matchUselessUrl = re.match(r'^#([\w.]?/?)\S*', url, re.M | re.I) # matchParams = re.match(r'^\?([\w.]?/?)\S*', url, re.M | re.I) if url: if matchFullUrl: true_url = url # print('原始url', true_url) elif matchUselessUrl: true_url = basic_url else: true_url = urljoin(basic_url, url) # print('未处理且未匹配', true_url) if self.judge_suda(true_url): item['father'] = basic_url item['url'] = true_url self.url_pool.add(true_url) yield item