def is_crawled(self, url_content): if not self.conn: return -1 else: content = md5_encode(url_content) ex = self.conn.sadd('keys', content) return ex
def _real_parse_item(self, response): item = GovItem(domain_collection=None, html=None, pdf=[], xls=[], images=[], others=[]) # 1.保存html filename = make_file_name(response.url, 'html') item['html'] = filename domain = response.url.split('/')[2] item['domain_collection'] = md5_encode(domain) abpath = DATA_DIR + item['domain_collection'] if not os.path.exists(abpath): # 第一次创建文件夹 os.makedirs(abpath) with open(abpath + '/' + filename, 'wb') as f: f.write(response.body) # 2.保存其他资源 images = response.selector.xpath('//img/@src').extract() pdf = response.selector.xpath( '//a/@href[contains(.,".pdf")]').extract() xls = response.selector.xpath( '//a/@href[contains(.,".xls")]').extract() urls = images + pdf + xls if urls: for url in urls: """ url = response.urljoin(url) self.logger.info(url) yield scrapy.Request( "http://localhost:8050/render.html?url=" + url, callback=self.save_files, cb_kwargs=dict(item=item) ) """ yield response.follow(url, callback=self.save_files, cb_kwargs=dict(item=item))
def save_files(self,response,item): abpath = DATA_DIR + item['domain_collection'] filename = md5_encode(response.url)+'.'+response.url.split('.')[-1] with open(abpath +'/'+ filename, 'wb') as f: f.write(response.body) self.logger.info('Files downloading...' +filename) if filename.endswith('.pdf'): item['pdf'].append(filename) elif filename.endswith('.xls'): item['xls'].append(filename) elif filename.endswith('png') or filename.endswith('jpg'): item['images'].append(filename) else: item['others'].append(filename) return item