def create_spider(self): spider = Spider() xml = parse(self._filename) params = xml.getElementsByTagName(self._parameters) if params is not None: params = params[0] pages = params.getElementsByTagName(self._page) for page in pages: print(page.firstChild.data) spider.add_url(page.firstChild.data) domains = params.getElementsByTagName(self._domain) for domain in domains: print(domain.firstChild.data) spider.add_domain(domain.firstChild.data) depth = params.getElementsByTagName(self._depth) if depth is not None: depth = depth[0] print(depth.firstChild.data) spider.set_max_depth(depth.firstChild.data) return spider
#print('image expired') spider.add_url(options['thumbnail'], 'parse_binary', options) def parse_binary(response, spider, options): with open( folder + '/' + options['member'] + '-' + options['date'] + '-' + str(options['image_index']) + '.jpg', 'wb') as f: f.write(response) print('binary write with', options['date'] + '-' + str(options['image_index']) + '.jpg') def parse_tieba(response, spider, options): bs = BeautifulSoup( response.replace('<!--', '').replace('-->', ''), 'html.parser') titles = bs.find_all(class_='j_th_tit') if __name__ == '__main__': blog_spider = Spider() blog_spider.register_callback('parse_date', 'text', parse_date) # send queue to spider blog_spider.register_callback('parse_blog', 'text', parse_blog, True) blog_spider.register_callback('parse_image', 'text', parse_image) blog_spider.register_callback('parse_binary', 'binary', parse_binary) blog_spider.add_url(blog_prefix + 'manatsu.akimoto', 'parse_date', {'member': 'manatsu.akimoto'}) blog_spider.run() #print(get_members_urlprefix())