def create_spider(self):
        spider = Spider()

        xml = parse(self._filename)
        params = xml.getElementsByTagName(self._parameters)
        if params is not None:
            params = params[0]

            pages = params.getElementsByTagName(self._page)
            for page in pages:
                print(page.firstChild.data)
                spider.add_url(page.firstChild.data)

            domains = params.getElementsByTagName(self._domain)
            for domain in domains:
                print(domain.firstChild.data)
                spider.add_domain(domain.firstChild.data)

            depth = params.getElementsByTagName(self._depth)
            if depth is not None:
                depth = depth[0]
                print(depth.firstChild.data)
                spider.set_max_depth(depth.firstChild.data)

        return spider
Ejemplo n.º 2
0
        #print('image expired')
        spider.add_url(options['thumbnail'], 'parse_binary', options)


def parse_binary(response, spider, options):
    with open(
            folder + '/' + options['member'] + '-' + options['date'] + '-' +
            str(options['image_index']) + '.jpg', 'wb') as f:
        f.write(response)
    print('binary write with',
          options['date'] + '-' + str(options['image_index']) + '.jpg')


def parse_tieba(response, spider, options):
    bs = BeautifulSoup(
        response.replace('<!--', '').replace('-->', ''), 'html.parser')
    titles = bs.find_all(class_='j_th_tit')


if __name__ == '__main__':
    blog_spider = Spider()
    blog_spider.register_callback('parse_date', 'text', parse_date)
    # send queue to spider
    blog_spider.register_callback('parse_blog', 'text', parse_blog, True)
    blog_spider.register_callback('parse_image', 'text', parse_image)
    blog_spider.register_callback('parse_binary', 'binary', parse_binary)
    blog_spider.add_url(blog_prefix + 'manatsu.akimoto', 'parse_date',
                        {'member': 'manatsu.akimoto'})
    blog_spider.run()

#print(get_members_urlprefix())