class MySpider(Spider): start_url = 'https://blog.scrapinghub.com/' frequency = 5 parsers = [ Parser('https://blog.scrapinghub.com/page/\d+/'), Parser('https://blog.scrapinghub.com/\d{4}/\d{2}/\d{2}/[a-z0-9\-]+/', Post) ]
class MySpider(Spider): start_url = 'http://blog.sciencenet.cn/home.php?mod=space&uid=40109&do=blog&view=me&from=space' concurrency = 1 headers = {'User-Agent': 'Google Spider'} parsers = [ Parser( 'http://blog.sciencenet.cn/home.php\?mod=space&uid=\d+&do=blog&view=me&from=space&page=\d+' ), Parser('blog\-\d+\-\d+\.html', Post) ]
def test_parse_urls(): html = ('<a href="item?id=14447885">64comments</a>' '<a href="item?id=14447886">64comments</a>') class User(Item): username = Xpath('//title') karma = Css('.karma') parser = Parser('item\?id=\d+', User) parser.parse_urls(html, 'https://blog.scrapinghub.com') assert parser.pre_parse_urls.__len__() == 2
def test_parse(): html = '<title class="username">tom</title><div class="karma">15</div>' class User(Item): username = Xpath('//title') karma = Css('.karma') parser = Parser(html, User) user = parser.parse_item(html) assert user.results == {'username': '******', 'karma': '15'}
def test_parse(): html = '<title class="username">tom</title><div class="karma">15</div>' class User(Item): username = Xpath('//title') karma = Css('.karma') parser = Parser('http://github.com', User) user = parser.parse(html) assert 'username' in user.results assert 'karma' in user.results assert user.username == 'tom' assert user.karma == '15'
def test_parse_urls(): html = ('<a href="item?id=14447885">64comments</a>' '<a href="item?id=14447886">64comments</a>') class User(Item): username = Xpath('//title') karma = Css('.karma') parser = Parser('item\?id=\d+', User) parser.parse_urls(html) assert parser.parsing_urls.__len__() == 2 assert 'item?id=14447886' in parser.parsing_urls assert 'item?id=14447885' in parser.parsing_urls assert 'item?id=14447886' in parser.parsed_urls assert 'item?id=14447885' in parser.parsed_urls
class MySpider(Spider): start_url = 'http://blog.jobbole.com/' concurrency = 5 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36' } parsers = [ Parser('http://blog.jobbole.com/114503/', Post), ]
class MySpider(Spider): start_url = 'https://mydramatime.com/europe-and-us-drama/' concurrency = 5 headers = {'User-Agent': 'Google Spider'} parsers = [ Parser( 'https://mydramatime.com/europe-and-us-drama/game-of-thrones/page/\d+?/' ), XPathParser('//span[@class="category-name"]/a/@href'), XPathParser( '//div[@class="mini-left"]//div[contains(@class, "mini-title")]/a/@href', Post) ]
class MySpider(Spider): start_url = 'https://blog.scrapinghub.com/' concurrency = 5 headers = {'User-Agent': 'Google Spider'} parsers = [Parser('https://blog.scrapinghub.com/page/\d+/'), Parser('https://blog.scrapinghub.com/\d{4}/\d{2}/\d{2}/[a-z0-9\-]+/', Post)]
class MySpider(Spider): start_url = 'https://///www.v2ex.com/go//////create' # change here, if you want to scrape this site. concurrency = 1 headers = {'User-Agent': 'Google Spider'} parsers = [Parser('/go/create?p=\d+'), Parser('/t/\d+#reply\d+', Post)]
class MySpider(Spider): concurrency = 5 headers = {'User-Agent': 'Google Spider'} start_url = 'http://quotes.toscrape.com/' parsers = [Parser('/page/1/'), Parser('/page/1/', Post)]
class GoogleSpider(Spider): start_url = 'https://google.com/' concurrency = 1 headers = {'User-Agent': 'Google Spider'} parsers = [Parser('/'), Parser('/', Post)]