Exemple #1
0
class MySpider(Spider):
    start_url = 'https://blog.scrapinghub.com/'
    frequency = 5
    parsers = [
        Parser('https://blog.scrapinghub.com/page/\d+/'),
        Parser('https://blog.scrapinghub.com/\d{4}/\d{2}/\d{2}/[a-z0-9\-]+/',
               Post)
    ]
Exemple #2
0
class MySpider(Spider):
    start_url = 'http://blog.sciencenet.cn/home.php?mod=space&uid=40109&do=blog&view=me&from=space'
    concurrency = 1
    headers = {'User-Agent': 'Google Spider'}
    parsers = [
        Parser(
            'http://blog.sciencenet.cn/home.php\?mod=space&uid=\d+&do=blog&view=me&from=space&page=\d+'
        ),
        Parser('blog\-\d+\-\d+\.html', Post)
    ]
Exemple #3
0
def test_parse_urls():
    html = ('<a href="item?id=14447885">64comments</a>'
            '<a href="item?id=14447886">64comments</a>')

    class User(Item):
        username = Xpath('//title')
        karma = Css('.karma')

    parser = Parser('item\?id=\d+', User)
    parser.parse_urls(html, 'https://blog.scrapinghub.com')
    assert parser.pre_parse_urls.__len__() == 2
Exemple #4
0
def test_parse():
    html = '<title class="username">tom</title><div class="karma">15</div>'

    class User(Item):
        username = Xpath('//title')
        karma = Css('.karma')

    parser = Parser(html, User)

    user = parser.parse_item(html)
    assert user.results == {'username': '******', 'karma': '15'}
Exemple #5
0
def test_parse():
    html = '<title class="username">tom</title><div class="karma">15</div>'

    class User(Item):
        username = Xpath('//title')
        karma = Css('.karma')

    parser = Parser('http://github.com', User)

    user = parser.parse(html)
    assert 'username' in user.results
    assert 'karma' in user.results
    assert user.username == 'tom'
    assert user.karma == '15'
Exemple #6
0
def test_parse_urls():
    html = ('<a href="item?id=14447885">64comments</a>'
            '<a href="item?id=14447886">64comments</a>')

    class User(Item):
        username = Xpath('//title')
        karma = Css('.karma')

    parser = Parser('item\?id=\d+', User)
    parser.parse_urls(html)
    assert parser.parsing_urls.__len__() == 2
    assert 'item?id=14447886' in parser.parsing_urls
    assert 'item?id=14447885' in parser.parsing_urls

    assert 'item?id=14447886' in parser.parsed_urls
    assert 'item?id=14447885' in parser.parsed_urls
Exemple #7
0
class MySpider(Spider):
    start_url = 'http://blog.jobbole.com/'
    concurrency = 5
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'
    }
    parsers = [
        Parser('http://blog.jobbole.com/114503/', Post),
    ]
Exemple #8
0
class MySpider(Spider):
    start_url = 'https://mydramatime.com/europe-and-us-drama/'
    concurrency = 5
    headers = {'User-Agent': 'Google Spider'}
    parsers = [
        Parser(
            'https://mydramatime.com/europe-and-us-drama/game-of-thrones/page/\d+?/'
        ),
        XPathParser('//span[@class="category-name"]/a/@href'),
        XPathParser(
            '//div[@class="mini-left"]//div[contains(@class, "mini-title")]/a/@href',
            Post)
    ]
class MySpider(Spider):
    start_url = 'https://blog.scrapinghub.com/'
    concurrency = 5
    headers = {'User-Agent': 'Google Spider'}
    parsers = [Parser('https://blog.scrapinghub.com/page/\d+/'),
               Parser('https://blog.scrapinghub.com/\d{4}/\d{2}/\d{2}/[a-z0-9\-]+/', Post)]
Exemple #10
0
class MySpider(Spider):
    start_url = 'https://///www.v2ex.com/go//////create'  # change here, if you want to scrape this site.
    concurrency = 1
    headers = {'User-Agent': 'Google Spider'}
    parsers = [Parser('/go/create?p=\d+'),
               Parser('/t/\d+#reply\d+', Post)]
Exemple #11
0
class MySpider(Spider):
    concurrency = 5
    headers = {'User-Agent': 'Google Spider'}
    start_url = 'http://quotes.toscrape.com/'
    parsers = [Parser('/page/1/'),
               Parser('/page/1/', Post)]
class GoogleSpider(Spider):
    start_url = 'https://google.com/'
    concurrency = 1
    headers = {'User-Agent': 'Google Spider'}
    parsers = [Parser('/'), Parser('/', Post)]