Ejemplo n.º 1
0
async def main() -> None:
    backup = Path(__file__).parent / 'backup.mp'
    config = Configuration(backup_filename=f'{backup}', item_processors=[date_processor])
    spider = StaticSpider(urls=['http://quotes.toscrape.com'], parse=parse, config=config)
    await spider.run()
    print(spider.statistics())
    # you can do whatever you want with the results
    async for item in read_mp(backup, decoder=spider.config.msgpack_decoder):
        print(item)
Ejemplo n.º 2
0
async def main() -> None:
    backup = Path(__file__).parent / 'backup.mp'
    config = Configuration(selenium_driver_log_file=None,
                           backup_filename=f'{backup}',
                           item_processors=[date_processor])
    spider = SeleniumSpider(urls=['http://quotes.toscrape.com'],
                            parse=parse,
                            config=config)
    await spider.run()
    print(spider.statistics())
    # you can do whatever you want with the results
    async for quote in read_mp(filename=backup, decoder=datetime_decoder):
        print(quote)
Ejemplo n.º 3
0
async def main() -> None:
    backup = Path(__file__).parent / 'backup.mp'
    config = Configuration(selenium_driver_log_file=None, backup_filename=f'{backup}', item_processors=[date_processor])
    sel_spider = SeleniumSpider(urls=['http://httpbin.org/'], parse=parse, config=config)
    await sel_spider.run()
    print(sel_spider.statistics())
    # you can do whatever you want with the results
    async for quote_data in read_mp(filename=backup, decoder=datetime_decoder):
        print('****', quote_data['title'], '****')
        print(quote_data['description'])
        print('== operations ==')
        for operation in quote_data['operations']:
            print('\tmethod:', operation['method'])
            print('\tpath:', operation['path'])
            print('\tdescription:', operation['description'], end='\n\n')
Ejemplo n.º 4
0
    next_link = None
    try:
        element = response.driver.find_element_by_xpath(
            '//nav/ul/li[@class="next"]/a')
        next_link = element.get_attribute('href')
    except NoSuchElementException:
        pass

    if next_link is not None:
        response.follow(next_link)


def date_processor(item: dict) -> dict:
    item['date'] = datetime.now()
    return item


if __name__ == '__main__':
    backup = Path(__file__).parent / 'backup.mp'
    config = Configuration(selenium_driver_log_file=None,
                           backup_filename=f'{backup}',
                           item_processors=[date_processor])
    sel_spider = SeleniumSpider(urls=['http://quotes.toscrape.com'],
                                parse=parse,
                                config=config)
    sel_spider.run()
    print(sel_spider.statistics())
    # you can do whatever you want with the results
    for quote_data in read_mp(filename=backup, decoder=datetime_decoder):
        print(quote_data)
Ejemplo n.º 5
0
            'message': quote.xpath('./span[@class="text"]/text()').get(),
            'author': quote.xpath('./span/small/text()').get(),
            'tags': quote.xpath('./div/a/text()').getall(),
        }
        spider.save_item(data)

    next_link = response.xpath('//nav/ul/li[@class="next"]/a').xpath(
        '@href').get()
    if next_link is not None:
        response.follow(next_link)


def date_processor(item: dict) -> dict:
    item['date'] = datetime.now()
    return item


if __name__ == '__main__':
    backup = Path(__file__).parent / 'backup.mp'
    config = Configuration(backup_filename=f'{backup}',
                           item_processors=[date_processor])
    spider = StaticSpider(urls=['http://quotes.toscrape.com'],
                          parse=parse,
                          config=config)
    spider.run()
    print(spider.statistics())
    # you can do whatever you want with the results
    for quote_data in read_mp(filename=backup,
                              decoder=spider.config.msgpack_decoder):
        print(quote_data)