Exemple #1
0
def delete_job(job_id):
    """
    A ndb helper method that manipulates the _scraper object.
    """
    try:
        del ndb.root._spiders.lists[job_id]
        ndb.commit()
    except KeyError:
        pass
Exemple #2
0
def bts_book_manager(_BooksWorker):
    """
    A BooksToScrape Manager test fixture for live network call.
    Here, we are spinning up two workers, while we have three
    tasks. It is important to test this as such, in spinning up
    a less number of workers vs total tasks.  There are plenty
    of ways to break this test when refactoring. One likely
    source would be the BaseWorker class method `load_items`.
    It took me half-a-day to track down a bug in that method
    which resulted in this test only working if the # workers
    was equal to the number of tasks. That was the previous
    default way to run this test, so the bug went un-found.
    """
    # first, setup newt.db for testing
    ndb.root._spiders = SpiderLists()
    ndb.commit()

    # ensure to open this file in binary mode
    book_data_file = open('c:/temp/book_data.csv', 'a+b')
    exporters = [
        CsvItemExporter(fields_to_export=['book_title', 'stock', 'price'],
                        file=book_data_file,
                        encoding='utf_8_sig')
    ]

    file = get_file_path('book_titles.xlsx')
    trackers = ['books.toscrape.com']
    tasks = StatefulBook(file, trackers, keywords='titles', autorun=True)

    groups = [
        WorkGroup(
            name='books.toscrape.com',
            url='http://books.toscrape.com/',
            spider=BooksToScrapeScraper,
            worker=_BooksWorker,
            items=BookItems,
            loader=BookItemsLoader,
            exporters=exporters,
            workers=
            2,  # this creates 2 scrapers and assigns each a book as a task
            kwargs={'timeout': (3.0, 20.0)})
    ]
    manager = BooksWorkGroupManager('books_scrape',
                                    tasks,
                                    workgroups=groups,
                                    pool=5)

    yield manager

    # teardown
    delete_job('books_scrape')
    del ndb.root._spiders
    ndb.commit()
Exemple #3
0
def bts_broker_manager(_BooksToScrapeGroup, _BooksWorker, broker_tasks,
                       broker_conn):
    """
    A BooksToScrape Manager test fixture for live network call.
    Here, we use a broker (RabbitMQ) to test.
    """
    # setup newt.db for testing
    ndb.root._spiders = SpiderLists()
    ndb.commit()

    # ensure to open this file in binary mode
    book_data_file = open('c:/temp/broker_data.csv', 'a+b')
    exporters = [
        CsvItemExporter(fields_to_export=['book_title', 'stock', 'price'],
                        file=book_data_file,
                        encoding='utf_8_sig')
    ]

    groups = [
        WorkGroup(
            name='books.toscrape.com',
            url='http://books.toscrape.com/',
            spider=BooksToScrapeScraper,
            worker=_BooksWorker,
            items=BookItems,
            loader=BookItemsLoader,
            exporters=exporters,
            workers=
            2,  # this creates 2 scrapers and assigns each a book as a task
            kwargs={'timeout': (3.0, 20.0)})
    ]
    manager = BooksWorkGroupManager('books_broker_scrape',
                                    broker_tasks,
                                    workgroups=groups,
                                    pool=5,
                                    connection=broker_conn)

    yield manager

    # teardown newt.db
    delete_job('books_broker_scrape')
    del ndb.root._spiders
    ndb.commit()
Exemple #4
0
 def pre_process_exports(self, spider, task):
     if self.job_id is not 'NONE':
         try:
             # create the list with the job name if it doesnt already exist
             ndb.root._spiders.add(self.job_id, SpiderList())
             print(
                 f'Worker {self.name}-{self.number} created a new scrape_list '
                 f'for {self.job_id}')
         except KeyError:
             # will be raised if there is already a list with the same job_name
             pass
         # export the scraper data to the items object
         items = self.load_items(spider)
         # save the items object to newt.db
         ndb.root._spiders[self.job_id].add(items)
         ndb.commit()
         print(
             f'Worker {self.name}-{self.number} saved {items.__repr__()} to '
             f'scrape_list "{self.job_id}" for task {task}.')
     else:
         # if job_id is NONE then we'll skip saving the objects
         print(
             f'Worker {self.name}-{self.number} said job_name is {self.job_id} '
             f'so will not save it.')