Python Spider Examples

Programming Language: Python

Namespace/Package Name: simplespider

Class/Type: Spider

Examples at hotexamples.com: 2

Python Spider - 2 examples found. These are the top rated real world Python examples of simplespider.Spider extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

add_runners(2)

_testing(1)

queue_task(1)

run(1)

Example #1

Show file

File: test_spider.py Project: rshk/simplespider

def spider():

    tasks_execution_log = []

    class LoggingTaskRunner(BaseTaskRunner):
        def __call__(self, task):
            tasks_execution_log.append((self, task.id))
            if task.get('abortme', False):
                raise AbortTask()
            if task.get('skipme', False):
                raise SkipRunner()
            if task.get('retryme', False):
                raise RetryTask()
            if task.get('raise_sth', False):
                raise Exception("This is an exception!")
            return iter([])

    class MyTaskRunner(LoggingTaskRunner):
        def match(self, task):
            return isinstance(task, MyTask)

    class MyTaskRunner2(LoggingTaskRunner):
        def match(self, task):
            return isinstance(task, MyTask)

        def __call__(self, task):
            super(MyTaskRunner2, self).__call__(task)
            yield MyOtherTask(
                task_id='was:' + task.id,
                previous=task)

    class MyOtherTaskRunner(LoggingTaskRunner):
        def match(self, task):
            return isinstance(task, MyOtherTask)

    runners = {
        0: MyTaskRunner(),
        1: MyTaskRunner2(),
        2: MyOtherTaskRunner(),
    }

    spider = Spider()
    spider.add_runners(x[1] for x in sorted(runners.iteritems()))
    # spider.execution_log = tasks_execution_log

    ## We need to pass some extra stuff..
    spider._testing = {
        'execution_log': tasks_execution_log,
        'runners': runners,
    }

    return spider

Example #2

Show file

File: wikicrawler.py Project: rshk/simplespider

        return True

    def __call__(self, task):
        assert self.match(task)
        response = task['response']
        content_type, params = cgi.parse_header(
            response['headers'].get('content-type') or 'text/html')
        # content_type = task['response'].headers['Content-type'].split(';')
        if content_type[0] != 'text/html':
            return  # Nothing to do here..
        tree = lxml.html.fromstring(task['response'].content)
        el = tree.xpath('//h1[@id="firstHeading"]')[0]
        yield WikipediaPage(url=task['url'], title=el.text_content())


spider = Spider()
spider.add_runners([
    WikipediaDownloader(),
    WikipediaScraper(),
    LinkExtractor(max_depth=3),
])


if __name__ == '__main__':
    try:
        ## Prepare the storage
        if len(sys.argv) > 1:
            storage = AnydbmStorage(path=sys.argv[1])
        else:
            storage = DictStorage()
        spider.add_runners([storage])