Ejemplo n.º 1
0
class PrintService(object):
    """
    A logging service that logs to the standard output via print
    """
    config = Inject("config_service", HasAttributes("loglevel"))
    output = Inject("logging_output", HasMethods("write"))

    def __init__(self):
        self.loglevel = LOGLEVELS[self.config.loglevel]

    def debug(self, *messages):
        """
        Logs messages with debug log level.
        """
        self.log(LOGLEVELS["debug"], "\n[Debug]", *messages)

    def info(self, *messages):
        """
        Logs messages with info log level.
        """
        self.log(LOGLEVELS["info"], "\n[Info]", *messages)

    def important(self, *messages):
        """
        Logs messages with important log level.
        """
        self.log(LOGLEVELS["important"], "\n[Important]", *messages)

    def error(self, *messages):
        """
        Logs messages with error log level.
        """
        self.log(LOGLEVELS["error"], "\n[Error]", *messages)

    def warn(self, *messages):
        """
        Logs messages with warn log level.
        """
        self.log(LOGLEVELS["warn"], "\n[Warn]", *messages)

    def log(self, level, prefix, *messages):
        """
        Checks if the requested loglevel is below the available loglevel.
        If so, extend the message with the name of the current thread and call write.
        """
        if self.loglevel > level:
            return
        msg = "%sThread=[%s]\t" % (prefix, threading.currentThread().name)
        for message in messages:
            msg += " %s" % message
        self.write(msg)

    def write(self, msg):
        """
        Writes the message to the output of the instance.
        """
        try:
            self.output.write(msg)
        except UnicodeEncodeError as error:
            self.output.write(error)
Ejemplo n.º 2
0
 def test_no_assertion(self):
     class Test(object):
         kanga = Inject("Kanga", cache=False)
     Inject.register_feature("Kanga", "They steal your youth!")
     result = Test().kanga
     self.assertTrue(result)
     self.assertEqual(result, "They steal your youth!")
Ejemplo n.º 3
0
 def tearDown(self):
     Inject.reset()
     parser.parse.reset()
     log.write.reset()
     log.info.reset()
     log.error.reset()
     log.debug.reset()
Ejemplo n.º 4
0
class Parser(object):
    """
    Base class for parsers
    The parser performs a GET request on the url and calls feed with the response.text
    Subclasses should implement the feed method
    """
    _requests = Inject("requests", HasMethods("get"))
    log_service = Inject("log_service",
                         HasMethods("debug", "info", "error", "warn"))

    def parse(self, url):
        """
        Send a GET request to the given url and parse the response.
        :returns: ParseResult containing the link, status_code and page_size of the result
        """
        assert url is not None
        response = self._requests.get(url, verify=False)
        if response.status_code == 200:
            self.feed(response.text)
        return ParseResult(link=url,
                           status_code=response.status_code,
                           page_size=len(response.content))

    def feed(self, data):
        """
        Parser.parse feeds the response data to the method.
        Override to customize parsing.
        """
        pass
Ejemplo n.º 5
0
 def setUp(self):
     Inject.reset()
     self.startMock = CallableMock()
     threading_mock.Thread.reset(returns=Any(daemon=False, start=self.startMock))
     Inject.register_feature("threading", threading_mock)
     Inject.register_feature("config_service", config_mock)
     Inject.register_feature("queue", lambda: queue_mock)
     Inject.register_feature("log_service", log_service_mock)
Ejemplo n.º 6
0
def register_services(services=None):
    """
    Register the services in dependency_injection.Inject.
    If services is None the Services class is used.
    """
    assert services is dict or services is None
    if not services:
        services = Services.__dict__
    Inject.register_features(**services)
Ejemplo n.º 7
0
 def test_assertionerror_is_raised_if_assertion_fails(self):
     class Test(object):
         kanga = Inject("Kanga", lambda f: isinstance(f, int), cache=False)
     Inject.register_feature("Kanga", "Roo")
     try:
         Test().kanga
         self.fail()
     except AssertionError as error:
         self.assertEqual(str(error), "The value=[%s] of feature=[%s] does not match a criteria"
                          % ("Roo", "Kanga"))
Ejemplo n.º 8
0
    def test_factory_method(self):
        class Test(object):
            kanga = Inject("Kanga", lambda f: isinstance(f, str), cache=False)

        class TestKangaFactory(object):
            id = 0

            def __call__(self):
                TestKangaFactory.id += 1
                return "#%s" % TestKangaFactory.id

        Inject.register_feature("Kanga", TestKangaFactory())
        result1 = Test().kanga
        result2 = Test().kanga
        self.assertNotEqual(result1, result2)
Ejemplo n.º 9
0
    def test_reset(self):
        Inject.register_features(Kanga=1, Tiggers=2)

        class Test(object):
            kanga = Inject("Kanga")
            tiggers = Inject("Tiggers")

        test = Test()
        self.assertTrue(test.kanga)
        self.assertTrue(test.tiggers)
        Inject.reset()
        with self.assertRaises(KeyError):
            test.kanga
        with self.assertRaises(KeyError):
            test.tiggers
Ejemplo n.º 10
0
 def setUp(self):
     self.log_service = Any(debug=CallableMock(), error=CallableMock())
     self.crawl_result = Any(link="",
                             status_code=200,
                             page_size=0,
                             links=[])
     self.parser = Any(parse=CallableMock(returns=self.crawl_result))
     self.crawler = Any(
         status=Any(urls_in_progress=[],
                    urls_to_crawl=[],
                    visited_urls=set()),
         site_parser_factory=CallableMock(returns=self.parser),
         process_links=CallableMock(),
         get_status_message=CallableMock())
     Inject.reset()
     Inject.register_feature('log_service', self.log_service)
Ejemplo n.º 11
0
class CrawlJob(object):
    """
    A single crawl job, that parses sites until it's crawler runs out of urls
    """
    log_service = Inject("log_service", HasMethods("error", "debug"))

    def __init__(self, crawler):
        self.crawler = crawler

    def __call__(self):
        self.run()

    def run(self):
        """
        Parses sites until there are no more urls_to_crawl in the Crawler
        """
        parser = self.crawler.site_parser_factory()
        while self.crawler.status.urls_to_crawl:
            try:
                url = self.crawler.status.urls_to_crawl.pop()
            except IndexError:
                return
            self.crawler.status.urls_in_progress.append(url)
            result = self.crawl_site(url, parser)
            self.crawler.status.visited_urls.add(url)
            if result:
                self.crawler.process_links(url, result.links)
            self.log_service.debug(self.crawler.get_status_message())

    def crawl_site(self, url, parser=None):
        """
        Crawls the given url using a parser made by site_parser_factory
        If a requests.exceptions.ConnectionError
        or requests.exceptions.SSLError is raised
        returns None
        """
        try:
            parser = self.crawler.site_parser_factory(
            ) if parser is None else parser
            result = parser.parse(url)
            return result
        except (requests.exceptions.ConnectionError,
                requests.exceptions.SSLError) as error:
            self.log_service.error("Error while crawling site=[%s]" % url,
                                   str(error))
            return None
        finally:
            self.__remove_url_from_crawler(url)

    def __remove_url_from_crawler(self, url):
        try:
            self.crawler.status.urls_in_progress.remove(url)
        except ValueError:
            pass
Ejemplo n.º 12
0
 def setUp(self):
     Inject.reset()
     Inject.register_feature('site_parser_factory',
                             lambda: Any(parse=CallableMock()))
     Inject.register_feature(
         'log_service',
         Any(info=CallableMock(),
             error=CallableMock(),
             debug=CallableMock()))
     Inject.register_feature(
         'work_service',
         Any(request_work=CallableMock(callback=lambda cb: cb()),
             terminate_all=CallableMock(),
             active_count=CallableMock))
Ejemplo n.º 13
0
 def setUp(self):
     super(WorkerService_request_work_Tests, self).setUp()
     Inject.register_feature("queue", Queue.Queue)
     Inject.register_feature("threading", threading)
Ejemplo n.º 14
0
 def tearDown(self):
     Inject.reset()
Ejemplo n.º 15
0
class WorkService(object):
    """
    Service for handling multithreaded jobs.
    """
    __worker_labels = set()
    __instances = 0
    __init_lock = Lock()

    threading = Inject("threading", HasMethods("Thread"))
    config_service = Inject("config_service", HasAttributes("threads"))
    log_service = Inject("log_service", HasMethods("debug", "info"))
    queue_factory = Inject("queue", return_factory=True)

    def __init__(self):
        worker_count = int(self.config_service.threads)
        assert worker_count > 0, "Thread count cannot be less than 1!"
        WorkService.__init_lock.acquire(True)
        try:
            self.__active_count = 0
            self.__worker_labels = set()
            self.queue = self.queue_factory()
            self.__init_workers(worker_count, WorkService.__instances)
            WorkService.__instances += 1
        finally:
            WorkService.__init_lock.release()

    def __init_workers(self, number, instance_number):
        label = max(WorkService.__worker_labels) if WorkService.__worker_labels else 0
        for _ in range(number):
            label += 1
            self.__worker_labels.add(label)
            WorkService.__worker_labels.add(label)
            worker = self.threading.Thread(target=self.__worker,
                                           name="WSc[%s]--%s" % (instance_number, label))
            worker.daemon = True
            worker.start()

    def __del__(self):
        for label in self.__worker_labels:
            WorkService.__worker_labels.remove(label)

    def request_work(self, job, blocking=True):
        """
        Request a job to be processed.
        It is put into the job queue for processing.
        """
        assert job is not None
        assert callable(job)
        return self.queue.put(job, blocking)

    def terminate_all(self, graceful=False):
        """
        Terminates all pending jobs.
        """
        self.log_service.debug("-----------------Terminate request received-----------------")
        if graceful:
            self.queue.join()

    def active_count(self):
        """
        Returns the number of currently busy worker threads.
        """
        return self.__active_count

    def __worker(self):
        while 1:
            task = self.queue.get()
            self.__active_count += 1
            task()
            self.__active_count -= 1
            self.queue.task_done()
Ejemplo n.º 16
0
 def setUp(self):
     Inject.reset()
     Inject.register_feature("site_parser_factory", site_parser_factory)
     Inject.register_feature("config_service", config)
     Inject.register_feature("log_service", log)
     Inject.register_feature("work_service", work_service)
Ejemplo n.º 17
0
 def setUp(self):
     Inject.reset()
     Inject.register_feature("requests", requests)
     Inject.register_feature("log_service", log_service)
Ejemplo n.º 18
0
 class Test(object):
     kanga = Inject("Kanga", lambda f: isinstance(f, int), lambda f: f % 5 == 0, cache=False)
Ejemplo n.º 19
0
 class Test(object):
     kanga = Inject("Kanga")
     tiggers = Inject("Tiggers")
Ejemplo n.º 20
0
 class Test(object):
     kanga = Inject("Kanga", lambda f: isinstance(f, str), cache=False)
Ejemplo n.º 21
0
class Crawler(object):
    """
    Crawls a resource starting from url
    Uses Crawlers created by site_parser_factory to crawl each resource,
    then decides the next pages to crawl based on the result

    Uses work_service to execute crawling jobs
    Uses log_service to display results and errors
    """
    site_parser_factory = Inject("site_parser_factory", return_factory=True)
    log_service = Inject("log_service", HasMethods("info", "error", "debug"))
    work_service = Inject("work_service",
                          HasMethods("request_work", "terminate_all",
                                     "active_count"),
                          cache=True)
    config_service = Inject("config_service", HasAttributes("skip"))

    def __init__(self, urls=None):
        self.on_start = Subject()
        self.on_interrupt = Subject()
        self.on_finish = Subject()
        self.protocol = None
        self.status = CrawlerStatus()
        self.reset(urls)

    def reset(self, url):
        """
        Reset the crawler's state
        """
        self.status.reset(set(), set(), [])
        if not url:
            return
        if not isinstance(url, list):
            url = [url]
        self.initialize_status(url)

    def initialize_status(self, urls):
        """
        Initialize the crawler's status field by a collection of urls
        """
        self.protocol = get_protocol(urls[0])
        if not self.protocol:
            self.protocol = "http://"
            urls[0] = "%s%s" % (self.protocol, urls[0])
        self.status.urls_to_crawl.add(urls[0])
        for url in urls[1:]:
            processed = self.process_link(None, url)
            if not processed:
                raise AttributeError(
                    "Url=[%s] is not valid in this collection!" % url)
            self.status.urls_to_crawl.add(processed)

    def crawl(self, url=None):
        """
        Starts the crawl procedure
        Returns when the crawling is complete
        or a KeyboardInterrupt or SystemExit is raised
        Terminates remaining work if interrupted
        """
        if url:
            self.reset(url)
        self.log_service.info(
            "----------Crawl starting from url=[{url}]----------".format(
                url=url))
        self.on_start.next(url)
        self.__request_crawl_work()
        try:
            while self.is_task_left():
                self.__sleep_until_task_is_available()
                self.__request_crawl_work()
        except (KeyboardInterrupt, SystemExit) as error:
            self.handle_interrupt(error)
        finally:
            self.handle_crawl_finish()

    def __sleep_until_task_is_available(self):
        if not self.is_waiting_for_url():
            return
        self.log_service.debug(
            "No urls to crawl, going to sleep.",
            "Work in progress=[%s]" % self.work_service.active_count())
        while self.is_waiting_for_url():
            time.sleep(1)

    def __request_crawl_work(self):
        return self.work_service.request_work(CrawlJob(self))

    def process_links(self, origin, links):
        """
        Process a collection of links found at origin
        """
        result = []
        for link in links:
            processed_link = self.process_link(origin, link)
            if self.is_new_link(processed_link):
                result.append(processed_link)
                self.status.urls_to_crawl.add(processed_link)
        return result

    def is_new_link(self, link):
        """
        Returns if the link has been seen by the crawler before
        """
        return link \
            and link not in self.status.visited_urls \
            and link not in self.status.urls_to_crawl

    def process_link(self, origin, link):
        """
        Processes a newly found link
        For relative links the origin link will be used as base
        """
        if not link or any([
                x for x in self.config_service.skip
                if re.search(x, link.lower())
        ]):
            return None
        if is_relative_link(link):
            if link[0] == ".":
                link = link[1::]
            link = "%s%s%s" % (self.protocol, get_domain(origin), link)
        link = strip_beginning_slashes(link)
        if not get_protocol(link):
            link = "%s%s" % (self.protocol, link)
        return link

    def get_status_message(self):
        """
        Returns a message about the current progress
        """
        visited = len(self.status.visited_urls)
        in_progess = len(self.status.urls_in_progress)
        todo = len(self.status.urls_to_crawl)
        message = """--------------------Crawl status--------------------
                                        Urls visited=[%s]
                                        Urls in progess=[%s]
                                        Urls left=[%s]""" %\
            (visited, in_progess, todo)
        return message

    def is_waiting_for_url(self):
        """
        Returns if urls_to_crawl is empty, but there are urls is progress
        """
        return not any(self.status.urls_to_crawl) \
            and any(self.status.urls_in_progress)

    def is_task_left(self):
        """
        Returns if there are any urls that haven't been processed yet
        """
        return any(self.status.urls_in_progress +
                   list(self.status.urls_to_crawl))

    def handle_interrupt(self, error):
        """
        Handles crawling interruptions
        Calls on_interrupt obeservers
        Terminates pending jobs
        """
        self.log_service.info("----------Crawling was interrupted----------",
                              error)
        self.on_interrupt.next()
        self.work_service.terminate_all()

    def handle_crawl_finish(self):
        """
        Handles crawling finish
        Calls on_finish observers
        Logs the final status
        """
        self.on_finish.next(self.status.visited_urls,
                            self.status.urls_in_progress,
                            self.status.urls_to_crawl)
        self.log_service.info(self.get_status_message())
        self.log_service.info("----------Crawl finished----------\n")
Ejemplo n.º 22
0
 def setUp(self):
     Inject.reset()
Ejemplo n.º 23
0
 class Test(object):
     kanga = Inject("Kanga", cache=False)
Ejemplo n.º 24
0
 def test_init(self):
     result = Inject("some_feature", cache=False)
     self.assertTrue(result)
Ejemplo n.º 25
0
 def test_positive_assertion_is_silent(self):
     class Test(object):
         kanga = Inject("Kanga", lambda f: isinstance(f, int), lambda f: f % 5 == 0, cache=False)
     Inject.register_feature("Kanga", 5)
     result = Test().kanga
     self.assertEqual(result, 5)