class PrintService(object): """ A logging service that logs to the standard output via print """ config = Inject("config_service", HasAttributes("loglevel")) output = Inject("logging_output", HasMethods("write")) def __init__(self): self.loglevel = LOGLEVELS[self.config.loglevel] def debug(self, *messages): """ Logs messages with debug log level. """ self.log(LOGLEVELS["debug"], "\n[Debug]", *messages) def info(self, *messages): """ Logs messages with info log level. """ self.log(LOGLEVELS["info"], "\n[Info]", *messages) def important(self, *messages): """ Logs messages with important log level. """ self.log(LOGLEVELS["important"], "\n[Important]", *messages) def error(self, *messages): """ Logs messages with error log level. """ self.log(LOGLEVELS["error"], "\n[Error]", *messages) def warn(self, *messages): """ Logs messages with warn log level. """ self.log(LOGLEVELS["warn"], "\n[Warn]", *messages) def log(self, level, prefix, *messages): """ Checks if the requested loglevel is below the available loglevel. If so, extend the message with the name of the current thread and call write. """ if self.loglevel > level: return msg = "%sThread=[%s]\t" % (prefix, threading.currentThread().name) for message in messages: msg += " %s" % message self.write(msg) def write(self, msg): """ Writes the message to the output of the instance. """ try: self.output.write(msg) except UnicodeEncodeError as error: self.output.write(error)
def test_no_assertion(self): class Test(object): kanga = Inject("Kanga", cache=False) Inject.register_feature("Kanga", "They steal your youth!") result = Test().kanga self.assertTrue(result) self.assertEqual(result, "They steal your youth!")
def tearDown(self): Inject.reset() parser.parse.reset() log.write.reset() log.info.reset() log.error.reset() log.debug.reset()
class Parser(object): """ Base class for parsers The parser performs a GET request on the url and calls feed with the response.text Subclasses should implement the feed method """ _requests = Inject("requests", HasMethods("get")) log_service = Inject("log_service", HasMethods("debug", "info", "error", "warn")) def parse(self, url): """ Send a GET request to the given url and parse the response. :returns: ParseResult containing the link, status_code and page_size of the result """ assert url is not None response = self._requests.get(url, verify=False) if response.status_code == 200: self.feed(response.text) return ParseResult(link=url, status_code=response.status_code, page_size=len(response.content)) def feed(self, data): """ Parser.parse feeds the response data to the method. Override to customize parsing. """ pass
def setUp(self): Inject.reset() self.startMock = CallableMock() threading_mock.Thread.reset(returns=Any(daemon=False, start=self.startMock)) Inject.register_feature("threading", threading_mock) Inject.register_feature("config_service", config_mock) Inject.register_feature("queue", lambda: queue_mock) Inject.register_feature("log_service", log_service_mock)
def register_services(services=None): """ Register the services in dependency_injection.Inject. If services is None the Services class is used. """ assert services is dict or services is None if not services: services = Services.__dict__ Inject.register_features(**services)
def test_assertionerror_is_raised_if_assertion_fails(self): class Test(object): kanga = Inject("Kanga", lambda f: isinstance(f, int), cache=False) Inject.register_feature("Kanga", "Roo") try: Test().kanga self.fail() except AssertionError as error: self.assertEqual(str(error), "The value=[%s] of feature=[%s] does not match a criteria" % ("Roo", "Kanga"))
def test_factory_method(self): class Test(object): kanga = Inject("Kanga", lambda f: isinstance(f, str), cache=False) class TestKangaFactory(object): id = 0 def __call__(self): TestKangaFactory.id += 1 return "#%s" % TestKangaFactory.id Inject.register_feature("Kanga", TestKangaFactory()) result1 = Test().kanga result2 = Test().kanga self.assertNotEqual(result1, result2)
def test_reset(self): Inject.register_features(Kanga=1, Tiggers=2) class Test(object): kanga = Inject("Kanga") tiggers = Inject("Tiggers") test = Test() self.assertTrue(test.kanga) self.assertTrue(test.tiggers) Inject.reset() with self.assertRaises(KeyError): test.kanga with self.assertRaises(KeyError): test.tiggers
def setUp(self): self.log_service = Any(debug=CallableMock(), error=CallableMock()) self.crawl_result = Any(link="", status_code=200, page_size=0, links=[]) self.parser = Any(parse=CallableMock(returns=self.crawl_result)) self.crawler = Any( status=Any(urls_in_progress=[], urls_to_crawl=[], visited_urls=set()), site_parser_factory=CallableMock(returns=self.parser), process_links=CallableMock(), get_status_message=CallableMock()) Inject.reset() Inject.register_feature('log_service', self.log_service)
class CrawlJob(object): """ A single crawl job, that parses sites until it's crawler runs out of urls """ log_service = Inject("log_service", HasMethods("error", "debug")) def __init__(self, crawler): self.crawler = crawler def __call__(self): self.run() def run(self): """ Parses sites until there are no more urls_to_crawl in the Crawler """ parser = self.crawler.site_parser_factory() while self.crawler.status.urls_to_crawl: try: url = self.crawler.status.urls_to_crawl.pop() except IndexError: return self.crawler.status.urls_in_progress.append(url) result = self.crawl_site(url, parser) self.crawler.status.visited_urls.add(url) if result: self.crawler.process_links(url, result.links) self.log_service.debug(self.crawler.get_status_message()) def crawl_site(self, url, parser=None): """ Crawls the given url using a parser made by site_parser_factory If a requests.exceptions.ConnectionError or requests.exceptions.SSLError is raised returns None """ try: parser = self.crawler.site_parser_factory( ) if parser is None else parser result = parser.parse(url) return result except (requests.exceptions.ConnectionError, requests.exceptions.SSLError) as error: self.log_service.error("Error while crawling site=[%s]" % url, str(error)) return None finally: self.__remove_url_from_crawler(url) def __remove_url_from_crawler(self, url): try: self.crawler.status.urls_in_progress.remove(url) except ValueError: pass
def setUp(self): Inject.reset() Inject.register_feature('site_parser_factory', lambda: Any(parse=CallableMock())) Inject.register_feature( 'log_service', Any(info=CallableMock(), error=CallableMock(), debug=CallableMock())) Inject.register_feature( 'work_service', Any(request_work=CallableMock(callback=lambda cb: cb()), terminate_all=CallableMock(), active_count=CallableMock))
def setUp(self): super(WorkerService_request_work_Tests, self).setUp() Inject.register_feature("queue", Queue.Queue) Inject.register_feature("threading", threading)
def tearDown(self): Inject.reset()
class WorkService(object): """ Service for handling multithreaded jobs. """ __worker_labels = set() __instances = 0 __init_lock = Lock() threading = Inject("threading", HasMethods("Thread")) config_service = Inject("config_service", HasAttributes("threads")) log_service = Inject("log_service", HasMethods("debug", "info")) queue_factory = Inject("queue", return_factory=True) def __init__(self): worker_count = int(self.config_service.threads) assert worker_count > 0, "Thread count cannot be less than 1!" WorkService.__init_lock.acquire(True) try: self.__active_count = 0 self.__worker_labels = set() self.queue = self.queue_factory() self.__init_workers(worker_count, WorkService.__instances) WorkService.__instances += 1 finally: WorkService.__init_lock.release() def __init_workers(self, number, instance_number): label = max(WorkService.__worker_labels) if WorkService.__worker_labels else 0 for _ in range(number): label += 1 self.__worker_labels.add(label) WorkService.__worker_labels.add(label) worker = self.threading.Thread(target=self.__worker, name="WSc[%s]--%s" % (instance_number, label)) worker.daemon = True worker.start() def __del__(self): for label in self.__worker_labels: WorkService.__worker_labels.remove(label) def request_work(self, job, blocking=True): """ Request a job to be processed. It is put into the job queue for processing. """ assert job is not None assert callable(job) return self.queue.put(job, blocking) def terminate_all(self, graceful=False): """ Terminates all pending jobs. """ self.log_service.debug("-----------------Terminate request received-----------------") if graceful: self.queue.join() def active_count(self): """ Returns the number of currently busy worker threads. """ return self.__active_count def __worker(self): while 1: task = self.queue.get() self.__active_count += 1 task() self.__active_count -= 1 self.queue.task_done()
def setUp(self): Inject.reset() Inject.register_feature("site_parser_factory", site_parser_factory) Inject.register_feature("config_service", config) Inject.register_feature("log_service", log) Inject.register_feature("work_service", work_service)
def setUp(self): Inject.reset() Inject.register_feature("requests", requests) Inject.register_feature("log_service", log_service)
class Test(object): kanga = Inject("Kanga", lambda f: isinstance(f, int), lambda f: f % 5 == 0, cache=False)
class Test(object): kanga = Inject("Kanga") tiggers = Inject("Tiggers")
class Test(object): kanga = Inject("Kanga", lambda f: isinstance(f, str), cache=False)
class Crawler(object): """ Crawls a resource starting from url Uses Crawlers created by site_parser_factory to crawl each resource, then decides the next pages to crawl based on the result Uses work_service to execute crawling jobs Uses log_service to display results and errors """ site_parser_factory = Inject("site_parser_factory", return_factory=True) log_service = Inject("log_service", HasMethods("info", "error", "debug")) work_service = Inject("work_service", HasMethods("request_work", "terminate_all", "active_count"), cache=True) config_service = Inject("config_service", HasAttributes("skip")) def __init__(self, urls=None): self.on_start = Subject() self.on_interrupt = Subject() self.on_finish = Subject() self.protocol = None self.status = CrawlerStatus() self.reset(urls) def reset(self, url): """ Reset the crawler's state """ self.status.reset(set(), set(), []) if not url: return if not isinstance(url, list): url = [url] self.initialize_status(url) def initialize_status(self, urls): """ Initialize the crawler's status field by a collection of urls """ self.protocol = get_protocol(urls[0]) if not self.protocol: self.protocol = "http://" urls[0] = "%s%s" % (self.protocol, urls[0]) self.status.urls_to_crawl.add(urls[0]) for url in urls[1:]: processed = self.process_link(None, url) if not processed: raise AttributeError( "Url=[%s] is not valid in this collection!" % url) self.status.urls_to_crawl.add(processed) def crawl(self, url=None): """ Starts the crawl procedure Returns when the crawling is complete or a KeyboardInterrupt or SystemExit is raised Terminates remaining work if interrupted """ if url: self.reset(url) self.log_service.info( "----------Crawl starting from url=[{url}]----------".format( url=url)) self.on_start.next(url) self.__request_crawl_work() try: while self.is_task_left(): self.__sleep_until_task_is_available() self.__request_crawl_work() except (KeyboardInterrupt, SystemExit) as error: self.handle_interrupt(error) finally: self.handle_crawl_finish() def __sleep_until_task_is_available(self): if not self.is_waiting_for_url(): return self.log_service.debug( "No urls to crawl, going to sleep.", "Work in progress=[%s]" % self.work_service.active_count()) while self.is_waiting_for_url(): time.sleep(1) def __request_crawl_work(self): return self.work_service.request_work(CrawlJob(self)) def process_links(self, origin, links): """ Process a collection of links found at origin """ result = [] for link in links: processed_link = self.process_link(origin, link) if self.is_new_link(processed_link): result.append(processed_link) self.status.urls_to_crawl.add(processed_link) return result def is_new_link(self, link): """ Returns if the link has been seen by the crawler before """ return link \ and link not in self.status.visited_urls \ and link not in self.status.urls_to_crawl def process_link(self, origin, link): """ Processes a newly found link For relative links the origin link will be used as base """ if not link or any([ x for x in self.config_service.skip if re.search(x, link.lower()) ]): return None if is_relative_link(link): if link[0] == ".": link = link[1::] link = "%s%s%s" % (self.protocol, get_domain(origin), link) link = strip_beginning_slashes(link) if not get_protocol(link): link = "%s%s" % (self.protocol, link) return link def get_status_message(self): """ Returns a message about the current progress """ visited = len(self.status.visited_urls) in_progess = len(self.status.urls_in_progress) todo = len(self.status.urls_to_crawl) message = """--------------------Crawl status-------------------- Urls visited=[%s] Urls in progess=[%s] Urls left=[%s]""" %\ (visited, in_progess, todo) return message def is_waiting_for_url(self): """ Returns if urls_to_crawl is empty, but there are urls is progress """ return not any(self.status.urls_to_crawl) \ and any(self.status.urls_in_progress) def is_task_left(self): """ Returns if there are any urls that haven't been processed yet """ return any(self.status.urls_in_progress + list(self.status.urls_to_crawl)) def handle_interrupt(self, error): """ Handles crawling interruptions Calls on_interrupt obeservers Terminates pending jobs """ self.log_service.info("----------Crawling was interrupted----------", error) self.on_interrupt.next() self.work_service.terminate_all() def handle_crawl_finish(self): """ Handles crawling finish Calls on_finish observers Logs the final status """ self.on_finish.next(self.status.visited_urls, self.status.urls_in_progress, self.status.urls_to_crawl) self.log_service.info(self.get_status_message()) self.log_service.info("----------Crawl finished----------\n")
def setUp(self): Inject.reset()
class Test(object): kanga = Inject("Kanga", cache=False)
def test_init(self): result = Inject("some_feature", cache=False) self.assertTrue(result)
def test_positive_assertion_is_silent(self): class Test(object): kanga = Inject("Kanga", lambda f: isinstance(f, int), lambda f: f % 5 == 0, cache=False) Inject.register_feature("Kanga", 5) result = Test().kanga self.assertEqual(result, 5)