class EventletInbox(object): def __init__(self, logger=None): ''' __init__ ''' self.__inbox = EventletQueue() if logger is None: self._logger = getLogger('%s.EventletInbox' % __name__) else: self._logger = logger def get(self): ''' get data from inbox ''' try: result = self.__inbox.get_nowait() except EventletEmpty: raise EmptyInboxException return result def put(self, message): ''' put message to inbox ''' self.__inbox.put(message) def __len__(self): ''' return length of inbox ''' return self.__inbox.qsize()
class PipeReader: def __init__(self, pipe): self._pipe = pipe self._queue = Queue() _queue = self._queue def _enqueue_output(): for line in iter(pipe.readline, b''): _queue.put(line) pipe.close() self._green_thread = eventlet.spawn(_enqueue_output) def get(self): try: return self._queue.get_nowait() except Empty: pass return None def shutdown(self): if self._green_thread is not None: try: self._green_thread.kill() except: pass
class SubmitMonitor: def __init__(self, logger: Logger, interval: float = 10): self._logger = logger self._q = Queue() self._ok_submits = 0 self._bad_submits = 0 self._connections = 0 self._interval = interval self._was_ok = self._ok_submits self._was_bad = self._bad_submits self._was_conn = self._connections self._labels = ['attacker_id', 'victim_id', 'task_id', 'submit_ok'] def add(self, ar: models.AttackResult) -> None: self._q.put_nowait(ar) if ar.submit_ok: self.inc_ok() else: self.inc_bad() def inc_ok(self) -> None: self._ok_submits += 1 def inc_bad(self) -> None: self._bad_submits += 1 def inc_conns(self) -> None: self._connections += 1 def _process_statistics(self) -> None: new_ok, new_bad = self._ok_submits, self._bad_submits new_conn = self._connections self._logger.info(f"OK: {new_ok - self._was_ok:>6}, " f"BAD: {new_bad - self._was_bad:>6}, " f"CONN: {new_conn - self._was_conn:>6}, " f"TOTOK: {new_ok:>6}, TOTBAD: {new_bad:>6}, " f"TOTCONN: {new_conn:>6}") self._was_ok = new_ok self._was_bad = new_bad self._was_conn = new_conn def _process_attacks_queue(self) -> None: conn = Connection(config.get_broker_url()) with conn.channel() as channel: producer = Producer(channel) by_label = defaultdict(list) while not self._q.empty(): try: ar: models.AttackResult = self._q.get_nowait() except Empty: continue else: by_label[ar.get_label_key()].append(ar) for ar_list in by_label.values(): if not ar_list: continue monitor_message = { 'type': 'flag_submit', 'data': ar_list[0].get_label_values(), 'value': len(ar_list), } producer.publish( monitor_message, exchange='', routing_key='forcad-monitoring', ) def __call__(self) -> None: while True: try: self._process_statistics() self._process_attacks_queue() except Exception as e: self._logger.error("Error in monitoring: %s", str(e)) eventlet.sleep(self._interval)
class Crawler(object): def __init__(self, max_connections, input_is_plain): self.max_connections = max_connections self.input_is_plain = input_is_plain self.queue = Queue(1) self.closed = False self._handler_pool = GreenPool(self.max_connections) self._robots_cache = PoolMap(self.get_robots_checker, pool_max_size=1, timeout=600) # Start IO worker and die if he does. self.io_worker = io.Worker(lambda: self.closed) t = spawn(self.io_worker.run_loop) t.link(reraise_errors, greenthread.getcurrent()) log.debug(u"Crawler started. Max connections: %d.", self.max_connections) def crawl(self, forever=True): # TODO: do something special about signals? if forever: self.start_queue_updater() while not self.closed: # `get_nowait` will only work together with sleep(0) here # because we need greenlet switch to reraise exception from `do_process`. sleep() try: item = self.queue.get_nowait() except Empty: if not forever: self.graceful_stop() sleep(0.01) continue t = self._handler_pool.spawn(self.do_process, item) t.link(reraise_errors, greenthread.getcurrent()) def stop(self): self.closed = True def graceful_stop(self, timeout=None): """Stops crawler and waits for all already started crawling requests to finish. If `timeout` is supplied, it waits for at most `timeout` time to finish and returns True if allocated time was enough. Returns False if `timeout` was not enough. """ self.closed = True if timeout is not None: with eventlet.Timeout(timeout, False): if hasattr(self, "_queue_updater_thread"): self._queue_updater_thread.kill() self._handler_pool.waitall() return True return False else: if hasattr(self, "_queue_updater_thread"): self._queue_updater_thread.kill() self._handler_pool.waitall() def start_queue_updater(self): self._queue_updater_thread = spawn(self.queue_updater) self._queue_updater_thread.link(reraise_errors, greenthread.getcurrent()) def queue_updater(self): log.debug("Waiting for crawl jobs on stdin.") for line in sys.stdin: if self.closed: break line = line.strip() if self.input_is_plain: job = {"url": line} else: try: job = json.loads(line) except ValueError: log.error(u"Decoding input line: %s", line) continue # extend worker queue # 1. skip duplicate URLs for queue_item in self.queue.queue: if queue_item["url"] == job["url"]: # compare URLs break else: # 2. extend queue with new items # May block here, when queue is full. This is a feature. self.queue.put(job) # Stdin exhausted -> stop. while not self.queue.empty(): sleep(0.01) sleep(2) # FIXME: Crutch to prevent stopping too early. self.graceful_stop() def get_robots_checker(self, scheme, authority): """PoolMap func :: scheme, authority -> (agent, uri -> bool).""" robots_uri = "%s://%s/robots.txt" % (scheme, authority) fetch_result = self.io_worker.fetch(robots_uri) # Graceful stop thing. if fetch_result is None: return None if fetch_result["success"]: # TODO: set expiration time from headers # but this must be done after `self._robots_cache.put` or somehow else... if 200 <= fetch_result["status_code"] < 300: parser = robotparser.RobotFileParser() content_lines = fetch_result["content"].splitlines() try: parser.parse(content_lines) except KeyError: raise RobotsError(u"Known robotparser bug: KeyError at urllib.quote(path).") return parser.can_fetch # Authorization required and Forbidden are considered Disallow all. elif fetch_result["status_code"] in (401, 403): return lambda _agent, _uri: False # /robots.txt Not Found is considered Allow all. elif fetch_result["status_code"] == 404: return lambda _agent, _uri: True # FIXME: this is an optimistic rule and probably should be detailed with more specific checks elif fetch_result["status_code"] >= 400: return lambda _agent, _uri: True # What other cases left? 100 and redirects. Consider it Disallow all. else: return lambda _agent, _uri: False else: raise FetchError(u"/robots.txt fetch problem: %s" % (fetch_result["result"])) def ask_robots(self, uri, scheme, authority): key = scheme + ":" + authority with self._robots_cache.getc(key, scheme, authority) as checker: try: # Graceful stop thing. if checker is None: return None return checker(settings.identity["name"], uri) except Exception, e: log.exception(u"Get rid of this. ask_robots @ %s", uri) raise RobotsError(u"Error checking robots.txt permissions for URI '%s': %s" % (uri, unicode(e)))