Ejemplo n.º 1
0
class EventletInbox(object):
    def __init__(self, logger=None):
        ''' __init__ '''

        self.__inbox = EventletQueue()

        if logger is None:
            self._logger = getLogger('%s.EventletInbox' % __name__)
        else:
            self._logger = logger

    def get(self):
        ''' get data from inbox '''
        try:
            result = self.__inbox.get_nowait()
        except EventletEmpty:
            raise EmptyInboxException
        return result

    def put(self, message):
        ''' put message to inbox '''

        self.__inbox.put(message)

    def __len__(self):
        ''' return length of inbox '''

        return self.__inbox.qsize()
Ejemplo n.º 2
0
class PipeReader:
    def __init__(self, pipe):

        self._pipe = pipe
        self._queue = Queue()

        _queue = self._queue

        def _enqueue_output():
            for line in iter(pipe.readline, b''):
                _queue.put(line)

            pipe.close()

        self._green_thread = eventlet.spawn(_enqueue_output)

    def get(self):
        try:
            return self._queue.get_nowait()
        except Empty:
            pass
        return None

    def shutdown(self):

        if self._green_thread is not None:
            try:
                self._green_thread.kill()
            except:
                pass
Ejemplo n.º 3
0
class SubmitMonitor:
    def __init__(self, logger: Logger, interval: float = 10):
        self._logger = logger
        self._q = Queue()

        self._ok_submits = 0
        self._bad_submits = 0
        self._connections = 0

        self._interval = interval

        self._was_ok = self._ok_submits
        self._was_bad = self._bad_submits
        self._was_conn = self._connections

        self._labels = ['attacker_id', 'victim_id', 'task_id', 'submit_ok']

    def add(self, ar: models.AttackResult) -> None:
        self._q.put_nowait(ar)
        if ar.submit_ok:
            self.inc_ok()
        else:
            self.inc_bad()

    def inc_ok(self) -> None:
        self._ok_submits += 1

    def inc_bad(self) -> None:
        self._bad_submits += 1

    def inc_conns(self) -> None:
        self._connections += 1

    def _process_statistics(self) -> None:
        new_ok, new_bad = self._ok_submits, self._bad_submits
        new_conn = self._connections
        self._logger.info(f"OK: {new_ok - self._was_ok:>6}, "
                          f"BAD: {new_bad - self._was_bad:>6}, "
                          f"CONN: {new_conn - self._was_conn:>6}, "
                          f"TOTOK: {new_ok:>6}, TOTBAD: {new_bad:>6}, "
                          f"TOTCONN: {new_conn:>6}")
        self._was_ok = new_ok
        self._was_bad = new_bad
        self._was_conn = new_conn

    def _process_attacks_queue(self) -> None:
        conn = Connection(config.get_broker_url())
        with conn.channel() as channel:
            producer = Producer(channel)
            by_label = defaultdict(list)
            while not self._q.empty():
                try:
                    ar: models.AttackResult = self._q.get_nowait()
                except Empty:
                    continue
                else:
                    by_label[ar.get_label_key()].append(ar)

            for ar_list in by_label.values():
                if not ar_list:
                    continue

                monitor_message = {
                    'type': 'flag_submit',
                    'data': ar_list[0].get_label_values(),
                    'value': len(ar_list),
                }

                producer.publish(
                    monitor_message,
                    exchange='',
                    routing_key='forcad-monitoring',
                )

    def __call__(self) -> None:
        while True:
            try:
                self._process_statistics()
                self._process_attacks_queue()
            except Exception as e:
                self._logger.error("Error in monitoring: %s", str(e))
            eventlet.sleep(self._interval)
Ejemplo n.º 4
0
class Crawler(object):
    def __init__(self, max_connections, input_is_plain):
        self.max_connections = max_connections
        self.input_is_plain = input_is_plain

        self.queue = Queue(1)
        self.closed = False
        self._handler_pool = GreenPool(self.max_connections)
        self._robots_cache = PoolMap(self.get_robots_checker, pool_max_size=1, timeout=600)

        # Start IO worker and die if he does.
        self.io_worker = io.Worker(lambda: self.closed)
        t = spawn(self.io_worker.run_loop)
        t.link(reraise_errors, greenthread.getcurrent())

        log.debug(u"Crawler started. Max connections: %d.", self.max_connections)

    def crawl(self, forever=True):
        # TODO: do something special about signals?

        if forever:
            self.start_queue_updater()

        while not self.closed:
            # `get_nowait` will only work together with sleep(0) here
            # because we need greenlet switch to reraise exception from `do_process`.
            sleep()
            try:
                item = self.queue.get_nowait()
            except Empty:
                if not forever:
                    self.graceful_stop()
                sleep(0.01)
                continue
            t = self._handler_pool.spawn(self.do_process, item)
            t.link(reraise_errors, greenthread.getcurrent())

    def stop(self):
        self.closed = True

    def graceful_stop(self, timeout=None):
        """Stops crawler and waits for all already started crawling requests to finish.

        If `timeout` is supplied, it waits for at most `timeout` time to finish
            and returns True if allocated time was enough.
            Returns False if `timeout` was not enough.
        """
        self.closed = True
        if timeout is not None:
            with eventlet.Timeout(timeout, False):
                if hasattr(self, "_queue_updater_thread"):
                    self._queue_updater_thread.kill()
                self._handler_pool.waitall()
                return True
            return False
        else:
            if hasattr(self, "_queue_updater_thread"):
                self._queue_updater_thread.kill()
            self._handler_pool.waitall()

    def start_queue_updater(self):
        self._queue_updater_thread = spawn(self.queue_updater)
        self._queue_updater_thread.link(reraise_errors, greenthread.getcurrent())

    def queue_updater(self):
        log.debug("Waiting for crawl jobs on stdin.")
        for line in sys.stdin:
            if self.closed:
                break

            line = line.strip()

            if self.input_is_plain:
                job = {"url": line}
            else:
                try:
                    job = json.loads(line)
                except ValueError:
                    log.error(u"Decoding input line: %s", line)
                    continue

            # extend worker queue
            # 1. skip duplicate URLs
            for queue_item in self.queue.queue:
                if queue_item["url"] == job["url"]:  # compare URLs
                    break
            else:
                # 2. extend queue with new items
                # May block here, when queue is full. This is a feature.
                self.queue.put(job)

        # Stdin exhausted -> stop.
        while not self.queue.empty():
            sleep(0.01)

        sleep(2)  # FIXME: Crutch to prevent stopping too early.

        self.graceful_stop()

    def get_robots_checker(self, scheme, authority):
        """PoolMap func :: scheme, authority -> (agent, uri -> bool)."""
        robots_uri = "%s://%s/robots.txt" % (scheme, authority)

        fetch_result = self.io_worker.fetch(robots_uri)
        # Graceful stop thing.
        if fetch_result is None:
            return None

        if fetch_result["success"]:
            # TODO: set expiration time from headers
            # but this must be done after `self._robots_cache.put` or somehow else...
            if 200 <= fetch_result["status_code"] < 300:
                parser = robotparser.RobotFileParser()
                content_lines = fetch_result["content"].splitlines()
                try:
                    parser.parse(content_lines)
                except KeyError:
                    raise RobotsError(u"Known robotparser bug: KeyError at urllib.quote(path).")
                return parser.can_fetch
            # Authorization required and Forbidden are considered Disallow all.
            elif fetch_result["status_code"] in (401, 403):
                return lambda _agent, _uri: False
            # /robots.txt Not Found is considered Allow all.
            elif fetch_result["status_code"] == 404:
                return lambda _agent, _uri: True
            # FIXME: this is an optimistic rule and probably should be detailed with more specific checks
            elif fetch_result["status_code"] >= 400:
                return lambda _agent, _uri: True
            # What other cases left? 100 and redirects. Consider it Disallow all.
            else:
                return lambda _agent, _uri: False
        else:
            raise FetchError(u"/robots.txt fetch problem: %s" % (fetch_result["result"]))

    def ask_robots(self, uri, scheme, authority):
        key = scheme + ":" + authority
        with self._robots_cache.getc(key, scheme, authority) as checker:
            try:
                # Graceful stop thing.
                if checker is None:
                    return None
                return checker(settings.identity["name"], uri)
            except Exception, e:
                log.exception(u"Get rid of this. ask_robots @ %s", uri)
                raise RobotsError(u"Error checking robots.txt permissions for URI '%s': %s" % (uri, unicode(e)))