コード例 #1
0
    def get_file_list(account, **kwargs):
        queue = Queue()
        sem = BoundedSemaphore(FETCH_CONCURRENCY)
        done, working = set(), set()
        data = set()

        @gen.coroutine
        def fetch_url():
            current_url = yield queue.get()
            try:
                if current_url in working:
                    return
                page_no = working.__len__()
                app_log.info("Fetching page {}".format(page_no))
                working.add(current_url)
                req = account.get_request(current_url)
                client = AsyncHTTPClient()
                response = yield client.fetch(req)
                done.add(current_url)
                app_log.info("Page {} downloaded".format(page_no))
                response_data = json.loads(response.body.decode('utf-8'))

                for file in response_data:
                    # be sure we're a valid file type and less than our maximum response size limit
                    extension = file['path'].lower().split('.')[-1]
                    if extension in VALID_FILETYPES and int(
                            file['bytes']) < RESPONSE_SIZE_LIMIT * 1000000:
                        data.add((
                            file['path'].lstrip('/'),
                            file['path'],
                        ))
                app_log.info("Page {} completed".format(page_no))
            finally:
                queue.task_done()
                sem.release()

        @gen.coroutine
        def worker():
            while True:
                yield sem.acquire()
                fetch_url()

        app_log.info("Gathering filelist for account {}".format(account._id))
        for file_type in VALID_FILETYPES:
            file_type = '.'.join([file_type])
            url = "https://api.dropbox.com/1/search/auto/?query={}&include_membership=true".format(
                file_type)
            queue.put(url)
        # start our concurrency worker
        worker()
        # wait until we're done
        yield queue.join(timeout=timedelta(seconds=MAXIMUM_REQ_TIME))
        app_log.info("Finished list retrieval. Found {} items.".format(
            data.__len__()))
        return sorted([{
            "title": title,
            "value": path
        } for title, path in data],
                      key=lambda f: f['title'])
コード例 #2
0
def run(args):
    if not args.test:
        ip_iter = _create_ip_iterator()
    else:
        ip_iter = _get_test_ips()
        good_ips = []

    job_queue = Queue(maxsize=200)

    start = time.time()
    counter = Counter()

    @gen.coroutine
    def job_producer():
        for ip in ip_iter:
            yield job_queue.put(ip)
            #print("Put {}".format(ip))

    @gen.coroutine
    def worker(id):
        while True:
            ip = yield job_queue.get()
            try:
                good = yield test_ip(ip)
                counter['all'] += 1
                if args.progress:
                    if counter['all'] % 10000 == 0:
                        print("Tested {} ips.".format(counter['all']))
                if good:
                    print("Found good ip: {}".format(ip))
                    counter['good'] += 1
                    if not args.test:
                        yield record_good_ip(ip)
                    else:
                        good_ips.append(ip)
            finally:
                job_queue.task_done()

    for i in range(CONCURRENCY):
        worker(i)

    _disable_logging()

    try:
        yield job_producer()
        yield job_queue.join()
    finally:
        print("\n\nTested: {} ips\nFound {} good ips\nQps: {}".format(
            counter['all'],
            counter['good'],
            counter['all'] / (time.time() - start)
        ))

    if args.test and args.remove:
        with open(GOOD_IP_FILE + '_removed', 'w') as f:
            f.write('|'.join(good_ips))
コード例 #3
0
ファイル: libwatcher.py プロジェクト: jianingy/watchgang
class TornadoQuerierBase(object):

    def __init__(self):
        self.tasks = TornadoQueue()

    def gen_task(self):
        raise NotImplementError()

    def run_task(self, task):
        raise NotImplementError()

    def prepare(self):
        self.running = True

    def cleanup(self):
        self.running = False

    @coroutine
    def run_worker(self, worker_id, f):
        while self.tasks.qsize() > 0:
            task = yield self.tasks.get()
            LOG.debug('worker[%d]: current task is %s' % (worker_id, task))
            try:
                yield f(task)
                pass
            except Exception as e:
                LOG.warning(str(e))
            finally:
                self.tasks.task_done()
                task = None
        LOG.debug('worker[%d]: all tasks done %s' % (worker_id, self.tasks))

    @coroutine
    def start(self, num_workers=1):

        self.prepare()

        # add tasks
        tasks = yield self.gen_task()
        for task in tasks:
            yield self.tasks.put(task)

        # start shoot workers
        for worker_id in range(num_workers):
            LOG.debug('starting worker %d' % worker_id)
            self.run_worker(worker_id, self.run_task)

        yield self.tasks.join()
        self.cleanup()
コード例 #4
0
ファイル: files.py プロジェクト: vizydrop/apps
    def get_file_list(account, **kwargs):
        queue = Queue()
        sem = BoundedSemaphore(FETCH_CONCURRENCY)
        done, working = set(), set()
        data = set()

        @gen.coroutine
        def fetch_url():
            current_url = yield queue.get()
            try:
                if current_url in working:
                    return
                page_no = working.__len__()
                app_log.info("Fetching page {}".format(page_no))
                working.add(current_url)
                req = account.get_request(current_url)
                client = AsyncHTTPClient()
                response = yield client.fetch(req)
                done.add(current_url)
                app_log.info("Page {} downloaded".format(page_no))
                response_data = json.loads(response.body.decode('utf-8'))

                for file in response_data:
                    # be sure we're a valid file type and less than our maximum response size limit
                    extension = file['path'].lower().split('.')[-1]
                    if extension in VALID_FILETYPES and int(file['bytes']) < RESPONSE_SIZE_LIMIT * 1000000:
                        data.add((file['path'].lstrip('/'), file['path'], ))
                app_log.info("Page {} completed".format(page_no))
            finally:
                queue.task_done()
                sem.release()

        @gen.coroutine
        def worker():
            while True:
                yield sem.acquire()
                fetch_url()

        app_log.info("Gathering filelist for account {}".format(account._id))
        for file_type in VALID_FILETYPES:
            file_type = '.'.join([file_type])
            url = "https://api.dropbox.com/1/search/auto/?query={}&include_membership=true".format(file_type)
            queue.put(url)
        # start our concurrency worker
        worker()
        # wait until we're done
        yield queue.join(timeout=timedelta(seconds=MAXIMUM_REQ_TIME))
        app_log.info("Finished list retrieval. Found {} items.".format(data.__len__()))
        return sorted([{"title": title, "value": path} for title, path in data], key=lambda f: f['title'])
コード例 #5
0
ファイル: libwatcher.py プロジェクト: jianingy/watchgang
class TornadoQuerierBase(object):
    def __init__(self):
        self.tasks = TornadoQueue()

    def gen_task(self):
        raise NotImplementError()

    def run_task(self, task):
        raise NotImplementError()

    def prepare(self):
        self.running = True

    def cleanup(self):
        self.running = False

    @coroutine
    def run_worker(self, worker_id, f):
        while self.tasks.qsize() > 0:
            task = yield self.tasks.get()
            LOG.debug('worker[%d]: current task is %s' % (worker_id, task))
            try:
                yield f(task)
                pass
            except Exception as e:
                LOG.warning(str(e))
            finally:
                self.tasks.task_done()
                task = None
        LOG.debug('worker[%d]: all tasks done %s' % (worker_id, self.tasks))

    @coroutine
    def start(self, num_workers=1):

        self.prepare()

        # add tasks
        tasks = yield self.gen_task()
        for task in tasks:
            yield self.tasks.put(task)

        # start shoot workers
        for worker_id in range(num_workers):
            LOG.debug('starting worker %d' % worker_id)
            self.run_worker(worker_id, self.run_task)

        yield self.tasks.join()
        self.cleanup()
コード例 #6
0
def main():
    cocurrency = 10

    queue = Queue()
    queue.put("http://www.jianshu.com")

    workers = []
    for _ in range(cocurrency):
        workers.append(Worker(app, queue))

    for worker in workers:
        Log4Spider.debugLog("worker begin:", worker)
        worker.run()

    Log4Spider.debugLog("waitiing for spiderQueue empty:")
    yield queue.join(timeout=timedelta(seconds=300))
    Log4Spider.debugLog("main done!")
コード例 #7
0
class FirehoseWebSocket(tornado.websocket.WebSocketHandler):
    @tornado.gen.coroutine
    def open(self):
        print "hose open"
        global queues
        self.queue = Queue()
        queues.append(self.queue)
        while True:
            item = yield self.queue.get()
            self.queue.task_done()
            self.write_message(json.dumps(item))

    @tornado.gen.coroutine
    def on_close(self):
        global queues
        yield self.queue.join()
        queues.remove(self.queue)
コード例 #8
0
def main():
    cocurrency = 10

    queue = Queue()
    queue.put("http://www.jianshu.com")

    workers = []
    for _ in range(cocurrency):
        workers.append(Worker(app,queue))

    for worker in workers:
        Log4Spider.debugLog("worker begin:",worker)
        worker.run()

    Log4Spider.debugLog("waitiing for spiderQueue empty:")
    yield queue.join(timeout=timedelta(seconds=300))
    Log4Spider.debugLog("main done!")
コード例 #9
0
class TornadoPikaPublisher(BeergardenPublisher, PikaClient):
    def __init__(self, **kwargs):
        self.logger = logging.getLogger(__name__)

        self._shutdown_timeout = timedelta(
            seconds=kwargs.pop("shutdown_timeout", 5))
        self._work_queue = Queue()
        self._connection = None
        self._channel = None

        self.coroutiner = CoroutineMaker({
            "TornadoConnection": "on_open_callback",
            "channel": "on_open_callback"
        })

        # Trying to get super() to work with incompatible signatures is a nightmare
        BeergardenPublisher.__init__(self)
        PikaClient.__init__(self, **kwargs)

        IOLoop.current().spawn_callback(self._process)

    def shutdown(self):
        return self._work_queue.join(timeout=self._shutdown_timeout)

    @coroutine
    def _open_connection(self):
        self._connection = yield self.coroutiner.convert(TornadoConnection)(
            parameters=self._conn_params)

    @coroutine
    def _open_channel(self):
        self._channel = yield self.coroutiner.convert(
            self._connection.channel)()

    @coroutine
    def _process(self):

        while True:
            item = yield self._work_queue.get()

            try:
                if not self._connection or not self._connection.is_open:
                    yield self._open_connection()
                if not self._channel or not self._channel.is_open:
                    yield self._open_channel()

                yield getattr(self._channel, item[0])(**item[1])
            finally:
                self._work_queue.task_done()

    def publish(self, message, **kwargs):
        """Publish a message.

        :param message: The message to publish
        :param kwargs: Additional message properties
        :Keyword Arguments:
            * *routing_key* --
              Routing key to use when publishing
            * *headers* --
              Headers to be included as part of the message properties
            * *expiration* --
              Expiration to be included as part of the message properties
        :return: None
        """
        self._work_queue.put((
            "basic_publish",
            {
                "exchange":
                self._exchange,
                "routing_key":
                kwargs["routing_key"],
                "body":
                message,
                "properties":
                BasicProperties(
                    app_id="beer-garden",
                    content_type="text/plain",
                    headers=kwargs.pop("headers", None),
                    expiration=kwargs.pop("expiration", None),
                ),
            },
        ))

    def _event_publish_args(self, event, **kwargs):

        # Main thing we need to do here is figure out the appropriate routing key
        args = {}
        if event.metadata and "routing_key" in event.metadata:
            args["routing_key"] = event.metadata["routing_key"]
        elif "request" in kwargs:
            request = kwargs["request"]
            args["routing_key"] = get_routing_key("request", request.system,
                                                  request.system_version,
                                                  request.instance_name)
        else:
            args["routing_key"] = "beergarden"

        return args
コード例 #10
0
    def get_file_list(account, **kwargs):
        queue = Queue()
        sem = BoundedSemaphore(FETCH_CONCURRENCY)
        done, working = set(), set()
        data = set()
        request_params = {
            'type': 'file',
            'limit': 200,
            'size_range': ',{}'.format(RESPONSE_SIZE_LIMIT * 1000000),
            'file_extensions': ','.join(VALID_FILETYPES)
        }
        qs = '&'.join([
            "{}={}".format(key, value)
            for key, value in request_params.items()
        ])

        @gen.coroutine
        def fetch_url():
            current_url = yield queue.get()
            try:
                if current_url in working:
                    return
                page_no = working.__len__()
                app_log.info("Fetching page {}".format(page_no))
                working.add(current_url)
                req = account.get_request(current_url)
                client = AsyncHTTPClient()
                response = yield client.fetch(req)
                done.add(current_url)
                app_log.info("Page {} downloaded".format(page_no))
                response_data = json.loads(response.body.decode('utf-8'))

                for file in response_data.get('entries', []):
                    file_entry = (('/'.join([
                        path['name']
                        for path in file['path_collection']['entries']
                        if path['id'] != '0'
                    ] + [file['name']])).lstrip('/'), file['id'])
                    # be sure we're a valid file type and less than our maximum response size limit
                    extension = file['name'].lower().split('.')[-1]
                    if extension in VALID_FILETYPES:
                        data.add(file_entry)
                app_log.info("Page {} completed".format(page_no))
            finally:
                queue.task_done()
                sem.release()

        @gen.coroutine
        def worker():
            while True:
                yield sem.acquire()
                fetch_url()

        app_log.info("Gathering filelist for account {}".format(account._id))
        for file_type in VALID_FILETYPES:
            file_type = '.'.join([file_type])
            url = "https://api.box.com/2.0/search?query={}&{}".format(
                file_type, qs)
            queue.put(url)
        # start our concurrency worker
        worker()
        # wait until we're done
        yield queue.join(timeout=timedelta(seconds=MAXIMUM_REQ_TIME))
        app_log.info("Finished list retrieval. Found {} items.".format(
            data.__len__()))
        return sorted([{
            "title": title,
            "value": path
        } for title, path in data],
                      key=lambda f: f['title'])
コード例 #11
0
class AsynSpider(MySpider):
    def __init__(self, out, **kwargs):
        super(AsynSpider, self).__init__(out, **kwargs)
        self.client = httpclient.AsyncHTTPClient()
        self.q = Queue()
        self.fetching, self.fetched = set(), set()

    def assign_jobs(self, jobs):
        for job in jobs:
            self.q.put(job)

    @gen.coroutine
    def run(self):
        if self.q.empty():
            url = LIST_URL + urllib.urlencode(self.list_query)
            self.q.put(url)
        for _ in range(CONCURRENCY):
            self.worker()
        yield self.q.join()
        assert self.fetching == self.fetched
        # print len(self.fetched)
        if isinstance(self._out, Analysis):
            self._out.finish()

    @gen.coroutine
    def worker(self):
        while True:
            yield self.fetch_url()

    @gen.coroutine
    def fetch_url(self):
        current_url = yield self.q.get()
        try:
            if current_url in self.fetching:
                return
            self.fetching.add(current_url)
            request = httpclient.HTTPRequest(current_url, headers=HEADERS)
            resp = yield self.client.fetch(request)
            self.fetched.add(current_url)
            xml = etree.fromstring(resp.body)
            has_total_count = xml.xpath("//totalcount/text()")
            if has_total_count:  # 非空证明为列表,否则为详细页
                total_count = int(has_total_count[0])
                if total_count == 0:
                    return  # 列表跨界
                if self.list_query["pageno"] == 1:
                    pageno = 2
                    while pageno < 10:
                        # while pageno <= total_count / PAGE_SIZE:
                        self.list_query["pageno"] = pageno
                        next_list_url = LIST_URL + urllib.urlencode(
                            self.list_query)
                        self.q.put(next_list_url)
                        # logging.info(next_list_url)
                        pageno += 1
                job_ids = xml.xpath("//jobid/text()")
                job_detail_urls = []
                for ID in job_ids:
                    new_detail_query = DETAIL_QUERY.copy()
                    new_detail_query["jobid"] = ID
                    job_detail_urls.append(DETAIL_URL +
                                           urllib.urlencode(new_detail_query))
                for detail_url in job_detail_urls:
                    self.q.put(detail_url)
                    # logging.info(detail_url)

            else:
                self._out.collect(xml)
        finally:
            self.q.task_done()
コード例 #12
0
ファイル: files.py プロジェクト: vizydrop/apps
    def get_file_list(account, **kwargs):
        queue = Queue()
        sem = BoundedSemaphore(FETCH_CONCURRENCY)
        done, working = set(), set()
        data = []
        ids = set()

        @gen.coroutine
        def fetch_url():
            current_url = yield queue.get()
            try:
                if current_url in working:
                    return
                page_no = working.__len__()
                app_log.info("Fetching page {}".format(page_no))
                working.add(current_url)
                req = account.get_request(current_url)
                client = AsyncHTTPClient()
                response = yield client.fetch(req)
                done.add(current_url)
                app_log.info("Page {} downloaded".format(page_no))
                response_data = json.loads(response.body.decode("utf-8"))

                url = response_data.get("@odata.nextLink", None)
                if url is not None:
                    queue.put(url)

                for file in response_data.get("value", []):
                    if file["name"][-4:].strip(".").lower() in VALID_FILETYPES:
                        if file["id"] not in ids:
                            ids.add(file["id"])
                            data.append(
                                {
                                    "title": file["parentReference"]["path"].split(":")[1].lstrip("/")
                                    + "/"
                                    + file["name"],
                                    "value": file["id"],
                                }
                            )
                app_log.info("Page {} completed".format(page_no))
            finally:
                queue.task_done()
                sem.release()

        @gen.coroutine
        def worker():
            while True:
                yield sem.acquire()
                fetch_url()

        app_log.info("Gathering filelist for account {}".format(account._id))
        for file_type in VALID_FILETYPES:
            file_type = ".".join([file_type])
            url = "https://api.onedrive.com/v1.0/drive/root/view.search?top=1000&select=parentReference,name,id,size&q={}".format(
                file_type
            )
            queue.put(url)
        # start our concurrency worker
        worker()
        # wait until we're done
        yield queue.join(timeout=timedelta(seconds=MAXIMUM_REQ_TIME))
        app_log.info("Finished list retrieval. Found {} items.".format(data.__len__()))
        return sorted(data, key=lambda f: f["title"])
コード例 #13
0
ファイル: commits.py プロジェクト: vizydrop/apps
    def get_data(cls, account, source_filter, limit=100, skip=0):
        """
        Gathers commit information from GH
        GET https://api.github.com/repos/:owner/:repo/commits
        Header: Accept: application/vnd.github.v3+json
        """
        if not account or not account.enabled:
            raise ValueError('cannot gather information without a valid account')
        client = AsyncHTTPClient()

        source_filter = GitHubRepositoryDateFilter(source_filter)

        if source_filter.repository is None:
            raise ValueError('required parameter projects missing')

        default_headers = {"Content-Type": "application/json", "Accept": "application/vnd.github.v3+json"}

        # first we grab our list of commits
        uri = "https://api.github.com/repos/{}/commits".format(source_filter.repository)
        qs = source_filter.get_qs()
        if qs != '':
            uri = uri + '?' + qs
        app_log.info("Starting retrieval of commit list for account {}".format(account._id))
        if limit is not None and limit <= 100:
            # we can handle our limit right here
            uri += "?per_page={}".format(limit)
        elif limit is None:
            uri += "?per_page=100"  # maximum number per page for GitHub API
        taken = 0

        queue = Queue()
        sem = BoundedSemaphore(FETCH_CONCURRENCY)
        done, working = set(), set()

        while uri is not None:
            app_log.info(
                "({}) Retrieving next page, received {} commits thus far".format(account._id, taken))
            req = account.get_request(uri, headers=default_headers)
            response = yield client.fetch(req)

            page_data = json.loads(response.body.decode('utf-8'))
            taken += page_data.__len__()
            for item in page_data:
                queue.put(item.get('url', None))

            if limit is None or taken < limit:
                # parse the Link header from GitHub (https://developer.github.com/v3/#pagination)
                links = parse_link_header(response.headers.get('Link', ''))
                uri = links.get('next', None)
            else:
                break

            if queue.qsize() > 500:
                raise HTTPError(413, 'too many commits')
        app_log.info("({}) Commit list retrieved, fetching info for {} commits".format(account._id, taken))

        # open our list
        cls.write('[')

        # our worker to actually fetch the info
        @gen.coroutine
        def fetch_url():
            current_url = yield queue.get()
            try:
                if current_url in working:
                    return
                page_no = working.__len__()
                app_log.info("Fetching page {}".format(page_no))
                working.add(current_url)
                req = account.get_request(current_url)
                client = AsyncHTTPClient()
                response = yield client.fetch(req)
                response_data = json.loads(response.body.decode('utf-8'))
                obj = {
                    'date': response_data['commit']['author']['date'],
                    'author': response_data['commit']['author']['name'],
                    'added_files': [file for file in response_data['files'] if file['status'] == 'added'].__len__(),
                    'deleted_files': [file for file in response_data['files'] if file['status'] == 'deleted'].__len__(),
                    'modified_files': [file for file in response_data['files'] if file['status'] == 'modified'].__len__(),
                    'additions': response_data['stats']['additions'],
                    'deletions': response_data['stats']['deletions']
                }
                if done.__len__() > 0:
                    cls.write(',')
                cls.write(json.dumps(obj))
                done.add(current_url)
                app_log.info("Page {} downloaded".format(page_no))

            finally:
                queue.task_done()
                sem.release()

        @gen.coroutine
        def worker():
            while True:
                yield sem.acquire()
                fetch_url()

        # start our concurrency worker
        worker()
        try:
            # wait until we're done
            yield queue.join(timeout=timedelta(seconds=MAXIMUM_REQ_TIME))
        except gen.TimeoutError:
            app_log.warning("Request exceeds maximum time, cutting response short")
        finally:
            # close our list
            cls.write(']')
        app_log.info("Finished retrieving commits for {}".format(account._id))
コード例 #14
0
class ProjectGroomer(object):
    """ Cleans up expired transactions for a project. """
    def __init__(self, project_id, coordinator, zk_client, db_access,
                 thread_pool):
        """ Creates a new ProjectGroomer.

    Args:
      project_id: A string specifying a project ID.
      coordinator: A GroomingCoordinator.
      zk_client: A KazooClient.
      db_access: A DatastoreProxy.
      thread_pool: A ThreadPoolExecutor.
    """
        self.project_id = project_id

        self._coordinator = coordinator
        self._zk_client = zk_client
        self._tornado_zk = TornadoKazoo(self._zk_client)
        self._db_access = db_access
        self._thread_pool = thread_pool
        self._project_node = '/appscale/apps/{}'.format(self.project_id)
        self._containers = []
        self._inactive_containers = set()
        self._batch_resolver = BatchResolver(self.project_id, self._db_access)

        self._zk_client.ensure_path(self._project_node)
        self._zk_client.ChildrenWatch(self._project_node,
                                      self._update_containers)

        self._txid_manual_offset = 0
        self._offset_node = '/'.join([self._project_node, OFFSET_NODE])
        self._zk_client.DataWatch(self._offset_node, self._update_offset)

        self._stop_event = AsyncEvent()
        self._stopped_event = AsyncEvent()

        # Keeps track of cleanup results for each round of grooming.
        self._txids_cleaned = 0
        self._oldest_valid_tx_time = None

        self._worker_queue = AsyncQueue(maxsize=MAX_CONCURRENCY)
        for _ in range(MAX_CONCURRENCY):
            IOLoop.current().spawn_callback(self._worker)

        IOLoop.current().spawn_callback(self.start)

    @gen.coroutine
    def start(self):
        """ Starts the grooming process until the stop event is set. """
        logger.info('Grooming {}'.format(self.project_id))
        while True:
            if self._stop_event.is_set():
                break

            try:
                yield self._groom_project()
            except Exception:
                # Prevent the grooming loop from stopping if an error is encountered.
                logger.exception('Unexpected error while grooming {}'.format(
                    self.project_id))
                yield gen.sleep(MAX_TX_DURATION)

        self._stopped_event.set()

    @gen.coroutine
    def stop(self):
        """ Stops the grooming process. """
        logger.info('Stopping grooming process for {}'.format(self.project_id))
        self._stop_event.set()
        yield self._stopped_event.wait()

    @gen.coroutine
    def _worker(self):
        """ Processes items in the worker queue. """
        while True:
            tx_path, composite_indexes = yield self._worker_queue.get()
            try:
                tx_time = yield self._resolve_txid(tx_path, composite_indexes)
                if tx_time is None:
                    self._txids_cleaned += 1

                if tx_time is not None and tx_time < self._oldest_valid_tx_time:
                    self._oldest_valid_tx_time = tx_time
            except Exception:
                logger.exception(
                    'Unexpected error while resolving {}'.format(tx_path))
            finally:
                self._worker_queue.task_done()

    def _update_offset(self, new_offset, _):
        """ Watches for updates to the manual offset node.

    Args:
      new_offset: A string specifying the new manual offset.
    """
        self._txid_manual_offset = int(new_offset or 0)

    def _update_containers(self, nodes):
        """ Updates the list of active txid containers.

    Args:
      nodes: A list of strings specifying ZooKeeper nodes.
    """
        counters = [
            int(node[len(CONTAINER_PREFIX):] or 1) for node in nodes
            if node.startswith(CONTAINER_PREFIX)
            and node not in self._inactive_containers
        ]
        counters.sort()

        containers = [CONTAINER_PREFIX + str(counter) for counter in counters]
        if containers and containers[0] == '{}1'.format(CONTAINER_PREFIX):
            containers[0] = CONTAINER_PREFIX

        self._containers = containers

    @gen.coroutine
    def _groom_project(self):
        """ Runs the grooming process. """
        index = self._coordinator.index
        worker_count = self._coordinator.total_workers

        oldest_valid_tx_time = yield self._fetch_and_clean(index, worker_count)

        # Wait until there's a reasonable chance that some transactions have
        # timed out.
        next_timeout_eta = oldest_valid_tx_time + MAX_TX_DURATION

        # The oldest ignored transaction should still be valid, but ensure that
        # the timeout is not negative.
        next_timeout = max(0, next_timeout_eta - time.time())
        time_to_wait = datetime.timedelta(seconds=next_timeout +
                                          (MAX_TX_DURATION / 2))

        # Allow the wait to be cut short when a project is removed.
        try:
            yield self._stop_event.wait(timeout=time_to_wait)
        except gen.TimeoutError:
            return

    @gen.coroutine
    def _remove_locks(self, txid, tx_path):
        """ Removes entity locks involved with the transaction.

    Args:
      txid: An integer specifying the transaction ID.
      tx_path: A string specifying the location of the transaction node.
    """
        groups_path = '/'.join([tx_path, 'groups'])
        try:
            groups_data = yield self._tornado_zk.get(groups_path)
        except NoNodeError:
            # If the group list does not exist, the locks have not been acquired.
            return

        group_paths = json.loads(groups_data[0])
        for group_path in group_paths:
            try:
                contenders = yield self._tornado_zk.get_children(group_path)
            except NoNodeError:
                # The lock may have been cleaned up or not acquired in the first place.
                continue

            for contender in contenders:
                contender_path = '/'.join([group_path, contender])
                contender_data = yield self._tornado_zk.get(contender_path)
                contender_txid = int(contender_data[0])
                if contender_txid != txid:
                    continue

                yield self._tornado_zk.delete(contender_path)
                break

    @gen.coroutine
    def _remove_path(self, tx_path):
        """ Removes a ZooKeeper node.

    Args:
      tx_path: A string specifying the path to delete.
    """
        try:
            yield self._tornado_zk.delete(tx_path)
        except NoNodeError:
            pass
        except NotEmptyError:
            yield self._thread_pool.submit(self._zk_client.delete,
                                           tx_path,
                                           recursive=True)

    @gen.coroutine
    def _resolve_txid(self, tx_path, composite_indexes):
        """ Cleans up a transaction if it has expired.

    Args:
      tx_path: A string specifying the location of the ZooKeeper node.
      composite_indexes: A list of CompositeIndex objects.
    Returns:
      The transaction start time if still valid, None if invalid because this
      method will also delete it.
    """
        try:
            tx_data = yield self._tornado_zk.get(tx_path)
        except NoNodeError:
            return

        tx_time = float(tx_data[0])

        _, container, tx_node = tx_path.rsplit('/', 2)
        tx_node_id = int(tx_node.lstrip(COUNTER_NODE_PREFIX))
        container_count = int(container[len(CONTAINER_PREFIX):] or 1)
        if tx_node_id < 0:
            yield self._remove_path(tx_path)
            return

        container_size = MAX_SEQUENCE_COUNTER + 1
        automatic_offset = (container_count - 1) * container_size
        txid = self._txid_manual_offset + automatic_offset + tx_node_id

        if txid < 1:
            yield self._remove_path(tx_path)
            return

        # If the transaction is still valid, return the time it was created.
        if tx_time + MAX_TX_DURATION >= time.time():
            raise gen.Return(tx_time)

        yield self._batch_resolver.resolve(txid, composite_indexes)
        yield self._remove_locks(txid, tx_path)
        yield self._remove_path(tx_path)
        yield self._batch_resolver.cleanup(txid)

    @gen.coroutine
    def _fetch_and_clean(self, worker_index, worker_count):
        """ Cleans up expired transactions.

    Args:
      worker_index: An integer specifying this worker's index.
      worker_count: An integer specifying the number of total workers.
    Returns:
      A float specifying the time of the oldest valid transaction as a unix
      timestamp.
    """
        self._txids_cleaned = 0
        self._oldest_valid_tx_time = time.time()

        children = []
        for index, container in enumerate(self._containers):
            container_path = '/'.join([self._project_node, container])
            new_children = yield self._tornado_zk.get_children(container_path)

            if not new_children and index < len(self._containers) - 1:
                self._inactive_containers.add(container)

            children.extend(
                ['/'.join([container_path, node]) for node in new_children])

        logger.debug('Found {} transaction IDs for {}'.format(
            len(children), self.project_id))

        if not children:
            raise gen.Return(self._oldest_valid_tx_time)

        # Refresh these each time so that the indexes are fresh.
        encoded_indexes = yield self._thread_pool.submit(
            self._db_access.get_indices, self.project_id)
        composite_indexes = [
            CompositeIndex(index) for index in encoded_indexes
        ]

        for tx_path in children:
            tx_node_id = int(
                tx_path.split('/')[-1].lstrip(COUNTER_NODE_PREFIX))
            # Only resolve transactions that this worker has been assigned.
            if tx_node_id % worker_count != worker_index:
                continue

            yield self._worker_queue.put((tx_path, composite_indexes))

        yield self._worker_queue.join()

        if self._txids_cleaned > 0:
            logger.info('Cleaned up {} expired txids for {}'.format(
                self._txids_cleaned, self.project_id))

        raise gen.Return(self._oldest_valid_tx_time)
コード例 #15
0
class Scraper():
    def __init__(self,
                 request_params=[{}],
                 max_clients=100,
                 maxsize=100,
                 connect_timeout=9999999,
                 request_timeout=9999999,
                 auth_username=None,
                 auth_password=None,
                 method='GET',
                 func=None,
                 sleep=0,
                 endpoint=None):
        self.sleep = sleep
        self.endpoint = endpoint
        """Instantiate a tornado async http client to do multiple concurrent requests"""
        self.max_clients = max_clients
        AsyncHTTPClient.configure(
            "tornado.simple_httpclient.SimpleAsyncHTTPClient",
            max_clients=self.max_clients)
        self.request_params = request_params
        self.method = method

        self.maxsize = maxsize
        self.auth_username = auth_username
        self.auth_password = auth_password
        self.connect_timeout = connect_timeout
        self.request_timeout = request_timeout
        self.to_return = []
        self.http_client = AsyncHTTPClient()
        self.queue = Queue(maxsize=self.maxsize)
        self.func = func
        self.read(self.request_params)
        self.get(self.connect_timeout, self.request_timeout, self.http_client)
        self.loop = ioloop.IOLoop.current()
        self.join_future = self.queue.join()

        def done(future):
            self.loop.stop()

        self.join_future.add_done_callback(done)
        self.loop.start()

    @gen.coroutine
    def read(self, request_params):
        for request_param in request_params:
            yield self.queue.put(request_param)

    @gen.coroutine
    def get(self, connect_timeout, request_timeout, http_client):
        print("Getting Links")
        self.counter = 1
        while True:
            request_param = yield self.queue.get()
            url = request_param.get('url', self.endpoint)
            body = request_param.get('body', None)
            dictKey = request_param['dictKey']
            # request_param['headers']['dictKey'] = dictKey

            request = CustomHTTPRequest(url,
                                        method=self.method,
                                        headers=request_param['headers'],
                                        body=body,
                                        connect_timeout=connect_timeout,
                                        request_timeout=request_timeout,
                                        auth_username=self.auth_username,
                                        auth_password=self.auth_password,
                                        key=dictKey)

            def handle_response(response):
                if not self.func:
                    if response.error:
                        self.to_return.append({
                            'key':
                            response.request.__dict__['key'],
                            'response':
                            str(response.error)
                        })
                    else:
                        self.to_return.append({
                            'key':
                            response.request.__dict__['key'],
                            'response':
                            response.body
                        })
                else:
                    try:
                        self.func(response.body,
                                  response.request.__dict__['key'])
                    except Exception as e:
                        pass
                # print(self.counter)
                self.counter += 1
                self.queue.task_done()

            future = self.http_client.fetch(request, handle_response)

            time.sleep(self.sleep)

    def return_results(self):
        return self.to_return
コード例 #16
0
ファイル: files.py プロジェクト: fstfwd/apps
    def get_file_list(account, **kwargs):
        queue = Queue()
        sem = BoundedSemaphore(FETCH_CONCURRENCY)
        done, working = set(), set()
        data = []
        ids = set()

        @gen.coroutine
        def fetch_url():
            current_url = yield queue.get()
            try:
                if current_url in working:
                    return
                page_no = working.__len__()
                app_log.info("Fetching page {}".format(page_no))
                working.add(current_url)
                req = account.get_request(current_url)
                client = AsyncHTTPClient()
                response = yield client.fetch(req)
                done.add(current_url)
                app_log.info("Page {} downloaded".format(page_no))
                response_data = json.loads(response.body.decode('utf-8'))

                url = response_data.get('@odata.nextLink', None)
                if url is not None:
                    queue.put(url)

                for file in response_data.get('value', []):
                    if file['name'][-4:].strip('.').lower() in VALID_FILETYPES:
                        if file['id'] not in ids:
                            ids.add(file['id'])
                            data.append({
                                "title":
                                file['parentReference']['path'].split(':')
                                [1].lstrip('/') + '/' + file['name'],
                                "value":
                                file['id']
                            })
                app_log.info("Page {} completed".format(page_no))
            finally:
                queue.task_done()
                sem.release()

        @gen.coroutine
        def worker():
            while True:
                yield sem.acquire()
                fetch_url()

        app_log.info("Gathering filelist for account {}".format(account._id))
        for file_type in VALID_FILETYPES:
            file_type = '.'.join([file_type])
            url = "https://api.onedrive.com/v1.0/drive/root/view.search?top=1000&select=parentReference,name,id,size&q={}" \
                .format(file_type)
            queue.put(url)
        # start our concurrency worker
        worker()
        # wait until we're done
        yield queue.join(timeout=timedelta(seconds=MAXIMUM_REQ_TIME))
        app_log.info("Finished list retrieval. Found {} items.".format(
            data.__len__()))
        return sorted(data, key=lambda f: f['title'])
コード例 #17
0
class SQSDrain(object):
    """Implementation of IDrain that writes to an AWS SQS queue.
    """

    def __init__(self, logger, loop, sqs_client,
                 metric_prefix='emitter'):
        self.emitter = sqs_client
        self.logger = logger
        self.loop = loop
        self.metric_prefix = metric_prefix
        self.output_error = Event()
        self.state = RUNNING
        self.sender_tag = 'sender:%s.%s' % (self.__class__.__module__,
                                            self.__class__.__name__)
        self._send_queue = Queue()
        self._should_flush_queue = Event()
        self._flush_handle = None
        self.loop.spawn_callback(self._onSend)

    @gen.coroutine
    def _flush_send_batch(self, batch_size):
        send_batch = [
            self._send_queue.get_nowait()
            for pos in range(min(batch_size, self.emitter.max_messages))
        ]
        try:
            response = yield self.emitter.send_message_batch(*send_batch)
        except SQSError as err:
            self.logger.exception('Error encountered flushing data to SQS: %s',
                                  err)
            self.output_error.set()
            for msg in send_batch:
                self._send_queue.put_nowait(msg)
        else:
            if response.Failed:
                self.output_error.set()
                for req in response.Failed:
                    self.logger.error('Message failed to send: %s', req.Id)
                    self._send_queue.put_nowait(req)

    @gen.coroutine
    def _onSend(self):
        respawn = True
        while respawn:
            qsize = self._send_queue.qsize()
            # This will keep flushing until clear,
            # including items that show up in between flushes
            while qsize > 0:
                try:
                    yield self._flush_send_batch(qsize)
                except Exception as err:
                    self.logger.exception(err)
                    self.output_error.set()
                qsize = self._send_queue.qsize()
            # We've cleared the backlog, remove any possible future flush
            if self._flush_handle:
                self.loop.remove_timeout(self._flush_handle)
                self._flush_handle = None
            self._should_flush_queue.clear()
            yield self._should_flush_queue.wait()

    @gen.coroutine
    def close(self, timeout=None):
        self.state = CLOSING
        yield self._send_queue.join(timeout)

    def emit_nowait(self, msg):
        if self._send_queue.qsize() >= self.emitter.max_messages:
            # Signal flush
            self._should_flush_queue.set()
            raise QueueFull()
        elif self._flush_handle is None:
            # Ensure we flush messages at least by MAX_TIMEOUT
            self._flush_handle = self.loop.add_timeout(
                MAX_TIMEOUT,
                lambda: self._should_flush_queue.set(),
            )
        self.logger.debug("Drain emitting")
        self._send_queue.put_nowait(msg)

    @gen.coroutine
    def emit(self, msg, timeout=None):
        if self._send_queue.qsize() >= self.emitter.max_messages:
            # Signal flush
            self._should_flush_queue.set()
        elif self._flush_handle is None:
            # Ensure we flush messages at least by MAX_TIMEOUT
            self._flush_handle = self.loop.add_timeout(
                MAX_TIMEOUT,
                lambda: self._should_flush_queue.set(),
            )
        yield self._send_queue.put(msg, timeout)
コード例 #18
0
class ProjectGroomer(object):
  """ Cleans up expired transactions for a project. """
  def __init__(self, project_id, coordinator, zk_client, db_access,
               thread_pool):
    """ Creates a new ProjectGroomer.

    Args:
      project_id: A string specifying a project ID.
      coordinator: A GroomingCoordinator.
      zk_client: A KazooClient.
      db_access: A DatastoreProxy.
      thread_pool: A ThreadPoolExecutor.
    """
    self.project_id = project_id

    self._coordinator = coordinator
    self._zk_client = zk_client
    self._tornado_zk = TornadoKazoo(self._zk_client)
    self._db_access = db_access
    self._thread_pool = thread_pool
    self._project_node = '/appscale/apps/{}'.format(self.project_id)
    self._containers = []
    self._inactive_containers = set()
    self._batch_resolver = BatchResolver(self.project_id, self._db_access)

    self._zk_client.ensure_path(self._project_node)
    self._zk_client.ChildrenWatch(self._project_node, self._update_containers)

    self._txid_manual_offset = 0
    self._offset_node = '/'.join([self._project_node, OFFSET_NODE])
    self._zk_client.DataWatch(self._offset_node, self._update_offset)

    self._stop_event = AsyncEvent()
    self._stopped_event = AsyncEvent()

    # Keeps track of cleanup results for each round of grooming.
    self._txids_cleaned = 0
    self._oldest_valid_tx_time = None

    self._worker_queue = AsyncQueue(maxsize=MAX_CONCURRENCY)
    for _ in range(MAX_CONCURRENCY):
      IOLoop.current().spawn_callback(self._worker)

    IOLoop.current().spawn_callback(self.start)

  @gen.coroutine
  def start(self):
    """ Starts the grooming process until the stop event is set. """
    logger.info('Grooming {}'.format(self.project_id))
    while True:
      if self._stop_event.is_set():
        break

      try:
        yield self._groom_project()
      except Exception:
        # Prevent the grooming loop from stopping if an error is encountered.
        logger.exception(
          'Unexpected error while grooming {}'.format(self.project_id))
        yield gen.sleep(MAX_TX_DURATION)

    self._stopped_event.set()

  @gen.coroutine
  def stop(self):
    """ Stops the grooming process. """
    logger.info('Stopping grooming process for {}'.format(self.project_id))
    self._stop_event.set()
    yield self._stopped_event.wait()

  @gen.coroutine
  def _worker(self):
    """ Processes items in the worker queue. """
    while True:
      tx_path, composite_indexes = yield self._worker_queue.get()
      try:
        tx_time = yield self._resolve_txid(tx_path, composite_indexes)
        if tx_time is None:
          self._txids_cleaned += 1

        if tx_time is not None and tx_time < self._oldest_valid_tx_time:
          self._oldest_valid_tx_time = tx_time
      finally:
        self._worker_queue.task_done()

  def _update_offset(self, new_offset, _):
    """ Watches for updates to the manual offset node.

    Args:
      new_offset: A string specifying the new manual offset.
    """
    self._txid_manual_offset = int(new_offset or 0)

  def _update_containers(self, nodes):
    """ Updates the list of active txid containers.

    Args:
      nodes: A list of strings specifying ZooKeeper nodes.
    """
    counters = [int(node[len(CONTAINER_PREFIX):] or 1)
                for node in nodes if node.startswith(CONTAINER_PREFIX)
                and node not in self._inactive_containers]
    counters.sort()

    containers = [CONTAINER_PREFIX + str(counter) for counter in counters]
    if containers and containers[0] == '{}1'.format(CONTAINER_PREFIX):
      containers[0] = CONTAINER_PREFIX

    self._containers = containers

  @gen.coroutine
  def _groom_project(self):
    """ Runs the grooming process. """
    index = self._coordinator.index
    worker_count = self._coordinator.total_workers

    oldest_valid_tx_time = yield self._fetch_and_clean(index, worker_count)

    # Wait until there's a reasonable chance that some transactions have
    # timed out.
    next_timeout_eta = oldest_valid_tx_time + MAX_TX_DURATION

    # The oldest ignored transaction should still be valid, but ensure that
    # the timeout is not negative.
    next_timeout = max(0, next_timeout_eta - time.time())
    time_to_wait = datetime.timedelta(
      seconds=next_timeout + (MAX_TX_DURATION / 2))

    # Allow the wait to be cut short when a project is removed.
    try:
      yield self._stop_event.wait(timeout=time_to_wait)
    except gen.TimeoutError:
      raise gen.Return()

  @gen.coroutine
  def _remove_path(self, tx_path):
    """ Removes a ZooKeeper node.

    Args:
      tx_path: A string specifying the path to delete.
    """
    try:
      yield self._tornado_zk.delete(tx_path)
    except NoNodeError:
      pass
    except NotEmptyError:
      yield self._thread_pool.submit(self._zk_client.delete, tx_path,
                                     recursive=True)

  @gen.coroutine
  def _resolve_txid(self, tx_path, composite_indexes):
    """ Cleans up a transaction if it has expired.

    Args:
      tx_path: A string specifying the location of the ZooKeeper node.
      composite_indexes: A list of CompositeIndex objects.
    Returns:
      The transaction start time if still valid, None if invalid because this
      method will also delete it.
    """
    tx_data = yield self._tornado_zk.get(tx_path)
    tx_time = float(tx_data[0])

    _, container, tx_node = tx_path.rsplit('/', 2)
    tx_node_id = int(tx_node.lstrip(COUNTER_NODE_PREFIX))
    container_count = int(container[len(CONTAINER_PREFIX):] or 1)
    if tx_node_id < 0:
      yield self._remove_path(tx_path)
      raise gen.Return()

    container_size = MAX_SEQUENCE_COUNTER + 1
    automatic_offset = (container_count - 1) * container_size
    txid = self._txid_manual_offset + automatic_offset + tx_node_id

    if txid < 1:
      yield self._remove_path(tx_path)
      raise gen.Return()

    # If the transaction is still valid, return the time it was created.
    if tx_time + MAX_TX_DURATION >= time.time():
      raise gen.Return(tx_time)

    yield self._batch_resolver.resolve(txid, composite_indexes)
    yield self._remove_path(tx_path)
    yield self._batch_resolver.cleanup(txid)

  @gen.coroutine
  def _fetch_and_clean(self, worker_index, worker_count):
    """ Cleans up expired transactions.

    Args:
      worker_index: An integer specifying this worker's index.
      worker_count: An integer specifying the number of total workers.
    Returns:
      A float specifying the time of the oldest valid transaction as a unix
      timestamp.
    """
    self._txids_cleaned = 0
    self._oldest_valid_tx_time = time.time()

    children = []
    for index, container in enumerate(self._containers):
      container_path = '/'.join([self._project_node, container])
      new_children = yield self._tornado_zk.get_children(container_path)

      if not new_children and index < len(self._containers) - 1:
        self._inactive_containers.add(container)

      children.extend(['/'.join([container_path, node])
                       for node in new_children])

    logger.debug(
      'Found {} transaction IDs for {}'.format(len(children), self.project_id))

    if not children:
      raise gen.Return(self._oldest_valid_tx_time)

    # Refresh these each time so that the indexes are fresh.
    encoded_indexes = yield self._thread_pool.submit(
      self._db_access.get_indices, self.project_id)
    composite_indexes = [CompositeIndex(index) for index in encoded_indexes]

    for tx_path in children:
      tx_node_id = int(tx_path.split('/')[-1].lstrip(COUNTER_NODE_PREFIX))
      # Only resolve transactions that this worker has been assigned.
      if tx_node_id % worker_count != worker_index:
        continue

      yield self._worker_queue.put((tx_path, composite_indexes))

    yield self._worker_queue.join()

    if self._txids_cleaned > 0:
      logger.info('Cleaned up {} expired txids for {}'.format(
        self._txids_cleaned, self.project_id))

    raise gen.Return(self._oldest_valid_tx_time)
コード例 #19
0
ファイル: async_task_manager.py プロジェクト: rydzykje/aucote
class AsyncTaskManager(object):
    """
    Aucote uses asynchronous task executed in ioloop. Some of them,
    especially scanners, should finish before ioloop will stop

    This class should be accessed by instance class method, which returns global instance of task manager

    """
    _instances = {}

    TASKS_POLITIC_WAIT = 0
    TASKS_POLITIC_KILL_WORKING_FIRST = 1
    TASKS_POLITIC_KILL_PROPORTIONS = 2
    TASKS_POLITIC_KILL_WORKING = 3

    def __init__(self, parallel_tasks=10):
        self._shutdown_condition = Event()
        self._stop_condition = Event()
        self._cron_tasks = {}
        self._parallel_tasks = parallel_tasks
        self._tasks = Queue()
        self._task_workers = {}
        self._events = {}
        self._limit = self._parallel_tasks
        self._next_task_number = 0
        self._toucan_keys = {}

    @classmethod
    def instance(cls, name=None, **kwargs):
        """
        Return instance of AsyncTaskManager

        Returns:
            AsyncTaskManager

        """
        if cls._instances.get(name) is None:
            cls._instances[name] = AsyncTaskManager(**kwargs)
        return cls._instances[name]

    @property
    def shutdown_condition(self):
        """
        Event which is resolved if every job is done and AsyncTaskManager is ready to shutdown

        Returns:
            Event
        """
        return self._shutdown_condition

    def start(self):
        """
        Start CronTabCallback tasks

        Returns:
            None

        """
        for task in self._cron_tasks.values():
            task.start()

        for number in range(self._parallel_tasks):
            self._task_workers[number] = IOLoop.current().add_callback(
                partial(self.process_tasks, number))

        self._next_task_number = self._parallel_tasks

    def add_crontab_task(self, task, cron, event=None):
        """
        Add function to scheduler and execute at cron time

        Args:
            task (function):
            cron (str): crontab value
            event (Event): event which prevent from running task with similar aim, eg. security scans

        Returns:
            None

        """
        if event is not None:
            event = self._events.setdefault(event, Event())
        self._cron_tasks[task] = AsyncCrontabTask(cron, task, event)

    @gen.coroutine
    def stop(self):
        """
        Stop CronTabCallback tasks and wait on them to finish

        Returns:
            None

        """
        for task in self._cron_tasks.values():
            task.stop()
        IOLoop.current().add_callback(self._prepare_shutdown)
        yield [self._stop_condition.wait(), self._tasks.join()]
        self._shutdown_condition.set()

    def _prepare_shutdown(self):
        """
        Check if ioloop can be stopped

        Returns:
            None

        """
        if any(task.is_running() for task in self._cron_tasks.values()):
            IOLoop.current().add_callback(self._prepare_shutdown)
            return

        self._stop_condition.set()

    def clear(self):
        """
        Clear list of tasks

        Returns:
            None

        """
        self._cron_tasks = {}
        self._shutdown_condition.clear()
        self._stop_condition.clear()

    async def process_tasks(self, number):
        """
        Execute queue. Every task in executed in separated thread (_Executor)

        """
        log.info("Starting worker %s", number)
        while True:
            try:
                item = self._tasks.get_nowait()
                try:
                    log.debug("Worker %s: starting %s", number, item)
                    thread = _Executor(task=item, number=number)
                    self._task_workers[number] = thread
                    thread.start()

                    while thread.is_alive():
                        await sleep(0.5)
                except:
                    log.exception("Worker %s: exception occurred", number)
                finally:
                    log.debug("Worker %s: %s finished", number, item)
                    self._tasks.task_done()
                    tasks_per_scan = (
                        '{}: {}'.format(scanner, len(tasks))
                        for scanner, tasks in self.tasks_by_scan.items())
                    log.debug("Tasks left in queue: %s (%s)",
                              self.unfinished_tasks, ', '.join(tasks_per_scan))
                    self._task_workers[number] = None
            except QueueEmpty:
                await gen.sleep(0.5)
                if self._stop_condition.is_set() and self._tasks.empty():
                    return
            finally:
                if self._limit < len(self._task_workers):
                    break

        del self._task_workers[number]

        log.info("Closing worker %s", number)

    def add_task(self, task):
        """
        Add task to the queue

        Args:
            task:

        Returns:
            None

        """
        self._tasks.put(task)

    @property
    def unfinished_tasks(self):
        """
        Task which are still processed or in queue

        Returns:
            int

        """
        return self._tasks._unfinished_tasks

    @property
    def tasks_by_scan(self):
        """
        Returns queued tasks grouped by scan
        """
        tasks = self._tasks._queue

        return_value = {}

        for task in tasks:
            return_value.setdefault(task.context.scanner.NAME, []).append(task)

        return return_value

    @property
    def cron_tasks(self):
        """
        List of cron tasks

        Returns:
            list

        """
        return self._cron_tasks.values()

    def cron_task(self, name):
        for task in self._cron_tasks.values():
            if task.func.NAME == name:
                return task

    def change_throttling_toucan(self, key, value):
        self.change_throttling(value)

    def change_throttling(self, new_value):
        """
        Change throttling value. Keeps throttling value between 0 and 1.

        Behaviour of algorithm is described in docs/throttling.md

        Only working tasks are closing here. Idle workers are stop by themselves

        """
        if new_value > 1:
            new_value = 1
        if new_value < 0:
            new_value = 0

        new_value = round(new_value * 100) / 100

        old_limit = self._limit
        self._limit = round(self._parallel_tasks * float(new_value))

        working_tasks = [
            number for number, task in self._task_workers.items()
            if task is not None
        ]
        current_tasks = len(self._task_workers)

        task_politic = cfg['service.scans.task_politic']

        if task_politic == self.TASKS_POLITIC_KILL_WORKING_FIRST:
            tasks_to_kill = current_tasks - self._limit
        elif task_politic == self.TASKS_POLITIC_KILL_PROPORTIONS:
            tasks_to_kill = round((old_limit - self._limit) *
                                  len(working_tasks) / self._parallel_tasks)
        elif task_politic == self.TASKS_POLITIC_KILL_WORKING:
            tasks_to_kill = (old_limit - self._limit) - (
                len(self._task_workers) - len(working_tasks))
        else:
            tasks_to_kill = 0

        log.debug('%s tasks will be killed', tasks_to_kill)

        for number in working_tasks:
            if tasks_to_kill <= 0:
                break
            self._task_workers[number].stop()
            tasks_to_kill -= 1

        self._limit = round(self._parallel_tasks * float(new_value))

        current_tasks = len(self._task_workers)

        for number in range(self._limit - current_tasks):
            self._task_workers[self._next_task_number] = None
            IOLoop.current().add_callback(
                partial(self.process_tasks, self._next_task_number))
            self._next_task_number += 1
コード例 #20
0
ファイル: scraper.py プロジェクト: andres-de-castro/scraping
class Scraper():

    def __init__(
                self,
                destinations=None,
                transform=None,
                headers={},
                max_clients=50,
                maxsize=50,
                connect_timeout=1200,
                request_timeout=600,):

        """Instantiate a tornado async http client to do multiple concurrent requests"""

        if None in [destinations, transform]:
            sys.stderr.write('You must pass both collection of URLS and a transform function')
            raise SystemExit

        self.max_clients = max_clients
        self.maxsize = maxsize
        self.connect_timeout = connect_timeout
        self.request_timeout = request_timeout

        AsyncHTTPClient.configure("tornado.simple_httpclient.SimpleAsyncHTTPClient", max_clients=self.max_clients)

        self.http_client = AsyncHTTPClient()
        self.queue = Queue(maxsize=50)
        self.destinations = destinations
        self.transform = transform
        self.headers = headers
        self.read(self.destinations)
        self.get(self.transform, self.headers, self.connect_timeout, self.request_timeout, self.http_client)
        self.loop = ioloop.IOLoop.current()
        self.join_future = self.queue.join()

        def done(future):
            self.loop.stop()

        self.join_future.add_done_callback(done)
        self.loop.start()

    @gen.coroutine
    def read(self, destinations):
        for url in destinations:
            yield self.queue.put(url)

    @gen.coroutine
    def get(self, transform, headers, connect_timeout, request_timeout, http_client):
        while True:
            url = yield self.queue.get()
            try:
                request = HTTPRequest(url,
                                    connect_timeout=connect_timeout,
                                    request_timeout=request_timeout,
                                    method="GET",
                                    headers = headers
                )
            except Exception as e:
                sys.stderr.write('Destination {0} returned error {1}'.format(url, str(e) + '\n'))

            future = self.http_client.fetch(request)

            def done_callback(future):
                body = future.result().body
                url = future.result().effective_url
                transform(body, url=url)
                self.queue.task_done()

            try:
                future.add_done_callback(done_callback)
            except Exception as e:
                sys.stderr.write(str(e))
                queue.put(url)
コード例 #21
0
ファイル: spider.py プロジェクト: winstonf88/pyjobs
class BaseSpider(object):
    url_parser = None

    def __init__(self, engine, concurrent=3):
        self.engine = engine
        self.http = httpclient.AsyncHTTPClient()
        self.queue = Queue()
        self.concurrency = concurrent

    @property
    def hostname(self):
        return self.url_parser.hostname

    @property
    def url_root(self):
        return self.url_parser.url_root

    @property
    def base_url(self):
        return self.url_parser.base_url

    @gen.coroutine
    def __worker(self):
        """Consumes the queue."""
        while True:
            yield self.fetch_url()

    @gen.coroutine
    def crawl(self, description, location):
        """Starts crawling the specified URL."""
        url = self.url_parser(description, location)
        self.queue.put(url)
        self.engine.notify_started(self)
        for _ in range(self.concurrency):
            self.__worker()
        yield self.queue.join()
        self.engine.notify_finished(self)

    @gen.coroutine
    def fetch_url(self):
        """Retrieves a URL from the queue and returns the parsed data."""
        url = yield self.queue.get()
        logger.info('fetching %s' % url)
        try:
            response = yield self.http.fetch(url)
            soup = BeautifulSoup(response.body)
            logger.info('got response %s' % url)

            urls = yield self.fetch_links(response, soup)
            for new_url in urls:
                logger.debug('Added %s to queue' % new_url)
                yield self.queue.put(new_url)

            data = yield self.parse_response(response, soup)
            logger.info('Parsed response for %s' % url)
        except (httpclient.HTTPError, ValueError):
            message = 'HTTP Error: (%s)' % url
            self.engine.write_message(message, self.engine.STATUS_ERROR)
        else:
            self.engine.write_data(data)
        finally:
            self.queue.task_done()

    @gen.coroutine
    def fetch_links(self, response, soup):
        """Fetch URLs to be added to the queue."""
        raise gen.Return([])

    def parse_response(self, response, soup):
        """Extract information from the response, return should be a 
        list of dict's.
        
        Sample dict:
        {
            'title': 'Job Title',
            'company': 'Company Name',
            'location': 'City/State/Country',
            'tags': ['tag1', 'tag2', 'tag3'],
            'category': 'Software Developer',
            'origin': 'Name of the origin website',
            'url': 'Link to the complete job description',
        }
        """
        raise NotImplementedError
コード例 #22
0
ファイル: batched.py プロジェクト: wanjinchang/distributed
class BatchedStream(object):
    """ Mostly obsolete, see BatchedSend """
    def __init__(self, stream, interval):
        self.stream = stream
        self.interval = interval / 1000.
        self.last_transmission = default_timer()
        self.send_q = Queue()
        self.recv_q = Queue()
        self._background_send_coroutine = self._background_send()
        self._background_recv_coroutine = self._background_recv()
        self._broken = None

        self.pc = PeriodicCallback(lambda: None, 100)
        self.pc.start()

    @gen.coroutine
    def _background_send(self):
        with log_errors():
            while True:
                msg = yield self.send_q.get()
                if msg == 'close':
                    break
                msgs = [msg]
                now = default_timer()
                wait_time = self.last_transmission + self.interval - now
                if wait_time > 0:
                    yield gen.sleep(wait_time)
                while not self.send_q.empty():
                    msgs.append(self.send_q.get_nowait())

                try:
                    yield write(self.stream, msgs)
                except StreamClosedError:
                    self.recv_q.put_nowait('close')
                    self._broken = True
                    break

                if len(msgs) > 1:
                    logger.debug("Batched messages: %d", len(msgs))
                for _ in msgs:
                    self.send_q.task_done()

    @gen.coroutine
    def _background_recv(self):
        with log_errors():
            while True:
                try:
                    msgs = yield read(self.stream)
                except StreamClosedError:
                    self.recv_q.put_nowait('close')
                    self.send_q.put_nowait('close')
                    self._broken = True
                    break
                assert isinstance(msgs, list)
                if len(msgs) > 1:
                    logger.debug("Batched messages: %d", len(msgs))
                for msg in msgs:
                    self.recv_q.put_nowait(msg)

    @gen.coroutine
    def flush(self):
        yield self.send_q.join()

    @gen.coroutine
    def send(self, msg):
        if self._broken:
            raise StreamClosedError('Batch Stream is Closed')
        else:
            self.send_q.put_nowait(msg)

    @gen.coroutine
    def recv(self):
        result = yield self.recv_q.get()
        if result == 'close':
            raise StreamClosedError('Batched Stream is Closed')
        else:
            raise gen.Return(result)

    @gen.coroutine
    def close(self):
        yield self.flush()
        raise gen.Return(self.stream.close())

    def closed(self):
        return self.stream.closed()
コード例 #23
0
class BlogBackup(object):
    _default_dir_name = "seg_blog_backup"

    def _generate_save_dir(self):
        cur_dir = os.path.dirname(__file__)
        self.save_path = os.path.join(cur_dir, self._default_dir_name)
        if not os.path.isdir(self.save_path):
            os.mkdir(self.save_path)

    def _parse_save_path(self):
        if self.save_path:
            if os.path.exists(self.save_path) and os.path.isdir(self.save_path):
                return
            else:
                raise BlogSavePathError("'%s' not exists or is not dir!" % self.save_path)
        else:
            self._generate_save_dir()

    @staticmethod
    def parse_token_from_html(content):
        overall_pat = re.compile(r"SF.token =.*?,\s+_\w+ = [\d,\[\]]+;", re.DOTALL)
        overall_res = overall_pat.search(content)
        if overall_res:
            overall_content = overall_res.group()
            # remove /* */ type annotation
            filter_res = re.sub(r"(/\*[/a-zA-Z\d' ]+\*/)", "", overall_content)
            str_list = re.findall(r"(?<!//)'([a-zA-Z\d]+)'", filter_res, re.DOTALL)
            filter_list = re.findall(r"\[(\d+),(\d+)\]", overall_content)
            ret = "".join(str_list)

            if filter_list:
                for m, n in filter_list:
                    ret = ret[: int(m)] + ret[int(n) :]
            if len(ret) == 32:
                return ret

        raise PageHtmlChanged("website login token has changed")

    def _get_user_cookies(self):
        s = requests.Session()
        s.headers.update(headers)
        rep = s.get(target_url)
        post_url = "%s%s?_=%s" % (target_url, login_api_path, self.parse_token_from_html(rep.text))
        data = {"mail": self.username, "password": self.passwd}
        s.post(post_url, data=data)
        return s.cookies

    def __init__(self, **conf):
        self.username = conf["username"]
        self.passwd = conf["passwd"]
        self.save_path = conf.get("save_path")
        self._q = Queue()
        self._cookies = self._get_user_cookies()
        self._parse_save_path()

    @gen.coroutine
    def run(self):
        start_url = target_url + blog_path
        yield self._fetch_blog_list_page(start_url)
        for _ in xrange(cpu_count()):
            self._fetch_essay_content()

        yield self._q.join()

    @gen.coroutine
    def _fetch_blog_list_page(self, page_link):
        ret = requests.get(page_link, cookies=self._cookies)
        d = pq(ret.text)
        link_elements = d(".stream-list__item > .summary > h2 > a")
        for link in link_elements:
            yield self._q.put(d(link).attr("href"))

        next_ele = d(".pagination li.next a")
        if next_ele:
            next_page_url = target_url + next_ele.attr("href")
            self._fetch_blog_list_page(next_page_url)

    @gen.coroutine
    def _fetch_essay_content(self):
        while True:
            try:
                essay_path = yield self._q.get(timeout=1)
                essay_url = target_url + essay_path + edit_suffix
                ret = requests.get(essay_url, cookies=self._cookies)
                d = pq(ret.text)
                title = d("#myTitle").val()
                content = d("#myEditor").text()
                real_file_name = os.path.join(self.save_path, title + ".md")
                logger.info("is backup essay: %s" % title)
                with open(real_file_name, "w") as f:
                    f.writelines(content.encode("utf8"))
            except gen.TimeoutError:
                raise gen.Return()
            finally:
                self._q.task_done()
コード例 #24
0
class BlogBackup(object):
    _default_dir_name = 'seg_blog_backup'

    def _generate_save_dir(self):
        cur_dir = os.path.dirname(__file__)
        self.save_path = os.path.join(cur_dir, self._default_dir_name)
        if not os.path.isdir(self.save_path):
            os.mkdir(self.save_path)

    def _parse_save_path(self):
        if self.save_path:
            if os.path.exists(self.save_path) and \
                    os.path.isdir(self.save_path):
                return
            else:
                raise BlogSavePathError(
                    "'%s' not exists or is not dir!" % self.save_path)
        else:
            self._generate_save_dir()

    def _get_user_cookies(self):
        url = target_url + login_page_path
        self.driver.get(url)
        try:
            user_input = self.driver.find_element_by_name('mail')
            passwd_input = self.driver.find_element_by_name('password')
            submit_btn = self.driver.find_element_by_class_name('pr20')
        except NoSuchElementException:
            raise PageHtmlChanged(
                "%s login page structure have changed!" % _domain)

        user_input.send_keys(self.username)
        passwd_input.send_keys(self.passwd)
        submit_btn.click()
        try:
            WebDriverWait(self.driver, 3).until(staleness_of(submit_btn))
        except TimeoutException:
            raise Exception("Wrong username or password!")

        WebDriverWait(self.driver, timeout=10).until(has_page_load)
        try_times = 0
        while True:
            time.sleep(1)
            if url != self.driver.current_url:
                return self.driver.get_cookies()

            try_times += 1
            if try_times > 10:
                raise Exception("Getting cookie info failed!")

    def _get_driver(self):
        if self.phantomjs_path:
            try:
                return webdriver.PhantomJS(
                    executable_path=self.phantomjs_path,
                    service_log_path=os.path.devnull)
            except WebDriverException:
                raise PhantomjsPathError("Phantomjs locate path invalid!")
        else:
            return webdriver.PhantomJS(service_log_path=os.path.devnull)

    def __init__(self, **conf):
        self.username = conf['username']
        self.passwd = conf['passwd']
        self.phantomjs_path = conf.get('phantomjs_path')
        self.save_path = conf.get('save_path')
        self._q = Queue()

        self._parse_save_path()
        self.driver = self._get_driver()
        self._cookies = self._get_user_cookies()

    @gen.coroutine
    def run(self):
        self.__filter_cookies()

        start_url = target_url + blog_path
        yield self._fetch_blog_list_page(start_url)
        for _ in xrange(cpu_count()):
            self._fetch_essay_content()

        yield self._q.join()

    def __filter_cookies(self):
        self._cookies = {k['name']: k['value'] for k in self._cookies if
                         k['domain'] == _domain}

    @gen.coroutine
    def _fetch_blog_list_page(self, page_link):
        ret = requests.get(page_link, cookies=self._cookies)
        d = pq(ret.text)
        link_elements = d('.stream-list__item > .summary > h2 > a')
        for link in link_elements:
            yield self._q.put(d(link).attr('href'))

        next_ele = d('.pagination li.next a')
        if next_ele:
            next_page_url = target_url + next_ele.attr('href')
            self._fetch_blog_list_page(next_page_url)

    @gen.coroutine
    def _fetch_essay_content(self):
        while True:
            try:
                essay_path = yield self._q.get(timeout=1)
                essay_url = target_url + essay_path + edit_suffix
                ret = requests.get(essay_url, cookies=self._cookies)
                d = pq(ret.text)
                title = d("#myTitle").val()
                content = d("#myEditor").text()
                file_name = title + '.md'
                real_file_name = os.path.join(self.save_path, file_name)
                with open(real_file_name, 'w') as f:
                    f.writelines(content.encode('utf8'))
            except gen.TimeoutError:
                raise gen.Return()
            finally:
                self._q.task_done()
コード例 #25
0
ファイル: site_check.py プロジェクト: andjelx/pycode
def main():
    # Start consumer without waiting
    # Tornado framework used for async IO
    # http://www.tornadoweb.org/en/stable/index.html
    q = Queue()

    @gen.coroutine
    def consumer():
        item = yield q.get()
        try:
            code = False
            try:
                response = yield httpclient.AsyncHTTPClient().fetch(item)
                codes = ['200', '301', '302']
                code = any(s in response.headers['Status'] for s in codes)
                rcode = response.code
                if DEBUG:
                    fname = re.match(r'http://([\w+|.]+)/',item).group(1)
                    fname = os.path.join(DEBUG_DIR,fname.replace(".","_"))
                    with open(fname, 'w') as f:
                        for k,v in response.headers.get_all():
                            f.write(k+' '+v+'\n')
                        f.write('\n')
                        f.write(response.body)
                    f.close()
            except Exception as e:
                code = False
                rcode = str(e)
            
            print('%s,%s,%s,"%s"' % 
                            (datetime.now(), item, code, rcode))
            # Append to DOMAINS found URL 
            if code:
                DOMAINS[RESULT[item]].append(item)            
        
        finally:
            q.task_done()

    @gen.coroutine
    def worker():
        while True:
            yield consumer()

    @gen.coroutine
    def producer():
        if DEBUG and not os.path.exists(DEBUG_DIR):
            print('Creating debug out dir: %s' % DEBUG_DIR)
            os.makedirs(DEBUG_DIR)
  
        # Open and process file if supplied
        if len(sys.argv) >= 2:
            with open(sys.argv[1]) as f:
                for line in f:    
                    DOMAINS[line.strip()]= []
        else:
            print("Domains list file wasn't provided")
            print("Usage: %s <domains.txt> [ report.txt ]" % sys.argv[0])
            sys.exit(2)
        # Generate processing list 
        for d in DOMAINS.keys():
            for url in generate_url_list(d):
                q.put(url)

    yield producer()# Wait for producer to put all tasks.
    # Start workers, then wait for the work queue to be empty. 
    for _ in range(concurrency):
        worker()
    
    yield q.join() # Wait for consumer to finish all tasks.
    
    # Out results
    if len(sys.argv) >= 3:
        f = open(sys.argv[2],'w')
    else:
        f = sys.stdout

    for key, val in DOMAINS.items():
        if DOMAINS[key]:
            DOMAINS[key] = '"'+" ".join(val)+'"'
        else:
            DOMAINS[key] = 'No'
    out = "\n".join([",".join([key, str(val)]) for key, val in DOMAINS.items()]) + '\n'
    
    f.write(out)
コード例 #26
0
class TaskLogger(object):
    def __init__(self,
                 task_id,
                 engine=EngineType.REQUESTS,
                 io_loop=None,
                 task_url=TASK_URL,
                 wrap=False,
                 tenant=None):
        self.task_id = task_id
        self.task_url = task_url
        self._seq = 0
        self._partial_log_url = self._get_partial_url('log')
        self._partial_result_url = self._get_partial_url('result')

        self.wrap = wrap
        if wrap and tenant:
            self._partial_log_url = update_query_params(
                self._partial_log_url, {'tenant': tenant})
            self._partial_result_url = update_query_params(
                self._partial_result_url, {'tenant': tenant})

        if engine == EngineType.REQUESTS:
            self.log = self._log_by_requests
            self.result = self._result_by_requests
        elif engine == EngineType.TORNADO:
            io_loop = io_loop if io_loop else IOLoop.current()
            self._http_client = AsyncHTTPClient(io_loop=io_loop)
            self._queue = Queue()
            self.log = self._log_by_tornado
            self.result = self._result_by_tornado
        else:
            raise TaskLoggerError('',
                                  reason='engine only supports {}'.format(
                                      EngineType.types_str()))

    def _get_partial_url(self, partial_name):
        url = urljoin(self.task_url, partial_name)
        url = update_query_params(url, {'task_id': self.task_id})
        return url

    def _get_log_url(self, seq):
        url = update_query_params(self._partial_log_url, {'seq': seq})
        return url

    def _get_result_url(self, seq, exit_code=0):
        url = update_query_params(self._partial_result_url, {
            'seq': seq,
            'exit_code': exit_code
        })
        return url

    def _log_by_requests(self, log):
        self._seq += 1
        log_url = self._get_log_url(self._seq)
        data = self._create_log(log, self._seq)
        self._send_by_requests(log_url, data)

    def _result_by_requests(self, result, exit_code=0):
        self._seq += 1
        result_url = self._get_result_url(self._seq, exit_code)
        data = self._create_result(result, self._seq, exit_code=exit_code)
        self._send_by_requests(result_url, data)

    @staticmethod
    def _send_by_requests(url, data):
        res = requests.post(url, data=data, verify=False)
        if res.status_code != 200:
            raise TaskLoggerError(data, reason=res.reason)

    @gen.coroutine
    def _log_by_tornado(self, log):
        yield self._queue.put(1)
        self._seq += 1
        log_url = self._get_log_url(self._seq)
        data = self._create_log(log, self._seq)
        try:
            yield self._send_by_tornado(log_url, data)
        finally:
            yield self._queue.get()
            self._queue.task_done()

    @gen.coroutine
    def _result_by_tornado(self, result, exit_code=0):
        yield self._queue.join()
        self._seq += 1
        result_url = self._get_result_url(self._seq, exit_code)
        data = self._create_result(result, self._seq, exit_code=exit_code)
        yield self._send_by_tornado(result_url, data)

    @gen.coroutine
    def _send_by_tornado(self, url, data):
        try:
            response = yield self._http_client.fetch(
                url,
                method='POST',
                headers={'Content-Type': 'application/json'},
                validate_cert=False,
                body=data)
        except Exception as exc:
            if hasattr(exc, 'response') and exc.response:
                exc = 'url:{}, exc:{}, body:{}'.format(url, exc,
                                                       exc.response.body)
            raise TaskLoggerError(data, str(exc))
        else:
            if response.code != 200:
                raise TaskLoggerError(data, reason=response.body)

    def _create_log(self, log, seq):
        assert isinstance(log, basestring)
        log = log + '\n'
        if self.wrap:
            log_msg = TaskLogMessage(task_id=self.task_id, log=log, seq=seq)
            data = json_encode({'messages': log_msg})
        else:
            data = log
        return data

    def _create_result(self, result, seq, exit_code):
        assert isinstance(result, basestring)
        result = result + '\n'
        if self.wrap:
            result_msg = TaskResultMessage(task_id=self.task_id,
                                           result=result,
                                           seq=seq,
                                           exit_code=exit_code)
            data = json_encode({'messages': result_msg})
        else:
            data = result
        return data
コード例 #27
0
ファイル: publishers.py プロジェクト: hazmat345/brew-view
class TornadoPikaPublisher(BeergardenPublisher, PikaClient):
    def __init__(self, **kwargs):
        self.logger = logging.getLogger(__name__)

        self._shutdown_timeout = timedelta(
            seconds=kwargs.pop('shutdown_timeout', 5))
        self._work_queue = Queue()
        self._connection = None
        self._channel = None

        self.coroutiner = CoroutineMaker({
            'TornadoConnection': 'on_open_callback',
            'channel': 'on_open_callback'
        })

        # Trying to get super() to work with incompatible signatures is a nightmare
        BeergardenPublisher.__init__(self)
        PikaClient.__init__(self, **kwargs)

        IOLoop.current().spawn_callback(self._process)

    def shutdown(self):
        return self._work_queue.join(timeout=self._shutdown_timeout)

    @coroutine
    def _open_connection(self):
        self._connection = yield self.coroutiner.convert(TornadoConnection)(
            parameters=self._conn_params, stop_ioloop_on_close=False)

    @coroutine
    def _open_channel(self):
        self._channel = yield self.coroutiner.convert(
            self._connection.channel)()

    @coroutine
    def _process(self):

        while True:
            item = yield self._work_queue.get()

            try:
                if not self._connection or not self._connection.is_open:
                    yield self._open_connection()
                if not self._channel or not self._channel.is_open:
                    yield self._open_channel()

                yield getattr(self._channel, item[0])(**item[1])
            finally:
                self._work_queue.task_done()

    def publish(self, message, **kwargs):
        """Publish a message.

        :param message: The message to publish
        :param kwargs: Additional message properties
        :Keyword Arguments:
            * *routing_key* --
              Routing key to use when publishing
            * *headers* --
              Headers to be included as part of the message properties
            * *expiration* --
              Expiration to be included as part of the message properties
        :return: None
        """
        self._work_queue.put(('basic_publish', {
            'exchange':
            self._exchange,
            'routing_key':
            kwargs['routing_key'],
            'body':
            message,
            'properties':
            BasicProperties(app_id='beer-garden',
                            content_type='text/plain',
                            headers=kwargs.pop('headers', None),
                            expiration=kwargs.pop('expiration', None))
        }))

    def _event_publish_args(self, event, **kwargs):

        # Main thing we need to do here is figure out the appropriate routing key
        args = {}
        if event.metadata and 'routing_key' in event.metadata:
            args['routing_key'] = event.metadata['routing_key']
        elif 'request' in kwargs:
            request = kwargs['request']
            args['routing_key'] = get_routing_key('request', request.system,
                                                  request.system_version,
                                                  request.instance_name)
        else:
            args['routing_key'] = 'beergarden'

        return args
コード例 #28
0
ファイル: batched.py プロジェクト: broxtronix/distributed
class BatchedStream(object):
    """ Mostly obsolete, see BatchedSend """

    def __init__(self, stream, interval):
        self.stream = stream
        self.interval = interval / 1000.0
        self.last_transmission = default_timer()
        self.send_q = Queue()
        self.recv_q = Queue()
        self._background_send_coroutine = self._background_send()
        self._background_recv_coroutine = self._background_recv()
        self._broken = None

        self.pc = PeriodicCallback(lambda: None, 100)
        self.pc.start()

    @gen.coroutine
    def _background_send(self):
        with log_errors():
            while True:
                msg = yield self.send_q.get()
                if msg == "close":
                    break
                msgs = [msg]
                now = default_timer()
                wait_time = self.last_transmission + self.interval - now
                if wait_time > 0:
                    yield gen.sleep(wait_time)
                while not self.send_q.empty():
                    msgs.append(self.send_q.get_nowait())

                try:
                    yield write(self.stream, msgs)
                except StreamClosedError:
                    self.recv_q.put_nowait("close")
                    self._broken = True
                    break

                if len(msgs) > 1:
                    logger.debug("Batched messages: %d", len(msgs))
                for _ in msgs:
                    self.send_q.task_done()

    @gen.coroutine
    def _background_recv(self):
        with log_errors():
            while True:
                try:
                    msgs = yield read(self.stream)
                except StreamClosedError:
                    self.recv_q.put_nowait("close")
                    self.send_q.put_nowait("close")
                    self._broken = True
                    break
                assert isinstance(msgs, list)
                if len(msgs) > 1:
                    logger.debug("Batched messages: %d", len(msgs))
                for msg in msgs:
                    self.recv_q.put_nowait(msg)

    @gen.coroutine
    def flush(self):
        yield self.send_q.join()

    @gen.coroutine
    def send(self, msg):
        if self._broken:
            raise StreamClosedError("Batch Stream is Closed")
        else:
            self.send_q.put_nowait(msg)

    @gen.coroutine
    def recv(self):
        result = yield self.recv_q.get()
        if result == "close":
            raise StreamClosedError("Batched Stream is Closed")
        else:
            raise gen.Return(result)

    @gen.coroutine
    def close(self):
        yield self.flush()
        raise gen.Return(self.stream.close())

    def closed(self):
        return self.stream.closed()
コード例 #29
0
class SQSSource(object):
    """Implementation of ISource that receives messages from a SQS queue.
    """

    max_delete_delay = 5

    def __init__(self, logger, loop, gate, sqs_client, metric_prefix='source'):
        self.gate = gate
        self.collector = sqs_client
        self.logger = logger
        self.loop = loop
        self.metric_prefix = metric_prefix
        self.end_of_input = Event()
        self.input_error = Event()
        self.state = RUNNING
        self._delete_queue = Queue()
        self._should_flush_queue = Event()
        self.sender_tag = 'sender:%s.%s' % (self.__class__.__module__,
                                            self.__class__.__name__)
        self.loop.spawn_callback(self.onInput)
        self.loop.spawn_callback(self._onDelete)

    @gen.coroutine
    def close(self, timeout=None):
        self.state = CLOSING
        self.logger.warning('Closing source')
        yield self._delete_queue.join(timeout)

    @gen.coroutine
    def _flush_delete_batch(self, batch_size):
        delete_batch = [
            self._delete_queue.get_nowait()
            for pos in range(min(batch_size, self.collector.max_messages))
        ]
        try:
            response = yield self.collector.delete_message_batch(*delete_batch)
        except SQSError as err:
            lmsg = 'Error encountered deleting processed messages in SQS: %s'
            self.logger.exception(lmsg, err)
            self.input_error.set()

            for msg in delete_batch:
                self._delete_queue.put_nowait(msg)
        else:
            if response.Failed:
                self.input_error.set()
                for req in response.Failed:
                    self.logger.error('Message failed to delete: %s', req.Id)
                    self._delete_queue.put_nowait(req)

    @gen.coroutine
    def _onDelete(self):
        respawn = True
        while respawn:
            try:
                qsize = self._delete_queue.qsize()
                # This will keep flushing until clear,
                # including items that show up in between flushes
                while qsize > 0:
                    yield self._flush_delete_batch(qsize)
                    qsize = self._delete_queue.qsize()
                self._should_flush_queue.clear()
                yield self._should_flush_queue.wait()
            except Exception as err:
                self.logger.exception(err)
                self.input_error.set()
                respawn = False

    @gen.coroutine
    def onInput(self):
        respawn = True
        retry_timeout = INITIAL_TIMEOUT
        # We use an algorithm similar to TCP window scaling,
        # so that we request fewer messages when we encounter
        # back pressure from our gate/drain and request more
        # when we flushed a complete batch
        window_size = self.collector.max_messages
        while respawn:
            try:
                response = yield self.collector.receive_message_batch(
                    max_messages=window_size, )
                if response.Messages:
                    # We need to have low latency to delete messages
                    # we've processed
                    retry_timeout = INITIAL_TIMEOUT
                else:
                    retry_timeout = min(retry_timeout * 2, MAX_TIMEOUT)
                    yield gen.sleep(retry_timeout.total_seconds())

                sent_full_batch = True
                for position, msg in enumerate(response.Messages):
                    try:
                        self.gate.put_nowait(msg)
                    except QueueFull:
                        self.logger.debug('Gate queue full; yielding')
                        sent_full_batch = False
                        # TODO: is it worth trying to batch and schedule
                        #       a flush at this point instead of many
                        #       single deletes?
                        yield self.gate.put(msg)
                    self._should_flush_queue.set()
                    self._delete_queue.put_nowait(msg)
                    statsd.increment('%s.queued' % self.metric_prefix,
                                     tags=[self.sender_tag])

                # If we were able to flush the entire batch without waiting,
                # increase our window size to max_messages
                if sent_full_batch and \
                   window_size < self.collector.max_messages:
                    window_size += 1
                # Otherwise ask for less next time
                elif not sent_full_batch and window_size > 1:
                    window_size -= 1
            except Exception as err:
                self.logger.exception(err)
                self.input_error.set()
                respawn = False