Ejemplo n.º 1
0
def main():
    q = queues.Queue()
    start = time.time()
    fetching, fetched = set(), set()
    fetched_links = set()
    
    sq = queues.Queue()

    @gen.coroutine
    def fetch_url():
        current_url = yield q.get()
        try:
            if current_url in fetching:
                return

            # print('fetching %s' % current_url)
            fetching.add(current_url)
            links = yield get_links_from_url(current_url)
            for link in links:
                fetched_links.add(link)
            fetched.add(current_url)

        finally:
            q.task_done()

    @gen.coroutine
    def worker():
        while True:
            yield fetch_url()

    for base_url in base_url_list:
        q.put(base_url)

    # Start workers, then wait for the work queue to be empty.
    for cur_page in xrange(1, concurrency):
        worker()
    yield q.join(timeout=timedelta(seconds=300))
    assert fetching == fetched
    print('Done in %d seconds, fetched %s URLs.' % (
        time.time() - start, len(fetched)))

    @gen.coroutine
    def sworker():
        while True:
            current_url = yield sq.get()
            try:
                # print('fetching %s' % current_url)
                fetching.add(current_url)
                recruit_info = yield get_res_from_url(current_url)
                print recruit_info
            finally:
                sq.task_done()

    # # deal all links, get job info
    for link in fetched_links:
        sq.put(link)
    for link in fetched_links:
        sworker()
    yield sq.join(timeout=timedelta(seconds=600))
    print '--OVER--'
Ejemplo n.º 2
0
    def __init__(self, context, detach=False):
        """
        Initialize the Engine with the executor context
        :param context: the parade context to boot engine
        :param detach: the flag to indicate the engine is executed in detached mode or not
        :return: the initialized engine
        """
        self.context = context

        # the thread pool to convert block execution of task into async process
        self.thread_pool = ThreadPoolExecutor(4)
        self.wait_queue = queues.Queue()
        self.exec_queue = queues.Queue()

        def engine_loop():
            _ioloop = ioloop.IOLoop.current()
            _ioloop.add_callback(self.daemon_loop)
            _ioloop.start()

        # if the detached mode is enabled
        # use a seperated thread to boot the io-loop
        self.loop_thread = None
        if detach:
            logger.debug("engine running in detach mode")
            self.loop_thread = threading.Thread(target=engine_loop)
            self.loop_thread.start()
 def __init__(self, base_url, concurrency=10):
     self.q = queues.Queue()
     self.q2 = queues.Queue()
     self.start = time.time()
     self.fetching = set()
     self.fetched = set()
     self.base_url = base_url
     self.concurrency = concurrency
     self.i = 0
Ejemplo n.º 4
0
def main():
    q = queues.Queue()
    start = time.time()
    fetching, fetched = set(), set()

    newq = queues.Queue()

    @tornado.gen.coroutine
    def fetch_url():
        current_url = yield q.get()
        try:
            if current_url in fetching:
                return

            print('fetching %s' % current_url)
            fetching.add(current_url)
            urls = yield get_pictures_from_url(current_url)
            fetched.add(current_url)

            for new_url in urls:
                # Only follow links beneath the base URL
                if new_url.startswith('base_url'):
                    yield q.put(new_url)

        finally:
            q.task_done()

    @tornado.gen.coroutine
    def download_pic():
        current_url = yield newq.get()
        try:
            print(current_url)
        finally:
            newq.task_done()

    @tornado.gen.coroutine
    def downloader():
        while True:
            yield download_pic()

    @tornado.gen.coroutine
    def worker():
        while True:
            yield fetch_url()

    q.put(base_url)

    # Start workers, then wait for the work queue to be empty.
    for _ in range(concurrency):
        worker()
    yield q.join(timeout=timedelta(seconds=300))
    assert fetching == fetched
    print('Done in %d seconds, fetched %s URLs.' %
          (time.time() - start, len(fetched)))
def mainx():
    start = time.time()
    fetched = 0
    client = MongoClient('mongodb://localhost:27017/')
    db = client['posts']
    cookies = {
        'wordpress_logged_in_0efdf49af511fd88681529ef8c2e5fbf':
        'liuzhijun%7C1489462391%7CcFSvpRWbyJcPRGSIelRPWRIqUNdIQnF5Jjh1BrBPQI2%7'
        'C812c5106ea45baeae74102845a2c6d269de6b7547e85a5613b575aa9c8708add',
        'wordpress_0efdf49af511fd88681529ef8c2e5fbf':
        'liuzhijun%7C1489462391%7CcFSvpRWbyJcPRGSIelRPWRIqUNdIQnF5Jjh1BrBPQI2%7'
        'C0edb104a0e34927a3c18e3fc4f10cc051153b1252f1f74efd7b57d21613e1f92'
    }
    post_queue = queues.Queue()
    page_queue = queues.Queue()
    for i in range(1, 69):
        page_url = "http://python.jobbole.com/all-posts/page/{page}/".format(
            page=i)
        page_queue.put(page_url)
        print(page_url)

    # @gen.coroutine
    def posts_url_worker():
        while True:
            page = yield page_queue.get()
            urls = yield get_posts_url_from_page(page)
            for u in urls:
                post_queue.put(u)
            page_queue.task_done()

    @gen.coroutine
    def post_data_worker():
        while True:
            url = yield post_queue.get()
            post = yield get_post_data_from_url(url, cookies)
            nonlocal fetched
            fetched += 1
            db.posts.insert_one(post)
            post_queue.task_done()

    for _ in range(concurrency):
        posts_url_worker()
    for _ in range(concurrency):
        post_data_worker()

    yield page_queue.join()
    yield post_queue.join()
    # yield q.join(timeout=timedelta(seconds=300))
    print('爬取%s 篇文章,总共耗时%d 秒.' % (fetched, time.time() - start))
Ejemplo n.º 6
0
def elysium(mocker):

    config = Config(mocker.Mock())
    sentry_wrapper = mocker.Mock()

    mocker.patch.object(_AppsCache,
                        'make_control_ch',
                        return_value=make_mock_channel_with(0))

    node = mocker.Mock()
    node.start_app = mocker.Mock(side_effect=make_mock_channels_list_with(
        xrange(count_apps(to_run_apps))))

    node.control = mocker.Mock(side_effect=make_mock_channels_list_with(
        xrange(count_apps(to_run_apps))))

    submitter = mocker.Mock()
    submitter.post_committed_state = mocker.Mock(
        return_value=make_mock_channel_with(0))

    return burlak.AppsElysium(
        Context(
            LoggerSetup(make_logger_mock(mocker), False),
            config,
            '0',
            sentry_wrapper,
            mocker.Mock(),
        ), CommittedState(), node, queues.Queue(), submitter)
Ejemplo n.º 7
0
def main():
    q = queues.Queue()
    start = time.time()
    fetching, fetched = set(), set()

    @gen.coroutine
    def fetch_url():
        current_url = yield q.get()
        stream = yield TCPClient().connect(options.host, options.port)
        print("Connected (#%s)" % current_url)
        yield stream.write(("%d\n" % current_url).encode())
        reply = yield stream.read_until(b"\n")
        print("Response from server (#%s)" % current_url)
        q.task_done()

    @gen.coroutine
    def worker():
        while True:
            yield fetch_url()

    for _ in range(10000):
        q.put(_)


    # Start workers, then wait for the work queue to be empty.
    for _ in range(concurrency):
        worker()

    yield q.join(timeout=timedelta(seconds=300))
Ejemplo n.º 8
0
 def __init__(self, access_key, brain_api_url, driver, recording_file):
     self.access_key = access_key
     self.brain_api_url = brain_api_url
     self.driver = driver
     self.recording_file = recording_file
     if self.recording_file:
         self.recording_queue = queues.Queue()
Ejemplo n.º 9
0
    def call(self, api_function, *args, **kwargs):
        """Call a boto2 or boto3 api function.

        Simply invoke this with an api method and its args and kwargs.

        The api function call is coordinated synchronously across all
        calls to this `api_call_queue`, and they will run in order.

        I.e., if you invoke this right after another coroutine invoked this,
        it will block until that other coroutine's call completed.

        If the call ends up being rate limited,
        it will backoff and try again continuously.

        By serializing the api calls to the specific method,
        this prevents a stampeding herd effect that you'd normally get
        with infinite retries.

        There is no limit or timeout on how many times it will retry,
        so in practice this may block an extremely long time if all responses
        are rate limit exceptions.

        Any other failures, like connection timeouts or read timeouts,
        will bubble up immediately and won't be retried here.
        """
        result_queue = queues.Queue(maxsize=1)
        yield self._queue.put((result_queue, api_function, args, kwargs))
        result = yield result_queue.get()
        if isinstance(result, Exception):
            raise result
        raise gen.Return(result)
Ejemplo n.º 10
0
    def post(self, *args, **kwargs):
        # we will use plagiarism system permission
        import probCrawler
        self._q = queues.Queue()
        self.filterOJ = set([])
        self.infoDict = {}
        #email = self.get_argument('email')
        self.query = probCrawler.crawler(queryName='')
        #accessKey = self.get_argument('accessKey')
        from codePlag.models import filePermitArchive
        # need check right first
        #if not filePermitArchive.filter(email=email,isAuth=True,isBanned=False,auth_key=accessKey).exists():
        #    self._reason = '传递的Email值或者准入密钥有问题/您的Email没有验证成功'
        #    self._status_code = 500
        #    self.write_error(500)
        #    return
        bulkAccount = self.get_argument('namelist')
        # split the text according to lines
        accountList = bulkAccount.split()
        if len(accountList) > 50:
            self._reason = '批量查询账号超过最大检索数50'
            self._status_code = 500
            self.write_error(500)
        # print(accountList)
        print accountList
        for account in accountList:
            self.genTask(name=account)

        yield self._q.join(timeout=timedelta(seconds=100))
        print self.infoDict
        import tornado.escape
        self.write(tornado.escape.json_encode(self.infoDict))
Ejemplo n.º 11
0
    def genTask(self, name=''):
        self._q = queues.Queue()
        name = self.name
        import probCrawler
        self.name = name

        # traverse non-auth oj rule
        for oj, website, acRegex, submitRegex in self.query.getNoAuthRules():
            success = False
            otherInfo = 0
            # build the URL
            url = website % name
            # put into queue
            yield self._q.put((oj, url))
        # ACDream,Codeforces,vjudge,uestc
        oj = 'uestc'
        url = 'http://acm.uestc.edu.cn/user/userCenterData/%s' % name
        yield self._q.put((oj, url))
        oj = 'acdream'
        url = 'http://acdream.info/user/%s' % name
        yield self._q.put((oj, url))
        oj = 'vjudge'
        url = 'https://cn.vjudge.net/user/solveDetail/%s' % name
        yield self._q.put((oj, url))
        oj = 'codeforces'
        loopFlag = True
        loopTimes = 0
        count = 1000
        startItem = 1 + loopTimes * count
        url = 'http://codeforces.com/api/user.status?handle=%s&from=%s' % (
            name, startItem)
        yield self._q.put((oj, url))

        for _ in range(100):  # open up 100 process
            self.worker()
Ejemplo n.º 12
0
async def main():
    seen_set = set()
    q = queues.Queue()

    async def fetch_url(current_url):
        # 生产者
        if current_url in seen_set:
            return
        print(f"获取: {current_url}")
        seen_set.add(current_url)
        next_urls = await get_url_links(current_url)
        for new_url in next_urls:
            if new_url.startswith(base_url):
                # 放入队列, 使用await 是为了当这个协程放不进去的时候切换出来,切换到另一个get()协程上
                await q.put(new_url)

    async def worker():
        async for url in q:
            if url is None:
                return
            try:
                await fetch_url(url)
            except Exception as e:
                print("Exception")
            finally:
                # 将数据减一  消费了一个
                q.task_done()

    await q.put(base_url)
    workers = gen.multi([worker() for _ in range(concurrency)])
    await q.join()
    for _ in range(concurrency):
        await q.put(None)
    await workers
Ejemplo n.º 13
0
    def __init__(self, stream, server):
        self.name = None
        self.is_alive = True
        self.is_ready = False

        self.stream = stream
        self.message_queue = queues.Queue()
        self.server = server

        self.inital_data = None
        self.inital_data_loaded = locks.Condition()

        self.handlers = {
            (message_type.CHAT, chat_message.BROADCAST):
            self.broadcast_chat,
            (message_type.LOBBY, lobby_message.READY):
            self.ready,
            (message_type.LOBBY, lobby_message.START_GAME):
            self.load_inital_data,
            (message_type.LOBBY, lobby_message.UNREADY):
            self.unready,
            (message_type.GAME, game_message.READY):
            self.game_ready,
            (message_type.GAME, game_message.UPDATE):
            self.send_to_game((message_type.GAME, game_message.UPDATE)),
            (message_type.GAME, game_message.CALL):
            self.send_to_game((message_type.GAME, game_message.CALL)),
            (message_type.GAME, game_message.ACTION_APPEND):
            self.send_to_game((message_type.GAME, game_message.ACTION_APPEND)),
            (message_type.GAME, game_message.ACTION_REMOVE):
            self.send_to_game((message_type.GAME, game_message.ACTION_REMOVE)),
        }
Ejemplo n.º 14
0
def readone(comm):
    """
    Read one message at a time from a comm that reads lists of
    messages.
    """
    try:
        q = _readone_queues[comm]
    except KeyError:
        q = _readone_queues[comm] = queues.Queue()

        @gen.coroutine
        def background_read():
            while True:
                try:
                    messages = yield comm.read()
                except CommClosedError:
                    break
                for msg in messages:
                    q.put_nowait(msg)
            q.put_nowait(None)
            del _readone_queues[comm]

        background_read()

    msg = yield q.get()
    if msg is None:
        raise CommClosedError
    else:
        raise gen.Return(msg)
Ejemplo n.º 15
0
def main():
    q = queues.Queue()
    start = time.time()
    fetching, fetched = set(), set()

    @gen.coroutine
    def fetch_url():
        current_url = yield q.get()
        try:
            if current_url in fetching:
                return
            print('fetching {}'.format(current_url))
            fetching.add(current_url)
            urls = yield get_link_from_url(current_url)
            fetched.add(current_url)
            for new_url in urls:
                if new_url.startswith(BASE_URL) and new_url not in fetched:
                    yield q.put(new_url)
        finally:
            # 减少计数次数
            q.task_done()

    @gen.coroutine
    def worker():
        while True:
            yield fetch_url()

    q.put(BASE_URL)

    for _ in range(concurrency):
        worker()
    yield q.join(timeout=timedelta(seconds=300))
    assert fetching == fetched
    print('Done in {} seconds fetched {} urls'.format(time.time() - start,
                                                      len(fetched)))
Ejemplo n.º 16
0
 def test_blocking_get_wait(self):
     q = queues.Queue()  # type: queues.Queue[int]
     q.put(0)
     self.io_loop.call_later(0.01, q.put, 1)
     self.io_loop.call_later(0.02, q.put, 2)
     self.assertEqual(0, (yield q.get(timeout=timedelta(seconds=1))))
     self.assertEqual(1, (yield q.get(timeout=timedelta(seconds=1))))
Ejemplo n.º 17
0
 def __init__(self, urls, concurrency):
     urls.reverse()
     self.urls = urls
     self.concurrency = concurrency
     self._q = queues.Queue()
     self._fetching = set()
     self._fetched = set()
Ejemplo n.º 18
0
 def test_put_timeout_preempted(self):
     q = queues.Queue(1)  # type: queues.Queue[int]
     q.put_nowait(0)
     put = q.put(1, timeout=timedelta(seconds=0.01))
     q.get()
     yield gen.sleep(0.02)
     yield put  # No TimeoutError.
Ejemplo n.º 19
0
    def test_order(self):
        q = queues.Queue()  # type: queues.Queue[int]
        for i in [1, 3, 2]:
            q.put_nowait(i)

        items = [q.get_nowait() for _ in range(3)]
        self.assertEqual([1, 3, 2], items)
Ejemplo n.º 20
0
    def __init__(self, endpoints=LOCATOR_DEFAULT_ENDPOINTS, io_loop=None):
        self.io_loop = io_loop or IOLoop.current()
        self.endpoints = endpoints
        self._lock = Lock()

        self.counter = itertools.count(1)

        self.pipe = None
        self.target = Defaults.app
        self.verbosity = DEBUG_LEVEL
        self.queue = queues.Queue(10000)

        # level could be reset from update_verbosity in the future
        if not fallback_logger.handlers:
            sh = logging.StreamHandler()
            sh.setFormatter(
                logging.Formatter(
                    fmt=
                    "[%(asctime)s.%(msecs)d] %(levelname)s fallback %(message)s",
                    datefmt="%z %d/%b/%Y:%H:%M:%S"))
            sh.setLevel(logging.DEBUG)
            fallback_logger.addHandler(sh)

        self._send()
        try:
            uuid = Defaults.uuid
            self._defaultattrs = [("uuid", uuid)]
        except GetOptError:
            self._defaultattrs = []
Ejemplo n.º 21
0
    def __procon(self):
        """
                producer
                consumer
                concurrency(parallelism)
        """
        start = time.time()

        q = queues.Queue(maxsize=self.concurrency)

        @gen.coroutine
        def worker():
            while True:
                d = yield q.get()
                try:
                    yield self.consumer(d)
                except Exception as e:
                    print("PRO_CON:" + str(e), d)
                q.task_done()

        # consumer worker
        for i in range(self.concurrency):
            worker()

        # producer
        for d in self.producer():
            yield q.put(d)

        yield q.join(timeout=timedelta(seconds=300))
        print("concurrency task done in %d seconds" % (time.time() - start))
Ejemplo n.º 22
0
    def __init__(self, name, countries, session=new_session()):
        """
        创建一个search worker
        :param name: worker的名称
        :param countries: search_worker需要搜索的国家,接受一个列表
        """
        super(SearchWorker, self).__init__()
        self.name = name
        self.queue = queues.Queue()
        self.countries = countries
        self.current_searching_countries = []
        self.skip = 0
        self.cookies = None
        self.session = session
        self.client = httpclient.AsyncHTTPClient()
        self.search_done = False
        self.cookies_update_lock = locks.Lock()
        self.cookies_updating = False
        self.countries_cache = {}

        self.total_index = 0
        workers = []
        # 平均下来一个页面中会返回10个详情,我们创建10个detail worker能够保持队列大概持平
        for i in range(10):
            workers.append(DetailWorker(self, i, self.session))
        self.workers = workers
Ejemplo n.º 23
0
Archivo: httpd.py Proyecto: pkkr/kevin
    def get(self):
        print("plain opened")
        self.job = None

        try:
            job_id = self.request.query_arguments["job"][0]
        except (KeyError, IndexError):
            self.write(b"no job id given\n")
            return

        job_id = job_id.decode(errors='replace')
        try:
            self.job = jobs.get_existing(job_id)
        except ValueError:
            self.write(("no such job: " + repr(job_id) + "\n").encode())
            return
        else:
            self.queue = queues.Queue()
            self.job.watch(self)

        while True:
            update = yield self.queue.get()
            if update is StopIteration:
                return
            if isinstance(update, StdOut):
                self.write(update.data.encode())
                self.flush()
Ejemplo n.º 24
0
    def __init__(
        self,
        sessionstore: "SessionManager",
        sid: uuid.UUID,
        hang_interval: int,
        timout: int,
        tid: uuid.UUID,
        endpoint_names: Set[str],
        nodename: str,
        disable_expire_check: bool = False,
    ) -> None:
        self._sid = sid
        self._interval = hang_interval
        self._timeout = timout
        self._sessionstore: SessionManager = sessionstore
        self._seen: float = time.time()
        self._callhandle = None
        self.expired: bool = False

        self.tid: uuid.UUID = tid
        self.endpoint_names: Set[str] = endpoint_names
        self.nodename: str = nodename

        self._replies: Dict[uuid.UUID, asyncio.Future] = {}

        # Disable expiry in certain tests
        if not disable_expire_check:
            self.check_expire()
        self._queue: queues.Queue[common.Request] = queues.Queue()

        self.client = ReturnClient(str(sid), self)
Ejemplo n.º 25
0
def main():

    q = queues.Queue()
    start = time.time()
    fetching, fetched, fetched_page_set = set(), set(), set()

    @gen.coroutine
    def fetch_url():
        current_url_type, current_url = yield q.get()
        try:
            if current_url in fetching:
                return
            if current_url_type == 0 and get_page(
                    current_url) not in fetched_page_set:
                fetching.add(current_url)
                logging.debug('fetching {0}'.format(current_url))
                urls = yield get_reviewers(current_url_type, current_url)
                fetched.add(current_url)
                fetched_page_set.add(get_page(current_url))

                for (url_type, url) in urls:
                    if (url in fetched) or (url_type == 0 and get_page(url)
                                            in fetched_page_set):
                        continue
                    yield q.put((url_type, url))
            elif current_url_type == 1 and current_url not in fetched:
                fetching.add(current_url)
                logging.debug('fetching {0}'.format(current_url))
                yield get_contact(current_url)
                fetched.add(current_url)
        except Exception, e:
            logging.error('error: {0}, {1}'.format(e.message, current_url))
            q.put((current_url_type, current_url))
            fetching.remove(current_url)
        finally:
Ejemplo n.º 26
0
    def check_instances(self):
        """
        把待监控的instance取出来放入队列,
        然后消费队列里的instance,即对每个instance进行告警检查
        """
        q = queues.Queue()
        monitoring, monitored = set(), set()
        instances = yield get_instances()
        map(q.put, instances)

        @gen.coroutine
        def _judge_and_alert():
            current_instance = yield q.get()
            try:
                if current_instance in monitoring:
                    return
                monitoring.add(current_instance)
                yield judge_and_alert(current_instance)
                monitored.add(current_instance)
                logging.info("{0} was checked!".format(current_instance))
            finally:
                q.task_done()

        @gen.coroutine
        def _worker():
            while True:
                yield _judge_and_alert()

        for _ in range(ALERT_CONCURRENCY_NUM):
            _worker()
        # 对所有实例的检查必须在一轮检查周期内完成
        yield q.join(timeout=timedelta(seconds=ALERT_CHECK_CYCLE))
        assert monitoring == monitored
        logging.info("Current monitoring cycle is done!")
Ejemplo n.º 27
0
 def test_blocking_put_wait(self):
     q = queues.Queue(1)  # type: queues.Queue[int]
     q.put_nowait(0)
     self.io_loop.call_later(0.01, q.get)
     self.io_loop.call_later(0.02, q.get)
     futures = [q.put(0), q.put(1)]
     self.assertFalse(any(f.done() for f in futures))
     yield futures
Ejemplo n.º 28
0
 def test_put_with_getters(self):
     q = queues.Queue()  # type: queues.Queue[int]
     get0 = q.get()
     get1 = q.get()
     yield q.put(0)
     self.assertEqual(0, (yield get0))
     yield q.put(1)
     self.assertEqual(1, (yield get1))
Ejemplo n.º 29
0
 def __init__(self, config, port=None):
     super().__init__(config, port)
     self.port = port if port is not None else self.config.server.port
     self.writing_queue = queues.Queue()
     self.callbacks = [
         queue_writer.QueueWriter(self.writing_queue, self.db).process
     ]
     self.path = home.HomeHandler.PATH
Ejemplo n.º 30
0
 def __init__(self, config, port=8889):
     super().__init__(config, port)
     self.port = port if port is not None else self.config.messaging.port
     self.sender = sms_sender.get(self.config)
     self.queue = queues.Queue()
     self.scheduler = scheduler.MessageScheduler(
         config=self.config, db=self.db_factory.create(), queue=self.queue)
     self.callbacks = [self.process]