Beispiel #1
0
    def run(self):
        """This method puts the daemon into a poll/action loop.

        This method should not be extended or overwritten. Instead,
        implementations of this daemon should implement the 'get_message()'
        and 'handle_message()' methods.

        This loop makes use of an gevent Pool to manage maximum greenthread
        concurrency. The behaviour of the pool, and greenthreads in general, is
        such that there must be a cooperative yield in order for gevent to
        switch context into another greenthread.

        This loop, by default, will only yield on an empty message and when the
        Pool has allocated the maximum allowed greenthreads. To yield the
        loop after each message, set the aggressive_yield bit to True.
        """

        pool = gevent.pool.Pool(size=self.pool_size)

        while True:

            message = self.get_message()
            LOG.debug(
                "Daemon (%r) got message (%r).",
                self.pidfile,
                message
            )

            if message is None:

                LOG.debug(
                    "Daemon (%r) received no message. Going idle for (%r).",
                    self.pidfile,
                    self.idle_time
                )

                self.sleep(self.idle_time)
                continue

            LOG.debug(
                "Daemon (%r) attempting to start new greenthread with (%r) "
                "active and (%r) free",
                self.pidfile,
                pool.size - pool.free_count(),
                pool.free_count()
            )
            pool.spawn(self.handle_message, message)

            if self.aggressive_yield is True:

                gevent.sleep(0)
Beispiel #2
0
def cron(pool):
    """
    Assigned to a worker to perform the following tasks periodically to
    maintain a continuous network-wide connections:

    [Master]
    1) Checks for a new snapshot
    2) Loads new reachable nodes into the reachable set in Redis
    3) Signals listener to get reachable nodes from opendata set
    4) Sets bestblockhash in Redis

    [Master/Slave]
    1) Spawns workers to establish and maintain connection with reachable nodes
    """
    publish_key = 'snapshot:{}'.format(hexlify(CONF['magic_number']))
    snapshot = None

    while True:
        if CONF['master']:
            new_snapshot = get_snapshot()

            if new_snapshot != snapshot:
                nodes = get_nodes(new_snapshot)
                if len(nodes) == 0:
                    continue

                logging.info("New snapshot: %s", new_snapshot)
                snapshot = new_snapshot

                logging.info("Nodes: %d", len(nodes))

                reachable_nodes = set_reachable(nodes)
                logging.info("New reachable nodes: %d", reachable_nodes)

                # Allow connections to stabilize before publishing snapshot
                gevent.sleep(CONF['socket_timeout'])
                REDIS_CONN.publish(publish_key, int(time.time()))

            connections = REDIS_CONN.scard('open')
            logging.info("Connections: %d", connections)

            set_bestblockhash()

        for _ in xrange(min(REDIS_CONN.scard('reachable'), pool.free_count())):
            pool.spawn(task)

        workers = CONF['workers'] - pool.free_count()
        logging.info("Workers: %d", workers)

        gevent.sleep(CONF['cron_delay'])
Beispiel #3
0
def cron(pool):
    """
    Assigned to a worker to perform the following tasks periodically to
    maintain a continuous network-wide connections:

    [Master]
    1) Checks for a new snapshot
    2) Loads new reachable nodes into the reachable set in Redis
    3) Signals listener to get reachable nodes from opendata set
    4) Sets bestblockhash in Redis

    [Master/Slave]
    1) Spawns workers to establish and maintain connection with reachable nodes
    """
    publish_key = 'snapshot:{}'.format(hexlify(CONF['magic_number']))
    snapshot = None

    while True:
        if CONF['master']:
            new_snapshot = get_snapshot()

            if new_snapshot != snapshot:
                nodes = get_nodes(new_snapshot)
                if len(nodes) == 0:
                    continue

                logging.info("New snapshot: %s", new_snapshot)
                snapshot = new_snapshot

                logging.info("Nodes: %d", len(nodes))

                reachable_nodes = set_reachable(nodes)
                logging.info("New reachable nodes: %d", reachable_nodes)

                # Allow connections to stabilize before publishing snapshot
                gevent.sleep(CONF['socket_timeout'])
                REDIS_CONN.publish(publish_key, int(time.time()))

            connections = REDIS_CONN.scard('open')
            logging.info("Connections: %d", connections)

            set_bestblockhash()

        for _ in xrange(min(REDIS_CONN.scard('reachable'), pool.free_count())):
            pool.spawn(task)

        workers = CONF['workers'] - pool.free_count()
        logging.info("Workers: %d", workers)

        gevent.sleep(CONF['cron_delay'])
Beispiel #4
0
    def test_stderr_raising(self):
        if greentest.PYPY:
            # Does not work on PyPy
            return
        # testing that really egregious errors in the error handling code
        # (that prints tracebacks to stderr) don't cause the pool to lose
        # any members
        import sys
        pool = self.klass(size=1)

        # we're going to do this by causing the traceback.print_exc in
        # safe_apply to raise an exception and thus exit _main_loop
        normal_err = sys.stderr
        try:
            sys.stderr = FakeFile()
            waiter = pool.spawn(crash)
            with gevent.Timeout(2):
                self.assertRaises(RuntimeError, waiter.get)
            # the pool should have something free at this point since the
            # waiter returned
            # pool.Pool change: if an exception is raised during execution of a link,
            # the rest of the links are scheduled to be executed on the next hub iteration
            # this introduces a delay in updating pool.sem which makes pool.free_count() report 0
            # therefore, sleep:
            gevent.sleep(0)
            self.assertEqual(pool.free_count(), 1)
            # shouldn't block when trying to get
            t = gevent.Timeout.start_new(0.1)
            try:
                pool.apply(gevent.sleep, (0, ))
            finally:
                t.cancel()
        finally:
            sys.stderr = normal_err
            pool.join()
Beispiel #5
0
    def test_stderr_raising(self):
        # testing that really egregious errors in the error handling code
        # (that prints tracebacks to stderr) don't cause the pool to lose
        # any members
        import sys
        pool = self.klass(size=1)

        # we're going to do this by causing the traceback.print_exc in
        # safe_apply to raise an exception and thus exit _main_loop
        normal_err = sys.stderr
        try:
            sys.stderr = FakeFile()
            waiter = pool.spawn(crash)
            with gevent.Timeout(2):
                self.assertRaises(RuntimeError, waiter.get)
            # the pool should have something free at this point since the
            # waiter returned
            # pool.Pool change: if an exception is raised during execution of a link,
            # the rest of the links are scheduled to be executed on the next hub iteration
            # this introduces a delay in updating pool.sem which makes pool.free_count() report 0
            # therefore, sleep:
            gevent.sleep(0)
            self.assertEqual(pool.free_count(), 1)
            # shouldn't block when trying to get
            with gevent.Timeout.start_new(0.1):
                pool.apply(gevent.sleep, (0, ))
        finally:
            sys.stderr = normal_err
            pool.join()
Beispiel #6
0
def cron(pool):
    """
    Assigned to a worker to perform the following tasks periodically to
    maintain a continuous network-wide connections:

    [Master]
    1) Checks for a new snapshot
    2) Loads new reachable nodes into the reachable set in Redis
    3) Signals listener to get reachable nodes from opendata set

    [Master/Slave]
    1) Spawns workers to establish and maintain connection with reachable nodes
    """
    snapshot = None

    while True:
        if SETTINGS['master']:
            new_snapshot = get_snapshot()

            if new_snapshot != snapshot:
                nodes = get_nodes(new_snapshot)
                if len(nodes) == 0:
                    continue

                logging.info("New snapshot: {}".format(new_snapshot))
                snapshot = new_snapshot

                logging.info("Nodes: {}".format(len(nodes)))

                reachable_nodes = set_reachable(nodes)
                logging.info("New reachable nodes: {}".format(reachable_nodes))

                # Allow connections to stabilize before publishing snapshot
                gevent.sleep(SETTINGS['cron_delay'])
                REDIS_CONN.publish('snapshot', int(time.time()))

            connections = REDIS_CONN.scard('open')
            logging.info("Connections: {}".format(connections))

        for _ in xrange(min(REDIS_CONN.scard('reachable'), pool.free_count())):
            pool.spawn(task)

        workers = SETTINGS['workers'] - pool.free_count()
        logging.info("Workers: {}".format(workers))

        gevent.sleep(SETTINGS['cron_delay'])
Beispiel #7
0
def cron(pool):
    """
    Assigned to a worker to perform the following tasks periodically to
    maintain a continuous network-wide connections:

    [Master]
    1) Checks for a new snapshot
    2) Loads new reachable nodes into the reachable set in Redis
    3) Signals listener to get reachable nodes from opendata set

    [Master/Slave]
    1) Spawns workers to establish and maintain connection with reachable nodes
    """
    snapshot = None

    while True:
        if SETTINGS['master']:
            new_snapshot = get_snapshot()

            if new_snapshot != snapshot:
                nodes = get_nodes(new_snapshot)
                if len(nodes) == 0:
                    continue

                logging.info("New snapshot: {}".format(new_snapshot))
                snapshot = new_snapshot

                logging.info("Nodes: {}".format(len(nodes)))

                reachable_nodes = set_reachable(nodes)
                logging.info("New reachable nodes: {}".format(reachable_nodes))

                # Allow connections to stabilize before publishing snapshot
                gevent.sleep(SETTINGS['cron_delay'])
                REDIS_CONN.publish('snapshot', int(time.time()))

            connections = REDIS_CONN.scard('open')
            logging.info("Connections: {}".format(connections))

        for _ in xrange(min(REDIS_CONN.scard('reachable'), pool.free_count())):
            pool.spawn(task)

        workers = SETTINGS['workers'] - pool.free_count()
        logging.info("Workers: {}".format(workers))

        gevent.sleep(SETTINGS['cron_delay'])
Beispiel #8
0
def init():

    #queue init
    #main.queue.put("")
    #main.pool.spawn(getLink).join()

    #give worker pool
    print('start crwaling')
    #while not pool.free_count() == 15:
    while not queue.empty():
        gevent.sleep(0.8)
        for x in range(0, min(queue.qsize(), pool.free_count())):
            pool.spawn(getData)

    #wait for everything complete
    pool.join()
Beispiel #9
0
def cron(pool):
    """
    Assigned to a worker to perform the following tasks periodically to
    maintain a continuous network-wide connections:
    1) Checks for a new snapshot
    2) Loads new reachable nodes into the reachable set in Redis
    3) Spawns workers to establish and maintain connection with reachable nodes
    4) Signals listener to get reachable nodes from opendata set
    """
    snapshot = None

    while True:
        logging.debug("")

        new_snapshot = get_snapshot()
        if new_snapshot != snapshot:
            logging.info("New snapshot: {}".format(new_snapshot))

            nodes = get_nodes(new_snapshot)
            if len(nodes) == 0:
                continue
            logging.info("Nodes: {}".format(len(nodes)))

            snapshot = new_snapshot

            reachable_nodes = set_reachable(nodes)
            logging.info("Reachable nodes: {}".format(reachable_nodes))

            try:
                SETTINGS['keepalive'] = int(REDIS_CONN.get('elapsed'))
            except TypeError as err:
                logging.warning(err)
            logging.debug("Keepalive: {}".format(SETTINGS['keepalive']))

            for _ in xrange(reachable_nodes):
                pool.spawn(task)

            gevent.sleep(SETTINGS['cron_delay'])

            REDIS_CONN.publish('snapshot', int(time.time()))
            workers = SETTINGS['workers'] - pool.free_count()
            logging.info("Workers: {}".format(workers))
            logging.info("Connections: {}".format(REDIS_CONN.scard('open')))
        else:
            gevent.sleep(SETTINGS['cron_delay'])
Beispiel #10
0
def cron(pool):
    """
    Assigned to a worker to perform the following tasks periodically to
    maintain a continuous network-wide connections:
    1) Checks for a new snapshot
    2) Loads new reachable nodes into the reachable set in Redis
    3) Spawns workers to establish and maintain connection with reachable nodes
    4) Signals listener to get reachable nodes from opendata set
    """
    snapshot = None

    while True:
        logging.debug("")

        new_snapshot = get_snapshot()
        if new_snapshot != snapshot:
            logging.info("New snapshot: {}".format(new_snapshot))

            nodes = get_nodes(new_snapshot)
            if len(nodes) == 0:
                continue
            logging.info("Nodes: {}".format(len(nodes)))

            snapshot = new_snapshot

            reachable_nodes = set_reachable(nodes)
            logging.info("Reachable nodes: {}".format(reachable_nodes))

            SETTINGS['keepalive'] = int(REDIS_CONN.get('elapsed'))
            logging.debug("Keepalive: {}".format(SETTINGS['keepalive']))

            for _ in xrange(reachable_nodes):
                pool.spawn(task)

            gevent.sleep(SETTINGS['cron_delay'])

            REDIS_CONN.publish('snapshot', int(time.time()))
            workers = SETTINGS['workers'] - pool.free_count()
            logging.info("Workers: {}".format(workers))
            logging.info("Connections: {}".format(REDIS_CONN.scard('open')))
        else:
            gevent.sleep(SETTINGS['cron_delay'])
Beispiel #11
0
def scrape_base_url():
    global data
    startTime = datetime.now()
    tree = html.fromstring(session.get(base_url).text)

    func = lambda x: queue.put_nowait((parse_comp, {
        'url': domain + x.xpath('./@href')[0],
        'name': x.xpath('./text()')[0]
    }))
    [
        func(x) for x in tree.xpath('//div[@class="st-text"]//td/a')
        if x.xpath('./text()') != []
    ]

    while not queue.empty() and not pool.full():
        for x in xrange(0, min(queue.qsize(), pool.free_count())):
            t = queue.get_nowait()
            pool.start(pool.spawn(t[0], t[1]))
    pool.join()
    print 'Time Taken : ', datetime.now() - startTime
    with open('data.json', 'w') as fp:
        json.dump(data, fp)
Beispiel #12
0
    def scheduler():
        """Coordinate downloading in greenlet threads.
        When the worker queue fills up the scheduler will block on the put() operation.  
        If the job queue is empty and no workers are active the pool is stopped."""
        while True:
            # join dead greenlets
            for greenlet in list(pool):
                if greenlet.dead:
                    pool.discard(greenlet)

            try:
                url = inq.get_nowait()
            except queue.Empty:
                # No urls remaining
                if pool.free_count() != pool.size:
                    worker_finished.wait()
                    worker_finished.clear()
                else:
                    # No workers left, shutting down.")
                    pool.join()
                    return True
            else:
                # spawn worker for url
                pool.spawn(worker, url)
Beispiel #13
0
    def scheduler():
        """Coordinate downloading in greenlet threads.
        When the worker queue fills up the scheduler will block on the put() operation.  
        If the job queue is empty and no workers are active the pool is stopped."""
        while True:
            # join dead greenlets
            for greenlet in list(pool):
                if greenlet.dead:
                    pool.discard(greenlet)

            try:
                url = inq.get_nowait()
            except queue.Empty:
                # No urls remaining
                if pool.free_count() != pool.size:
                    worker_finished.wait()
                    worker_finished.clear()
                else:
                    # No workers left, shutting down.")
                    pool.join()
                    return True
            else:
                # spawn worker for url
                pool.spawn(worker, url)
Beispiel #14
0
        if result != '':
            print 'Found [%s][%s] in %s' % (result, link, tag)
            a += 1
            if tag in json_dict:
                json_dict[tag].append((result, link, imgs[i]))
            else:
                json_dict[tag] = list()
                json_dict[tag].append((result, link, imgs[i]))

r = session.get(url)
tree = html.fromstring(r.text)
a_tags = tree.xpath('//li[@class="menu-item"]//a')
tags = [(x.xpath('.//@href'), repr(x.xpath('.//text()'))) for x in a_tags]

for t in tags:
    url = t[0]
    result = regex.findall(t[1])
    # print url, result
    # scrape(url[0], result[0])
    queue.put((url[0], result[0]))


while not queue.empty() and not pool.full():
    for x in xrange(0, min(queue.qsize(), pool.free_count())):
        pool.spawn(worker)
pool.join()
print a
print 'Time Taken : ', datetime.now() - start_time
with open('data.json', 'w') as fp:
    json.dump(json_dict, fp)
            break

        print "job done"
        handler.log("job done")
        print "so far crawled %s pages" % crawled
        handler.log("so far crawled %s pages" % crawled)


queue.put(start_url_1)
queue.put(start_url_2)
pool.spawn(crawler)
handler = Handler()

print 'starting Crawler...'
handler.log('starting Crawler...')
while not queue.empty() and not pool.free_count() == workers_count:
    gevent.sleep(0.8)
    for x in xrange(0, min(queue.qsize(), pool.free_count())):
        pool.spawn(crawler)


#wait for jobs to finish
pool.join()
print "Done"
handler.log("Done+\n")
print '\n'
print "collected %s imgs" % ITEMS_COUNT
handler.log("collected %s imgs" % ITEMS_COUNT)
print "see generated output and log files"

handler.close() #close the IO files
Beispiel #16
0
 def log_qsize(wait):
     while True:
         log.info(u'{}: number of concurrent delete tasks = {}'.format(
             group_id, pool_size - pool.free_count()))
         gevent.sleep(wait)
Beispiel #17
0
    def _download_res(
            self,
            filepath,
            rate,
            uploading=True,
            callback=None,
            cb_kwargs=None):
        try:
            peers = self.get_peers()
            self._record_get_peer_ts()
            peers_num = len(peers)
            count = 0

            # just get resource size
            while True:
                ip, port = peers[count]
                logger.info('get resource size')
                try:
                    ret = self._requests_session.get(
                        "{protocol}{ip}:{port}/?{res}"
                        .format(
                            protocol=PROTOCOL, ip=ip,
                            port=port,
                            res=urlencode({'res_url': self.upload_res_url})),
                        stream=True,
                        headers={"Range": "bytes=0-0"},
                        timeout=1)

                    if ret.ok:
                        #: bytes=0-1/17)
                        content_range = ret.headers.get("Content-Range")
                        res_length = content_range.split('/')[-1]
                        break
                    else:
                        logger.warn(
                            'get piece from ip: %s port: %s error, code: %s ' %
                            (ip, port, ret.status_code))
                        count += 1
                        self.del_from_tracker(ip=ip, peer_port=port)
                except ConnectionError:
                    logger.warn(
                        'get piece from ip: %s port: %s error ConnectionError'
                        % (ip, port))
                    count += 1
                    self.del_from_tracker(ip=ip, peer_port=port)
                except Timeout:
                    logger.warn(
                        'get piece from ip: %s port: %s error Timeout' %
                        (ip, port))
                    count += 1
                    self.del_from_tracker(ip=ip, peer_port=port)
                finally:
                    if count >= peers_num:
                        logger.warn("No peers avaliable")
                        peers = self.get_peers()
                        peers_num = len(peers)
                        count = 0

            logger.info('%s is size of %s' %
                        (self.upload_res_url, sizeof_fmt_human(res_length)))

            self.piece_file = PieceFile(res_length, filepath)

            pool_work_num = 15
            pool_q_size = pool_work_num * 2
            pool = gevent.pool.Pool(pool_work_num)
            self.start_ready_upload_thread()

            if rate:
                self.download_rate = rate
            else:
                rate = self.download_rate

            if rate:
                self.token_bucket = TokenBucket(rate)

            while self.piece_file.has_unalloc():
                args_list = list()
                for peer in peers:
                    if peer not in self._peer_in_conn:
                        args_list.append((peer, None))
                [pool.apply_async(self._download_piece_thread, *args)
                    for args in args_list[:pool_q_size]]
                # update peers if peer run out
                while pool.full():
                    gevent.sleep(0.2)

                if not self.piece_file.has_empty():
                    pool.join()

                logger.debug(
                    'test get_empty_block: %s' %
                    self.piece_file.get_empty_piece())

                logger.debug('peer in connection:  %s' % self._peer_in_conn)
                if self.piece_file.has_unalloc():
                    try:
                        tv = self._get_last_get_peer_tv()
                        if tv < GET_PEER_INTERVAL:
                            gevent.sleep(GET_PEER_INTERVAL - tv)
                        g = gevent.spawn(self.get_peers)
                        peers = g.get()
                        self._record_get_peer_ts()
                    except NoPeersFound:
                        # if pool.workRequests:
                        if pool_work_num - pool.free_count() > 0:
                            # some remained piece maybe on the way
                            pool.join()
                            if self.piece_file.has_unalloc():
                                tv = self._get_last_get_peer_tv()
                                if tv > GET_PEER_INTERVAL:
                                    gevent.sleep(GET_PEER_INTERVAL - tv)
                                g = gevent.spawn(self.get_peers)
                                peers = g.get()
                                self._record_get_peer_ts()
                            else:
                                break
                        else:
                            logger.error("no worker running, and get no peers")
                            raise
                else:
                    break

            logger.info('File is complete, size: %s' %
                        self.piece_file.get_real_filesize())

        except NoPeersFound:
            if self.fallback:
                logger.info('Use fallback way to get resouce')
                try:
                    res_length = get_res_length(self.res_url)
                except ConnectionError:
                    raise OriginURLConnectError(self.res_url)
                logger.info(
                    'get resource length %s' %
                    sizeof_fmt_human(res_length))
                if not self.piece_file:
                    self.piece_file = PieceFile(res_length, filepath)

                self.start_ready_upload_thread()
                http_download_to_piecefile(
                        self.res_url, self.piece_file)
            else:
                self.deactivate()
                raise

        # self.piece_file.tofile()
        self.res_is_downloaded = True
        if callback:
            logger.info('Run callback')
            callback(**cb_kwargs)
Beispiel #18
0
 def log_qsize(wait):
     while True:
         log.info(u'{}: number of concurrent delete tasks = {}'
                  .format(group_id, pool_size - pool.free_count()))
         gevent.sleep(wait)
Beispiel #19
0
                      (response.status_code, url))

        except gevent.queue.Empty:
            print('queue empty')
            break


if __name__ == '__main__':
    if len(sys.argv) != 3:
        print('USAGE:\n\t%s <base_url> <entry_path>' % sys.argv[0])
        sys.exit(1)

    if validators.url(sys.argv[1]) != True:
        print('Invalid Url')
        sys.exit(1)

    queue.put(getUrl(sys.argv[2]))
    pool.spawn(crawler)

    while 1:
        if queue.empty() and pool.free_count() == WORKER_COUNT:
            print('No more links left and nothing running')
            break

        for x in range(0, min(queue.qsize(), pool.free_count())):
            pool.spawn(crawler)
        gevent.sleep(0.1)

    # Wait for everything to complete
    pool.join()