Beispiel #1
0
def is_green_pool_free(pool):
    """
    Return True if the provided green pool is free, False otherwise.
    """
    if CONCURRENCY_LIBRARY == "eventlet":
        return pool.free()
    elif CONCURRENCY_LIBRARY == "gevent":
        return not pool.full()
    else:
        raise ValueError("Unsupported concurrency library")
def crawler(u):
    global crawled

    response = requests.get(u)
    print response.status_code, u

    for link in re.findall('<a href="(http.*?)"', response.content):

        if crawled < 10 and not pool.full():
            crawled += 1
            pool.spawn(crawler, link)
def crawler(u):
    global crawled

    response = requests.get(u)
    print response.status_code, u

    for link in re.findall('<a href="(http.*?)"', response.content):

        if crawled < 10 and not pool.full():
            crawled += 1
            pool.spawn(crawler, link)
Beispiel #4
0
def crawler(u):
    '''A very simple pooled gevent web crawler'''
    global crawled

    # Crawl the page, print the status
    response = requests.get(u)
    print response.status_code, u

    # Extract some links to follow
    for link in re.findall('<a href="(http.*?)"', response.content):

        # Limit to 10 pages (ignores links when the pool is already full)
        if crawled < 10 and not pool.full():
            crawled += 1
            pool.spawn(crawler, link)
Beispiel #5
0
def scrape_base_url():
    global data
    startTime = datetime.now()
    tree = html.fromstring(session.get(base_url).text)

    func = lambda x: queue.put_nowait((parse_comp, {
        'url': domain + x.xpath('./@href')[0],
        'name': x.xpath('./text()')[0]
    }))
    [
        func(x) for x in tree.xpath('//div[@class="st-text"]//td/a')
        if x.xpath('./text()') != []
    ]

    while not queue.empty() and not pool.full():
        for x in xrange(0, min(queue.qsize(), pool.free_count())):
            t = queue.get_nowait()
            pool.start(pool.spawn(t[0], t[1]))
    pool.join()
    print 'Time Taken : ', datetime.now() - startTime
    with open('data.json', 'w') as fp:
        json.dump(data, fp)
Beispiel #6
0
def green_producer(iterator, map_func, queue, consumer_instance, pool_size,
                   consumer_args=(), consumer_kwargs={}):
    """A map function:

    Map pool.spawn(func, iterated_element) on iterator
    until the gevent pool is full or the iterator is exhausted.

    Put the return value (a gevent.event.AsyncResult) of each spawned func call
    into the queue.

    Whenever the pool is full, switch control to the consumer_instance greenlet.

    When control switches back to this instance, fill up pool again.
    """
    log.debug('initializing producer.  This will only appear once per greenlet')
    consumer_instance.switch(*consumer_args, **consumer_kwargs)
    pool = gevent.pool.Pool(7)
    for elem in iterator:
        log.debug("Queued job for elem {elem}".format(**locals()))
        queue.put(pool.spawn(map_func, elem))
        if pool.full():
            consumer_instance.switch()
    gevent.joinall(pool)
Beispiel #7
0
def wait_available(pool, pool_name):
    statsd = stats.get_statsd_client()
    if pool.full():
        statsd.incr('%s.pool.full' % pool_name)
        pool.wait_available()
Beispiel #8
0
 def _spawn_gm_worker(self):
     global pool
     if not pool.full():
         gm_worker = _GMWorker(self.parent)
         gm_worker.register_task('message.send', send)
         pool.add(gevent.spawn(gm_worker.work, time_to_work=20))
Beispiel #9
0
    def _download_res(
            self,
            filepath,
            rate,
            uploading=True,
            callback=None,
            cb_kwargs=None):
        try:
            peers = self.get_peers()
            self._record_get_peer_ts()
            peers_num = len(peers)
            count = 0

            # just get resource size
            while True:
                ip, port = peers[count]
                logger.info('get resource size')
                try:
                    ret = self._requests_session.get(
                        "{protocol}{ip}:{port}/?{res}"
                        .format(
                            protocol=PROTOCOL, ip=ip,
                            port=port,
                            res=urlencode({'res_url': self.upload_res_url})),
                        stream=True,
                        headers={"Range": "bytes=0-0"},
                        timeout=1)

                    if ret.ok:
                        #: bytes=0-1/17)
                        content_range = ret.headers.get("Content-Range")
                        res_length = content_range.split('/')[-1]
                        break
                    else:
                        logger.warn(
                            'get piece from ip: %s port: %s error, code: %s ' %
                            (ip, port, ret.status_code))
                        count += 1
                        self.del_from_tracker(ip=ip, peer_port=port)
                except ConnectionError:
                    logger.warn(
                        'get piece from ip: %s port: %s error ConnectionError'
                        % (ip, port))
                    count += 1
                    self.del_from_tracker(ip=ip, peer_port=port)
                except Timeout:
                    logger.warn(
                        'get piece from ip: %s port: %s error Timeout' %
                        (ip, port))
                    count += 1
                    self.del_from_tracker(ip=ip, peer_port=port)
                finally:
                    if count >= peers_num:
                        logger.warn("No peers avaliable")
                        peers = self.get_peers()
                        peers_num = len(peers)
                        count = 0

            logger.info('%s is size of %s' %
                        (self.upload_res_url, sizeof_fmt_human(res_length)))

            self.piece_file = PieceFile(res_length, filepath)

            pool_work_num = 15
            pool_q_size = pool_work_num * 2
            pool = gevent.pool.Pool(pool_work_num)
            self.start_ready_upload_thread()

            if rate:
                self.download_rate = rate
            else:
                rate = self.download_rate

            if rate:
                self.token_bucket = TokenBucket(rate)

            while self.piece_file.has_unalloc():
                args_list = list()
                for peer in peers:
                    if peer not in self._peer_in_conn:
                        args_list.append((peer, None))
                [pool.apply_async(self._download_piece_thread, *args)
                    for args in args_list[:pool_q_size]]
                # update peers if peer run out
                while pool.full():
                    gevent.sleep(0.2)

                if not self.piece_file.has_empty():
                    pool.join()

                logger.debug(
                    'test get_empty_block: %s' %
                    self.piece_file.get_empty_piece())

                logger.debug('peer in connection:  %s' % self._peer_in_conn)
                if self.piece_file.has_unalloc():
                    try:
                        tv = self._get_last_get_peer_tv()
                        if tv < GET_PEER_INTERVAL:
                            gevent.sleep(GET_PEER_INTERVAL - tv)
                        g = gevent.spawn(self.get_peers)
                        peers = g.get()
                        self._record_get_peer_ts()
                    except NoPeersFound:
                        # if pool.workRequests:
                        if pool_work_num - pool.free_count() > 0:
                            # some remained piece maybe on the way
                            pool.join()
                            if self.piece_file.has_unalloc():
                                tv = self._get_last_get_peer_tv()
                                if tv > GET_PEER_INTERVAL:
                                    gevent.sleep(GET_PEER_INTERVAL - tv)
                                g = gevent.spawn(self.get_peers)
                                peers = g.get()
                                self._record_get_peer_ts()
                            else:
                                break
                        else:
                            logger.error("no worker running, and get no peers")
                            raise
                else:
                    break

            logger.info('File is complete, size: %s' %
                        self.piece_file.get_real_filesize())

        except NoPeersFound:
            if self.fallback:
                logger.info('Use fallback way to get resouce')
                try:
                    res_length = get_res_length(self.res_url)
                except ConnectionError:
                    raise OriginURLConnectError(self.res_url)
                logger.info(
                    'get resource length %s' %
                    sizeof_fmt_human(res_length))
                if not self.piece_file:
                    self.piece_file = PieceFile(res_length, filepath)

                self.start_ready_upload_thread()
                http_download_to_piecefile(
                        self.res_url, self.piece_file)
            else:
                self.deactivate()
                raise

        # self.piece_file.tofile()
        self.res_is_downloaded = True
        if callback:
            logger.info('Run callback')
            callback(**cb_kwargs)
Beispiel #10
0
 def _spawn_gm_worker(self):
     global pool
     if not pool.full():
         gm_worker = _GMWorker(self.parent)
         gm_worker.register_task('message.send', send)
         pool.add(gevent.spawn(gm_worker.work, time_to_work=20))
Beispiel #11
0
def wait_available(pool, pool_name):
    statsd = stats.get_statsd_client()
    if pool.full():
        statsd.incr('%s.pool.full' % pool_name)
        pool.wait_available()
    return not STATE['shutdown']
Beispiel #12
0
        if result != '':
            print 'Found [%s][%s] in %s' % (result, link, tag)
            a += 1
            if tag in json_dict:
                json_dict[tag].append((result, link, imgs[i]))
            else:
                json_dict[tag] = list()
                json_dict[tag].append((result, link, imgs[i]))

r = session.get(url)
tree = html.fromstring(r.text)
a_tags = tree.xpath('//li[@class="menu-item"]//a')
tags = [(x.xpath('.//@href'), repr(x.xpath('.//text()'))) for x in a_tags]

for t in tags:
    url = t[0]
    result = regex.findall(t[1])
    # print url, result
    # scrape(url[0], result[0])
    queue.put((url[0], result[0]))


while not queue.empty() and not pool.full():
    for x in xrange(0, min(queue.qsize(), pool.free_count())):
        pool.spawn(worker)
pool.join()
print a
print 'Time Taken : ', datetime.now() - start_time
with open('data.json', 'w') as fp:
    json.dump(json_dict, fp)