Ejemplo n.º 1
0
 def batch_fetch(self, queue, event, linger_ms, max_queued_messages):
     if queue.qsize() < max_queued_messages:
         event.wait(linger_ms / 1000)
     if event.is_set():
         event.clear()
     batch_msgs = [queue.get() for _ in range(queue.qsize())]
     return batch_msgs
Ejemplo n.º 2
0
    def close(self):
        # Close the queue.  There are 2 possibilities:

        # 1. The file buffer is non-empty and there's a greenlet
        #    emptying it.  (See the feed greenlet in the put method.)
        #    The greenlet is blocked puting data in the underlying
        #    queue.  We can set size to -1, marking us as closed and
        #    close the file. The greenlet will check sise before
        #    trying trying to read the file again.

        # 2. The file bugger is empty and there's no running greenlet.
        #    We can set the size to -1 and close the file.

        # In either case, we'll empty the underying queue, both for
        # cleanliness and to unblock a greenlet, if there is one, so
        # it can die a normal death,

        if self.size < 0:
            return # already closed

        self.size = -1
        self.file.close()

        queue = self.queue
        while queue.qsize():
            queue.get()
        self.size_bytes = 0
Ejemplo n.º 3
0
 def flush(self):
     """Forces a flush from the internal queue to the server"""
     queue = self.queue
     size = queue.qsize()
     queue.join(JOIN_TIMEOUT_SECONDS)
     # Note that this message may not be precise, because of threading.
     self.log.debug('successfully flushed about %s items.', size)
Ejemplo n.º 4
0
def crawler(n):
    """ this is the worker routine, the heart of this solution

    the job is performed by the following steps:
    1. take an url from the queue
    2. make a request to this url
    3. mark it as visited
    4. check whether the response is ok to be parsed
    5. if the url corresponds to a product page, then extract data from it
    6. extract more urls from the current page and add them to the queue

    this is repeated continuously until the queue is empty
    """
    while True:
        logger.info(
            'links: [%d] pending, [%d] discovered, [%d] visited'
            % (queue.qsize(), len(discovered), len(visited))
        )
        url = queue.get()
        logger.info('crawler [%d] took [%s] from queue' % (n, url))
        response = requests.get(url, verify=False)  # no SSL validation
        visited.append(url)
        if response.status_code == requests.codes.ok:
            soup = Soup(response.content)
            if is_valid_product_page(url, response):
                data = extract_product_data(url, soup)
                csv.write(CSV_FORMAT % data)
            discover_links(url, soup)
        else:
            logger.warning('response not ok for [%s]' % url)
        queue.task_done()
Ejemplo n.º 5
0
def manage_webhook_data(queue):
    while True:
        qsize = queue.qsize()
        if qsize > 5000:
            log.warning("Queue length is at %s... this may be causing "
                        + "a significant delay in notifications.", qsize)
        data = queue.get(block=True)
        obj = Events.event_factory(data)
        if obj is not None:
            for name, mgr in managers.iteritems():
                mgr.update(obj)
                log.debug("Distributing event {} to manager {}.".format(
                    obj.id, name))
            log.debug("Finished distributing event: {}".format(obj.id))
Ejemplo n.º 6
0
def init():

    #queue init
    #main.queue.put("")
    #main.pool.spawn(getLink).join()

    #give worker pool
    print('start crwaling')
    #while not pool.free_count() == 15:
    while not queue.empty():
        gevent.sleep(0.8)
        for x in range(0, min(queue.qsize(), pool.free_count())):
            pool.spawn(getData)

    #wait for everything complete
    pool.join()
Ejemplo n.º 7
0
    def runloop(self):
        i = self.i
        queue = self.queue
        multiplier = self.options.multiplier
        rate_limit = self.options.rate_limit
        location_grep = self.options.location_grep
        incoming_requests_counter = Counter('input')
        record_file = None
        if self.options.record_file:
            record_file = open(self.options.record_file, 'w')

        if statsd_client:

            def on_tick():
                statsd_client.incr('backlog', queue.qsize())

            incoming_requests_counter.on_tick = on_tick

        drop_counter = Counter('dropped')
        multiplied_output_counter = Counter()
        logger.info('Listener %d started', i)
        len_limit = self.options.backlog - self.options.backlog_breathing_space

        while self.running:
            q = self.next_query()
            if not q:
                continue
            if location_grep and not self.filter_by_location(q, location_grep):
                continue
            if record_file:
                print >> record_file, q
            incoming_requests_counter.count()
            logger.debug('Listener %d got %s', i, q)
            if queue:
                for _ in xrange(multiplier):
                    multiplied_output_counter.count()
                    if rate_limit > 0 and multiplied_output_counter.v >= rate_limit:
                        continue
                    if queue.qsize() > len_limit:
                        drop_counter.count()
                    else:
                        queue.put(q)
Ejemplo n.º 8
0
    def runloop(self):
        i = self.i
        queue = self.queue
        multiplier = self.options.multiplier
        rate_limit = self.options.rate_limit
        location_grep = self.options.location_grep
        incoming_requests_counter = Counter('input')
        record_file = None
        if self.options.record_file:
            record_file = open(self.options.record_file, 'w')

        if statsd_client:
            def on_tick():
                statsd_client.incr('backlog', queue.qsize())

            incoming_requests_counter.on_tick = on_tick

        drop_counter = Counter('dropped')
        multiplied_output_counter = Counter()
        logger.info('Listener %d started', i)
        len_limit = self.options.backlog - self.options.backlog_breathing_space

        while self.running:
            q = self.next_query()
            if not q:
                continue
            if location_grep and not self.filter_by_location(q, location_grep):
                continue
            if record_file:
                print >>record_file, q
            incoming_requests_counter.count()
            logger.debug('Listener %d got %s', i, q)
            if queue:
                for _ in xrange(multiplier):
                    multiplied_output_counter.count()
                    if rate_limit > 0 and multiplied_output_counter.v >= rate_limit:
                        continue
                    if queue.qsize() > len_limit:
                        drop_counter.count()
                    else:
                        queue.put(q)
Ejemplo n.º 9
0
def scrape_base_url():
    global data
    startTime = datetime.now()
    tree = html.fromstring(session.get(base_url).text)

    func = lambda x: queue.put_nowait((parse_comp, {
        'url': domain + x.xpath('./@href')[0],
        'name': x.xpath('./text()')[0]
    }))
    [
        func(x) for x in tree.xpath('//div[@class="st-text"]//td/a')
        if x.xpath('./text()') != []
    ]

    while not queue.empty() and not pool.full():
        for x in xrange(0, min(queue.qsize(), pool.free_count())):
            t = queue.get_nowait()
            pool.start(pool.spawn(t[0], t[1]))
    pool.join()
    print 'Time Taken : ', datetime.now() - startTime
    with open('data.json', 'w') as fp:
        json.dump(data, fp)
Ejemplo n.º 10
0
def init_queue_with_item(queue, item=None):
    # drain out queue
    while queue.qsize() > 0:
        queue.get()
    if item:
        queue.put(item)
Ejemplo n.º 11
0
        print "job done"
        handler.log("job done")
        print "so far crawled %s pages" % crawled
        handler.log("so far crawled %s pages" % crawled)


queue.put(start_url_1)
queue.put(start_url_2)
pool.spawn(crawler)
handler = Handler()

print 'starting Crawler...'
handler.log('starting Crawler...')
while not queue.empty() and not pool.free_count() == workers_count:
    gevent.sleep(0.8)
    for x in xrange(0, min(queue.qsize(), pool.free_count())):
        pool.spawn(crawler)


#wait for jobs to finish
pool.join()
print "Done"
handler.log("Done+\n")
print '\n'
print "collected %s imgs" % ITEMS_COUNT
handler.log("collected %s imgs" % ITEMS_COUNT)
print "see generated output and log files"

handler.close() #close the IO files

Ejemplo n.º 12
0
 def on_tick():
     statsd_client.incr('backlog', queue.qsize())
Ejemplo n.º 13
0
 def on_tick():
     statsd_client.incr('backlog', queue.qsize())
Ejemplo n.º 14
0
        }
    try:
        r = requests.get('http://store.nike.com/cn/zh_cn/', proxies=proxies, timeout=(3, 1))
    except requests.exceptions.ConnectionError:
        return
    except requests.exceptions.ReadTimeout:
        return
    end = time.time()
    delay = '{:.0f}ms'.format((end-start)*1000)
    queue.put([index, delay])

if __name__ == '__main__':
    with open('give.txt', 'r') as f:
        ips = f.read().strip().split('\n')
    pool = gevent.pool.Pool(len(ips))
    queue = gevent.queue.Queue()
    for index, ip in enumerate(ips):
        pool.apply_async(ip_delay, (index, ip))
    pool.join()

    # ip_delay(00, None)
    nums = []
    while True:
        if queue.qsize() > 0:
            task = queue.get()
            print(task)
            nums.append(task[0])
        else:
            break
    nums.sort()
    print(nums)