def crawler(n): """ this is the worker routine, the heart of this solution the job is performed by the following steps: 1. take an url from the queue 2. make a request to this url 3. mark it as visited 4. check whether the response is ok to be parsed 5. if the url corresponds to a product page, then extract data from it 6. extract more urls from the current page and add them to the queue this is repeated continuously until the queue is empty """ while True: logger.info( 'links: [%d] pending, [%d] discovered, [%d] visited' % (queue.qsize(), len(discovered), len(visited)) ) url = queue.get() logger.info('crawler [%d] took [%s] from queue' % (n, url)) response = requests.get(url, verify=False) # no SSL validation visited.append(url) if response.status_code == requests.codes.ok: soup = Soup(response.content) if is_valid_product_page(url, response): data = extract_product_data(url, soup) csv.write(CSV_FORMAT % data) discover_links(url, soup) else: logger.warning('response not ok for [%s]' % url) queue.task_done()
def spawner(queue): while 1: try: item = queue.get() except hub.LoopExit: logger.error("exit getter spawner...") return queue.task_done() gs.append(gevent.spawn(http_getter, item))
def worker(self, thread_id, queue): \ # pylint: disable=unused-argument while True: try: spot_instance_request = queue.get() self.process_spot_instance_request(spot_instance_request) except Exception: self._logger.exception( 'Exception while processing spot instance request') finally: queue.task_done()
def converter(queue): LOGGER.debug('converter started') while True: data = queue.get() LOGGER.debug('new data for conversion') if data == StopIteration: queue.task_done() break LOGGER.debug('flv file: %s' % path.abspath(data['source_file'].name)) LOGGER.debug('target file: %s' % data['target_file']) ffmpeg_args = [ 'ffmpeg', '-i', path.abspath(data['source_file'].name), '-vn', '-acodec', data['acodec'], '-aq', data['quality'], '-y', data['target_file'] ] p = subprocess.Popen(ffmpeg_args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) fcntl.fcntl(p.stdin, fcntl.F_SETFL, O_NONBLOCK) fcntl.fcntl(p.stdout, fcntl.F_SETFL, O_NONBLOCK) p.stdin.close() output = "" while True: try: chunk = p.stdout.read(4096) if not chunk: break output += chunk except IOError: ex = sys.exc_info()[1] if ex[0] != errno.EAGAIN: raise sys.exc_clear() socket.wait_read(p.stdout.fileno()) p.stdout.close() data['source_file'].close() LOGGER.debug('convertion done') queue.task_done()