def take_action(self, parsed_args): self.log.debug('take_action(%s)', parsed_args) digits = self.app.client_manager.get_meta1_digits() workers_count = parsed_args.workers conf = {'namespace': self.app.client_manager.namespace} if parsed_args.proxy: conf.update({'proxyd_url': parsed_args.proxy}) else: ns_conf = load_namespace_conf(conf['namespace']) proxy = ns_conf.get('proxy') conf.update({'proxyd_url': proxy}) workers = list() with green.ContextPool(workers_count) as pool: pile = GreenPile(pool) prefix_queue = Queue(16) # Prepare some workers for i in range(workers_count): w = WarmupWorker(conf, self.log) workers.append(w) pile.spawn(w.run, prefix_queue) # Feed the queue trace_increment = 0.01 trace_next = trace_increment sent, total = 0, float(count_prefixes(digits)) for prefix in generate_prefixes(digits): sent += 1 prefix_queue.put(prefix) # Display the progression ratio = float(sent) / total if ratio >= trace_next: self.log.info("... %d%%", int(ratio * 100.0)) trace_next += trace_increment self.log.debug("Send the termination marker") prefix_queue.join() self.log.info("All the workers are done")
class ECWriter(object): """ Writes an EC chunk """ def __init__(self, chunk, conn): self._chunk = chunk self._conn = conn self.failed = False self.bytes_transferred = 0 self.checksum = hashlib.md5() @property def chunk(self): return self._chunk @property def conn(self): return self._conn @classmethod def connect(cls, chunk, sysmeta, reqid=None): raw_url = chunk["url"] parsed = urlparse(raw_url) chunk_path = parsed.path.split('/')[-1] h = {} h["transfer-encoding"] = "chunked" h[chunk_headers["content_id"]] = sysmeta['id'] h[chunk_headers["content_path"]] = sysmeta['content_path'] h[chunk_headers["content_chunkmethod"]] = sysmeta['chunk_method'] h[chunk_headers["container_id"]] = sysmeta['container_id'] h[chunk_headers["chunk_pos"]] = chunk["pos"] h[chunk_headers["chunk_id"]] = chunk_path h[chunk_headers["content_policy"]] = sysmeta['policy'] h[chunk_headers["content_version"]] = sysmeta['version'] if reqid: h['X-oio-req-id'] = reqid # in the trailer # metachunk_size & metachunk_hash h["Trailer"] = (chunk_headers["metachunk_size"], chunk_headers["metachunk_hash"]) with ConnectionTimeout(io.CONNECTION_TIMEOUT): conn = io.http_connect( parsed.netloc, 'PUT', parsed.path, h) conn.chunk = chunk return cls(chunk, conn) def start(self, pool): # we use eventlet Queue to pass data to the send coroutine self.queue = Queue(io.PUT_QUEUE_DEPTH) # spawn the send coroutine pool.spawn(self._send) def _send(self): # this is the send coroutine loop while True: # fetch input data from the queue d = self.queue.get() # use HTTP transfer encoding chunked # to write data to RAWX if not self.failed: # format the chunk to_send = "%x\r\n%s\r\n" % (len(d), d) try: with ChunkWriteTimeout(io.CHUNK_TIMEOUT): self.conn.send(to_send) self.bytes_transferred += len(d) except (Exception, ChunkWriteTimeout) as e: self.failed = True msg = str(e) logger.warn("Failed to write to %s (%s)", self.chunk, msg) self.chunk['error'] = msg self.queue.task_done() def wait(self): # wait until all data in the queue # has been processed by the send coroutine if self.queue.unfinished_tasks: self.queue.join() def send(self, data): # do not send empty data because # this will end the chunked body if not data: return # put the data to send into the queue # it will be processed by the send coroutine self.queue.put(data) def finish(self, metachunk_size, metachunk_hash): parts = [ '0\r\n', '%s: %s\r\n' % (chunk_headers['metachunk_size'], metachunk_size), '%s: %s\r\n' % (chunk_headers['metachunk_hash'], metachunk_hash), '\r\n' ] to_send = "".join(parts) self.conn.send(to_send) def getresponse(self): # read the HTTP response from the connection with Timeout(io.CHUNK_TIMEOUT): self.resp = self.conn.getresponse() return self.resp
class EcChunkWriter(object): """ Writes an EC chunk """ def __init__(self, chunk, conn, write_timeout=None, **_kwargs): self._chunk = chunk self._conn = conn self.failed = False self.bytes_transferred = 0 self.checksum = hashlib.md5() self.write_timeout = write_timeout or io.CHUNK_TIMEOUT # we use eventlet Queue to pass data to the send coroutine self.queue = Queue(io.PUT_QUEUE_DEPTH) @property def chunk(self): return self._chunk @property def conn(self): return self._conn @classmethod def connect(cls, chunk, sysmeta, reqid=None, connection_timeout=None, write_timeout=None, **_kwargs): raw_url = chunk["url"] parsed = urlparse(raw_url) chunk_path = parsed.path.split('/')[-1] hdrs = headers_from_object_metadata(sysmeta) if reqid: hdrs['X-oio-req-id'] = reqid hdrs[chunk_headers["chunk_pos"]] = chunk["pos"] hdrs[chunk_headers["chunk_id"]] = chunk_path # in the trailer # metachunk_size & metachunk_hash hdrs["Trailer"] = ', '.join( (chunk_headers["metachunk_size"], chunk_headers["metachunk_hash"], chunk_headers["chunk_hash"])) with green.ConnectionTimeout(connection_timeout or io.CONNECTION_TIMEOUT): conn = io.http_connect(parsed.netloc, 'PUT', parsed.path, hdrs) conn.chunk = chunk return cls(chunk, conn, write_timeout=write_timeout) def start(self, pool): """Spawn the send coroutine""" pool.spawn(self._send) def _send(self): """Send coroutine loop""" while True: # fetch input data from the queue data = self.queue.get() # use HTTP transfer encoding chunked # to write data to RAWX if not self.failed: # format the chunk to_send = "%x\r\n%s\r\n" % (len(data), data) try: with green.ChunkWriteTimeout(self.write_timeout): self.conn.send(to_send) self.bytes_transferred += len(data) except (Exception, green.ChunkWriteTimeout) as exc: self.failed = True msg = str(exc) logger.warn("Failed to write to %s (%s)", self.chunk, msg) self.chunk['error'] = 'write: %s' % msg self.queue.task_done() def wait(self): """ Wait until all data in the queue has been processed by the send coroutine """ if self.queue.unfinished_tasks: self.queue.join() def send(self, data): # do not send empty data because # this will end the chunked body if not data: return # put the data to send into the queue # it will be processed by the send coroutine self.queue.put(data) def finish(self, metachunk_size, metachunk_hash): """Send metachunk_size and metachunk_hash as trailers""" parts = [ '0\r\n', '%s: %s\r\n' % (chunk_headers['metachunk_size'], metachunk_size), '%s: %s\r\n' % (chunk_headers['metachunk_hash'], metachunk_hash), '%s: %s\r\n' % (chunk_headers['chunk_hash'], self.checksum.hexdigest()), '\r\n' ] to_send = "".join(parts) self.conn.send(to_send) def getresponse(self): """Read the HTTP response from the connection""" # As the server may buffer data before writing it to non-volatile # storage, we don't know if we have to wait while sending data or # while reading response, thus we apply the same timeout to both. with Timeout(self.write_timeout): return self.conn.getresponse()
def main(): args = options() global ACCOUNT, PROXY, QUEUE, NS, VERBOSE, TIMEOUT global COUNTERS, ELECTIONS ACCOUNT = args.account NS = args.namespace VERBOSE = args.verbose TIMEOUT = args.timeout PROXY = ObjectStorageApi(NS) ELECTIONS = AtomicInteger() num_worker_threads = int(args.max_worker) print("Using %d workers" % num_worker_threads) total_objects = {'size': 0, 'files': 0, 'elapsed': 0} total_containers = {'size': 0, 'files': 0, 'elapsed': 0} for path in args.path: path = path.rstrip('/') if '/' in path: bucket, path = path.split('/', 1) else: bucket = path path = "" containers = [] QUEUE = Queue() pool = eventlet.GreenPool(num_worker_threads) for i in range(num_worker_threads): pool.spawn(worker_objects) COUNTERS = AtomicInteger() _bucket = container_hierarchy(bucket, path) # we don't use placeholders, we use prefix path as prefix for entry in full_list(prefix=container_hierarchy(bucket, path)): name, _files, _size, _ = entry if name != _bucket and not name.startswith(_bucket + '%2F'): continue if _files: QUEUE.put(name) containers.append(name) # we have to wait all objects print("Waiting flush of objects") report = args.report while not QUEUE.empty(): ts = time.time() while time.time() - ts < report and not QUEUE.empty(): time.sleep(1) diff = time.time() - ts val = COUNTERS.reset() elections = ELECTIONS.reset() print("Objects: %5.2f / Size: %5.2f" % (val[0] / diff, val[1] / diff), "Elections failed: %5.2f/s total: %d" % (elections[0] / diff, ELECTIONS.total()[0]), " " * 20, end='\r') sys.stdout.flush() print("Waiting end of workers") QUEUE.join() val = COUNTERS.total() total_objects['files'] += val[0] total_objects['size'] += val[1] total_objects['elapsed'] += COUNTERS.time() COUNTERS = AtomicInteger() QUEUE = Queue() for i in range(num_worker_threads): pool.spawn(worker_container) print("We have to delete", len(containers), "containers") for container in containers: QUEUE.put(container) while not QUEUE.empty(): ts = time.time() while time.time() - ts < report and not QUEUE.empty(): time.sleep(1) diff = time.time() - ts val = COUNTERS.reset() elections = ELECTIONS.reset() print("Containers: %5.2f" % (val[0] / diff), "Elections failed: %5.2f/s total: %d" % (elections[0] / diff, ELECTIONS.total()[0]), " " * 20, end='\r') sys.stdout.flush() QUEUE.join() val = COUNTERS.total() total_containers['files'] += val[0] total_containers['size'] += val[1] total_containers['elapsed'] += COUNTERS.time() print(""" Objects: - ran during {o[elapsed]:5.2f} - {o[files]} objects removed (size {size}) - {o_file_avg:5.2f} objects/s ({o_size_avg} avg. size/s) """.format(o=total_objects, size=show(total_objects['size'], True), o_file_avg=total_objects['files'] / total_objects['elapsed'], o_size_avg=show(total_objects['size'] / total_objects['elapsed'], True))) print(""" Containers: - ran during {o[elapsed]:5.2f} - {o[files]} containers - {o_file_avg:5.2f} containers/s """.format(o=total_containers, o_file_avg=total_containers['files'] / total_containers['elapsed'])) print("Elections failed: %d" % ELECTIONS.total()[0])
class Crawler(object): """ A crawler will traverse all the pages of a site and process the content in a defined way. :param init_urls: the very first urls to start with. :param q: the queue that stores all urls to be crawled :param urls: a set stores all urls already crawled """ def __init__(self, init_urls, max_workers=200): self.init_urls = init_urls self.max_workers = max_workers self.q = Queue() self.urls = set() self.s = requests.Session() self.root_hosts = set() for url in init_urls: self.q.put(url) self.urls.add(url) self.root_hosts.add(get_netloc(url)) def url_allowed(self, url): """Check if given url will be crawled. Current, only if the url belongs to the same host as init_urls. """ return get_netloc(url) in self.root_hosts def save(self, response): """Save data at the given url.""" raise NotImplementedError( "Please implement your own save logic in subclass.") def parse(self, response): self.save(response) new_links = set() for url in self.find_links(response): if url not in self.urls and self.url_allowed(url): new_links.add(url) self.urls.add(url) self.q.put(url) if len(new_links) != 0: print("Find %d new urls to crawl" % len(new_links)) def fetch(self, url): """Fetch content of the url from network.""" response = self.s.get(url) print("Getting content from %s, length: %d" % (url, len(response.content))) return response def work(self, i): """Define the work process. Retrieve a url from queue, fetch the content from it, process it and get new urls to crawl. Continue the process until all pages are crawled. :param i: indicate the worker number """ while True: url = self.q.get() print("Worker %d: Getting url %s from queue." % (i, url)) response = self.fetch(url) self.parse(response) self.q.task_done() def run(self): """Start the crawling process. This is the main entrance for our crawler. It will start several workers, crawling in parallel. """ pool = eventlet.GreenPool() start = time.time() for i in range(self.max_workers): pool.spawn(self.work, i) self.q.join() end = time.time() print("Finished crawling, takes %s seconds." % str(end - start)) print("Have fun hacking!")
class ECWriter(object): """ Writes an EC chunk """ def __init__(self, chunk, conn): self._chunk = chunk self._conn = conn self.failed = False self.bytes_transferred = 0 self.checksum = hashlib.md5() @property def chunk(self): return self._chunk @property def conn(self): return self._conn @classmethod def connect(cls, chunk, sysmeta): raw_url = chunk["url"] parsed = urlparse(raw_url) chunk_path = parsed.path.split('/')[-1] h = {} h["transfer-encoding"] = "chunked" h[chunk_headers["content_id"]] = sysmeta['id'] h[chunk_headers["content_path"]] = sysmeta['content_path'] h[chunk_headers["content_chunkmethod"]] = sysmeta['chunk_method'] h[chunk_headers["container_id"]] = sysmeta['container_id'] h[chunk_headers["chunk_pos"]] = chunk["pos"] h[chunk_headers["chunk_id"]] = chunk_path h[chunk_headers["content_policy"]] = sysmeta['policy'] h[chunk_headers["content_version"]] = sysmeta['version'] # in the trailer # metachunk_size & metachunk_hash h["Trailer"] = (chunk_headers["metachunk_size"], chunk_headers["metachunk_hash"]) with ConnectionTimeout(io.CONNECTION_TIMEOUT): conn = io.http_connect(parsed.netloc, 'PUT', parsed.path, h) conn.chunk = chunk return cls(chunk, conn) def start(self, pool): # we use eventlet Queue to pass data to the send coroutine self.queue = Queue(io.PUT_QUEUE_DEPTH) # spawn the send coroutine pool.spawn(self._send) def _send(self): # this is the send coroutine loop while True: # fetch input data from the queue d = self.queue.get() # use HTTP transfer encoding chunked # to write data to RAWX if not self.failed: # format the chunk to_send = "%x\r\n%s\r\n" % (len(d), d) try: with ChunkWriteTimeout(io.CHUNK_TIMEOUT): self.conn.send(to_send) self.bytes_transferred += len(d) except (Exception, ChunkWriteTimeout) as e: self.failed = True msg = str(e) logger.warn("Failed to write to %s (%s)", self.chunk, msg) self.chunk['error'] = msg self.queue.task_done() def wait(self): # wait until all data in the queue # has been processed by the send coroutine if self.queue.unfinished_tasks: self.queue.join() def send(self, data): # do not send empty data because # this will end the chunked body if not data: return # put the data to send into the queue # it will be processed by the send coroutine self.queue.put(data) def finish(self, metachunk_size, metachunk_hash): parts = [ '0\r\n', '%s: %s\r\n' % (chunk_headers['metachunk_size'], metachunk_size), '%s: %s\r\n' % (chunk_headers['metachunk_hash'], metachunk_hash), '\r\n' ] to_send = "".join(parts) self.conn.send(to_send) def getresponse(self): # read the HTTP response from the connection with Timeout(io.CHUNK_TIMEOUT): self.resp = self.conn.getresponse() return self.resp