Esempio n. 1
0
    def take_action(self, parsed_args):
        self.log.debug('take_action(%s)', parsed_args)
        digits = self.app.client_manager.get_meta1_digits()
        workers_count = parsed_args.workers

        conf = {'namespace': self.app.client_manager.namespace}
        if parsed_args.proxy:
            conf.update({'proxyd_url': parsed_args.proxy})
        else:
            ns_conf = load_namespace_conf(conf['namespace'])
            proxy = ns_conf.get('proxy')
            conf.update({'proxyd_url': proxy})

        workers = list()
        with green.ContextPool(workers_count) as pool:
            pile = GreenPile(pool)
            prefix_queue = Queue(16)

            # Prepare some workers
            for i in range(workers_count):
                w = WarmupWorker(conf, self.log)
                workers.append(w)
                pile.spawn(w.run, prefix_queue)

            # Feed the queue
            trace_increment = 0.01
            trace_next = trace_increment
            sent, total = 0, float(count_prefixes(digits))
            for prefix in generate_prefixes(digits):
                sent += 1
                prefix_queue.put(prefix)
                # Display the progression
                ratio = float(sent) / total
                if ratio >= trace_next:
                    self.log.info("... %d%%", int(ratio * 100.0))
                    trace_next += trace_increment

            self.log.debug("Send the termination marker")
            prefix_queue.join()

        self.log.info("All the workers are done")
Esempio n. 2
0
class ECWriter(object):
    """
    Writes an EC chunk
    """
    def __init__(self, chunk, conn):
        self._chunk = chunk
        self._conn = conn
        self.failed = False
        self.bytes_transferred = 0
        self.checksum = hashlib.md5()

    @property
    def chunk(self):
        return self._chunk

    @property
    def conn(self):
        return self._conn

    @classmethod
    def connect(cls, chunk, sysmeta, reqid=None):
        raw_url = chunk["url"]
        parsed = urlparse(raw_url)
        chunk_path = parsed.path.split('/')[-1]
        h = {}
        h["transfer-encoding"] = "chunked"
        h[chunk_headers["content_id"]] = sysmeta['id']
        h[chunk_headers["content_path"]] = sysmeta['content_path']
        h[chunk_headers["content_chunkmethod"]] = sysmeta['chunk_method']
        h[chunk_headers["container_id"]] = sysmeta['container_id']
        h[chunk_headers["chunk_pos"]] = chunk["pos"]
        h[chunk_headers["chunk_id"]] = chunk_path
        h[chunk_headers["content_policy"]] = sysmeta['policy']
        h[chunk_headers["content_version"]] = sysmeta['version']
        if reqid:
            h['X-oio-req-id'] = reqid

        # in the trailer
        # metachunk_size & metachunk_hash
        h["Trailer"] = (chunk_headers["metachunk_size"],
                        chunk_headers["metachunk_hash"])
        with ConnectionTimeout(io.CONNECTION_TIMEOUT):
            conn = io.http_connect(
                parsed.netloc, 'PUT', parsed.path, h)
            conn.chunk = chunk
        return cls(chunk, conn)

    def start(self, pool):
        # we use eventlet Queue to pass data to the send coroutine
        self.queue = Queue(io.PUT_QUEUE_DEPTH)
        # spawn the send coroutine
        pool.spawn(self._send)

    def _send(self):
        # this is the send coroutine loop
        while True:
            # fetch input data from the queue
            d = self.queue.get()
            # use HTTP transfer encoding chunked
            # to write data to RAWX
            if not self.failed:
                # format the chunk
                to_send = "%x\r\n%s\r\n" % (len(d), d)
                try:
                    with ChunkWriteTimeout(io.CHUNK_TIMEOUT):
                        self.conn.send(to_send)
                        self.bytes_transferred += len(d)
                except (Exception, ChunkWriteTimeout) as e:
                    self.failed = True
                    msg = str(e)
                    logger.warn("Failed to write to %s (%s)", self.chunk, msg)
                    self.chunk['error'] = msg

            self.queue.task_done()

    def wait(self):
        # wait until all data in the queue
        # has been processed by the send coroutine
        if self.queue.unfinished_tasks:
            self.queue.join()

    def send(self, data):
        # do not send empty data because
        # this will end the chunked body
        if not data:
            return
        # put the data to send into the queue
        # it will be processed by the send coroutine
        self.queue.put(data)

    def finish(self, metachunk_size, metachunk_hash):
        parts = [
            '0\r\n',
            '%s: %s\r\n' % (chunk_headers['metachunk_size'],
                            metachunk_size),
            '%s: %s\r\n' % (chunk_headers['metachunk_hash'],
                            metachunk_hash),
            '\r\n'
        ]
        to_send = "".join(parts)
        self.conn.send(to_send)

    def getresponse(self):
        # read the HTTP response from the connection
        with Timeout(io.CHUNK_TIMEOUT):
            self.resp = self.conn.getresponse()
            return self.resp
Esempio n. 3
0
class EcChunkWriter(object):
    """
    Writes an EC chunk
    """
    def __init__(self, chunk, conn, write_timeout=None, **_kwargs):
        self._chunk = chunk
        self._conn = conn
        self.failed = False
        self.bytes_transferred = 0
        self.checksum = hashlib.md5()
        self.write_timeout = write_timeout or io.CHUNK_TIMEOUT
        # we use eventlet Queue to pass data to the send coroutine
        self.queue = Queue(io.PUT_QUEUE_DEPTH)

    @property
    def chunk(self):
        return self._chunk

    @property
    def conn(self):
        return self._conn

    @classmethod
    def connect(cls,
                chunk,
                sysmeta,
                reqid=None,
                connection_timeout=None,
                write_timeout=None,
                **_kwargs):
        raw_url = chunk["url"]
        parsed = urlparse(raw_url)
        chunk_path = parsed.path.split('/')[-1]
        hdrs = headers_from_object_metadata(sysmeta)
        if reqid:
            hdrs['X-oio-req-id'] = reqid

        hdrs[chunk_headers["chunk_pos"]] = chunk["pos"]
        hdrs[chunk_headers["chunk_id"]] = chunk_path

        # in the trailer
        # metachunk_size & metachunk_hash
        hdrs["Trailer"] = ', '.join(
            (chunk_headers["metachunk_size"], chunk_headers["metachunk_hash"],
             chunk_headers["chunk_hash"]))
        with green.ConnectionTimeout(connection_timeout
                                     or io.CONNECTION_TIMEOUT):
            conn = io.http_connect(parsed.netloc, 'PUT', parsed.path, hdrs)
            conn.chunk = chunk
        return cls(chunk, conn, write_timeout=write_timeout)

    def start(self, pool):
        """Spawn the send coroutine"""
        pool.spawn(self._send)

    def _send(self):
        """Send coroutine loop"""
        while True:
            # fetch input data from the queue
            data = self.queue.get()
            # use HTTP transfer encoding chunked
            # to write data to RAWX
            if not self.failed:
                # format the chunk
                to_send = "%x\r\n%s\r\n" % (len(data), data)
                try:
                    with green.ChunkWriteTimeout(self.write_timeout):
                        self.conn.send(to_send)
                        self.bytes_transferred += len(data)
                except (Exception, green.ChunkWriteTimeout) as exc:
                    self.failed = True
                    msg = str(exc)
                    logger.warn("Failed to write to %s (%s)", self.chunk, msg)
                    self.chunk['error'] = 'write: %s' % msg

            self.queue.task_done()

    def wait(self):
        """
        Wait until all data in the queue
        has been processed by the send coroutine
        """
        if self.queue.unfinished_tasks:
            self.queue.join()

    def send(self, data):
        # do not send empty data because
        # this will end the chunked body
        if not data:
            return
        # put the data to send into the queue
        # it will be processed by the send coroutine
        self.queue.put(data)

    def finish(self, metachunk_size, metachunk_hash):
        """Send metachunk_size and metachunk_hash as trailers"""
        parts = [
            '0\r\n',
            '%s: %s\r\n' % (chunk_headers['metachunk_size'], metachunk_size),
            '%s: %s\r\n' % (chunk_headers['metachunk_hash'], metachunk_hash),
            '%s: %s\r\n' %
            (chunk_headers['chunk_hash'], self.checksum.hexdigest()), '\r\n'
        ]
        to_send = "".join(parts)
        self.conn.send(to_send)

    def getresponse(self):
        """Read the HTTP response from the connection"""
        # As the server may buffer data before writing it to non-volatile
        # storage, we don't know if we have to wait while sending data or
        # while reading response, thus we apply the same timeout to both.
        with Timeout(self.write_timeout):
            return self.conn.getresponse()
Esempio n. 4
0
def main():
    args = options()

    global ACCOUNT, PROXY, QUEUE, NS, VERBOSE, TIMEOUT
    global COUNTERS, ELECTIONS
    ACCOUNT = args.account
    NS = args.namespace
    VERBOSE = args.verbose
    TIMEOUT = args.timeout
    PROXY = ObjectStorageApi(NS)
    ELECTIONS = AtomicInteger()

    num_worker_threads = int(args.max_worker)
    print("Using %d workers" % num_worker_threads)

    total_objects = {'size': 0, 'files': 0, 'elapsed': 0}
    total_containers = {'size': 0, 'files': 0, 'elapsed': 0}

    for path in args.path:
        path = path.rstrip('/')
        if '/' in path:
            bucket, path = path.split('/', 1)
        else:
            bucket = path
            path = ""

        containers = []

        QUEUE = Queue()
        pool = eventlet.GreenPool(num_worker_threads)

        for i in range(num_worker_threads):
            pool.spawn(worker_objects)

        COUNTERS = AtomicInteger()
        _bucket = container_hierarchy(bucket, path)
        # we don't use placeholders, we use prefix path as prefix
        for entry in full_list(prefix=container_hierarchy(bucket, path)):
            name, _files, _size, _ = entry
            if name != _bucket and not name.startswith(_bucket + '%2F'):
                continue

            if _files:
                QUEUE.put(name)

            containers.append(name)

        # we have to wait all objects
        print("Waiting flush of objects")

        report = args.report

        while not QUEUE.empty():
            ts = time.time()
            while time.time() - ts < report and not QUEUE.empty():
                time.sleep(1)
            diff = time.time() - ts
            val = COUNTERS.reset()
            elections = ELECTIONS.reset()
            print("Objects: %5.2f / Size: %5.2f" %
                  (val[0] / diff, val[1] / diff),
                  "Elections failed: %5.2f/s total: %d" %
                  (elections[0] / diff, ELECTIONS.total()[0]),
                  " " * 20,
                  end='\r')
            sys.stdout.flush()

        print("Waiting end of workers")
        QUEUE.join()

        val = COUNTERS.total()
        total_objects['files'] += val[0]
        total_objects['size'] += val[1]
        total_objects['elapsed'] += COUNTERS.time()

        COUNTERS = AtomicInteger()

        QUEUE = Queue()
        for i in range(num_worker_threads):
            pool.spawn(worker_container)

        print("We have to delete", len(containers), "containers")

        for container in containers:
            QUEUE.put(container)

        while not QUEUE.empty():
            ts = time.time()
            while time.time() - ts < report and not QUEUE.empty():
                time.sleep(1)
            diff = time.time() - ts
            val = COUNTERS.reset()
            elections = ELECTIONS.reset()
            print("Containers: %5.2f" % (val[0] / diff),
                  "Elections failed: %5.2f/s total: %d" %
                  (elections[0] / diff, ELECTIONS.total()[0]),
                  " " * 20,
                  end='\r')
            sys.stdout.flush()

        QUEUE.join()
        val = COUNTERS.total()
        total_containers['files'] += val[0]
        total_containers['size'] += val[1]
        total_containers['elapsed'] += COUNTERS.time()

    print("""
Objects:
    - ran during {o[elapsed]:5.2f}
    - {o[files]} objects removed (size {size})
    - {o_file_avg:5.2f} objects/s ({o_size_avg} avg. size/s)
""".format(o=total_objects,
           size=show(total_objects['size'], True),
           o_file_avg=total_objects['files'] / total_objects['elapsed'],
           o_size_avg=show(total_objects['size'] / total_objects['elapsed'],
                           True)))

    print("""
Containers:
    - ran during {o[elapsed]:5.2f}
    - {o[files]} containers
    - {o_file_avg:5.2f} containers/s
""".format(o=total_containers,
           o_file_avg=total_containers['files'] / total_containers['elapsed']))

    print("Elections failed: %d" % ELECTIONS.total()[0])
Esempio n. 5
0
class Crawler(object):
    """
    A crawler will traverse all the pages of a site and process the content
    in a defined way.

    :param init_urls: the very first urls to start with.
    :param q: the queue that stores all urls to be crawled
    :param urls: a set stores all urls already crawled
    """

    def __init__(self, init_urls, max_workers=200):
        self.init_urls = init_urls
        self.max_workers = max_workers
        self.q = Queue()
        self.urls = set()
        self.s = requests.Session()
        self.root_hosts = set()
        for url in init_urls:
            self.q.put(url)
            self.urls.add(url)
            self.root_hosts.add(get_netloc(url))

    def url_allowed(self, url):
        """Check if given url will be crawled.

        Current, only if the url belongs to the same host as init_urls.
        """
        return get_netloc(url) in self.root_hosts


    def save(self, response):
        """Save data at the given url."""
        raise NotImplementedError(
            "Please implement your own save logic in subclass.")

    def parse(self, response):
        self.save(response)

        new_links = set()

        for url in self.find_links(response):
            if url not in self.urls and self.url_allowed(url):
                new_links.add(url)
                self.urls.add(url)
                self.q.put(url)
        if len(new_links) != 0:
            print("Find %d new urls to crawl" % len(new_links))


    def fetch(self, url):
        """Fetch content of the url from network."""

        response = self.s.get(url)
        print("Getting content from %s, length: %d" % (url,
                                                       len(response.content)))
        return response

    def work(self, i):
        """Define the work process.

        Retrieve a url from queue, fetch the content from it,
        process it and get new urls to crawl.
        Continue the process until all pages are crawled.

        :param i: indicate the worker number
        """
        while True:
            url = self.q.get()
            print("Worker %d: Getting url %s from queue." % (i, url))
            response = self.fetch(url)
            self.parse(response)
            self.q.task_done()

    def run(self):
        """Start the crawling process.

        This is the main entrance for our crawler. It will start several
        workers, crawling in parallel.
        """
        pool = eventlet.GreenPool()
        start = time.time()
        for i in range(self.max_workers):
            pool.spawn(self.work, i)

        self.q.join()
        end = time.time()

        print("Finished crawling, takes %s seconds." % str(end - start))
        print("Have fun hacking!")
Esempio n. 6
0
class ECWriter(object):
    """
    Writes an EC chunk
    """
    def __init__(self, chunk, conn):
        self._chunk = chunk
        self._conn = conn
        self.failed = False
        self.bytes_transferred = 0
        self.checksum = hashlib.md5()

    @property
    def chunk(self):
        return self._chunk

    @property
    def conn(self):
        return self._conn

    @classmethod
    def connect(cls, chunk, sysmeta):
        raw_url = chunk["url"]
        parsed = urlparse(raw_url)
        chunk_path = parsed.path.split('/')[-1]
        h = {}
        h["transfer-encoding"] = "chunked"
        h[chunk_headers["content_id"]] = sysmeta['id']
        h[chunk_headers["content_path"]] = sysmeta['content_path']
        h[chunk_headers["content_chunkmethod"]] = sysmeta['chunk_method']
        h[chunk_headers["container_id"]] = sysmeta['container_id']
        h[chunk_headers["chunk_pos"]] = chunk["pos"]
        h[chunk_headers["chunk_id"]] = chunk_path
        h[chunk_headers["content_policy"]] = sysmeta['policy']
        h[chunk_headers["content_version"]] = sysmeta['version']

        # in the trailer
        # metachunk_size & metachunk_hash
        h["Trailer"] = (chunk_headers["metachunk_size"],
                        chunk_headers["metachunk_hash"])
        with ConnectionTimeout(io.CONNECTION_TIMEOUT):
            conn = io.http_connect(parsed.netloc, 'PUT', parsed.path, h)
            conn.chunk = chunk
        return cls(chunk, conn)

    def start(self, pool):
        # we use eventlet Queue to pass data to the send coroutine
        self.queue = Queue(io.PUT_QUEUE_DEPTH)
        # spawn the send coroutine
        pool.spawn(self._send)

    def _send(self):
        # this is the send coroutine loop
        while True:
            # fetch input data from the queue
            d = self.queue.get()
            # use HTTP transfer encoding chunked
            # to write data to RAWX
            if not self.failed:
                # format the chunk
                to_send = "%x\r\n%s\r\n" % (len(d), d)
                try:
                    with ChunkWriteTimeout(io.CHUNK_TIMEOUT):
                        self.conn.send(to_send)
                        self.bytes_transferred += len(d)
                except (Exception, ChunkWriteTimeout) as e:
                    self.failed = True
                    msg = str(e)
                    logger.warn("Failed to write to %s (%s)", self.chunk, msg)
                    self.chunk['error'] = msg

            self.queue.task_done()

    def wait(self):
        # wait until all data in the queue
        # has been processed by the send coroutine
        if self.queue.unfinished_tasks:
            self.queue.join()

    def send(self, data):
        # do not send empty data because
        # this will end the chunked body
        if not data:
            return
        # put the data to send into the queue
        # it will be processed by the send coroutine
        self.queue.put(data)

    def finish(self, metachunk_size, metachunk_hash):
        parts = [
            '0\r\n',
            '%s: %s\r\n' % (chunk_headers['metachunk_size'], metachunk_size),
            '%s: %s\r\n' % (chunk_headers['metachunk_hash'], metachunk_hash),
            '\r\n'
        ]
        to_send = "".join(parts)
        self.conn.send(to_send)

    def getresponse(self):
        # read the HTTP response from the connection
        with Timeout(io.CHUNK_TIMEOUT):
            self.resp = self.conn.getresponse()
            return self.resp