class Replicator(Thread):
    """
	Thread responsible for sending compressed batches to each replica in synchronized
	new threads
	"""
    def __init__(self, replicas, payid, total_batches):
        super().__init__(name="Replicator")
        self.progress = ProgressMeter(total_batches, "Replicator")
        self.replicas = replicas
        self.payid = payid
        self.__queue = Queue()  #maybe cache

    def send_handler(payid, msock, shape, compressed):
        """
		Send to a replicas the current chunk
		
		Args:
			payid (Payload.Id) : data network id
			msock (network.Socket): socket to send data.
			shape (tuple): batch's number of lines and columns
			compressed (bytes): compressed batch data
		"""
        msock.send(Payload(payid, (shape, compressed)))

    def run(self):
        i = 0
        threads = [None for _ in self.replicas]
        while True:
            should_stop, shape, compressed = self.__queue.get()
            if should_stop: break
            print(f"t={i} sending {shape[0]} points")
            #shuffles replicas to reduce same interface distribution problem
            for j, replica in enumerate(outplace_shuffle(self.replicas)):
                thread = Thread(name=f"Replicator.send_handler-{i,j}",
                                target=Replicator.send_handler,
                                args=(self.payid, replica, shape, compressed))
                threads[j] = thread
                thread.start()
            for thread in threads:
                thread.join()
            self.progress.update(1)
            i += 1

    def add_job(self, shape, compressed):
        self.__queue.put((False, shape, compressed))

    def join(self):
        self.__queue.put((True, None, None))
        super().join()
Esempio n. 2
0
class Bucket:
    """
	Manages each bucket entry to caclulate best silhouette for each batch index

	Args:
		batch_size (int): size of each chunk of the dataset
		total_batches (int): total number of batches
	"""
    def __init__(self, max_size, total_batches):
        self.lock = Lock()
        self.data = {}  #t -> Bucket.Entry
        self.max_size = max_size
        self.total_batches = total_batches
        self.progress = ProgressMeter(total_batches, "Bucket")

    def add(self, t, k, sil, msock):
        with self.lock:
            entry = self.data.get(t)
            if entry == None:  #insert new entry
                entry = BucketEntry(sil=sil,
                                    k=k,
                                    counter=1,
                                    msock=msock,
                                    timer=Timer(),
                                    proc_info=get_proc_info())
                self.data[t] = entry
                entry.timer.start()
            else:  #update entry
                entry.counter += 1
                if sil > entry.sil:
                    entry.sil = sil
                    entry.k = k
                    entry.msock = msock
                elif sil == entry.sil and k <= entry.k:  #collision
                    entry.sil = sil
                    entry.k = k
                    entry.msock = msock

            isfull = entry.counter == self.max_size
            if isfull:
                self.progress.update(1)
                entry.timer.stop()
            if entry.counter > self.max_size:
                raise RuntimeError("Unexpected error: bucket already full")
            return isfull

    def get(self, t):
        with self.lock:
            return self.data.get(t)

    def to_dicts(self):
        return [
            dict(batch_counter=t,
                 silhouette=entry.sil,
                 k=entry.k,
                 entry_counter=entry.counter,
                 time_start=entry.timer.beginning,
                 time_end=entry.timer.end,
                 time=entry.timer.t,
                 rss=entry.proc_info.rss,
                 data_write=entry.proc_info.data_write,
                 data_read=entry.proc_info.data_write)
            for t, entry in self.data.items()
        ]