class Replicator(Thread): """ Thread responsible for sending compressed batches to each replica in synchronized new threads """ def __init__(self, replicas, payid, total_batches): super().__init__(name="Replicator") self.progress = ProgressMeter(total_batches, "Replicator") self.replicas = replicas self.payid = payid self.__queue = Queue() #maybe cache def send_handler(payid, msock, shape, compressed): """ Send to a replicas the current chunk Args: payid (Payload.Id) : data network id msock (network.Socket): socket to send data. shape (tuple): batch's number of lines and columns compressed (bytes): compressed batch data """ msock.send(Payload(payid, (shape, compressed))) def run(self): i = 0 threads = [None for _ in self.replicas] while True: should_stop, shape, compressed = self.__queue.get() if should_stop: break print(f"t={i} sending {shape[0]} points") #shuffles replicas to reduce same interface distribution problem for j, replica in enumerate(outplace_shuffle(self.replicas)): thread = Thread(name=f"Replicator.send_handler-{i,j}", target=Replicator.send_handler, args=(self.payid, replica, shape, compressed)) threads[j] = thread thread.start() for thread in threads: thread.join() self.progress.update(1) i += 1 def add_job(self, shape, compressed): self.__queue.put((False, shape, compressed)) def join(self): self.__queue.put((True, None, None)) super().join()
class Bucket: """ Manages each bucket entry to caclulate best silhouette for each batch index Args: batch_size (int): size of each chunk of the dataset total_batches (int): total number of batches """ def __init__(self, max_size, total_batches): self.lock = Lock() self.data = {} #t -> Bucket.Entry self.max_size = max_size self.total_batches = total_batches self.progress = ProgressMeter(total_batches, "Bucket") def add(self, t, k, sil, msock): with self.lock: entry = self.data.get(t) if entry == None: #insert new entry entry = BucketEntry(sil=sil, k=k, counter=1, msock=msock, timer=Timer(), proc_info=get_proc_info()) self.data[t] = entry entry.timer.start() else: #update entry entry.counter += 1 if sil > entry.sil: entry.sil = sil entry.k = k entry.msock = msock elif sil == entry.sil and k <= entry.k: #collision entry.sil = sil entry.k = k entry.msock = msock isfull = entry.counter == self.max_size if isfull: self.progress.update(1) entry.timer.stop() if entry.counter > self.max_size: raise RuntimeError("Unexpected error: bucket already full") return isfull def get(self, t): with self.lock: return self.data.get(t) def to_dicts(self): return [ dict(batch_counter=t, silhouette=entry.sil, k=entry.k, entry_counter=entry.counter, time_start=entry.timer.beginning, time_end=entry.timer.end, time=entry.timer.t, rss=entry.proc_info.rss, data_write=entry.proc_info.data_write, data_read=entry.proc_info.data_write) for t, entry in self.data.items() ]