def train_step(peer, t, PSS): T = t if isinstance(t, tqdm) or isinstance(t, range) else [t] for t in T: # train one epoch peer.train_one_epoch() # broadcast gradients active = active_peers(peer.neighbors, peer.params.frac) msg = protocol.train_step(t, peer.get_gradients()) peer.broadcast(msg, active) # wait for enough updates labeled with round number t wait_until(enough_grads, 3, 0.05, peer, t, len(active)) if t not in peer.V: peer.V[t] = [] log('error', f"{peer} received no messages in round {t}") else: log( 'log', f"{peer} -- T= {t} -- Got enough messages : {len(peer.V[t])}.") # collaborativeUpdate v_t = collaborativeUpdateLight(peer, t) # update and evaluate the model # TODO Review update function update_model(peer, v_t, evaluate=(t % 10 == 0)) # start accepting gradients from next round peer.current_round = t + 1 del peer.V[t] # networkUpdate(peer, t, PSS) return
def train_step(peer, t): T = t if isinstance(t, tqdm) or isinstance(t, range) else [t] for t in T: # train for E (one) epoch peer.train_one_epoch() # weights ==> multiple epochs # train_for_x_epoch(peer, 10) # broadcast current model to all my active neighbors active = active_peers(peer.neighbors, peer.params.frac) # TODO check exchanging grads instead of model params. msg = protocol.train_step(t, peer.get_model_params()) peer.broadcast(msg, active) # wait for enough updates labeled with round number t wait_until(enough_received, WAIT_TIMEOUT, WAIT_INTERVAL, peer, t, len(active)) if t not in peer.V: peer.V[t] = [] log('error', f"{peer} received no messages in round {t}.") else: log( 'log', f"{peer} got {len(peer.V[t])}/{len(active)} messages in round {t}." ) # estimate \sigma in first round estimate_sigma(peer) # collaborativeUpdate v = collaborativeUpdate(peer, t) # update and evaluate the model update_model(peer, v, evaluate=(t % EVAL_ROUND == 0)) # start accepting gradients from next round peer.current_round = t + 1 del peer.V[t] return
def train_step(peer, t): T = t if isinstance(t, tqdm) or isinstance(t, range) else [t] for t in T: # train for E (one) epoch peer.train_one_epoch() # broadcast current model to all my active neighbors active = active_peers(peer.neighbors, peer.params.frac) msg = protocol.train_step(t, peer.get_model_params()) peer.broadcast(msg, active) # wait for enough updates labeled with round number t wait_until(enough_received, WAIT_TIMEOUT, WAIT_INTERVAL, peer, t, len(active)) if t not in peer.V: peer.V[t] = [] peer.log('error', f"{peer} received no messages in round {t}.") # peer.log('log', f"{peer} got {len(peer.V[t])}/{len(active)} messages in round {t}.", remote=False) # collaborativeUpdate w_t = collaborativeUpdateLight(peer, t) # update and evaluate the model if isinstance(T, tqdm): T.set_postfix_str(f"{peer} running evaluation in round {t}..." if ( t % EVAL_ROUND) == 0 else "") # TODO Review update function update_model(peer, w_t, evaluate=(t % EVAL_ROUND == 0), t=t) # start accepting gradients from next round peer.current_round = t + 1 del peer.V[t] # networkUpdate(peer, t, PSS) return
def train_step(peer: Node, t, args): T = t if isinstance(t, tqdm) or isinstance(t, range) else [t] for t in T: if peer.id == args.server_id: # Server wait_until(enough_received, WAIT_TIMEOUT * 100, WAIT_INTERVAL * 10, peer, t, len(peer.neighbors)) w = GAR(peer, [v for i, v in peer.V[t]]) msg = protocol.train_step(t, peer.get_model_params()) # not grads peer.broadcast(msg) peer.set_model_params(w) if t % EVAL_ROUND == 0: t_eval = peer.evaluate(peer.inference, one_batch=True) peer.params.logs.append(t_eval) else: if t > 0: wait_until(server_received, WAIT_TIMEOUT * 100, WAIT_INTERVAL * 10, peer, t) w_server = peer.V[t - 1][0][1] peer.set_model_params(w_server) # Worker train_for_x_epoch(peer, args.epochs) msg = protocol.train_step(t, peer.get_model_params()) # not grads server = peer.neighbors[0] peer.send(server, msg) # peer.params.server.params.models[t].append(peer.get_model_params()) return
def populate(self, info): self.send(protocol.call_method("populate", info)) done = wait_until(self.return_method, conf.FUNC_TIMEOUT, 1, "populate") if done and self.callbacks['populate']['s']: del self.callbacks['populate'] log('success', f"{self} populated successfully") elif done: log("error", f"Error populating {self}") else: log('warning', f"Calling populate() timeout after {conf.FUNC_TIMEOUT} seconds")
def fit(self, inference): self.send(protocol.call_method("fit", inference)) done = wait_until(self.return_method, conf.FUNC_TIMEOUT, 1, "fit") if done and self.callbacks['fit']['s']: history = self.callbacks['fit']['m'] del self.callbacks['fit'] # for i, h in enumerate(history): # log('', f"Epoch [{i}], val_loss: {h['val_loss']:.4f}, val_acc: {h['val_acc']:.4f}") return history else: log('warning', f"Calling fit() timeout after {conf.FUNC_TIMEOUT} seconds") return None
def connect(self, neighbor): self.send(protocol.call_method("connect", neighbor.id, neighbor.host, neighbor.port)) done = wait_until(self.return_method, conf.FUNC_TIMEOUT, 1, "connect") if done and self.callbacks['connect']['s']: self.neighbors.append(neighbor.id) del self.callbacks["connect"] return True elif done: log("error", self.callbacks['connect']['m']) return False else: log('warning', f"Calling connect() timeout after {conf.FUNC_TIMEOUT} seconds") return False
def edge_devices(args, count=1, rand_ids=False): if count < 1: return None if conf.ML_ENGINE != "NumPy": log('error', f"Mobile devices currently only support NumPy based ML") exit() if args.mp == 0: log('error', f"You need to use message passing when edge devices are involved") exit() launcher = Bridge(count, args, rand_ids=rand_ids) launcher.start() wait_until(launcher.bridged, conf.LAUNCHER_TIMEOUT, 1) if len(launcher.bridges) == count: log('success', f"All edge devices joined successfully") elif len(launcher.bridges) == 0: log('error', f"No device joined in {conf.LAUNCHER_TIMEOUT} seconds") launcher.stop() exit() else: log('error', f"Only {len(launcher.bridges)} devices joined after waiting for {conf.LAUNCHER_TIMEOUT} seconds") exit() return launcher
def wait_method(self, method): done = wait_until(self.return_method, conf.FUNC_TIMEOUT, 1, method) if not done: log('warning', f"Calling execute({method}) timeout after {conf.FUNC_TIMEOUT} seconds")