def __init__(self): if _FTLIB_INSTALLED: self._ftlib = BasicFTLib() self._ftlib.init(consensus="shared_storage", framework="dummy_NCCL") else: self._ftlib = None
def __init__(self, service_name=None): if _FTLIB_INSTALLED: connection_try_num = 0 while True: try: peer_list = list(self._get_peer_set(service_name)) except Exception: if (connection_try_num * 5 > _FTLIB_CONSENSUS_CONNECTION_TIMEOUT_SECS): logger.error( "Cannot connect to FTLib consensus service in %s " "seconds", str(_FTLIB_CONSENSUS_CONNECTION_TIMEOUT_SECS), ) self._ftlib = None return # sleep for 5s and try again logger.warning( "Cannot connect to FTLib consensus service, " "trying again.") connection_try_num += 1 time.sleep(5) else: break self._ftlib = BasicFTLib( consensus="gossip", commlib="pytorch", consensus_init_kwargs={ "known_addr_list": peer_list, "custom_bind_addr": socket.gethostbyname(socket.gethostname()), }, ) connection_try_num = 0 while peer_list and not self._ftlib.consensus_joined(): logger.warning("Retry building consensus...") try: self._ftlib.manual_join( known_addr_list=list(self._get_peer_set(service_name))) except Exception: if (connection_try_num * 5 > _FTLIB_CONSENSUS_CONNECTION_TIMEOUT_SECS): logger.error( "Cannot join FTLib consensus service in %s " "seconds", str(_FTLIB_CONSENSUS_CONNECTION_TIMEOUT_SECS), ) self._ftlib = None return logger.warning("Cannot join FTLib consensus service, " "trying again.") connection_try_num += 1 time.sleep(5) else: logger.warning( "FTLib is not installed. The CollectiveCommunicator " "may not work as expected") self._ftlib = None
def __init__(self): if _FTLIB_INSTALLED: self._ftlib = BasicFTLib() self._ftlib.init(consensus="gossip", commlib="pytorch") else: logger.warning( "FTLib is not installed. The CollectiveCommunicator " "may not work as expected") self._ftlib = None
class CollectiveCommunicator(object): def __init__(self): if _FTLIB_INSTALLED: self._ftlib = BasicFTLib() self._ftlib.init(consensus="gossip", commlib="pytorch") else: logger.warning( "FTLib is not installed. The CollectiveCommunicator " "may not work as expected") self._ftlib = None def allreduce(self, data, op="MEAN"): if data is None: logger.error("Data is required for allreduce operation") return CollectiveCommunicatorStatus.FAILED, data if op not in _SUPPORTED_ALLREDUCE_OPS: logger.error( "%s is not in list of supported allreduce operations: %s" % (op, _SUPPORTED_ALLREDUCE_OPS)) return CollectiveCommunicatorStatus.FAILED, data if self._ftlib is not None: res = self._ftlib.allreduce_average(data) if res == FTAllReduceStatus.SUCCESS: return CollectiveCommunicatorStatus.SUCCEEDED, data else: return CollectiveCommunicatorStatus.FAILED, data else: logger.warning("FTLib is not installed. " "Default to succeeded for testing purposes") return CollectiveCommunicatorStatus.SUCCEEDED, data def broadcast(self, data, root_ip): if self._ftlib is not None: res = self._ftlib.broadcast(data, root_ip) if res == FTAllReduceStatus.SUCCESS: return CollectiveCommunicatorStatus.SUCCEEDED, data else: return CollectiveCommunicatorStatus.FAILED, data else: logger.warning("FTLib is not installed. " "Default to succeeded for testing purposes") return CollectiveCommunicatorStatus.SUCCEEDED, data def barrier(self): return CollectiveCommunicatorStatus.SUCCEEDED def has_new_worker_joining(self): return True
def __init__(self, service_name=None): if _FTLIB_INSTALLED: self._ftlib = BasicFTLib( consensus="gossip", commlib="pytorch", consensus_init_kwargs={ "known_addr_list": list(self._get_peer_set(service_name)), "custom_bind_addr": socket.gethostbyname(socket.gethostname()), }, ) while not self._ftlib.consensus_joined(): logger.warning("Retry building consensus...") self._ftlib.manual_join( known_addr_list=list(self._get_peer_set(service_name))) else: logger.warning( "FTLib is not installed. The CollectiveCommunicator " "may not work as expected") self._ftlib = None
class CollectiveCommunicator(object): def __init__(self, service_name=None): if _FTLIB_INSTALLED: connection_try_num = 0 while True: try: peer_list = list(self._get_peer_set(service_name)) except Exception: if (connection_try_num * 5 > _FTLIB_CONSENSUS_CONNECTION_TIMEOUT_SECS): logger.error( "Cannot connect to FTLib consensus service in %s " "seconds", str(_FTLIB_CONSENSUS_CONNECTION_TIMEOUT_SECS), ) self._ftlib = None return # sleep for 5s and try again logger.info("Cannot connect to FTLib consensus service, " "trying again.") connection_try_num += 1 time.sleep(5) else: break self._ftlib = BasicFTLib( consensus="gossip", commlib="pytorch", consensus_init_kwargs={ "known_addr_list": peer_list, "custom_bind_addr": socket.gethostbyname(socket.gethostname()), }, ) while peer_list and not self._ftlib.consensus_joined(): logger.warning("Retry building consensus...") self._ftlib.manual_join( known_addr_list=list(self._get_peer_set(service_name))) else: logger.warning( "FTLib is not installed. The CollectiveCommunicator " "may not work as expected") self._ftlib = None def tf_allreduce(self, grads, op="MEAN"): if grads is None: logger.error("Grads is required for tf_allreduce operation") return CollectiveCommunicatorStatus.FAILED, grads # convert tf.Tensor to numpy numpy_data = [g.numpy() for g in grads] return self.allreduce(numpy_data, op) def allreduce(self, data, op="MEAN"): if data is None: logger.error("Data is required for allreduce operation") return CollectiveCommunicatorStatus.FAILED, data if op not in _SUPPORTED_ALLREDUCE_OPS: logger.error( "%s is not in list of supported allreduce operations: %s" % (op, _SUPPORTED_ALLREDUCE_OPS)) return CollectiveCommunicatorStatus.FAILED, data if self._ftlib is not None: status, res = self._ftlib.wait_gradients_ready(params=data) if (status == FTCollectiveStatus.SUCCESS and res == CommLibStatus.SUCCESS or status == FTCollectiveStatus.NO_NEED): return CollectiveCommunicatorStatus.SUCCEEDED, data else: return CollectiveCommunicatorStatus.FAILED, data else: logger.warning(_FTLIB_UNINSTALLED_DEFAULT_STATUS_MESSAGE) return CollectiveCommunicatorStatus.SUCCEEDED, data def tf_broadcast(self, params, src_rank): for p in params: data = p.numpy() status, data = self.broadcast(p.numpy(), src_rank) if status == CollectiveCommunicatorStatus.SUCCEEDED: p.assign(data) else: return status return CollectiveCommunicatorStatus.SUCCEEDED def broadcast(self, data, src_rank): if self._ftlib is not None: status, _ = self._ftlib.broadcast(data, src_rank) if status == FTCollectiveStatus.SUCCESS: return CollectiveCommunicatorStatus.SUCCEEDED, data else: return CollectiveCommunicatorStatus.FAILED, data else: logger.warning(_FTLIB_UNINSTALLED_DEFAULT_STATUS_MESSAGE) return CollectiveCommunicatorStatus.SUCCEEDED, data def barrier(self): if self._ftlib is not None: status, _ = self._ftlib.barrier() if status == FTCollectiveStatus.SUCCESS: return CollectiveCommunicatorStatus.SUCCEEDED else: return CollectiveCommunicatorStatus.FAILED else: logger.warning(_FTLIB_UNINSTALLED_DEFAULT_STATUS_MESSAGE) return CollectiveCommunicatorStatus.SUCCEEDED def is_initialized(self): """This will be `False` under three occasions: * New workers report joining in * Collective-communication operations fail or time out * Liveness probe fails for existing workers """ if self._ftlib is not None: return self._ftlib.initialized else: return True def _get_peer_set(self, svc_name): if svc_name is None: return None my_ip = socket.gethostbyname(socket.gethostname()) temp_set = socket.getaddrinfo(svc_name, 0, proto=socket.IPPROTO_TCP) peer_set = {peer[-1][0] for peer in temp_set if peer[-1][0] != my_ip} return peer_set
x = self.fc1(x) x = F.relu(x) x = self.dropout2(x) x = self.fc2(x) output = F.log_softmax(x, dim=1) return output if __name__ == "__main__": logging.info("start!") epochs = 1 # initialize the fault-tolerant library with consensus # and framework options ftlib = BasicFTLib(consensus="shared_storage", commlib="pytorch") use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") model = Net().to(device) optimizer = optim.Adadelta(model.parameters(), lr=1.0) scheduler = StepLR(optimizer, step_size=1, gamma=0.7) for epoch in range(1, epochs + 1): model.train() for batch_idx, (data, target) in enumerate(train_loader): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) time.sleep(0.5)
if __name__ == "__main__": args = parser.parse_args() logging.info("start!") logging.info("joining: {}".format(args.svc_name)) epochs = 1 # initialize the fault-tolerant library with consensus # and framework options ftlib = BasicFTLib( consensus="gossip", commlib="pytorch", consensus_init_kwargs={ "known_addr_list": list(get_peer_set(args.svc_name)) }, ) a_ground_truth = np.double(1.2) b_ground_truth = np.double(-3.7) c_ground_truth = np.double(4.9) target_func = ( lambda x: a_ground_truth * x * x + b_ground_truth * x + c_ground_truth) train_loader = torch.utils.data.DataLoader( SyntheticData( lambda x: target_func(x) + 10.0 * (np.double(np.random.rand()) - 0.5),
args = parser.parse_args() known_addr_list = ( args.known_nodes.split(",") if args.known_nodes != "" else [] ) logging.info("start!") logging.info("joining: {}".format(known_addr_list)) epochs = 1 # initialize the fault-tolerant library with consensus # and framework options ftlib = BasicFTLib( consensus="gossip", commlib="pytorch", consensus_init_kwargs={"known_addr_list": known_addr_list}, ) use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") model = Net().to(device) optimizer = optim.Adadelta(model.parameters(), lr=1.0) scheduler = StepLR(optimizer, step_size=1, gamma=0.7) for epoch in range(1, epochs + 1): model.train() for batch_idx, (data, target) in enumerate(train_loader): # move data to device (CPU or GPU) data, target = data.to(device), target.to(device) # clear gradients
x = torch.flatten(x, 1) x = self.fc1(x) x = F.relu(x) x = self.dropout2(x) x = self.fc2(x) output = F.log_softmax(x, dim=1) return output if __name__ == '__main__': logging.info("start!") epochs = 1 # initialize the fault-tolerant library with consensus and framework options ftlib = BasicFTLib() ftlib.init(consensus='shared_storage', framework='pytorch') use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") model = Net().to(device) optimizer = optim.Adadelta(model.parameters(), lr=1.0) scheduler = StepLR(optimizer, step_size=1, gamma=0.7) for epoch in range(1, epochs + 1): model.train() for batch_idx, (data, target) in enumerate(train_loader): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target)
def dummy_update(): logging.info("dummy update") time.sleep(0.5) if __name__ == "__main__": logging.info("start!") epochs = 1 dl = dummy_dataloader(10) # initialize the fault-tolerant library with consensus # and framework options ftlib = BasicFTLib(consensus="shared_storage", commlib="NCCL") for _ in range(epochs): for batch in dl: dummy_forward() dummy_backward() if ftlib.skip_allreduce: logging.info("skip allreduce") dummy_update() continue else: res = ftlib.wait_gradients_ready() if res == FTAllReduceStatus.NO_NEED: logging.critical( "cannot use average_gradient when there is no need")
time.sleep(5) def dummy_update(): logging.info("dummy update") time.sleep(0.5) if __name__ == '__main__': logging.info("start!") epochs = 1 dl = dummy_dataloader(10) # initialize the fault-tolerant library with consensus and framework options ftlib = BasicFTLib() ftlib.init(consensus='shared_storage', framework='dummy_NCCL') for _ in range(epochs): for batch in dl: dummy_forward() dummy_backward() if ftlib.skip_allreduce: logging.info("skip allreduce") dummy_update() continue else: res = ftlib.wait_weights_ready() if res == FTAllReduceStatus.NO_NEED: logging.critical(
class CollectiveCommunicator(object): def __init__(self, service_name=None): if _FTLIB_INSTALLED: self._ftlib = BasicFTLib( consensus="gossip", commlib="pytorch", consensus_init_kwargs={ "known_addr_list": list(self._get_peer_set(service_name)), "custom_bind_addr": socket.gethostbyname(socket.gethostname()), }, ) while not self._ftlib.consensus_joined(): logger.warning("Retry building consensus...") self._ftlib.manual_join( known_addr_list=list(self._get_peer_set(service_name))) else: logger.warning( "FTLib is not installed. The CollectiveCommunicator " "may not work as expected") self._ftlib = None def allreduce(self, data, op="MEAN"): if data is None: logger.error("Data is required for allreduce operation") return CollectiveCommunicatorStatus.FAILED, data if op not in _SUPPORTED_ALLREDUCE_OPS: logger.error( "%s is not in list of supported allreduce operations: %s" % (op, _SUPPORTED_ALLREDUCE_OPS)) return CollectiveCommunicatorStatus.FAILED, data if self._ftlib is not None: res = self._ftlib.wait_gradients_ready(data) if res == FTAllReduceStatus.SUCCESS: return CollectiveCommunicatorStatus.SUCCEEDED, data else: return CollectiveCommunicatorStatus.FAILED, data else: logger.warning(_FTLIB_UNINSTALLED_DEFAULT_STATUS_MESSAGE) return CollectiveCommunicatorStatus.SUCCEEDED, data def broadcast(self, data, src_rank): if self._ftlib is not None: res = self._ftlib.broadcast(data, src_rank) if res == FTAllReduceStatus.SUCCESS: return CollectiveCommunicatorStatus.SUCCEEDED, data else: return CollectiveCommunicatorStatus.FAILED, data else: logger.warning(_FTLIB_UNINSTALLED_DEFAULT_STATUS_MESSAGE) return CollectiveCommunicatorStatus.SUCCEEDED, data def barrier(self): if self._ftlib is not None: res = self._ftlib.barrier() if res == FTAllReduceStatus.SUCCESS: return CollectiveCommunicatorStatus.SUCCEEDED else: return CollectiveCommunicatorStatus.FAILED else: logger.warning(_FTLIB_UNINSTALLED_DEFAULT_STATUS_MESSAGE) return CollectiveCommunicatorStatus.SUCCEEDED def is_initialized(self): """This will be `False` under three occasions: * New workers report joining in * Collective-communication operations fail or time out * Liveness probe fails for existing workers """ if self._ftlib is not None: return self._ftlib.initialized else: return True def _get_peer_set(self, svc_name): if svc_name is None: return None my_ip = socket.gethostbyname(socket.gethostname()) temp_set = socket.getaddrinfo(svc_name, 0, proto=socket.IPPROTO_TCP) peer_set = {peer[-1][0] for peer in temp_set if peer[-1][0] != my_ip} return peer_set