def test_convert_gpu_info_to_metrics(self): info = { '1': { 'gpuUtil': u'98', 'gpuMemUtil': u'97' }, '0': { 'gpuUtil': u'100', 'gpuMemUtil': u'99' } } metrics = gpu_exporter.convert_gpu_info_to_metrics(info) self.assertEqual(5, len(metrics)) self.assertIn(Metric("nvidiasmi_attached_gpus", {}, 2), metrics) self.assertIn( Metric("nvidiasmi_utilization_gpu", {"minor_number": "0"}, "100"), metrics) self.assertIn( Metric("nvidiasmi_utilization_memory", {"minor_number": "0"}, "99"), metrics) self.assertIn( Metric("nvidiasmi_utilization_gpu", {"minor_number": "1"}, "98"), metrics) self.assertIn( Metric("nvidiasmi_utilization_memory", {"minor_number": "1"}, "97"), metrics)
def train(net: torch.nn.Module, dataset: torch.utils.data.dataset, cfg: EasyDict): dataloader = DataLoader(dataset, batch_size=cfg.batch_size, shuffle=True, num_workers=0) loss_metric = Metric() # optimizer = torch.optim.SGD(net.parameters(), lr=cfg.lr) total_iter = len(dataset) print('--------------------- start train ------------------------') for epoch in range(cfg.epoch): optimizer = torch.optim.SGD(net.parameters(), lr=(cfg.lr / pow(2, epoch))) for idx, sample in enumerate(dataloader): sentence, label = sample['sentence'], sample['label'] logit = net(sentence) optimizer.zero_grad() loss = F.cross_entropy(logit, label) loss_metric.update(loss) loss.backward() optimizer.step() if (idx + 1) % cfg.print_freq == 0: print(f'epoch {epoch} iter {idx + 1}/{total_iter} loss {loss_metric.value()}') torch.save(net.state_dict(), '../../model/health_consult_classification.pth') print('save model to ../../model/health_consult_classification.pth') print('------------------------------ end train --------------------------------')
def collect_job_metrics(gpuInfos): stats = docker_stats.stats() if stats is None: logger.warning("docker stats returns None") return None result = [] for container in stats: inspectInfo = docker_inspect.inspect(container) if inspectInfo is None or not inspectInfo["labels"]: continue gpuIds, otherLabels = parse_from_labels(inspectInfo["labels"]) otherLabels.update(inspectInfo["env"]) for id in gpuIds: if gpuInfos: logger.info(gpuInfos) labels = copy.deepcopy(otherLabels) labels["minor_number"] = id result.append(Metric("container_GPUPerc", labels, gpuInfos[id]["gpuUtil"])) result.append(Metric("container_GPUMemPerc", labels, gpuInfos[id]["gpuMemUtil"])) result.append(Metric("container_CPUPerc", otherLabels, stats[container]["CPUPerc"])) result.append(Metric("container_MemUsage", otherLabels, stats[container]["MemUsage_Limit"]["usage"])) result.append(Metric("container_MemLimit", otherLabels, stats[container]["MemUsage_Limit"]["limit"])) result.append(Metric("container_NetIn", otherLabels, stats[container]["NetIO"]["in"])) result.append(Metric("container_NetOut", otherLabels, stats[container]["NetIO"]["out"])) result.append(Metric("container_BlockIn", otherLabels, stats[container]["BlockIO"]["in"])) result.append(Metric("container_BlockOut", otherLabels, stats[container]["BlockIO"]["out"])) result.append(Metric("container_MemPerc", otherLabels, stats[container]["MemPerc"])) return result
def test_export_metrics_to_file(self): metrics = [] metrics.append(Metric("foo", {"bar": 2}, "3")) metrics.append(Metric("bar", {}, "4")) with tempfile.NamedTemporaryFile() as f: utils.export_metrics_to_file(f.name, metrics) lines = f.readlines() self.assertEqual("foo{bar=\"2\"} 3", lines[0].strip()) self.assertEqual("bar 4", lines[1].strip())
def convert_gpu_info_to_metrics(gpuInfos): if gpuInfos is None: return None result = [Metric("nvidiasmi_attached_gpus", {}, len(gpuInfos))] for minorNumber, info in gpuInfos.items(): label = {"minor_number": minorNumber} result.append( Metric("nvidiasmi_utilization_gpu", label, info["gpuUtil"])) result.append( Metric("nvidiasmi_utilization_memory", label, info["gpuMemUtil"])) return result
def to_metric(self): label = { "service_name": self.service_name, "name": self.name, "state": self.state, "ready": self.ready } return Metric("pai_container_count", label, 1)
def test_metrics_eq(self): self.assertEqual(Metric("foo", {"abc": "1"}, "3"), Metric("foo", {"abc": "1"}, "3")) self.assertNotEqual(Metric("foo", {"abc": "1"}, "3"), Metric("bar", {"abc": "1"}, "3")) self.assertNotEqual(Metric("foo", {"abc": "2"}, "3"), Metric("foo", {"abc": "1"}, "3")) self.assertNotEqual(Metric("foo", {"abc": "2"}, "3"), Metric("foo", {"abc": "2"}, "5"))
def collect_docker_daemon_status(configFilePath): metrics = [] cluster_config = common.load_yaml_file(configFilePath) node_configs = cluster_config['machine-list'] username = "" password = "" sshport = "" if "default-machine-properties" in cluster_config: if "username" in cluster_config["default-machine-properties"]: username = cluster_config["default-machine-properties"]["username"] if "password" in cluster_config["default-machine-properties"]: password = cluster_config["default-machine-properties"]["password"] if "sshport" in cluster_config["default-machine-properties"]: port = cluster_config["default-machine-properties"]["sshport"] cmd = "sudo systemctl is-active docker | if [ $? -eq 0 ]; then echo \"active\"; else exit 1 ; fi" errorNodeCout = 0 for node_config in node_configs: ip = node_config["hostip"] label = {"instance": ip} try: if "username" not in node_config or "password" not in node_config or "sshport" not in node_config: node_config["username"] = username node_config["password"] = password node_config["port"] = port flag = common.ssh_shell_paramiko(node_config, cmd) if not flag: errorNodeCout += 1 # single node docker health metrics.append(Metric("node_current_docker_error", label, 1)) except Exception as e: logger.exception("ssh to %s failed", ip) errorNodeCout += 1 metrics.append(Metric("node_current_docker_error", label, 1)) if errorNodeCout > 0: metrics.append(Metric("docker_error_node_count", {}, errorNodeCout)) return metrics
def to_metric(self): label = {"name": self.name, "phase": self.phase} if self.host_ip is not None: label["host_ip"] = self.host_ip for k, v in self.condition_map.items(): label[k] = v return Metric("pai_pod_count", label, 1)
def collect_k8s_componentStaus(address, nodesJsonObject): metrics = [] emptyLabel = {} # 1. check api server try: apiServerhealty = requests.get("{}/healthz".format(address)).text if apiServerhealty != "ok": # api server health status, 1 is error metrics.append( Metric("apiserver_current_status_error", emptyLabel, 1)) except Exception as e: logger.exception("get api server status failed") metrics.append(Metric("apiserver_current_status_error", emptyLabel, 1)) # 2. check etcd try: etcdhealty = requests.get("{}/healthz/etcd".format(address)).text if etcdhealty != "ok": # etcd health status, 1 is error metrics.append(Metric("etcd_current_status_error", emptyLabel, 1)) except Exception as e: logger.exception("get etcd status failed") metrics.append(Metric("etcd_current_status_error", emptyLabel, 1)) # 3. check kubelet nodeItems = nodesJsonObject["items"] kubeletErrorCount = 0 for name in nodeItems: ip = name["metadata"]["name"] label = {"node": ip} try: kubeletHealthy = requests.get("http://{}:{}/healthz".format( ip, 10255)).text if kubeletHealthy != "ok": # each node kubelet health status, 1 is error metrics.append(Metric("kubelet_current_status_error", label, 1)) kubeletErrorCount += 1 except Exception as e: kubeletErrorCount += 1 logger.exception("get kubelet status failed") metrics.append(Metric("kubelet_current_status_error", label, 1)) metrics.append( Metric("current_status_error_kubelet_count", emptyLabel, kubeletErrorCount)) return metrics
def main(argv): log_dir = argv[0] gpu_metrics_path = log_dir + "/gpu_exporter.prom" job_metrics_path = log_dir + "/job_exporter.prom" docker_metrics_path = log_dir + "/docker.prom" time_metrics_path = log_dir + "/time.prom" time_sleep_s = int(argv[1]) iter = 0 gpu_singleton = utils.Singleton(gpu_exporter.collect_gpu_info, name="gpu_singleton") docker_status_singleton = utils.Singleton(collect_docker_daemon_status, name="docker_singleton") type1_zombies = ZombieRecorder() type2_zombies = ZombieRecorder() while True: start = datetime.datetime.now() try: logger.info("job exporter running {0} iteration".format(str(iter))) iter += 1 gpu_infos = gpu_singleton.try_get() docker_status = docker_status_singleton.try_get() if docker_status is not None: utils.export_metrics_to_file(docker_metrics_path, [docker_status]) gpu_metrics = gpu_exporter.convert_gpu_info_to_metrics(gpu_infos) utils.export_metrics_to_file(gpu_metrics_path, gpu_metrics) all_conns = network.iftop() logger.debug("iftop result is %s", all_conns) # join with docker stats metrics and docker inspect labels job_metrics = collect_job_metrics(gpu_infos, all_conns, type1_zombies, type2_zombies) utils.export_metrics_to_file(job_metrics_path, job_metrics) except Exception as e: logger.exception("exception in job exporter loop") finally: end = datetime.datetime.now() time_metrics = [ Metric("job_exporter_iteration_seconds", {}, (end - start).seconds) ] utils.export_metrics_to_file(time_metrics_path, time_metrics) time.sleep(time_sleep_s)
def collect_healthz(metric_name, address, port, url): label = {"address": address, "error": "ok"} try: healthy = requests.get("http://{}:{}{}".format(address, port, url)).text if healthy != "ok": label["error"] = healthy except Exception as e: label["error"] = str(e) logger.exception("requesting %s:%d%s failed", address, port, url) return Metric(metric_name, label, 1)
def parse_nodes_status(nodesJsonObject): nodeItems = nodesJsonObject["items"] metrics = [] readyNodeCount = 0 dockerError = 0 for name in nodeItems: # 1. check each node status for condition in name["status"]["conditions"]: if "Ready" == condition["type"]: readyStatus = condition["status"] if readyStatus != "True": # node status, value 1 is error label = {"node": name["metadata"]["name"]} metrics.append(Metric("node_current_notready", label, 1)) else: readyNodeCount += 1 metrics.append( Metric("notready_node_count", {}, len(nodeItems) - readyNodeCount)) return metrics
def generate_zombie_count(stats, type1_zombies, type2_zombies): """ There are two types of zombie: 1. container which outputed "USER COMMAND END" but did not exist for a long period of time 2. yarn container exited but job container didn't """ exited_containers = set(filter(is_container_exited, stats.keys())) logger.debug("exited_containers is %s", exited_containers) now = datetime.datetime.now() zombie_count1 = generate_zombie_count_type1(type1_zombies, exited_containers, now) zombie_count2 = generate_zombie_count_type2(type2_zombies, stats, now) return [ Metric("zombie_container_count", {}, zombie_count1 + zombie_count2) ]
def collect_docker_daemon_status(hosts): metrics = [] cmd = "sudo systemctl is-active docker | if [ $? -eq 0 ]; then echo \"active\"; else exit 1 ; fi" for host in hosts: label = {"ip": host["hostip"], "error": "ok"} try: flag = common.ssh_shell_paramiko(host, cmd) if not flag: label["error"] = "config" # configuration is not correct except Exception as e: label["error"] = str(e) logger.exception("ssh to %s failed", host["hostip"]) metrics.append(Metric("docker_daemon_count", label, 1)) return metrics
def collect_docker_daemon_status(): """ check docker daemon status in current host """ cmd = "systemctl is-active docker | if [ $? -eq 0 ]; then echo \"active\"; else exit 1 ; fi" error = "ok" try: logger.info("call systemctl to get docker status") out = utils.check_output(cmd, shell=True) if "active" not in out: error = "inactive" except subprocess.CalledProcessError as e: logger.exception("command '%s' return with error (code %d): %s", cmd, e.returncode, e.output) error = e.strerror() except OSError as e: if e.errno == os.errno.ENOENT: logger.warning("systemctl not found") error = e.strerror() return Metric("docker_daemon_count", {"error": error}, 1)
def evaluate(data_loader): meter = Metric(mode=args.mode) model.eval() total_loss = 0 with torch.no_grad(): for idx, (img, segm) in enumerate(data_loader): img = img.cuda() segm = segm.cuda() outputs = model(img) loss = criterion(outputs, segm) del img del segm outputs = outputs.detach().cpu() segm = segm.detach().cpu() meter.update(segm, outputs) total_loss += loss.item() if args.mode == 'cls': tn, tp = meter.get_metrics() return total_loss / len(data_loader), tn, tp else: dices, iou = meter.get_metrics() dice, dice_neg, dice_pos = dices torch.cuda.empty_cache() return total_loss / len(data_loader), iou, dice, dice_neg, dice_pos
def parse_pods_status(podsJsonObject): kube_pod_status_probe_not_ready = 0 kube_pod_status_phase_failed = 0 kube_pod_status_phase_unknown = 0 pod_container_status_waiting = 0 pod_container_status_terminated = 0 pod_container_status_not_ready = 0 pod_container_status_restarted_pod_count = 0 metrics = [] serviceMetrics = collections.defaultdict(lambda: Service()) existServiceKey = {} podItems = podsJsonObject["items"] for pod in podItems: # all / per pod phase failed/unkown/Not ready (condition) serviceName = "" if "generateName" in pod["metadata"]: serviceName = pod["metadata"]["generateName"] else: serviceName = pod["metadata"]["name"] service = serviceMetrics[serviceName] status = pod["status"] phase = status["phase"] conditions = status["conditions"] ready = "True" init = "True" scheduled = "True" # 1. check not ready pod for condition in conditions: if condition["type"] == "Ready": ready = condition["status"] elif condition["type"] == "Initialized": init = condition["status"] elif condition["type"] == "PodScheduled": scheduled = condition["status"] # NOTE: this map will be reused in multiple metrics, do not modify this map label = { "pod": pod["metadata"]["name"], "hostip": pod["status"]["hostIP"] } if ready != "True" and init == "True" and scheduled == "True": kube_pod_status_probe_not_ready += 1 # specific pod occurs readiness probe failed error, condition is not ready, value is 1 metrics.append(Metric("pod_current_probe_not_ready", label, 1)) service.kube_pod_status_probe_not_ready += 1 # 2. check failed phase pods if phase == "Failed": kube_pod_status_phase_failed += 1 # specific pod phase become faile, value is 1 metrics.append(Metric("pod_current_phase_failed", label, 1)) service.kube_pod_status_phase_failed += 1 # 3. check unknown phase pods if phase == "Unknown": kube_pod_status_phase_unknown += 1 # specific pod phase become unknown, value is 1 metrics.append(Metric("pod_current_phase_unknown", label, 1)) service.kube_pod_status_phase_unknown += 1 containerStatus = status["containerStatuses"] # 4. check pod containers running/waiting/terminated status for perContainerStatus in containerStatus: containerReady = perContainerStatus["ready"] restartCount = perContainerStatus["restartCount"] containerLabel = copy.deepcopy(label) containerLabel["container"] = perContainerStatus["name"] if not containerReady: pod_container_status_not_ready += 1 # specific pod contains container status is not ready, value is 1 metrics.append( Metric("container_current_not_ready", containerLabel, 1)) service.pod_container_status_not_ready += 1 state = perContainerStatus["state"] if "terminated" in state: pod_container_status_terminated += 1 # specific pod container status is terminated total count, value is 1 metrics.append( Metric("container_current_terminated", containerLabel, 1)) service.pod_container_status_terminated += 1 if "waiting" in state: pod_container_status_waiting += 1 # specific pod container status is waiting total count, value is 1 metrics.append( Metric("container_current_waiting", containerLabel, 1)) service.pod_container_status_waiting += 1 if restartCount > 0: pod_container_status_restarted_pod_count += 1 # specific pod's container restart total count metrics.append( Metric("container_accumulation_restart_total", containerLabel, restartCount)) service.pod_container_status_restart_total += 1 # service level aggregation metrics for serviceName, service in serviceMetrics.items(): label = {"service": serviceName} # each service occurs readiness probe failed error, condition is not ready, total count if service.kube_pod_status_probe_not_ready != 0: metrics.append( Metric("service_current_probe_not_ready_pod_count", label, service.kube_pod_status_probe_not_ready)) # each service pods' phase become failed total count if service.kube_pod_status_phase_failed != 0: metrics.append( Metric("service_current_phase_failed_pod_count", label, service.kube_pod_status_phase_failed)) # each service pods' phase become unknown total count if service.kube_pod_status_phase_unknown != 0: metrics.append( Metric("service_current_phase_unknown_pod_count", label, service.kube_pod_status_phase_unknown)) # each service pods' contains container status is not ready total count if service.pod_container_status_waiting != 0: metrics.append( Metric("service_current_waiting_container_count", label, service.pod_container_status_waiting)) # each service pods' container status is terminated total count if service.pod_container_status_terminated != 0: metrics.append( Metric("service_current_terminated_container_count", label, service.pod_container_status_terminated)) # each service pods' container status is waiting total count if service.pod_container_status_not_ready != 0: metrics.append( Metric("service_current_probe_not_ready_pod_count", label, service.pod_container_status_not_ready)) # each service pods' container restart total count if service.pod_container_status_restart_total != 0: metrics.append( Metric("service_restarted_container_count", label, service.pod_container_status_restart_total)) emptyLabel = {} metrics.append( Metric("cluster_current_probe_not_ready_pod_count", emptyLabel, kube_pod_status_probe_not_ready)) metrics.append( Metric("cluster_current_phase_failed_pod_count", emptyLabel, kube_pod_status_phase_failed)) metrics.append( Metric("cluster_phase_unknown_pod_count", emptyLabel, kube_pod_status_phase_unknown)) metrics.append( Metric("cluster_current_status_not_ready_container_count", emptyLabel, pod_container_status_not_ready)) metrics.append( Metric("cluster_current_terminated_container_count", emptyLabel, pod_container_status_terminated)) metrics.append( Metric("cluster_current_waiting_container_count", emptyLabel, pod_container_status_waiting)) metrics.append( Metric("cluster_container_once_restarted_pod_count", emptyLabel, pod_container_status_restarted_pod_count)) return metrics
def train(opt, AMP, WdB, ralph_path, train_data_path, train_data_list, test_data_path, test_data_list, experiment_name, train_batch_size, val_batch_size, workers, lr, valInterval, num_iter, wdbprj, continue_model='', finetune=''): HVD3P = pO.HVD or pO.DDP os.makedirs(f'./saved_models/{experiment_name}', exist_ok=True) # if OnceExecWorker and WdB: # wandb.init(project=wdbprj, name=experiment_name) # wandb.config.update(opt) # load supplied ralph with open(ralph_path, 'r') as f: ralph_train = json.load(f) print('[4] IN TRAIN; BEFORE MAKING DATASET') train_dataset = ds_load.myLoadDS(train_data_list, train_data_path, ralph=ralph_train) valid_dataset = ds_load.myLoadDS(test_data_list, test_data_path, ralph=ralph_train) # SAVE RALPH FOR LATER USE # with open(f'./saved_models/{experiment_name}/ralph.json', 'w+') as f: # json.dump(train_dataset.ralph, f) print('[5] DATASET DONE LOADING') if OnceExecWorker: print(pO) print('Alphabet :', len(train_dataset.alph), train_dataset.alph) for d in [train_dataset, valid_dataset]: print('Dataset Size :', len(d.fns)) # print('Max LbW : ',max(list(map(len,d.tlbls))) ) # print('#Chars : ',sum([len(x) for x in d.tlbls])) # print('Sample label :',d.tlbls[-1]) # print("Dataset :", sorted(list(map(len,d.tlbls))) ) print('-' * 80) if opt.num_gpu > 1: workers = workers * (1 if HVD3P else opt.num_gpu) if HVD3P: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=opt.world_size, rank=opt.rank) valid_sampler = torch.utils.data.distributed.DistributedSampler( valid_dataset, num_replicas=opt.world_size, rank=opt.rank) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=train_batch_size, shuffle=True if not HVD3P else False, pin_memory=True, num_workers=int(workers), sampler=train_sampler if HVD3P else None, worker_init_fn=WrkSeeder, collate_fn=ds_load.SameTrCollate) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=val_batch_size, pin_memory=True, num_workers=int(workers), sampler=valid_sampler if HVD3P else None) model = OrigamiNet() model.apply(init_bn) # load finetune ckpt if finetune != '': model = load_finetune(model, finetune) model.train() if OnceExecWorker: import pprint [print(k, model.lreszs[k]) for k in sorted(model.lreszs.keys())] biparams = list( dict(filter(lambda kv: 'bias' in kv[0], model.named_parameters())).values()) nonbiparams = list( dict(filter(lambda kv: 'bias' not in kv[0], model.named_parameters())).values()) if not pO.DDP: model = model.to(device) else: model.cuda(opt.rank) optimizer = optim.Adam(model.parameters(), lr=lr) lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=10**(-1 / 90000)) # if OnceExecWorker and WdB: # wandb.watch(model, log="all") if pO.HVD: hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) # optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters(), compression=hvd.Compression.fp16) if pO.DDP and opt.rank != 0: random.seed() np.random.seed() # if AMP: # model, optimizer = amp.initialize(model, optimizer, opt_level = "O1") if pO.DP: model = torch.nn.DataParallel(model) elif pO.DDP: model = pDDP(model, device_ids=[opt.rank], output_device=opt.rank, find_unused_parameters=False) model_ema = ModelEma(model) if continue_model != '': if OnceExecWorker: print(f'loading pretrained model from {continue_model}') checkpoint = torch.load( continue_model, map_location=f'cuda:{opt.rank}' if HVD3P else None) model.load_state_dict(checkpoint['model'], strict=True) optimizer.load_state_dict(checkpoint['optimizer']) model_ema._load_checkpoint(continue_model, f'cuda:{opt.rank}' if HVD3P else None) criterion = torch.nn.CTCLoss(reduction='none', zero_infinity=True).to(device) converter = CTCLabelConverter(train_dataset.ralph.values()) if OnceExecWorker: with open(f'./saved_models/{experiment_name}/opt.txt', 'a') as opt_file: opt_log = '------------ Options -------------\n' args = vars(opt) for k, v in args.items(): opt_log += f'{str(k)}: {str(v)}\n' opt_log += '---------------------------------------\n' opt_log += gin.operative_config_str() opt_file.write(opt_log) # if WdB: # wandb.config.gin_str = gin.operative_config_str().splitlines() print(optimizer) print(opt_log) start_time = time.time() best_accuracy = -1 best_norm_ED = 1e+6 best_CER = 1e+6 i = 0 gAcc = 1 epoch = 1 btReplay = False and AMP max_batch_replays = 3 if HVD3P: train_sampler.set_epoch(epoch) titer = iter(train_loader) while (True): start_time = time.time() model.zero_grad() train_loss = Metric(pO, 'train_loss') for j in trange(valInterval, leave=False, desc='Training'): # Load a batch try: image_tensors, labels, fnames = next(titer) except StopIteration: epoch += 1 if HVD3P: train_sampler.set_epoch(epoch) titer = iter(train_loader) image_tensors, labels, fnames = next(titer) # log filenames # fnames = [f'{i}___{fname}' for fname in fnames] # with open(f'./saved_models/{experiment_name}/filelog.txt', 'a+') as f: # f.write('\n'.join(fnames) + '\n') # Move to device image = image_tensors.to(device) text, length = converter.encode(labels) batch_size = image.size(0) replay_batch = True maxR = 3 while replay_batch and maxR > 0: maxR -= 1 # Forward pass preds = model(image, text).float() preds_size = torch.IntTensor([preds.size(1)] * batch_size).to(device) preds = preds.permute(1, 0, 2).log_softmax(2) if i == 0 and OnceExecWorker: print('Model inp : ', image.dtype, image.size()) print('CTC inp : ', preds.dtype, preds.size(), preds_size[0]) # To avoid ctc_loss issue, disabled cudnn for the computation of the ctc_loss torch.backends.cudnn.enabled = False cost = criterion(preds, text.to(device), preds_size, length.to(device)).mean() / gAcc torch.backends.cudnn.enabled = True train_loss.update(cost) # cost tracking? # with open(f'./saved_models/{experiment_name}/steplog.txt', 'a+') as f: # f.write(f'Step {i} cost: {cost}\n') optimizer.zero_grad() default_optimizer_step = optimizer.step # added for batch replay # Backward and step if not AMP: cost.backward() replay_batch = False else: # with amp.scale_loss(cost, optimizer) as scaled_loss: # scaled_loss.backward() # if pO.HVD: optimizer.synchronize() # if optimizer.step is default_optimizer_step or not btReplay: # replay_batch = False # elif maxR>0: # optimizer.step() pass if btReplay: pass #amp._amp_state.loss_scalers[0]._loss_scale = mx_sc if (i + 1) % gAcc == 0: if pO.HVD and AMP: with optimizer.skip_synchronize(): optimizer.step() else: optimizer.step() model.zero_grad() model_ema.update(model, num_updates=i / 2) if (i + 1) % (gAcc * 2) == 0: lr_scheduler.step() i += 1 # validation part if True: elapsed_time = time.time() - start_time start_time = time.time() model.eval() with torch.no_grad(): valid_loss, current_accuracy, current_norm_ED, ted, bleu, preds, labels, infer_time = validation( model_ema.ema, criterion, valid_loader, converter, opt, pO) model.train() v_time = time.time() - start_time if OnceExecWorker: if current_norm_ED < best_norm_ED: best_norm_ED = current_norm_ED checkpoint = { 'model': model.state_dict(), 'state_dict_ema': model_ema.ema.state_dict(), 'optimizer': optimizer.state_dict(), } torch.save( checkpoint, f'./saved_models/{experiment_name}/best_norm_ED.pth') if ted < best_CER: best_CER = ted if current_accuracy > best_accuracy: best_accuracy = current_accuracy out = f'[{i}] Loss: {train_loss.avg:0.5f} time: ({elapsed_time:0.1f},{v_time:0.1f})' out += f' vloss: {valid_loss:0.3f}' out += f' CER: {ted:0.4f} NER: {current_norm_ED:0.4f} lr: {lr_scheduler.get_lr()[0]:0.5f}' out += f' bAcc: {best_accuracy:0.1f}, bNER: {best_norm_ED:0.4f}, bCER: {best_CER:0.4f}, B: {bleu*100:0.2f}' print(out) with open(f'./saved_models/{experiment_name}/log_train.txt', 'a') as log: log.write(out + '\n') # if WdB: # wandb.log({'lr': lr_scheduler.get_lr()[0], 'It':i, 'nED': current_norm_ED, 'B':bleu*100, # 'tloss':train_loss.avg, 'AnED': best_norm_ED, 'CER':ted, 'bestCER':best_CER, 'vloss':valid_loss}) if DEBUG: print( f'[!!!] Iteration check. Value of i: {i} | Value of num_iter: {num_iter}' ) # Change i == num_iter to i >= num_iter # Add num_iter > 0 condition if num_iter > 0 and i >= num_iter: print('end the training') #sys.exit() break
def collect_job_metrics(gpuInfos): stats = docker_stats.stats() if stats is None: logger.warning("docker stats returns None") return None result = [] for container_id, stats in stats.items(): pai_service_name = None # TODO speed this up, since this is O(n^2) for service_name in pai_services: if stats["name"].startswith(service_name): pai_service_name = service_name[4:] # remove "k8s_" prefix break if pai_service_name is None: inspectInfo = docker_inspect.inspect(container_id) if inspectInfo is None or not inspectInfo["labels"]: continue gpuIds, otherLabels = parse_from_labels(inspectInfo["labels"]) otherLabels.update(inspectInfo["env"]) for id in gpuIds: if gpuInfos: logger.info(gpuInfos) labels = copy.deepcopy(otherLabels) labels["minor_number"] = id result.append(Metric("container_GPUPerc", labels, gpuInfos[id]["gpuUtil"])) result.append(Metric("container_GPUMemPerc", labels, gpuInfos[id]["gpuMemUtil"])) result.append(Metric("container_CPUPerc", otherLabels, stats["CPUPerc"])) result.append(Metric("container_MemUsage", otherLabels, stats["MemUsage_Limit"]["usage"])) result.append(Metric("container_MemLimit", otherLabels, stats["MemUsage_Limit"]["limit"])) result.append(Metric("container_NetIn", otherLabels, stats["NetIO"]["in"])) result.append(Metric("container_NetOut", otherLabels, stats["NetIO"]["out"])) result.append(Metric("container_BlockIn", otherLabels, stats["BlockIO"]["in"])) result.append(Metric("container_BlockOut", otherLabels, stats["BlockIO"]["out"])) result.append(Metric("container_MemPerc", otherLabels, stats["MemPerc"])) else: labels = {"name": pai_service_name} result.append(Metric("service_cpu_percent", labels, stats["CPUPerc"])) result.append(Metric("service_mem_usage_byte", labels, stats["MemUsage_Limit"]["usage"])) result.append(Metric("service_mem_limit_byte", labels, stats["MemUsage_Limit"]["limit"])) result.append(Metric("service_mem_usage_percent", labels, stats["MemPerc"])) result.append(Metric("service_net_in_byte", labels, stats["NetIO"]["in"])) result.append(Metric("service_net_out_byte", labels, stats["NetIO"]["out"])) result.append(Metric("service_block_in_byte", labels, stats["BlockIO"]["in"])) result.append(Metric("service_block_out_byte", labels, stats["BlockIO"]["out"])) return result
def main(): if len(sys.argv) != 3: raise ValueError( 'Please specify two arguments: problem name and experiments configuration file' ) allowed_experiments = {'icd9': ICD9_SETUP, 'mortality': MORTALITY_SETUP} experiment_setup = allowed_experiments.get(sys.argv[1], None) if experiment_setup is None: raise ValueError( f'Wrong problem name. Allowed values are: {list(allowed_experiments.keys())}' ) configuration_filename = os.path.join(os.getenv('CODE'), sys.argv[2]) if not os.path.exists(configuration_filename): raise ValueError( 'Specified experiments configuration file does not exist.') with open(configuration_filename) as f: config_as_json = json.load(f) configurations = denormalize_config(config_as_json) model_config = Config() hook = sy.TorchHook(torch) X, y, folds = load_data(experiment_setup) metric_list = [ Metric('accuracy', metrics.accuracy_score, use_soft=False), Metric('precision', metrics.precision_score, use_soft=False), Metric('recall', metrics.recall_score, use_soft=False), Metric('f1_score', metrics.f1_score, use_soft=False), Metric('roc_auc', metrics.roc_auc_score, use_soft=True), Metric('average_precision', metrics.average_precision_score, use_soft=True), ] results_folder = os.getenv('RESULTS') if not os.path.exists(results_folder): os.makedirs(results_folder) results_filename = os.path.join(results_folder, experiment_setup.results_filename + '.csv') fieldnames = list(configurations[0].keys()) fieldnames += [metric.name for metric in metric_list] fieldnames += [ 'collecting_datasets', 'training', 'training_per_epoch', 'prediction', 'task' ] with open(results_filename, mode='w') as csv_file: writer = csv.DictWriter(csv_file, fieldnames=fieldnames) writer.writeheader() node_distribution_str2func = { 'beta_center': beta_center, 'beta_right_skewed': beta_right_skewed, 'beta_left_skewed': beta_left_skewed, 'linear': linear, 'uniform': uniform, } repetitions = len(folds) for i, experiment_config in enumerate(configurations): for j, (train_idx, valid_idx, test_idx) in enumerate(folds): print( f'config: {i+1}/{len(configurations)} repetition: {j+1}/{repetitions}', flush=True) print(experiment_config, flush=True) not_federated = experiment_config['nodes_type'] == 'no_nodes' if not_federated: experiment = NotFederatedExperiment(model_config) elif experiment_config['nodes_type'] == 'real': experiment = FederatedExperiment( hook, model_config, experiment_config['num_of_workers'], node_distribution_str2func[ experiment_config['node_distribution']], use_real_workers=True) elif experiment_config['nodes_type'] == 'virtual': experiment = FederatedExperiment( hook, model_config, experiment_config['num_of_workers'], node_distribution_str2func[ experiment_config['node_distribution']], use_real_workers=False) else: raise ValueError( f'Wrong nodes type. Allowed values are: "no nodes", "real" or "virtual"' ) model = build_model(model_config, n_features=X.shape[1], output_size=experiment_setup.output_size) train_idx = np.concatenate((train_idx, valid_idx)) if experiment_config['train_size'] is not None: train_idx = train_idx[:experiment_config['train_size']] queue = Queue() p = Process(target=run_experiment, args=(queue, not_federated, experiment, model, X, y, train_idx, test_idx, metric_list, experiment_setup.output_size)) p.start() p.join() results = queue.get() for name, value in experiment_config.items(): for k in range(experiment_setup.output_size): results[k][name] = value with open(results_filename, mode='a') as csv_file: writer = csv.DictWriter(csv_file, fieldnames=fieldnames) for k in range(experiment_setup.output_size): writer.writerow(results[k]) del experiment, p, queue
def to_metric(self): label = {"name": self.name} for k, v in self.condition_map.items(): label[k] = v return Metric("pai_node_count", label, 1)
def collect_job_metrics(gpu_infos, all_conns, type1_zombies, type2_zombies): stats_obj = docker_stats.stats() if stats_obj is None: logger.warning("docker stats returns None") return None result = [] for container_id, stats in stats_obj.items(): pai_service_name = None # TODO speed this up, since this is O(n^2) for service_name in pai_services: if stats["name"].startswith(service_name): pai_service_name = service_name[4:] # remove "k8s_" prefix break inspect_info = docker_inspect.inspect(container_id) pid = inspect_info["pid"] if inspect_info is not None else None inspect_labels = utils.walk_json_field_safe(inspect_info, "labels") if not inspect_labels and pai_service_name is None: continue # other container, maybe kubelet or api-server # get network consumption, since all our services/jobs running in host network, # network statistic from docker is not specific to that container. We have to # get network statistic by ourselves. lsof_result = network.lsof(pid) net_in, net_out = network.get_container_network_metrics( all_conns, lsof_result) if logger.isEnabledFor(logging.DEBUG): debug_info = utils.check_output( "ps -o cmd fp {0} | tail -n 1".format(pid), shell=True) logger.debug( "pid %s with cmd `%s` has lsof result %s, in %d, out %d", pid, debug_info, lsof_result, net_in, net_out) if pai_service_name is None: gpuIds, otherLabels = parse_from_labels(inspect_info["labels"]) otherLabels.update(inspect_info["env"]) for id in gpuIds: if gpu_infos: labels = copy.deepcopy(otherLabels) labels["minor_number"] = id result.append( Metric("container_GPUPerc", labels, gpu_infos[id]["gpuUtil"])) result.append( Metric("container_GPUMemPerc", labels, gpu_infos[id]["gpuMemUtil"])) result.append( Metric("container_CPUPerc", otherLabels, stats["CPUPerc"])) result.append( Metric("container_MemUsage", otherLabels, stats["MemUsage_Limit"]["usage"])) result.append( Metric("container_MemLimit", otherLabels, stats["MemUsage_Limit"]["limit"])) result.append(Metric("container_NetIn", otherLabels, net_in)) result.append(Metric("container_NetOut", otherLabels, net_out)) result.append( Metric("container_BlockIn", otherLabels, stats["BlockIO"]["in"])) result.append( Metric("container_BlockOut", otherLabels, stats["BlockIO"]["out"])) result.append( Metric("container_MemPerc", otherLabels, stats["MemPerc"])) else: labels = {"name": pai_service_name} result.append( Metric("service_cpu_percent", labels, stats["CPUPerc"])) result.append( Metric("service_mem_usage_byte", labels, stats["MemUsage_Limit"]["usage"])) result.append( Metric("service_mem_limit_byte", labels, stats["MemUsage_Limit"]["limit"])) result.append( Metric("service_mem_usage_percent", labels, stats["MemPerc"])) result.append(Metric("service_net_in_byte", labels, net_in)) result.append(Metric("service_net_out_byte", labels, net_out)) result.append( Metric("service_block_in_byte", labels, stats["BlockIO"]["in"])) result.append( Metric("service_block_out_byte", labels, stats["BlockIO"]["out"])) result.extend( generate_zombie_count(stats_obj, type1_zombies, type2_zombies)) return result