Esempio n. 1
0
    def test_convert_gpu_info_to_metrics(self):
        info = {
            '1': {
                'gpuUtil': u'98',
                'gpuMemUtil': u'97'
            },
            '0': {
                'gpuUtil': u'100',
                'gpuMemUtil': u'99'
            }
        }
        metrics = gpu_exporter.convert_gpu_info_to_metrics(info)
        self.assertEqual(5, len(metrics))

        self.assertIn(Metric("nvidiasmi_attached_gpus", {}, 2), metrics)
        self.assertIn(
            Metric("nvidiasmi_utilization_gpu", {"minor_number": "0"}, "100"),
            metrics)
        self.assertIn(
            Metric("nvidiasmi_utilization_memory", {"minor_number": "0"},
                   "99"), metrics)
        self.assertIn(
            Metric("nvidiasmi_utilization_gpu", {"minor_number": "1"}, "98"),
            metrics)
        self.assertIn(
            Metric("nvidiasmi_utilization_memory", {"minor_number": "1"},
                   "97"), metrics)
Esempio n. 2
0
def train(net: torch.nn.Module, dataset: torch.utils.data.dataset, cfg: EasyDict):
    dataloader = DataLoader(dataset, batch_size=cfg.batch_size, shuffle=True, num_workers=0)
    loss_metric = Metric()
    # optimizer = torch.optim.SGD(net.parameters(), lr=cfg.lr)
    total_iter = len(dataset)

    print('--------------------- start train ------------------------')
    for epoch in range(cfg.epoch):
        optimizer = torch.optim.SGD(net.parameters(), lr=(cfg.lr / pow(2, epoch)))
        for idx, sample in enumerate(dataloader):
            sentence, label = sample['sentence'], sample['label']
            logit = net(sentence)

            optimizer.zero_grad()
            loss = F.cross_entropy(logit, label)
            loss_metric.update(loss)
            loss.backward()
            optimizer.step()

            if (idx + 1) % cfg.print_freq == 0:
                print(f'epoch {epoch} iter {idx + 1}/{total_iter} loss {loss_metric.value()}')

    torch.save(net.state_dict(), '../../model/health_consult_classification.pth')
    print('save model to ../../model/health_consult_classification.pth')
    print('------------------------------ end train --------------------------------')
Esempio n. 3
0
def collect_job_metrics(gpuInfos):
    stats = docker_stats.stats()
    if stats is None:
        logger.warning("docker stats returns None")
        return None

    result = []
    for container in stats:
        inspectInfo = docker_inspect.inspect(container)
        if inspectInfo is None or not inspectInfo["labels"]:
            continue

        gpuIds, otherLabels = parse_from_labels(inspectInfo["labels"])
        otherLabels.update(inspectInfo["env"])

        for id in gpuIds:
            if gpuInfos:
                logger.info(gpuInfos)
                labels = copy.deepcopy(otherLabels)
                labels["minor_number"] = id

                result.append(Metric("container_GPUPerc", labels, gpuInfos[id]["gpuUtil"]))
                result.append(Metric("container_GPUMemPerc", labels, gpuInfos[id]["gpuMemUtil"]))

        result.append(Metric("container_CPUPerc", otherLabels, stats[container]["CPUPerc"]))
        result.append(Metric("container_MemUsage", otherLabels, stats[container]["MemUsage_Limit"]["usage"]))
        result.append(Metric("container_MemLimit", otherLabels, stats[container]["MemUsage_Limit"]["limit"]))
        result.append(Metric("container_NetIn", otherLabels, stats[container]["NetIO"]["in"]))
        result.append(Metric("container_NetOut", otherLabels, stats[container]["NetIO"]["out"]))
        result.append(Metric("container_BlockIn", otherLabels, stats[container]["BlockIO"]["in"]))
        result.append(Metric("container_BlockOut", otherLabels, stats[container]["BlockIO"]["out"]))
        result.append(Metric("container_MemPerc", otherLabels, stats[container]["MemPerc"]))

    return result
Esempio n. 4
0
 def test_export_metrics_to_file(self):
     metrics = []
     metrics.append(Metric("foo", {"bar": 2}, "3"))
     metrics.append(Metric("bar", {}, "4"))
     with tempfile.NamedTemporaryFile() as f:
         utils.export_metrics_to_file(f.name, metrics)
         lines = f.readlines()
         self.assertEqual("foo{bar=\"2\"} 3", lines[0].strip())
         self.assertEqual("bar 4", lines[1].strip())
Esempio n. 5
0
def convert_gpu_info_to_metrics(gpuInfos):
    if gpuInfos is None:
        return None

    result = [Metric("nvidiasmi_attached_gpus", {}, len(gpuInfos))]

    for minorNumber, info in gpuInfos.items():
        label = {"minor_number": minorNumber}
        result.append(
            Metric("nvidiasmi_utilization_gpu", label, info["gpuUtil"]))
        result.append(
            Metric("nvidiasmi_utilization_memory", label, info["gpuMemUtil"]))

    return result
Esempio n. 6
0
 def to_metric(self):
     label = {
         "service_name": self.service_name,
         "name": self.name,
         "state": self.state,
         "ready": self.ready
     }
     return Metric("pai_container_count", label, 1)
Esempio n. 7
0
    def test_metrics_eq(self):
        self.assertEqual(Metric("foo", {"abc": "1"}, "3"),
                         Metric("foo", {"abc": "1"}, "3"))

        self.assertNotEqual(Metric("foo", {"abc": "1"}, "3"),
                            Metric("bar", {"abc": "1"}, "3"))
        self.assertNotEqual(Metric("foo", {"abc": "2"}, "3"),
                            Metric("foo", {"abc": "1"}, "3"))
        self.assertNotEqual(Metric("foo", {"abc": "2"}, "3"),
                            Metric("foo", {"abc": "2"}, "5"))
Esempio n. 8
0
def collect_docker_daemon_status(configFilePath):
    metrics = []

    cluster_config = common.load_yaml_file(configFilePath)
    node_configs = cluster_config['machine-list']
    username = ""
    password = ""
    sshport = ""

    if "default-machine-properties" in cluster_config:
        if "username" in cluster_config["default-machine-properties"]:
            username = cluster_config["default-machine-properties"]["username"]
        if "password" in cluster_config["default-machine-properties"]:
            password = cluster_config["default-machine-properties"]["password"]
        if "sshport" in cluster_config["default-machine-properties"]:
            port = cluster_config["default-machine-properties"]["sshport"]

    cmd = "sudo systemctl is-active docker | if [ $? -eq 0 ]; then echo \"active\"; else exit 1 ; fi"
    errorNodeCout = 0

    for node_config in node_configs:
        ip = node_config["hostip"]
        label = {"instance": ip}

        try:
            if "username" not in node_config or "password" not in node_config or "sshport" not in node_config:
                node_config["username"] = username
                node_config["password"] = password
                node_config["port"] = port

            flag = common.ssh_shell_paramiko(node_config, cmd)
            if not flag:
                errorNodeCout += 1
                # single node docker health
                metrics.append(Metric("node_current_docker_error", label, 1))
        except Exception as e:
            logger.exception("ssh to %s failed", ip)
            errorNodeCout += 1
            metrics.append(Metric("node_current_docker_error", label, 1))

    if errorNodeCout > 0:
        metrics.append(Metric("docker_error_node_count", {}, errorNodeCout))

    return metrics
Esempio n. 9
0
    def to_metric(self):
        label = {"name": self.name, "phase": self.phase}

        if self.host_ip is not None:
            label["host_ip"] = self.host_ip

        for k, v in self.condition_map.items():
            label[k] = v

        return Metric("pai_pod_count", label, 1)
Esempio n. 10
0
def collect_k8s_componentStaus(address, nodesJsonObject):
    metrics = []

    emptyLabel = {}

    # 1. check api server
    try:
        apiServerhealty = requests.get("{}/healthz".format(address)).text

        if apiServerhealty != "ok":
            # api server health status, 1 is error
            metrics.append(
                Metric("apiserver_current_status_error", emptyLabel, 1))
    except Exception as e:
        logger.exception("get api server status failed")
        metrics.append(Metric("apiserver_current_status_error", emptyLabel, 1))

    # 2. check etcd
    try:
        etcdhealty = requests.get("{}/healthz/etcd".format(address)).text

        if etcdhealty != "ok":
            # etcd health status, 1 is error
            metrics.append(Metric("etcd_current_status_error", emptyLabel, 1))
    except Exception as e:
        logger.exception("get etcd status failed")
        metrics.append(Metric("etcd_current_status_error", emptyLabel, 1))

    # 3. check kubelet
    nodeItems = nodesJsonObject["items"]
    kubeletErrorCount = 0

    for name in nodeItems:
        ip = name["metadata"]["name"]

        label = {"node": ip}

        try:
            kubeletHealthy = requests.get("http://{}:{}/healthz".format(
                ip, 10255)).text

            if kubeletHealthy != "ok":
                # each node kubelet health status, 1 is error
                metrics.append(Metric("kubelet_current_status_error", label,
                                      1))
                kubeletErrorCount += 1
        except Exception as e:
            kubeletErrorCount += 1
            logger.exception("get kubelet status failed")
            metrics.append(Metric("kubelet_current_status_error", label, 1))

    metrics.append(
        Metric("current_status_error_kubelet_count", emptyLabel,
               kubeletErrorCount))

    return metrics
Esempio n. 11
0
def main(argv):
    log_dir = argv[0]
    gpu_metrics_path = log_dir + "/gpu_exporter.prom"
    job_metrics_path = log_dir + "/job_exporter.prom"
    docker_metrics_path = log_dir + "/docker.prom"
    time_metrics_path = log_dir + "/time.prom"
    time_sleep_s = int(argv[1])

    iter = 0

    gpu_singleton = utils.Singleton(gpu_exporter.collect_gpu_info,
                                    name="gpu_singleton")
    docker_status_singleton = utils.Singleton(collect_docker_daemon_status,
                                              name="docker_singleton")

    type1_zombies = ZombieRecorder()
    type2_zombies = ZombieRecorder()

    while True:
        start = datetime.datetime.now()
        try:
            logger.info("job exporter running {0} iteration".format(str(iter)))
            iter += 1
            gpu_infos = gpu_singleton.try_get()

            docker_status = docker_status_singleton.try_get()
            if docker_status is not None:
                utils.export_metrics_to_file(docker_metrics_path,
                                             [docker_status])

            gpu_metrics = gpu_exporter.convert_gpu_info_to_metrics(gpu_infos)
            utils.export_metrics_to_file(gpu_metrics_path, gpu_metrics)

            all_conns = network.iftop()
            logger.debug("iftop result is %s", all_conns)

            # join with docker stats metrics and docker inspect labels
            job_metrics = collect_job_metrics(gpu_infos, all_conns,
                                              type1_zombies, type2_zombies)
            utils.export_metrics_to_file(job_metrics_path, job_metrics)
        except Exception as e:
            logger.exception("exception in job exporter loop")
        finally:
            end = datetime.datetime.now()

            time_metrics = [
                Metric("job_exporter_iteration_seconds", {},
                       (end - start).seconds)
            ]
            utils.export_metrics_to_file(time_metrics_path, time_metrics)

        time.sleep(time_sleep_s)
Esempio n. 12
0
def collect_healthz(metric_name, address, port, url):
    label = {"address": address, "error": "ok"}

    try:
        healthy = requests.get("http://{}:{}{}".format(address, port,
                                                       url)).text

        if healthy != "ok":
            label["error"] = healthy
    except Exception as e:
        label["error"] = str(e)
        logger.exception("requesting %s:%d%s failed", address, port, url)

    return Metric(metric_name, label, 1)
Esempio n. 13
0
def parse_nodes_status(nodesJsonObject):
    nodeItems = nodesJsonObject["items"]

    metrics = []
    readyNodeCount = 0
    dockerError = 0

    for name in nodeItems:
        # 1. check each node status
        for condition in name["status"]["conditions"]:
            if "Ready" == condition["type"]:
                readyStatus = condition["status"]
                if readyStatus != "True":
                    # node status, value 1 is error
                    label = {"node": name["metadata"]["name"]}
                    metrics.append(Metric("node_current_notready", label, 1))
                else:
                    readyNodeCount += 1

    metrics.append(
        Metric("notready_node_count", {},
               len(nodeItems) - readyNodeCount))
    return metrics
Esempio n. 14
0
def generate_zombie_count(stats, type1_zombies, type2_zombies):
    """
    There are two types of zombie:
        1. container which outputed "USER COMMAND END" but did not exist for a long period of time
        2. yarn container exited but job container didn't
    """
    exited_containers = set(filter(is_container_exited, stats.keys()))
    logger.debug("exited_containers is %s", exited_containers)

    now = datetime.datetime.now()
    zombie_count1 = generate_zombie_count_type1(type1_zombies,
                                                exited_containers, now)
    zombie_count2 = generate_zombie_count_type2(type2_zombies, stats, now)

    return [
        Metric("zombie_container_count", {}, zombie_count1 + zombie_count2)
    ]
Esempio n. 15
0
def collect_docker_daemon_status(hosts):
    metrics = []

    cmd = "sudo systemctl is-active docker | if [ $? -eq 0 ]; then echo \"active\"; else exit 1 ; fi"

    for host in hosts:
        label = {"ip": host["hostip"], "error": "ok"}

        try:
            flag = common.ssh_shell_paramiko(host, cmd)
            if not flag:
                label["error"] = "config"  # configuration is not correct
        except Exception as e:
            label["error"] = str(e)
            logger.exception("ssh to %s failed", host["hostip"])

        metrics.append(Metric("docker_daemon_count", label, 1))

    return metrics
Esempio n. 16
0
def collect_docker_daemon_status():
    """ check docker daemon status in current host """
    cmd = "systemctl is-active docker | if [ $? -eq 0 ]; then echo \"active\"; else exit 1 ; fi"
    error = "ok"

    try:
        logger.info("call systemctl to get docker status")

        out = utils.check_output(cmd, shell=True)

        if "active" not in out:
            error = "inactive"
    except subprocess.CalledProcessError as e:
        logger.exception("command '%s' return with error (code %d): %s", cmd,
                         e.returncode, e.output)
        error = e.strerror()
    except OSError as e:
        if e.errno == os.errno.ENOENT:
            logger.warning("systemctl not found")
        error = e.strerror()

    return Metric("docker_daemon_count", {"error": error}, 1)
Esempio n. 17
0
def evaluate(data_loader):
    meter = Metric(mode=args.mode)
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for idx, (img, segm) in enumerate(data_loader):
            img = img.cuda()
            segm = segm.cuda()
            outputs = model(img)
            loss = criterion(outputs, segm)
            del img
            del segm
            outputs = outputs.detach().cpu()
            segm = segm.detach().cpu()
            meter.update(segm, outputs)
            total_loss += loss.item()
        if args.mode == 'cls':
            tn, tp = meter.get_metrics()
            return total_loss / len(data_loader), tn, tp
        else:
            dices, iou = meter.get_metrics()
            dice, dice_neg, dice_pos = dices
            torch.cuda.empty_cache()
            return total_loss / len(data_loader), iou, dice, dice_neg, dice_pos
Esempio n. 18
0
def parse_pods_status(podsJsonObject):
    kube_pod_status_probe_not_ready = 0
    kube_pod_status_phase_failed = 0
    kube_pod_status_phase_unknown = 0
    pod_container_status_waiting = 0
    pod_container_status_terminated = 0
    pod_container_status_not_ready = 0
    pod_container_status_restarted_pod_count = 0

    metrics = []

    serviceMetrics = collections.defaultdict(lambda: Service())
    existServiceKey = {}

    podItems = podsJsonObject["items"]

    for pod in podItems:
        # all / per pod phase failed/unkown/Not ready (condition)
        serviceName = ""

        if "generateName" in pod["metadata"]:
            serviceName = pod["metadata"]["generateName"]
        else:
            serviceName = pod["metadata"]["name"]

        service = serviceMetrics[serviceName]

        status = pod["status"]
        phase = status["phase"]
        conditions = status["conditions"]
        ready = "True"
        init = "True"
        scheduled = "True"

        # 1. check not ready pod
        for condition in conditions:
            if condition["type"] == "Ready":
                ready = condition["status"]
            elif condition["type"] == "Initialized":
                init = condition["status"]
            elif condition["type"] == "PodScheduled":
                scheduled = condition["status"]

        # NOTE: this map will be reused in multiple metrics, do not modify this map
        label = {
            "pod": pod["metadata"]["name"],
            "hostip": pod["status"]["hostIP"]
        }

        if ready != "True" and init == "True" and scheduled == "True":
            kube_pod_status_probe_not_ready += 1
            # specific pod occurs readiness probe failed error, condition is not ready, value is 1
            metrics.append(Metric("pod_current_probe_not_ready", label, 1))
            service.kube_pod_status_probe_not_ready += 1

        # 2. check failed phase pods
        if phase == "Failed":
            kube_pod_status_phase_failed += 1
            # specific pod phase become faile, value is 1
            metrics.append(Metric("pod_current_phase_failed", label, 1))
            service.kube_pod_status_phase_failed += 1

        # 3. check unknown phase pods
        if phase == "Unknown":
            kube_pod_status_phase_unknown += 1
            # specific pod phase become unknown, value is 1
            metrics.append(Metric("pod_current_phase_unknown", label, 1))
            service.kube_pod_status_phase_unknown += 1

        containerStatus = status["containerStatuses"]

        # 4. check pod containers running/waiting/terminated status
        for perContainerStatus in containerStatus:
            containerReady = perContainerStatus["ready"]
            restartCount = perContainerStatus["restartCount"]

            containerLabel = copy.deepcopy(label)
            containerLabel["container"] = perContainerStatus["name"]

            if not containerReady:
                pod_container_status_not_ready += 1
                # specific pod contains container status is not ready, value is 1
                metrics.append(
                    Metric("container_current_not_ready", containerLabel, 1))
                service.pod_container_status_not_ready += 1

            state = perContainerStatus["state"]
            if "terminated" in state:
                pod_container_status_terminated += 1
                # specific pod container status is terminated total count, value is 1
                metrics.append(
                    Metric("container_current_terminated", containerLabel, 1))
                service.pod_container_status_terminated += 1

            if "waiting" in state:
                pod_container_status_waiting += 1
                # specific pod container status is waiting  total count, value is 1
                metrics.append(
                    Metric("container_current_waiting", containerLabel, 1))
                service.pod_container_status_waiting += 1

            if restartCount > 0:
                pod_container_status_restarted_pod_count += 1
                # specific pod's container restart total count
                metrics.append(
                    Metric("container_accumulation_restart_total",
                           containerLabel, restartCount))
                service.pod_container_status_restart_total += 1

    # service level aggregation metrics
    for serviceName, service in serviceMetrics.items():
        label = {"service": serviceName}
        # each service occurs readiness probe failed error, condition is not ready, total count
        if service.kube_pod_status_probe_not_ready != 0:
            metrics.append(
                Metric("service_current_probe_not_ready_pod_count", label,
                       service.kube_pod_status_probe_not_ready))
        # each service pods' phase become failed total count
        if service.kube_pod_status_phase_failed != 0:
            metrics.append(
                Metric("service_current_phase_failed_pod_count", label,
                       service.kube_pod_status_phase_failed))
        # each service pods' phase become unknown total count
        if service.kube_pod_status_phase_unknown != 0:
            metrics.append(
                Metric("service_current_phase_unknown_pod_count", label,
                       service.kube_pod_status_phase_unknown))
        # each service pods' contains container status is not ready total count
        if service.pod_container_status_waiting != 0:
            metrics.append(
                Metric("service_current_waiting_container_count", label,
                       service.pod_container_status_waiting))
        # each service pods' container status is terminated total count
        if service.pod_container_status_terminated != 0:
            metrics.append(
                Metric("service_current_terminated_container_count", label,
                       service.pod_container_status_terminated))
        # each service pods' container status is waiting  total count
        if service.pod_container_status_not_ready != 0:
            metrics.append(
                Metric("service_current_probe_not_ready_pod_count", label,
                       service.pod_container_status_not_ready))
        # each service pods' container restart total count
        if service.pod_container_status_restart_total != 0:
            metrics.append(
                Metric("service_restarted_container_count", label,
                       service.pod_container_status_restart_total))

    emptyLabel = {}
    metrics.append(
        Metric("cluster_current_probe_not_ready_pod_count", emptyLabel,
               kube_pod_status_probe_not_ready))
    metrics.append(
        Metric("cluster_current_phase_failed_pod_count", emptyLabel,
               kube_pod_status_phase_failed))
    metrics.append(
        Metric("cluster_phase_unknown_pod_count", emptyLabel,
               kube_pod_status_phase_unknown))
    metrics.append(
        Metric("cluster_current_status_not_ready_container_count", emptyLabel,
               pod_container_status_not_ready))
    metrics.append(
        Metric("cluster_current_terminated_container_count", emptyLabel,
               pod_container_status_terminated))
    metrics.append(
        Metric("cluster_current_waiting_container_count", emptyLabel,
               pod_container_status_waiting))
    metrics.append(
        Metric("cluster_container_once_restarted_pod_count", emptyLabel,
               pod_container_status_restarted_pod_count))

    return metrics
Esempio n. 19
0
def train(opt,
          AMP,
          WdB,
          ralph_path,
          train_data_path,
          train_data_list,
          test_data_path,
          test_data_list,
          experiment_name,
          train_batch_size,
          val_batch_size,
          workers,
          lr,
          valInterval,
          num_iter,
          wdbprj,
          continue_model='',
          finetune=''):

    HVD3P = pO.HVD or pO.DDP

    os.makedirs(f'./saved_models/{experiment_name}', exist_ok=True)

    # if OnceExecWorker and WdB:
    #     wandb.init(project=wdbprj, name=experiment_name)
    #     wandb.config.update(opt)

    # load supplied ralph
    with open(ralph_path, 'r') as f:
        ralph_train = json.load(f)

    print('[4] IN TRAIN; BEFORE MAKING DATASET')
    train_dataset = ds_load.myLoadDS(train_data_list,
                                     train_data_path,
                                     ralph=ralph_train)
    valid_dataset = ds_load.myLoadDS(test_data_list,
                                     test_data_path,
                                     ralph=ralph_train)

    # SAVE RALPH FOR LATER USE
    # with open(f'./saved_models/{experiment_name}/ralph.json', 'w+') as f:
    #     json.dump(train_dataset.ralph, f)

    print('[5] DATASET DONE LOADING')
    if OnceExecWorker:
        print(pO)
        print('Alphabet :', len(train_dataset.alph), train_dataset.alph)
        for d in [train_dataset, valid_dataset]:
            print('Dataset Size :', len(d.fns))
            # print('Max LbW : ',max(list(map(len,d.tlbls))) )
            # print('#Chars : ',sum([len(x) for x in d.tlbls]))
            # print('Sample label :',d.tlbls[-1])
            # print("Dataset :", sorted(list(map(len,d.tlbls))) )
            print('-' * 80)

    if opt.num_gpu > 1:
        workers = workers * (1 if HVD3P else opt.num_gpu)

    if HVD3P:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset, num_replicas=opt.world_size, rank=opt.rank)
        valid_sampler = torch.utils.data.distributed.DistributedSampler(
            valid_dataset, num_replicas=opt.world_size, rank=opt.rank)

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=train_batch_size,
        shuffle=True if not HVD3P else False,
        pin_memory=True,
        num_workers=int(workers),
        sampler=train_sampler if HVD3P else None,
        worker_init_fn=WrkSeeder,
        collate_fn=ds_load.SameTrCollate)

    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=val_batch_size,
        pin_memory=True,
        num_workers=int(workers),
        sampler=valid_sampler if HVD3P else None)

    model = OrigamiNet()
    model.apply(init_bn)

    # load finetune ckpt
    if finetune != '':
        model = load_finetune(model, finetune)

    model.train()

    if OnceExecWorker:
        import pprint
        [print(k, model.lreszs[k]) for k in sorted(model.lreszs.keys())]

    biparams = list(
        dict(filter(lambda kv: 'bias' in kv[0],
                    model.named_parameters())).values())
    nonbiparams = list(
        dict(filter(lambda kv: 'bias' not in kv[0],
                    model.named_parameters())).values())

    if not pO.DDP:
        model = model.to(device)
    else:
        model.cuda(opt.rank)

    optimizer = optim.Adam(model.parameters(), lr=lr)
    lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer,
                                                          gamma=10**(-1 /
                                                                     90000))

    # if OnceExecWorker and WdB:
    #     wandb.watch(model, log="all")

    if pO.HVD:
        hvd.broadcast_parameters(model.state_dict(), root_rank=0)
        hvd.broadcast_optimizer_state(optimizer, root_rank=0)
        optimizer = hvd.DistributedOptimizer(
            optimizer, named_parameters=model.named_parameters())
        # optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters(), compression=hvd.Compression.fp16)

    if pO.DDP and opt.rank != 0:
        random.seed()
        np.random.seed()

    # if AMP:
    #     model, optimizer = amp.initialize(model, optimizer, opt_level = "O1")
    if pO.DP:
        model = torch.nn.DataParallel(model)
    elif pO.DDP:
        model = pDDP(model,
                     device_ids=[opt.rank],
                     output_device=opt.rank,
                     find_unused_parameters=False)

    model_ema = ModelEma(model)

    if continue_model != '':
        if OnceExecWorker:
            print(f'loading pretrained model from {continue_model}')
        checkpoint = torch.load(
            continue_model, map_location=f'cuda:{opt.rank}' if HVD3P else None)
        model.load_state_dict(checkpoint['model'], strict=True)
        optimizer.load_state_dict(checkpoint['optimizer'])
        model_ema._load_checkpoint(continue_model,
                                   f'cuda:{opt.rank}' if HVD3P else None)

    criterion = torch.nn.CTCLoss(reduction='none',
                                 zero_infinity=True).to(device)
    converter = CTCLabelConverter(train_dataset.ralph.values())

    if OnceExecWorker:
        with open(f'./saved_models/{experiment_name}/opt.txt',
                  'a') as opt_file:
            opt_log = '------------ Options -------------\n'
            args = vars(opt)
            for k, v in args.items():
                opt_log += f'{str(k)}: {str(v)}\n'
            opt_log += '---------------------------------------\n'
            opt_log += gin.operative_config_str()
            opt_file.write(opt_log)
            # if WdB:
            #     wandb.config.gin_str = gin.operative_config_str().splitlines()

        print(optimizer)
        print(opt_log)

    start_time = time.time()
    best_accuracy = -1
    best_norm_ED = 1e+6
    best_CER = 1e+6
    i = 0
    gAcc = 1
    epoch = 1
    btReplay = False and AMP
    max_batch_replays = 3

    if HVD3P: train_sampler.set_epoch(epoch)
    titer = iter(train_loader)

    while (True):
        start_time = time.time()

        model.zero_grad()
        train_loss = Metric(pO, 'train_loss')

        for j in trange(valInterval, leave=False, desc='Training'):

            # Load a batch
            try:
                image_tensors, labels, fnames = next(titer)
            except StopIteration:
                epoch += 1
                if HVD3P: train_sampler.set_epoch(epoch)
                titer = iter(train_loader)
                image_tensors, labels, fnames = next(titer)

            # log filenames
            # fnames = [f'{i}___{fname}' for fname in fnames]
            # with open(f'./saved_models/{experiment_name}/filelog.txt', 'a+') as f:
            #     f.write('\n'.join(fnames) + '\n')

            # Move to device
            image = image_tensors.to(device)
            text, length = converter.encode(labels)
            batch_size = image.size(0)

            replay_batch = True
            maxR = 3
            while replay_batch and maxR > 0:
                maxR -= 1

                # Forward pass
                preds = model(image, text).float()
                preds_size = torch.IntTensor([preds.size(1)] *
                                             batch_size).to(device)
                preds = preds.permute(1, 0, 2).log_softmax(2)

                if i == 0 and OnceExecWorker:
                    print('Model inp : ', image.dtype, image.size())
                    print('CTC inp : ', preds.dtype, preds.size(),
                          preds_size[0])

                # To avoid ctc_loss issue, disabled cudnn for the computation of the ctc_loss
                torch.backends.cudnn.enabled = False
                cost = criterion(preds, text.to(device), preds_size,
                                 length.to(device)).mean() / gAcc
                torch.backends.cudnn.enabled = True

                train_loss.update(cost)

                # cost tracking?
                # with open(f'./saved_models/{experiment_name}/steplog.txt', 'a+') as f:
                #     f.write(f'Step {i} cost: {cost}\n')

                optimizer.zero_grad()
                default_optimizer_step = optimizer.step  # added for batch replay

                # Backward and step
                if not AMP:
                    cost.backward()
                    replay_batch = False
                else:
                    # with amp.scale_loss(cost, optimizer) as scaled_loss:
                    #     scaled_loss.backward()
                    #     if pO.HVD: optimizer.synchronize()

                    # if optimizer.step is default_optimizer_step or not btReplay:
                    #     replay_batch = False
                    # elif maxR>0:
                    #     optimizer.step()
                    pass

            if btReplay:
                pass  #amp._amp_state.loss_scalers[0]._loss_scale = mx_sc

            if (i + 1) % gAcc == 0:

                if pO.HVD and AMP:
                    with optimizer.skip_synchronize():
                        optimizer.step()
                else:
                    optimizer.step()

                model.zero_grad()
                model_ema.update(model, num_updates=i / 2)

                if (i + 1) % (gAcc * 2) == 0:
                    lr_scheduler.step()

            i += 1

        # validation part
        if True:

            elapsed_time = time.time() - start_time
            start_time = time.time()

            model.eval()
            with torch.no_grad():

                valid_loss, current_accuracy, current_norm_ED, ted, bleu, preds, labels, infer_time = validation(
                    model_ema.ema, criterion, valid_loader, converter, opt, pO)

            model.train()
            v_time = time.time() - start_time

            if OnceExecWorker:
                if current_norm_ED < best_norm_ED:
                    best_norm_ED = current_norm_ED
                    checkpoint = {
                        'model': model.state_dict(),
                        'state_dict_ema': model_ema.ema.state_dict(),
                        'optimizer': optimizer.state_dict(),
                    }
                    torch.save(
                        checkpoint,
                        f'./saved_models/{experiment_name}/best_norm_ED.pth')

                if ted < best_CER:
                    best_CER = ted

                if current_accuracy > best_accuracy:
                    best_accuracy = current_accuracy

                out = f'[{i}] Loss: {train_loss.avg:0.5f} time: ({elapsed_time:0.1f},{v_time:0.1f})'
                out += f' vloss: {valid_loss:0.3f}'
                out += f' CER: {ted:0.4f} NER: {current_norm_ED:0.4f} lr: {lr_scheduler.get_lr()[0]:0.5f}'
                out += f' bAcc: {best_accuracy:0.1f}, bNER: {best_norm_ED:0.4f}, bCER: {best_CER:0.4f}, B: {bleu*100:0.2f}'
                print(out)

                with open(f'./saved_models/{experiment_name}/log_train.txt',
                          'a') as log:
                    log.write(out + '\n')

                # if WdB:
                #     wandb.log({'lr': lr_scheduler.get_lr()[0], 'It':i, 'nED': current_norm_ED,  'B':bleu*100,
                #     'tloss':train_loss.avg, 'AnED': best_norm_ED, 'CER':ted, 'bestCER':best_CER, 'vloss':valid_loss})

        if DEBUG:
            print(
                f'[!!!] Iteration check. Value of i: {i} | Value of num_iter: {num_iter}'
            )

        # Change i == num_iter to i >= num_iter
        # Add num_iter > 0 condition
        if num_iter > 0 and i >= num_iter:
            print('end the training')
            #sys.exit()
            break
Esempio n. 20
0
def collect_job_metrics(gpuInfos):
    stats = docker_stats.stats()
    if stats is None:
        logger.warning("docker stats returns None")
        return None

    result = []
    for container_id, stats in stats.items():
        pai_service_name = None

        # TODO speed this up, since this is O(n^2)
        for service_name in pai_services:
            if stats["name"].startswith(service_name):
                pai_service_name = service_name[4:] # remove "k8s_" prefix
                break

        if pai_service_name is None:
            inspectInfo = docker_inspect.inspect(container_id)
            if inspectInfo is None or not inspectInfo["labels"]:
                continue

            gpuIds, otherLabels = parse_from_labels(inspectInfo["labels"])
            otherLabels.update(inspectInfo["env"])

            for id in gpuIds:
                if gpuInfos:
                    logger.info(gpuInfos)
                    labels = copy.deepcopy(otherLabels)
                    labels["minor_number"] = id

                    result.append(Metric("container_GPUPerc", labels, gpuInfos[id]["gpuUtil"]))
                    result.append(Metric("container_GPUMemPerc", labels, gpuInfos[id]["gpuMemUtil"]))

            result.append(Metric("container_CPUPerc", otherLabels, stats["CPUPerc"]))
            result.append(Metric("container_MemUsage", otherLabels, stats["MemUsage_Limit"]["usage"]))
            result.append(Metric("container_MemLimit", otherLabels, stats["MemUsage_Limit"]["limit"]))
            result.append(Metric("container_NetIn", otherLabels, stats["NetIO"]["in"]))
            result.append(Metric("container_NetOut", otherLabels, stats["NetIO"]["out"]))
            result.append(Metric("container_BlockIn", otherLabels, stats["BlockIO"]["in"]))
            result.append(Metric("container_BlockOut", otherLabels, stats["BlockIO"]["out"]))
            result.append(Metric("container_MemPerc", otherLabels, stats["MemPerc"]))
        else:
            labels = {"name": pai_service_name}
            result.append(Metric("service_cpu_percent", labels, stats["CPUPerc"]))
            result.append(Metric("service_mem_usage_byte", labels, stats["MemUsage_Limit"]["usage"]))
            result.append(Metric("service_mem_limit_byte", labels, stats["MemUsage_Limit"]["limit"]))
            result.append(Metric("service_mem_usage_percent", labels, stats["MemPerc"]))
            result.append(Metric("service_net_in_byte", labels, stats["NetIO"]["in"]))
            result.append(Metric("service_net_out_byte", labels, stats["NetIO"]["out"]))
            result.append(Metric("service_block_in_byte", labels, stats["BlockIO"]["in"]))
            result.append(Metric("service_block_out_byte", labels, stats["BlockIO"]["out"]))

    return result
Esempio n. 21
0
def main():
    if len(sys.argv) != 3:
        raise ValueError(
            'Please specify two arguments: problem name and experiments configuration file'
        )

    allowed_experiments = {'icd9': ICD9_SETUP, 'mortality': MORTALITY_SETUP}
    experiment_setup = allowed_experiments.get(sys.argv[1], None)
    if experiment_setup is None:
        raise ValueError(
            f'Wrong problem name. Allowed values are: {list(allowed_experiments.keys())}'
        )

    configuration_filename = os.path.join(os.getenv('CODE'), sys.argv[2])
    if not os.path.exists(configuration_filename):
        raise ValueError(
            'Specified experiments configuration file does not exist.')

    with open(configuration_filename) as f:
        config_as_json = json.load(f)
    configurations = denormalize_config(config_as_json)

    model_config = Config()
    hook = sy.TorchHook(torch)

    X, y, folds = load_data(experiment_setup)

    metric_list = [
        Metric('accuracy', metrics.accuracy_score, use_soft=False),
        Metric('precision', metrics.precision_score, use_soft=False),
        Metric('recall', metrics.recall_score, use_soft=False),
        Metric('f1_score', metrics.f1_score, use_soft=False),
        Metric('roc_auc', metrics.roc_auc_score, use_soft=True),
        Metric('average_precision',
               metrics.average_precision_score,
               use_soft=True),
    ]

    results_folder = os.getenv('RESULTS')

    if not os.path.exists(results_folder):
        os.makedirs(results_folder)
    results_filename = os.path.join(results_folder,
                                    experiment_setup.results_filename + '.csv')

    fieldnames = list(configurations[0].keys())
    fieldnames += [metric.name for metric in metric_list]
    fieldnames += [
        'collecting_datasets', 'training', 'training_per_epoch', 'prediction',
        'task'
    ]
    with open(results_filename, mode='w') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()

    node_distribution_str2func = {
        'beta_center': beta_center,
        'beta_right_skewed': beta_right_skewed,
        'beta_left_skewed': beta_left_skewed,
        'linear': linear,
        'uniform': uniform,
    }

    repetitions = len(folds)
    for i, experiment_config in enumerate(configurations):
        for j, (train_idx, valid_idx, test_idx) in enumerate(folds):
            print(
                f'config: {i+1}/{len(configurations)} repetition: {j+1}/{repetitions}',
                flush=True)
            print(experiment_config, flush=True)

            not_federated = experiment_config['nodes_type'] == 'no_nodes'
            if not_federated:
                experiment = NotFederatedExperiment(model_config)
            elif experiment_config['nodes_type'] == 'real':
                experiment = FederatedExperiment(
                    hook,
                    model_config,
                    experiment_config['num_of_workers'],
                    node_distribution_str2func[
                        experiment_config['node_distribution']],
                    use_real_workers=True)
            elif experiment_config['nodes_type'] == 'virtual':
                experiment = FederatedExperiment(
                    hook,
                    model_config,
                    experiment_config['num_of_workers'],
                    node_distribution_str2func[
                        experiment_config['node_distribution']],
                    use_real_workers=False)
            else:
                raise ValueError(
                    f'Wrong nodes type. Allowed values are: "no nodes", "real" or "virtual"'
                )

            model = build_model(model_config,
                                n_features=X.shape[1],
                                output_size=experiment_setup.output_size)

            train_idx = np.concatenate((train_idx, valid_idx))
            if experiment_config['train_size'] is not None:
                train_idx = train_idx[:experiment_config['train_size']]

            queue = Queue()
            p = Process(target=run_experiment,
                        args=(queue, not_federated, experiment, model, X, y,
                              train_idx, test_idx, metric_list,
                              experiment_setup.output_size))
            p.start()
            p.join()
            results = queue.get()

            for name, value in experiment_config.items():
                for k in range(experiment_setup.output_size):
                    results[k][name] = value

            with open(results_filename, mode='a') as csv_file:
                writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
                for k in range(experiment_setup.output_size):
                    writer.writerow(results[k])

            del experiment, p, queue
Esempio n. 22
0
    def to_metric(self):
        label = {"name": self.name}
        for k, v in self.condition_map.items():
            label[k] = v

        return Metric("pai_node_count", label, 1)
Esempio n. 23
0
def collect_job_metrics(gpu_infos, all_conns, type1_zombies, type2_zombies):
    stats_obj = docker_stats.stats()
    if stats_obj is None:
        logger.warning("docker stats returns None")
        return None

    result = []
    for container_id, stats in stats_obj.items():
        pai_service_name = None

        # TODO speed this up, since this is O(n^2)
        for service_name in pai_services:
            if stats["name"].startswith(service_name):
                pai_service_name = service_name[4:]  # remove "k8s_" prefix
                break

        inspect_info = docker_inspect.inspect(container_id)
        pid = inspect_info["pid"] if inspect_info is not None else None
        inspect_labels = utils.walk_json_field_safe(inspect_info, "labels")

        if not inspect_labels and pai_service_name is None:
            continue  # other container, maybe kubelet or api-server

        # get network consumption, since all our services/jobs running in host network,
        # network statistic from docker is not specific to that container. We have to
        # get network statistic by ourselves.
        lsof_result = network.lsof(pid)
        net_in, net_out = network.get_container_network_metrics(
            all_conns, lsof_result)
        if logger.isEnabledFor(logging.DEBUG):
            debug_info = utils.check_output(
                "ps -o cmd fp {0} | tail -n 1".format(pid), shell=True)

            logger.debug(
                "pid %s with cmd `%s` has lsof result %s, in %d, out %d", pid,
                debug_info, lsof_result, net_in, net_out)

        if pai_service_name is None:
            gpuIds, otherLabels = parse_from_labels(inspect_info["labels"])
            otherLabels.update(inspect_info["env"])

            for id in gpuIds:
                if gpu_infos:
                    labels = copy.deepcopy(otherLabels)
                    labels["minor_number"] = id

                    result.append(
                        Metric("container_GPUPerc", labels,
                               gpu_infos[id]["gpuUtil"]))
                    result.append(
                        Metric("container_GPUMemPerc", labels,
                               gpu_infos[id]["gpuMemUtil"]))

            result.append(
                Metric("container_CPUPerc", otherLabels, stats["CPUPerc"]))
            result.append(
                Metric("container_MemUsage", otherLabels,
                       stats["MemUsage_Limit"]["usage"]))
            result.append(
                Metric("container_MemLimit", otherLabels,
                       stats["MemUsage_Limit"]["limit"]))
            result.append(Metric("container_NetIn", otherLabels, net_in))
            result.append(Metric("container_NetOut", otherLabels, net_out))
            result.append(
                Metric("container_BlockIn", otherLabels,
                       stats["BlockIO"]["in"]))
            result.append(
                Metric("container_BlockOut", otherLabels,
                       stats["BlockIO"]["out"]))
            result.append(
                Metric("container_MemPerc", otherLabels, stats["MemPerc"]))
        else:
            labels = {"name": pai_service_name}
            result.append(
                Metric("service_cpu_percent", labels, stats["CPUPerc"]))
            result.append(
                Metric("service_mem_usage_byte", labels,
                       stats["MemUsage_Limit"]["usage"]))
            result.append(
                Metric("service_mem_limit_byte", labels,
                       stats["MemUsage_Limit"]["limit"]))
            result.append(
                Metric("service_mem_usage_percent", labels, stats["MemPerc"]))
            result.append(Metric("service_net_in_byte", labels, net_in))
            result.append(Metric("service_net_out_byte", labels, net_out))
            result.append(
                Metric("service_block_in_byte", labels,
                       stats["BlockIO"]["in"]))
            result.append(
                Metric("service_block_out_byte", labels,
                       stats["BlockIO"]["out"]))

    result.extend(
        generate_zombie_count(stats_obj, type1_zombies, type2_zombies))

    return result