def get_cloud_cluster(args_node_ips, args_node_ip, args_port, selected_gpus):
    """
    args_node_ips, args_node_ip:string
    """
    #you can automatically get ip info while using paddlecloud multi nodes mode.
    node_ips = os.getenv("PADDLE_TRAINERS")
    assert node_ips is not None, "PADDLE_TRAINERS should not be None"

    node_ip = os.getenv("POD_IP")
    assert node_ip is not None, "POD_IP should not be None"

    node_rank = os.getenv("PADDLE_TRAINER_ID")
    assert node_rank is not None, "PADDLE_TRAINER_ID should not be None"

    node_ips = node_ips.split(",")
    num_nodes = len(node_ips)
    node_rank = int(node_rank)

    if node_ip != "127.0.0.1" and node_ip != args_node_ip:
        logger.warning("Please NOTE: When using paddlecloud, node_ip is \
automatically got from POD_IP. Your input node_ip: {} doesn't equals to \
node_ip: {} from paddlecloud environment.".format(args_node_ip, node_ip))

    if args_node_ips != "127.0.0.1" and args_node_ips != ",".join(node_ips):
        logger.warning(
            "Please NOTE: When using paddlecloud, cluster_node_ips is \
automatically got from PADDLE_TRAINERS(multi nodes) or POD_IP(single node).\
Your input cluster_node_ips: {} doesn't equals to IPs: {} from \
paddlecloud environment.".format(args_node_ips, node_ips))

    started_port = args_port
    print("num_nodes:", num_nodes)
    if num_nodes > 1:
        try:
            paddle_port = int(os.getenv("PADDLE_PORT", ""))
            paddle_port_num = int(os.getenv("TRAINER_PORTS_NUM", ""))

            if paddle_port_num >= len(
                    selected_gpus) and paddle_port != args_port:
                logger.warning(
                    "Use Cloud specified port:{}.".format(paddle_port))
                started_port = paddle_port

        except Exception as e:
            print(e)
            pass

    if started_port is None:
        started_port = 6170

    logger.debug("parsed from args:node_ips:{} \
        node_ip:{} node_rank:{} started_port:{}".format(
        node_ips, node_ip, node_rank, started_port))

    ports = [x for x in range(started_port, started_port + len(selected_gpus))]
    cluster, pod = get_cluster(node_ips, node_ip, ports, selected_gpus)
    return cluster, cluster.pods[node_rank]
Beispiel #2
0
def get_cluster_from_args(selected_gpus):
    cluster_node_ips = '127.0.0.1'
    node_ip = '127.0.0.1'

    node_ips = [x.strip() for x in cluster_node_ips.split(',')]

    node_ips.index(node_ip)

    free_ports = None

    free_ports = find_free_ports(len(selected_gpus))
    if free_ports is not None:
        free_ports = list(free_ports)
    return get_cluster(node_ips, node_ip, free_ports, selected_gpus)
def get_cluster_from_args(selected_gpus):
    cluster_node_ips = '127.0.0.1'
    node_ip = '127.0.0.1'

    node_ips = [x.strip() for x in cluster_node_ips.split(',')]

    node_ips.index(node_ip)

    free_ports = None

    free_ports = find_free_ports(len(selected_gpus))
    if free_ports is not None:
        free_ports = list(free_ports)

    trainer_endpoints = []
    for ip in node_ips:
        trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
    return get_cluster(node_ips, node_ip, trainer_endpoints, selected_gpus)
Beispiel #4
0
def get_cloud_cluster(args_node_ips, args_node_ip, args_port, selected_gpus):
    """
    args_node_ips:string, args_node_ip:string, args_port: int, selected_gpus:list
    """
    #you can automatically get ip info while using paddlecloud multi nodes mode.
    node_ips = os.getenv("PADDLE_TRAINERS")
    assert node_ips is not None, "PADDLE_TRAINERS should not be None"

    node_ip = os.getenv("POD_IP")
    assert node_ip is not None, "POD_IP should not be None"

    node_rank = os.getenv("PADDLE_TRAINER_ID")
    assert node_rank is not None, "PADDLE_TRAINER_ID should not be None"

    paddle_ports_num = int(os.getenv("TRAINER_PORTS_NUM"))
    assert paddle_ports_num is not None, "TRAINER_PORTS_NUM should not be None"

    node_ips = node_ips.split(",")
    num_nodes = len(node_ips)
    node_rank = int(node_rank)

    if node_ip != "127.0.0.1" and node_ip != args_node_ip:
        logger.warning("Please NOTE: When using paddlecloud, node_ip is \
automatically got from POD_IP. Your input node_ip: {} doesn't equals to \
node_ip: {} from paddlecloud environment.".format(args_node_ip, node_ip))

    if args_node_ips != "127.0.0.1" and args_node_ips != ",".join(node_ips):
        logger.warning(
            "Please NOTE: When using paddlecloud, cluster_node_ips is \
automatically got from PADDLE_TRAINERS(multi nodes) or POD_IP(single node).\
Your input cluster_node_ips: {} doesn't equals to IPs: {} from \
paddlecloud environment.".format(args_node_ips, node_ips))

    # DISTRIBUTED_TRAINER_ENDPOINTS: new environment since paddlecloud 1.8.4
    # e.g: DISTRIBUTED_TRAINER_ENDPOINTS="ip1:port1,ip1:port2,ip1:port3,ip1:port4,ip2:port5,ip2:port6,ip2:port7,ip2:port8"
    trainer_endpoints = os.getenv("DISTRIBUTED_TRAINER_ENDPOINTS")
    if trainer_endpoints is None:
        started_port = args_port
        if num_nodes > 1:
            try:
                paddle_port = int(os.getenv("PADDLE_PORT", ""))

                if paddle_ports_num >= len(
                        selected_gpus) and paddle_port != args_port:
                    logger.warning("Use Cloud specified port:{}.".format(
                        paddle_port))
                    started_port = paddle_port

            except Exception as e:
                print(e)
                pass

        if started_port is None:
            started_port = 6170
        ports = [
            x for x in range(started_port, started_port + len(selected_gpus))
        ]
        trainer_endpoints = []
        for ip in node_ips:
            trainer_endpoints.append(["%s:%d" % (ip, port) for port in ports])
    else:
        trainer_endpoints_ori = trainer_endpoints.split(",")
        trainer_endpoints = []
        assert num_nodes * paddle_ports_num == len(trainer_endpoints_ori)
        for i in range(num_nodes):
            trainer_endpoints.append(trainer_endpoints_ori[
                i * paddle_ports_num:(i + 1) * paddle_ports_num])

    logger.debug("parsed from args: node_ips:{} \
        node_ip:{} node_rank:{} trainer_endpoints:{}"
                 .format(node_ips, node_ip, node_rank, trainer_endpoints))

    cluster, pod = get_cluster(node_ips, node_ip, trainer_endpoints,
                               selected_gpus)
    return cluster, cluster.pods[node_rank]