def get_cluster_from_args(selected_gpus): cluster_node_ips = '127.0.0.1' node_ip = '127.0.0.1' node_ips = [x.strip() for x in cluster_node_ips.split(',')] node_ips.index(node_ip) free_ports = None free_ports = find_free_ports(len(selected_gpus)) if free_ports is not None: free_ports = list(free_ports) return get_cluster(node_ips, node_ip, free_ports, selected_gpus)
def get_cluster_from_args(selected_gpus): cluster_node_ips = '127.0.0.1' node_ip = '127.0.0.1' node_ips = [x.strip() for x in cluster_node_ips.split(',')] node_ips.index(node_ip) free_ports = None free_ports = find_free_ports(len(selected_gpus)) if free_ports is not None: free_ports = list(free_ports) trainer_endpoints = [] for ip in node_ips: trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports]) return get_cluster(node_ips, node_ip, trainer_endpoints, selected_gpus)
def test_communicator(self): run_server_cmd = """ from __future__ import print_function import sys import os import time import threading import subprocess import unittest import numpy import paddle import paddle.fluid as fluid from paddle.fluid.communicator import Communicator import paddle.fluid.incubate.fleet.base.role_maker as role_maker from paddle.fluid.incubate.fleet.parameter_server.mode import DistributedMode import paddle.distributed.fleet as fleet from test_communicator_geo import TestCommunicatorGeoEnd2End paddle.enable_static() class RunServer(TestCommunicatorGeoEnd2End): def runTest(self): pass os.environ["TRAINING_ROLE"] = "PSERVER" half_run_server = RunServer() half_run_server.run_ut() """ server_file = "run_server_for_communicator_geo.py" with open(server_file, "w") as wb: wb.write(run_server_cmd) port = find_free_ports(1).pop() os.environ["TRAINING_ROLE"] = "PSERVER" os.environ["PADDLE_PORT"] = str(port) os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:{}".format(port) _python = sys.executable ps_cmd = "{} {}".format(_python, server_file) ps_proc = subprocess.Popen( ps_cmd.strip().split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE) time.sleep(5) os.environ["TRAINING_ROLE"] = "TRAINER" self.run_ut() ps_proc.kill() ps_proc.wait() outs, errs = ps_proc.communicate() if os.path.exists(server_file): os.remove(server_file)