Beispiel #1
0
def get_cluster_from_args(selected_gpus):
    cluster_node_ips = '127.0.0.1'
    node_ip = '127.0.0.1'

    node_ips = [x.strip() for x in cluster_node_ips.split(',')]

    node_ips.index(node_ip)

    free_ports = None

    free_ports = find_free_ports(len(selected_gpus))
    if free_ports is not None:
        free_ports = list(free_ports)
    return get_cluster(node_ips, node_ip, free_ports, selected_gpus)
def get_cluster_from_args(selected_gpus):
    cluster_node_ips = '127.0.0.1'
    node_ip = '127.0.0.1'

    node_ips = [x.strip() for x in cluster_node_ips.split(',')]

    node_ips.index(node_ip)

    free_ports = None

    free_ports = find_free_ports(len(selected_gpus))
    if free_ports is not None:
        free_ports = list(free_ports)

    trainer_endpoints = []
    for ip in node_ips:
        trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
    return get_cluster(node_ips, node_ip, trainer_endpoints, selected_gpus)
    def test_communicator(self):
        run_server_cmd = """
from __future__ import print_function

import sys
import os

import time
import threading
import subprocess
import unittest
import numpy

import paddle
import paddle.fluid as fluid

from paddle.fluid.communicator import Communicator
import paddle.fluid.incubate.fleet.base.role_maker as role_maker
from paddle.fluid.incubate.fleet.parameter_server.mode import DistributedMode
import paddle.distributed.fleet as fleet

from test_communicator_geo import TestCommunicatorGeoEnd2End

paddle.enable_static()

class RunServer(TestCommunicatorGeoEnd2End):
    def runTest(self):
        pass

os.environ["TRAINING_ROLE"] = "PSERVER"

half_run_server = RunServer()
half_run_server.run_ut()
"""

        server_file = "run_server_for_communicator_geo.py"
        with open(server_file, "w") as wb:
            wb.write(run_server_cmd)

        port = find_free_ports(1).pop()

        os.environ["TRAINING_ROLE"] = "PSERVER"
        os.environ["PADDLE_PORT"] = str(port)
        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:{}".format(port)

        _python = sys.executable

        ps_cmd = "{} {}".format(_python, server_file)

        ps_proc = subprocess.Popen(
            ps_cmd.strip().split(" "),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE)

        time.sleep(5)

        os.environ["TRAINING_ROLE"] = "TRAINER"

        self.run_ut()
        ps_proc.kill()
        ps_proc.wait()
        outs, errs = ps_proc.communicate()

        if os.path.exists(server_file):
            os.remove(server_file)