Ejemplo n.º 1
0
    def _create_sockets(self):
        """ Each worker has three sockets at start:

        (1) request_master_socket: sends job address to master node.
        (2) reply_job_socket: receives job_address from subprocess.
        (3) kill_job_socket : receives commands to kill the job from jobs.

        When a job starts, a new heartbeat socket is created to receive
        heartbeat signals from the job.

        """
        self.worker_ip = get_ip_address()

        # request_master_socket: sends job address to master
        self.request_master_socket = self.ctx.socket(zmq.REQ)
        self.request_master_socket.linger = 0

        # wait for 0.5 second to check whether master is started
        self.request_master_socket.setsockopt(zmq.RCVTIMEO, 500)
        self.request_master_socket.connect("tcp://" + self.master_address)

        # reply_job_socket: receives job_address from subprocess
        self.reply_job_socket = self.ctx.socket(zmq.REP)
        self.reply_job_socket.linger = 0
        reply_job_port = self.reply_job_socket.bind_to_random_port("tcp://*")
        self.reply_job_address = "{}:{}".format(self.worker_ip, reply_job_port)

        # kill_job_socket
        self.kill_job_socket = self.ctx.socket(zmq.REP)
        self.kill_job_socket.linger = 0
        kill_job_port = self.kill_job_socket.bind_to_random_port("tcp://*")
        self.kill_job_address = "{}:{}".format(self.worker_ip, kill_job_port)
Ejemplo n.º 2
0
def status():
    if _IS_WINDOWS:
        cmd = r'''wmic process where "commandline like '%remote\\start.py --name worker --address%'" get commandline /format:list | findstr /V wmic | findstr CommandLine='''
    else:
        cmd = r'ps -ef | grep remote/start.py\ --name\ worker\ --address'

    content = os.popen(cmd).read().strip()
    pattern = re.compile('--address (.*?) --cpu')
    clusters = set(pattern.findall(content))
    if len(clusters) == 0:
        click.echo('No active cluster is found.')
    else:
        ctx = zmq.Context()
        status = []
        for cluster in clusters:
            if _IS_WINDOWS:
                cmd = r'''wmic process where "commandline like '%address {}%'" get commandline /format:list | findstr /V wmic | findstr CommandLine='''.format(
                    cluster)
            else:
                cmd = r'ps -ef | grep address\ {}'.format(cluster)
            content = os.popen(cmd).read()
            pattern = re.compile('--monitor_port (.*?)\n', re.S)
            monitors = pattern.findall(content)

            if len(monitors):
                monitor_port, _, master_address = monitors[0].split(' ')
                monitor_address = "{}:{}".format(get_ip_address(),
                                                 monitor_port)
                socket = ctx.socket(zmq.REQ)
                socket.setsockopt(zmq.RCVTIMEO, 10000)
                socket.connect('tcp://{}'.format(master_address))
                try:
                    socket.send_multipart([STATUS_TAG])
                    monitor_info = to_str(socket.recv_multipart()[1])
                except zmq.error.Again as e:
                    click.echo(
                        'Can not connect to cluster {}, please try later.'.
                        format(master_address))
                    socket.close(0)
                    continue
                msg = """
            # Cluster {} {}

            # If you want to check cluster status, please view: http://{}
            """.format(master_address, monitor_info, monitor_address)
                status.append(msg)
                socket.close(0)
            else:
                msg = """
            # Cluster {} fails to start the cluster monitor.
                """.format(cluster)
                status.append(msg)

        for monitor_status in status:
            click.echo(monitor_status)
Ejemplo n.º 3
0
    def __init__(self, port, monitor_port=None):
        self.ctx = zmq.Context()
        self.master_ip = get_ip_address()
        self.monitor_url = "http://{}:{}".format(self.master_ip, monitor_port)
        logger.set_dir(
            os.path.expanduser('~/.parl_data/master/{}_{}'.format(
                self.master_ip, port)))
        self.client_socket = self.ctx.socket(zmq.REP)
        self.client_socket.bind("tcp://*:{}".format(port))
        self.client_socket.linger = 0
        self.port = port

        self.job_center = JobCenter(self.master_ip)
        self.cluster_monitor = ClusterMonitor()
        self.master_is_alive = True
        self.client_hostname = defaultdict(int)
Ejemplo n.º 4
0
    def __init__(self, worker_address, log_server_address):
        """
        Args:
            worker_address(str): worker_address for sending job information(e.g, pid)

        Attributes:
            pid (int): Job process ID.
            max_memory (float): Maximum memory (MB) can be used by each remote instance.
        """
        self.max_memory = None

        self.job_address_receiver, job_address_sender = Pipe()
        self.job_id_receiver, job_id_sender = Pipe()

        self.worker_address = worker_address
        self.log_server_address = log_server_address
        self.job_ip = get_ip_address()
        self.pid = os.getpid()

        self.run_job_process = Process(target=self.run,
                                       args=(job_address_sender,
                                             job_id_sender))
        self.run_job_process.start()
        """
        NOTE:
            In Windows, it will raise errors when creating threading.Lock before starting multiprocess.Process.
        """
        self.lock = threading.Lock()
        self._create_sockets()

        process = psutil.Process(self.pid)
        self.init_memory = float(process.memory_info()[0]) / (1024**2)

        self.run_job_process.join()

        with self.lock:
            self.kill_job_socket.send_multipart(
                [remote_constants.KILLJOB_TAG,
                 to_byte(self.job_address)])
            try:
                _ = self.kill_job_socket.recv_multipart()
            except zmq.error.Again as e:
                pass
            os._exit(0)
Ejemplo n.º 5
0
    def run(self, job_address_sender, job_id_sender):
        """An infinite loop waiting for a new task.

        Args:
            job_address_sender(sending end of multiprocessing.Pipe): send job address of reply_socket to main process.
        """
        ctx = zmq.Context()

        # create the reply_socket
        reply_socket = ctx.socket(zmq.REP)
        job_port = reply_socket.bind_to_random_port(addr="tcp://*")
        reply_socket.linger = 0
        job_ip = get_ip_address()
        job_address = "{}:{}".format(job_ip, job_port)

        job_id = job_address.replace(':', '_') + '_' + str(int(time.time()))
        self.log_dir = os.path.expanduser('~/.parl_data/job/{}'.format(job_id))
        logger.set_dir(self.log_dir)
        logger.info(
            "[Job] Job {} initialized. Reply heartbeat socket Address: {}.".
            format(job_id, job_address))

        job_address_sender.send(job_address)
        job_id_sender.send(job_id)

        try:
            # receive source code from the actor and append them to the environment variables.
            envdir = self.wait_for_files(reply_socket, job_address)
            sys.path.insert(0, envdir)
            os.chdir(envdir)

            obj = self.wait_for_connection(reply_socket)
            assert obj is not None
            self.single_task(obj, reply_socket, job_address)
        except Exception as e:
            logger.error(
                "Error occurs when running a single task. We will reset this job. \nReason:{}"
                .format(e))
            traceback_str = str(traceback.format_exc())
            logger.error("traceback:\n{}".format(traceback_str))
Ejemplo n.º 6
0
def start_worker(address, cpu_num, log_server_port_range):
    start, end = parse_port_range(log_server_port_range)
    log_server_port = get_port_from_range(start, end)

    if not is_master_started(address):
        raise Exception(
            "Worker can not connect to the master node, " +
            "please check if the input address {} ".format(address) +
            "is correct.")
    cpu_num = str(cpu_num) if cpu_num else ''
    start_file = __file__.replace('scripts.pyc', 'start.py')
    start_file = start_file.replace('scripts.py', 'start.py')

    command = [
        sys.executable, start_file, "--name", "worker", "--address", address,
        "--cpu_num",
        str(cpu_num), "--log_server_port",
        str(log_server_port)
    ]
    p = subprocess.Popen(command)

    if not is_log_server_started(get_ip_address(), log_server_port):
        click.echo("# Fail to start the log server.")
Ejemplo n.º 7
0
    def _reply_heartbeat(self):
        """Reply heartbeat signals to the master node."""

        socket = self.ctx.socket(zmq.REP)
        socket.linger = 0
        socket.setsockopt(zmq.RCVTIMEO,
                          remote_constants.HEARTBEAT_RCVTIMEO_S * 1000)
        reply_master_heartbeat_port =\
            socket.bind_to_random_port(addr="tcp://*")
        self.reply_master_heartbeat_address = "{}:{}".format(
            get_ip_address(), reply_master_heartbeat_port)
        self.heartbeat_socket_initialized.set()
        connected = False
        while self.client_is_alive and self.master_is_alive:
            try:
                message = socket.recv_multipart()
                elapsed_time = datetime.timedelta(seconds=int(time.time() -
                                                              self.start_time))
                socket.send_multipart([
                    remote_constants.HEARTBEAT_TAG,
                    to_byte(self.executable_path),
                    to_byte(str(self.actor_num)),
                    to_byte(str(elapsed_time)),
                    to_byte(str(self.log_monitor_url)),
                ])  # TODO: remove additional information
            except zmq.error.Again as e:
                if connected:
                    logger.warning("[Client] Cannot connect to the master."
                                   "Please check if it is still alive.")
                else:
                    logger.warning(
                        "[Client] Cannot connect to the master."
                        "Please check the firewall between client and master.(e.g., ping the master IP)"
                    )
                self.master_is_alive = False
        socket.close(0)
        logger.warning("Client exit replying heartbeat for master.")
Ejemplo n.º 8
0
def start_master(port, cpu_num, monitor_port, debug, log_server_port_range):
    if debug:
        os.environ['DEBUG'] = 'True'

    if not is_port_available(port):
        raise Exception(
            "The master address localhost:{} is already in use.".format(port))

    if monitor_port and not is_port_available(monitor_port):
        raise Exception(
            "The input monitor port localhost:{} is already in use.".format(
                monitor_port))

    cpu_num = int(
        cpu_num) if cpu_num is not None else multiprocessing.cpu_count()
    start_file = __file__.replace('scripts.pyc', 'start.py')
    start_file = start_file.replace('scripts.py', 'start.py')
    monitor_file = __file__.replace('scripts.pyc', 'monitor.py')
    monitor_file = monitor_file.replace('scripts.py', 'monitor.py')

    monitor_port = monitor_port if monitor_port else get_free_tcp_port()
    start, end = parse_port_range(log_server_port_range)
    log_server_port = get_port_from_range(start, end)
    while log_server_port == monitor_port or log_server_port == port:
        log_server_port = get_port_from_range(start, end)

    master_command = [
        sys.executable,
        start_file,
        "--name",
        "master",
        "--port",
        port,
        "--monitor_port",
        monitor_port,
    ]
    worker_command = [
        sys.executable, start_file, "--name", "worker", "--address",
        "localhost:" + str(port), "--cpu_num",
        str(cpu_num), '--log_server_port',
        str(log_server_port)
    ]
    monitor_command = [
        sys.executable, monitor_file, "--monitor_port",
        str(monitor_port), "--address", "localhost:" + str(port)
    ]

    FNULL = open(os.devnull, 'w')

    # Redirect the output to DEVNULL to solve the warning log.
    _ = subprocess.Popen(master_command,
                         stdout=FNULL,
                         stderr=subprocess.STDOUT)

    if cpu_num > 0:
        # Sleep 1s for master ready
        time.sleep(1)
        _ = subprocess.Popen(worker_command,
                             stdout=FNULL,
                             stderr=subprocess.STDOUT)

    if _IS_WINDOWS:
        # TODO(@zenghsh3) redirecting stdout of monitor subprocess to FNULL will cause occasional failure
        tmp_file = tempfile.TemporaryFile()
        _ = subprocess.Popen(monitor_command, stdout=tmp_file)
        tmp_file.close()
    else:
        _ = subprocess.Popen(monitor_command,
                             stdout=FNULL,
                             stderr=subprocess.STDOUT)
    FNULL.close()

    if cpu_num > 0:
        monitor_info = """
            # The Parl cluster is started at localhost:{}.

            # A local worker with {} CPUs is connected to the cluster.    

            # Starting the cluster monitor...""".format(
            port,
            cpu_num,
        )
    else:
        monitor_info = """
            # The Parl cluster is started at localhost:{}.

            # Starting the cluster monitor...""".format(port)
    click.echo(monitor_info)

    # check if monitor is started
    monitor_is_started = False
    if _IS_WINDOWS:
        cmd = r'''wmic process where "commandline like '%remote\\monitor.py --monitor_port {} --address localhost:{}%'" get commandline /format:list | findstr /V wmic | findstr CommandLine='''.format(
            monitor_port, port)
    else:
        cmd = r'ps -ef | grep -v grep | grep remote/monitor.py\ --monitor_port\ {}\ --address\ localhost:{}'.format(
            monitor_port, port)
    for i in range(3):
        check_monitor_is_started = os.popen(cmd).read()
        if len(check_monitor_is_started) > 0:
            monitor_is_started = True
            break
        time.sleep(3)

    master_ip = get_ip_address()
    if monitor_is_started:
        start_info = """
        ## If you want to check cluster status, please view:

            http://{}:{}

        or call:

            xparl status""".format(master_ip, monitor_port)
    else:
        start_info = "# Fail to start the cluster monitor."

    monitor_info = """
        {}

        ## If you want to add more CPU resources, please call:

            xparl connect --address {}:{}

        ## If you want to shutdown the cluster, please call:

            xparl stop        
        """.format(start_info, master_ip, port)
    click.echo(monitor_info)

    if not is_log_server_started(master_ip, log_server_port):
        click.echo("# Fail to start the log server.")