Ejemplo n.º 1
0
    def get_request(self, tag, *args, **kwargs):
        # Find the first suitable request for the given tag. There is no safety
        # net so if requests are non-exhaustive in their combined accept
        # pattern those not cathed parameters will not return a Request.
        try:
            req_class_list = self.cls_mapping[tag]
        except:
            Logger().warning(
                "Unable to find collective list in the cls_mapping for tag %s"
                % tag)

        for req_class in req_class_list:
            obj = req_class.accept(self.communicator,
                                   self.communicator.mpi.settings, self.cache,
                                   *args, **kwargs)
            if obj:
                # Set the tag on the object.
                obj.tag = tag

                # Add the object to the MPI environment and send the start signal.
                with self.mpi.unstarted_collective_requests_lock:
                    self.mpi.unstarted_collective_requests.append(obj)

                    # Signal
                    self.mpi.unstarted_collective_requests_has_work.set()
                    self.mpi.has_work_event.set()

                return obj

        # Note: If we define a safety net we could select the first / last class
        # and initialize that.
        Logger().warning(
            "Unable to initialize the collective request for tag %s. I suspect failure from this point"
            % tag)
Ejemplo n.º 2
0
 def _version_check(self):
     """
     Check that the required Python version is installed
     """
     (major,minor,_,_,_) = sys.version_info
     if (major == 2 and minor < 6) or major < 2:
         Logger().error("pupyMPI requires Python 2.6 (you may have to kill processes manually)")
         sys.exit(1)
     elif major >= 2 and minor is not 6:
         Logger().warn("pupyMPI is only certified to run on Python 2.6")
Ejemplo n.º 3
0
    def _handle_writelist(self, writelist):
        for write_socket in writelist:
            removal = []
            with self.socket_to_request_lock:
                #request_list = self.socket_to_request[write_socket]
                try:
                    request_list = self.socket_to_request[write_socket]
                except Exception as e:
                    #Logger().debug("rank:%i trying to find %s on socket_to_request:%s" % (self.rank, write_socket, self.socket_to_request ) )
                    raise e
            for request in request_list:
                if request.status == "cancelled":
                    removal.append(request)
                elif request.status == "new":
                    # Send the data on the socket
                    try:
                        if request.multi:
                            utils.robust_send(write_socket, request.header)
                            utils.robust_send_multi(write_socket, request.data)
                        else:
                            utils.robust_send_multi(
                                write_socket, [request.header] + request.data)
                    except socket.error, e:
                        Logger().error("got:%s for socket:%s with data:%s" %
                                       (e, write_socket, request.data))
                        # TODO: Make sure we really want to continue here, instead of reacting
                        # Send went wrong, do not update, but hope for better luck next time
                        continue
                        #raise e
                    except Exception, e:
                        Logger().error(
                            "Other exception got:%s for socket:%s with header:%s payload:%s"
                            % (e, write_socket, request.header, request.data))
                        # Send went wrong, do not update, but hope for better luck next time
                        raise e

                    removal.append(request)

                    if request.acknowledge:
                        request.update(
                            "unacked"
                        )  # update status to wait for acknowledgement
                    else:
                        request.update(
                            "ready"
                        )  # update status and signal anyone waiting on this request
                else:
                    pass
Ejemplo n.º 4
0
    def add_out_request(self, request):
        """
        Put a requested out operation (eg. send) on the out list
        """

        # Create the proper data structure and pickle the data
        #request.prepare_send()

        # Find a socket and port of recipient process
        connection_info = self.network.all_procs[
            request.global_rank]['connection_info']
        connection_type = self.network.all_procs[
            request.global_rank]['connection_type']

        # TODO: This call should be extended to allow asking for a persistent connection
        client_socket, newly_created = self.socket_pool.get_socket(
            request.global_rank, connection_info, connection_type)
        # If the connection is a new connection it is added to the socket lists of the respective thread(s)
        if newly_created:
            self.network.t_in.add_in_socket(client_socket)
            self.network.t_out.add_out_socket(client_socket)

        with self.socket_to_request_lock:
            try:
                self.network.t_out.socket_to_request[client_socket].append(
                    request
                )  # socket already exists just add another request to the list
                self.outbound_requests += 1
            except Exception, e:  # This should not happen
                Logger().error(
                    "Network-thread (%s) got error: %s of type: %s, socket_to_request was: %s"
                    % (self.type, e, type(e),
                       self.network.t_out.socket_to_request))
Ejemplo n.º 5
0
def ssh(host, arguments, x_forward, process_io, logdir, rank):
    """Process starter using ssh through subprocess. No loadbalancing yet."""
    logger = Logger()

    # We join the sys.path here to allow user modifications to PYTHONPATH to take effect remotelyy
    python_path = os.path.dirname(
        os.path.abspath(__file__)) + "/../" + ":" + ":".join(sys.path)
    sshexec_str = "ssh %s%s \"PYTHONPATH=%s %s\"" % (
        ("-XY " if x_forward else ""), host, python_path, ' '.join(arguments))
    #if rank == 0:
    #    logger.debug("Starting remote process: %s with process_io type %s" % (sshexec_str, process_io))

    if process_io in [
            'none', 'direct', 'remotefile'
    ]:  # network is closed for i/o, nothing displayed or written on mpirun side. If remote_file, a file is created on the remote process machine only.
        target = None
    elif process_io == 'asyncdirect':  # uses io forwarder and prints to console
        target = subprocess.PIPE
    elif process_io == 'localfile':  # writes to a file on the mpirun machine only
        try:
            target = open(os.path.join(logdir, "mpi.rank%s.log" % rank), "w")
            io_target_list.append(target)
        except:
            raise MPIException(
                "Local directory not writeable - check that this path exists and is writeable:\n%s"
                % options.logdir)
    else:
        raise MPIException("Unsupported I/O type: '%s'" % process_io)

    # Execute the ssh command in a subprocess
    p = subprocess.Popen(sshexec_str, shell=True, stdout=target, stderr=target)
    process_list.append(p)
    return p
Ejemplo n.º 6
0
    def add_accepted_socket(self, socket_connection, global_rank):
        """
        Add a socket connection to the pool, where the connection is the returned
        value from a socket.accept - that is we are at the recieving end of a
        connection attempt.
        """
        if global_rank >= 0 and self.readonly:
            #Logger().debug("Bad conn to rank %i with metainfo:%s and sockets:%s" % (global_rank, self.metainfo, self.sockets))
            raise Exception(
                "Can't add accepted socket. We're in readonly mode")

        #Logger().debug("SocketPool.add_accepted_socket: Adding socket connection for rank %d: %s" % (global_rank, socket_connection))
        with self.sockets_lock:
            known_socket = self._get_socket_for_rank(global_rank)

        # TODO: Move this check under the if known_socket: condition since it is more specialized (i.e. saves an if-comparison in the normal case)
        if known_socket == socket_connection:
            Logger().error(
                "SocketPool.add_accepted_socket: Trying to add a socket_connection that is already in the pool?!"
            )
            return

        if known_socket:
            # When two procs send to each other simultaneously the result can be
            # adding a duplicate connection
            #Logger().debug("Already a socket in the pool:%s for an accepted connection:%s to rank:%i" % (known_socket,socket_connection,global_rank))
            pass

        self._add(global_rank, socket_connection, False)
Ejemplo n.º 7
0
def create_random_socket(min=10000, max=30000):
    """
    A simple helper method for creating a socket,
    binding it to a random free port within the specified range.
    """
    logger = Logger()
    used = []

    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    # Enable TCP_NODELAY to improve performance of sending one-off packets by
    # immediately acknowledging received packages instead of trying to
    # piggyback the ACK on the next outgoing packet (Nagle's algorithm)
    # XXX: If you remove this, remember to do so in socketpool as well.
    sock.setsockopt(socket.SOL_TCP, socket.TCP_NODELAY, 1)
    #sock.setsockopt(SOL_SOCKET, SO_REUSEADDR, 1)

    hostname = socket.gethostname()
    port_no = None

    while True:
        port_no = random.randint(min, max)
        if port_no in used:
            #logger.debug("get_socket: We know port %d is already in use, try a new one" % port_no)
            continue

        try:
            #logger.debug("get_socket: Trying to bind on port %d" % port_no)
            sock.bind( (hostname, port_no) )
            break
        except socket.error:
            #logger.debug("get_socket: Permission error on port %d, trying a new one" % port_no)
            used.append( port_no ) # Mark socket as used (or no good or whatever)
            continue

    return sock, hostname, port_no
Ejemplo n.º 8
0
    def handle_system_message(self, rank, command, raw_data, connection):
        """
        Handle a system message. We define a list of read only commands and all
        others are considered writeable. The raw data contains a security
        component we need to check in the case of a write command.

        This method returns a boolean indicating if the command was actually
        tried.
        """
        read_only = (constants.CMD_PING, constants.CMD_READ_REGISTER)
        commands = (constants.CMD_CONFIG, constants.CMD_ABORT, constants.CMD_PING, constants.CMD_MIGRATE_PACK, constants.CMD_READ_REGISTER, constants.CMD_CONN_CLOSE)

        data = utils.pickle.loads(raw_data)
        user_data = None
        security_component = None

        if isinstance(data, tuple):
            security_component, user_data = data
        else:
            security_component = data

        # Security check.
        if command not in read_only:
            if security_component != self.get_security_component() and rank < 0:
                Logger().warning("Failed security check in system command. Expected security component was %s but received %s for command %s" % (self.get_security_component(), raw_data, command))
                return False

        # Check we have a system command
        if command in commands:
            with self.pending_systems_commands_lock:
                self.pending_systems_commands.append( (command, connection, user_data))
        else:
            print "Error: Unknown system command"
Ejemplo n.º 9
0
 def select_out(self):
     try:
         #return select.select( [], self.sockets_out, self.sockets_out)
         return select.select([], self.sockets_out, self.sockets_out, 1)
     except Exception, e:
         Logger().error(
             "Network-thread (%s) Got exception: %s of type: %s" %
             (self.type, e, type(e)))
Ejemplo n.º 10
0
 def close_all_sockets(self):
     """
     Close all sockets in the socketpool
     """
     for s in self.sockets:
         try:
             #s.shutdown(2)
             s.close()
         except Exception, e:
             Logger().debug("Got error when closing socket: %s" % e)
Ejemplo n.º 11
0
def get_communicator_class(socket_poll_method=False):
    c_class = None

    if socket_poll_method:
        poll_method_exists = getattr(select, socket_poll_method, None)
        if not poll_method_exists:
            Logger().warn(
                "Socket poll method '%s' is not supported on this system - falling back to automatic selection."
                % socket_poll_method)
            socket_poll_method = False

    if socket_poll_method == "epoll":
        c_class = CommunicationHandlerEpoll

    elif socket_poll_method == "poll":
        c_class = CommunicationHandlerPoll

    elif socket_poll_method == "select":
        c_class = CommunicationHandlerSelect

    else:
        if socket_poll_method:
            Logger().warn(
                "Unknown socket poll method '%s' - falling back to automatic selection."
                % socket_poll_method)

        epoll = getattr(select, "epoll", None)
        if epoll:
            c_class = CommunicationHandlerEpoll

        poll = getattr(select, "poll", None)
        if poll and not c_class:
            c_class = CommunicationHandlerPoll

        if not c_class:
            c_class = CommunicationHandlerSelect

    #Logger().debug("Found communicator class of type %s, called with socket_poll_method parameter %s" % (c_class, socket_poll_method))
    return c_class
Ejemplo n.º 12
0
    def _handle_readlist(self, readlist):

        for read_socket in readlist:
            add_to_pool = False

            if read_socket in (self.main_receive_socket, self.unix_socket):
                try:
                    # _ is sender_address
                    (conn, _) = read_socket.accept()

                    self.network.t_in.add_in_socket(conn)
                    self.network.t_out.add_out_socket(conn)
                    add_to_pool = True
                except socket.error, e:
                    # We try to accept but if accept fails maybe it just data coming in?
                    Logger().error(
                        "accept() threw: %s on the main recv socket:%s" %
                        (e, read_socket))
                    continue
                except Exception, e:
                    Logger().error(
                        "_handle_readlist: Unknown error. Error was: %s" % e)
                    continue
Ejemplo n.º 13
0
    def accept_msg(self, rank, raw_data, msg_type):
        # Do not do anything if the request is completed.
        if self._finished.is_set():
            return False

        # Deserialize data
        data = utils.deserialize_message(raw_data, msg_type)

        if self.phase == "up":
            if rank not in self.missing_children:
                return False

            # Remove the rank from the missing children.
            self.missing_children.remove(rank)

            # Add the data to the list of received data
            if self.partial:
                # If partial reduce we didn't get a dict but just the reduced data
                self.received_data[rank] = data
            else:
                self.received_data.update(data)

            # When the list of missing children is empty we have received from
            # every child and can reduce the data and send to the parent.
            if not self.missing_children:
                # Add our own data element
                self.received_data[self.rank] = self.data

                # reduce the data
                if self.partial:
                    self.data = reduce_elementwise(self.received_data.values(),
                                                   self.operation)
                else:
                    self.data = self.received_data

                # forward to the parent.
                self.to_parent()
            return True
        elif self.phase == "down":
            if rank != self.parent:
                return False

            self.data = data
            self.to_children()
            return True
        else:
            Logger().warning("Accept_msg in unknown phase: %s" % self.phase)

        return False
Ejemplo n.º 14
0
    def accept_msg(self, rank, raw_data, msg_type):
        # Do not do anything if the request is completed.
        if self._finished.is_set():
            return False

        if self.phase == "up":
            if rank not in self.missing_children:
                return False

            # Remove the rank from the missing children.
            self.missing_children.remove(rank)

            desc = self.topology.descendants(rank)
            all_ranks = [
                rank
            ] + desc  # All the ranks having a payload in this message
            all_ranks.sort()  # keep it sorted

            # Store payloads in proper positions
            for i, r in enumerate(all_ranks):
                pos_r = i
                begin = pos_r * (self.chunksize)
                end = begin + (self.chunksize)
                # Add the data to the list of received data
                self.data_list[r] = raw_data[begin:end]

            # When the list of missing children is empty we have received from
            # every child and can reduce the data and send to the parent.
            if not self.missing_children:
                # forward to the parent.
                self.to_parent()
            return True

        elif self.phase == "down":
            if rank != self.parent:
                return False

            # FIXME: just fix!
            # NOTE: Unless we are sure that no reduce operation can change the
            # msg_type or chunksize we should really store them again here
            #Logger().debug("STORING msg_type:%s vs. old:%s type(raw_data):%s len(raw_data):%s raw_data:%s" % (msg_type, self.msg_type, type(raw_data), len(raw_data), raw_data))
            self.data_list = [raw_data]  # boxing
            self.to_children()
            return True
        else:
            Logger().warning("Accept_msg in unknown phase: %s" % self.phase)

        return False
Ejemplo n.º 15
0
def find_mapper(module_or_func):
    mod = __import__("mpi.lib.hostfile.mappers", fromlist="mpi.lib.hostfile")
    mapper = getattr(mod, module_or_func, None)

    if not mapper:
        # Try to import the module.
        if module_or_func.find(".") == -1:
            raise Exception("Cant import a custom hostmapper. Maybe you supplied something in a bad format")

        try:
            split = module_or_func.split(".")
            mod = __import__(split[:-1])
            func = split[-1]
            mapper = getatr(mod, func, None)
        except Exception, e:
            Logger().warn("Cant import the custom module. The exception raised is %s" % e)
Ejemplo n.º 16
0
def wait_for_shutdown(process_list):
    """
    Go through list of processes and make sure they all have terminated
    """
    logger = Logger()
    exit_codes = []
    while process_list:
        remove = []
        for p in process_list:
            returncode = p.poll()
            #logger.debug("Got return code: %s" % returncode)

            if returncode is None:  # still alive
                pass
            elif returncode == 0:  # exited correctly
                exit_codes += [returncode]
                remove.append(p)
                #process_list.remove( p )
                logger.debug(
                    "A process exited with a status of 0. And we have %i left."
                    % (len(process_list) - len(remove)))
            else:  # error code
                exit_codes += [returncode]
                remove.append(p)
                #process_list.remove( p )
                logger.debug(
                    "A process exited with return code %d. And we have %i left."
                    % (returncode, len(process_list) - len(remove)))

        # We remove outside iteration over list just to be safe
        for p in remove:
            process_list.remove(p)

        time.sleep(1)

    # Target list is empty unless the option process_io=localfile is specified, in
    # which case we close the filedescriptors of all the log files made
    for t in io_target_list:
        t.close()

    return exit_codes
Ejemplo n.º 17
0
    def match_pending(self, request):
        """
        Tries to match a pending request with something in the received data.

        If the received data is found we remove it from the list.

        The request is updated with the data if found and this
        status update returned from the function so it is possible
        to remove the item from the list.
        """
        match = False
        with self.received_data_lock:
            #Logger().debug("-- Match pending has lock! received_data:%s" % self.received_data)
            for element in self.received_data:
                (sender, tag, acknowledge, communicator_id, message) = element
                
                # For a message to match
                # 1) it must be within the same communicator
                # 2) participant must match or any rank have been specified
                # 3) tag must match OR if any-tag has been specified the message should just be any user tag (ie. non-negative)
                if request.communicator.id == communicator_id \
                and request.participant in (sender, constants.MPI_SOURCE_ANY) \
                and ( (request.tag == tag) or (request.tag == constants.MPI_TAG_ANY and tag > 0) ):
                    # Incoming synchronized communication requires acknowledgement
                    if acknowledge:
                        Logger().debug("SSEND RECEIVED request: %s" % request)
                        # Generate an acknowledge message as an isend
                        # NOTE: Consider using an empty message string, to save (a little) resources
                        self.communicators[communicator_id]._isend( "ACKNOWLEDGEMENT", sender, constants.TAG_ACK)
                    
                    matched_element = element
                    match = True
                    request.update(status="ready", data=message)
                    break # We can only find matching data for one request and we have
            
            if match:
                self.received_data.remove(matched_element)
        return match
Ejemplo n.º 18
0
    def generate_settings(self, settings):
        # We first import our normal settings packed with the mpi environment. These
        # will make a good base for all the functionality here. If the user supplies
        # any other settings files these will override the ones in our module.
        from mpi import settings as base_settings
        self.settings = base_settings

        if settings:
            settings = settings.strip().strip(", ")
            modules = settings.split(",")
            for module in modules:
                # help people a bit
                module = module.strip().strip(".py")

                try:
                    mod = __import__(module)
                    self.settings.__dict__.update(mod.__dict__)

                except ImportError:
                    #Logger().debug("Can not import a settings module by the name of %s" % module)
                    pass
                except Exception, e:
                    Logger().error("Something very wrong happened with your settings module:", e)
Ejemplo n.º 19
0
    def resume_packed_state(self):
        from mpi import dill
        obj = dill.loads(self.resume_state)
        session_data = obj['session']

        # Import everything from the user module. This is important as the user
        # might have defined objects / classes etc deleted as part of the
        # pickle process.
        user_module = obj['mpi']['user_module']

        try:
            user_module = __import__(user_module)
            import __main__

            for k in user_module.__dict__:
                if k not in __main__.__dict__:
                    __main__.__dict__[k] = user_module.__dict__[k]

            user_module.__dict__.update(__main__.__dict__)

        except Exception, e:
            Logger().warning("Can't import the user module: %s. This might not be a problem, but it is better to restore the script with your script in your PYTHONPATH." % user_module)
            print e
Ejemplo n.º 20
0
def round_robin(hosts, total_cpu, max_cpu, np=1, overmapping=True):
    l = generate_localhost_data(hosts, np)
    if l: return l

    if np > total_cpu:
        # Overmapping.
        if not overmapping or np > max_cpu:
            raise HostfileMapException("Number of processes exceeds the maximum allowed CPUs")

        Logger().warning("Not enough hosts. Overmapping in effect. ")

    mapped_hosts = []
    host_count = {}
    rank = 0
    done = False

    while not done:
        for host in hosts:
            hostname = host['host']

            if hostname not in host_count:
                host_count[hostname] = 0

            # Just check that this host allows more virtual CPUs on it
            if host_count[hostname] < host['max_cpu']:
                host_count[hostname] += 1
            else:
                continue

            mapped_hosts.append( (hostname, rank) )
            rank += 1

            if rank == np:
                done = True
                break

    return mapped_hosts
Ejemplo n.º 21
0
def robust_send_multi(socket, messages):
    """
    experimental cousin of robust_send
    if we can agree that the overhead of always considering messages a list is negligible this can be folded into regular robust_send

    TODO: Check (eg. with wireshark) if every send produces a tcp packet or if several messages can be packed into on tcp packet (which we hope is what happens)
    """

    
    for message in messages:
        target = len(message) # how many bytes to send
        transmitted_bytes = 0
        
        try:
            while target > transmitted_bytes:
                delta = socket.send(message)
                transmitted_bytes += delta
        
                if target > transmitted_bytes: # Rare unseen case therefore relegated to if clause instead of always slicing in send
                    message = message[transmitted_bytes:]
                #Logger().debug("Message sliced because it was too large for one send.")
        except Exception as e:            
            Logger().error("BAD multisend caller:%s msg type%s len:%s of %i in all - msg:%s error:%s" % (whosdaddy(), type(message), target, len(messages), message, e ) )
            raise e
Ejemplo n.º 22
0
def parse_hostfile(filepath="hostfile", limit_to=None):
    config = ConfigParser.SafeConfigParser()
    config.read(filepath)

    # The returned data will be at tuple containing
    # the hosts, sum of cpus and sum of allowed overmapped cpus
    sum_cpu = 0
    sum_maxcpu = 0
    hosts = []

    # As a default, we use all the sections defining
    # hosts. If - on the other hand - the user defined
    # a section called [ActiveNodes], only the secions
    # mention there will be read.
    sections = config.sections()

    # Filter the sections by looking in the ActiveNodes,
    # so only a subset of the section will be used.
    if "ActiveNodes" in sections:
        sections.remove("ActiveNodes")
        try:
            active_sections = config.get("ActiveNodes", "active")
            active_sections = [s.strip() for s in active_sections.split(",")]

            # Test if there is global overlap
            if not all([s in sections for s in active_sections]):
                raise Exception(
                    "There were sections defined in the ActiveNodes that does not exist"
                )

            sections = active_sections
        except ConfigParser.NoOptionError:
            pass

    if limit_to:
        for s in sections:
            if s not in limit_to:
                sections.remove(s)

    defaults = {'cpu': 0, 'max_cpu': 0}
    if "Defaults" in config.sections():
        # Fetch the default keys.
        for key in defaults:
            defaults[key] = config.getint("Defaults", key)

    # We are now ready to parse the remaining sections
    for section in sections:
        try:
            nodes = config.get(section, "nodes").split(",")
        except ConfigParser.NoOptionError:
            Logger().warning(
                "Found section %s in hostfile, but it does not include any nodes. This section will not contribture anything to the later process to host mapping."
            )
        for node in nodes:
            node = node.strip()

            # Use the defaults as defaults (wauw). Override them after.
            s = copy.copy(defaults)
            s["host"] = node
            for key in defaults:
                try:
                    s[key] = config.getint(section, key)
                except ConfigParser.NoOptionError:
                    pass

            # Aggregate some key information
            sum_maxcpu += s["max_cpu"]
            sum_cpu += s["cpu"]

            hosts.append(s)

    if sum_cpu > sum_maxcpu:
        Logger().warn(
            "Hostfile parser detected that the hostfile specifies more actual CPUs than 'virtual"
        )

    return hosts, sum_cpu, sum_maxcpu
Ejemplo n.º 23
0
def parse(parser):
    """
    Parses the above parser (and more maybe) and handle other elements like
    creating Loggers with the proper parameter etc. This method should be
    used to avoid a lot of duplicate code.
    """
    options, args = parser.parse_args()

    # Look for a different tag mapper.
    if len(args) != 1:
        parser.error("No data file supplied!")

    handle_file = args[0]
    if not Handle.valid_handle_file(handle_file):
        parser.error("Invalid data file")

    # Check that the supplied tag mapper is actually a file
    if options.tag_mapper and not os.path.isfile(options.tag_mapper):
        parser.error("No such tag mapper file: %s" % options.tag_mapper)

    if not options.tag_mapper:
        # Look for a default tag mapper
        potential_mapper = handle_file + ".tagmapper"
        if os.path.isfile(potential_mapper):
            options.tag_mapper = potential_mapper

    # Clean the filter test.
    options.test_filter = filter(
        None, [s.strip().lower() for s in options.test_filter.split(",")])

    # Create a negative test case if we try to use datasize. This is not supported by barrier.
    if "datasize" in (options.x_data, options.y_data):
        options.test_filter.append(":barrier")

    # Setup a logger based on the arguments about. This might seem stupid
    # as it is not returned from the call, but as the Logger is a singleton
    # it is possible to do a simple Logger() call later.
    from mpi.logger import Logger
    logfile = "pupyplot.log"
    if options.logfile:
        logfile = options.logfile

    verbosity = 1
    if options.debug:
        verbosity = 3
    elif options.verbose:
        verbosity = 2

    Logger(logfile, "pupyplot", options.debug, verbosity, not options.verbose)

    # Normalize the raw filters.
    raw_filters = []
    for f in filter(None, [f.strip() for f in options.raw_filters.split(";")]):
        # For now we only have one filter type (equal). We identify this by
        # a simple string. Parser people would probably not like this
        t = f.split(":")
        vals = filter(None, [f.strip() for f in t[1].split(",")])
        if len(t) == 2:
            raw_filters.append((t[0], "EQ", vals))

    options.raw_filters = raw_filters

    return options, args
Ejemplo n.º 24
0
def terminate_children():
    for p in process_list:
        logger = Logger()
        logger.debug("Killing %s" % p)
        p.terminate()
Ejemplo n.º 25
0
    'avg_time'  : 'time',
    'min_time' : 'time',
    'max_time' : 'time',
    'throughput' :'throughput',
    'nodes' : 'number',
}

if __name__ == "__main__":
    # Receive the parser and groups so we can add further elements
    # if we want
    parser, groups = plot_parser()

    # Parse it. This will setup logging and other things.
    options, args = parse(parser)

    Logger().debug("Command line arguments parsed.")

    # Object creation, used to keep, filter, aggregate, validate data.
    handle = Handle(args[0])

    tag_mapper = {}
    if options.tag_mapper:
        tag_mapper = get_tag_mapping(options.tag_mapper)

    # to extract and filter the data.
    ds = DataSupplier(handle.getdata())
    ds.set_raw_filters(options.raw_filters)

    # It should be possible to limit the tests to one single test. how should
    # this be one.
    rt = 0
Ejemplo n.º 26
0
def generate_localhost_data(hosts, np):
    if not hosts:
        Logger().warning("No hostfile. Overmapping on localhost. Unless you are developing right now, this might not be what you want.")
        return [("localhost", i) for i in range(np)]
Ejemplo n.º 27
0
 def close_all_sockets(self):
     for s in self.sockets_in + self.sockets_out:
         try:
             s.close()
         except Exception, e:
             Logger().error("Got error when closing socket: %s" % e)
Ejemplo n.º 28
0
            try:
                rank, msg_type, tag, ack, comm_id, coll_class_id, raw_data = get_raw_message(
                    conn, self.network.mpi.settings.SOCKET_RECEIVE_BYTECOUNT)
            except MPIException, e:
                # Broken connection is ok when shutdown is going on
                if self.shutdown_event.is_set():
                    break  # We don't care about incoming during shutdown
                else:
                    # TODO: We should check for a specific Exception thrown from get_raw_message to signify when other side has closed connection
                    # We have no way of knowing whether other party has reached shutdown or this was indeed an error
                    # so we just try listening to next socket
                    continue
            except Exception, e:
                Logger().error(
                    "_handle_readlist: Unexpected error thrown from get_raw_message. Error was: %s"
                    % e)
                continue

            # Now that we know the rank of sender we can add the socket to the pool
            if add_to_pool:
                self.network.socket_pool.add_accepted_socket(conn, rank)

            # user messages have a cmd field larger than CMD_RAWTYPE
            if msg_type >= constants.CMD_RAWTYPE:
                try:
                    with self.network.mpi.raw_data_lock:
                        self.network.mpi.raw_data_queue.append(
                            (rank, msg_type, tag, ack, comm_id, coll_class_id,
                             raw_data))
                        self.network.mpi.raw_data_has_work.set()
Ejemplo n.º 29
0
def main():
    Logger("migrate", "migrate", True, True, True)

    options, args = parse_extended_args()

    ranks = options.ranks
    hostinfo = options.hostinfo
    bypass = options.bypass

    # Create a socket we can receive results from.
    sock, hostname, port_no = create_random_socket()
    sock.listen(len(ranks))

    all_data = {
        'procs': {},
        'mpirun_args': options.mpirun_args,
    }

    # Start a tread for reaciing.
    receiver = Receiver(sock, len(ranks), all_data)
    receiver.start()
    receiver.start_event.wait()

    senders = []

    for participant in hostinfo:
        remote_host, remote_port, rank, security_component, avail = participant

        succ = True
        if not bypass:
            succ = avail_or_error(avail, rank, constants.CMD_MIGRATE_PACK)

        if not succ:
            sys.exit(1)

        # Data to send is a tuple with the security component, and then
        # command specific data
        data = (security_component, (hostname, port_no))

        sender = Sender(remote_host, remote_port, data)
        sender.start()
        senders.append(sender)

    # Join all the sender threads.
    for s in senders:
        s.wait()
        s.join()

    # Wait until everybody sent back.
    receiver.wait()
    receiver.join()

    # Write the final data to a file
    import tempfile
    _, filename = tempfile.mkstemp(prefix="pupy")

    fh = open(filename, "wb")
    dill.dump(all_data, fh)

    fh.close()

    print "Halted system saved to file: ", filename

    sys.exit(0)
Ejemplo n.º 30
0
    def __init__(self):
        Thread.__init__(self)
        """
        Initializes the MPI environment. This will give each process a separate
        rank in the MPI_COMM_WORLD communicator along with the total number of
        processes in the communicator. Both attributes can be read just after
        startup::

            from mpi import MPI

            mpi = MPI()
            rank = mpi.MPI_COMM_WORLD.rank()
            size = mpi.MPI_COMM_WORLD.size()

            print "Proc %d of %d started" % (rank, size)

            mpi.finalize()

        """

        self.name = "MPI" # Thread name

        # Startup time. Used in Wtime() implementation.
        self.startup_timestamp = time.time()

        # Event for handling thread packing.
        self.packing = threading.Event()

        # Data structures for jobs.
        # The locks are for guarding the data structures
        # The events are for signalling change in data structures

        # Pending requests are recieve requests where the data may or may not have arrived
        self.pending_requests = []
        self.pending_requests_lock = threading.Lock()
        self.pending_requests_has_work = threading.Event()

        # Raw data are messages that have arrived but not been unpickled yet
        self.raw_data_queue = []
        self.raw_data_lock = threading.Lock()
        self.raw_data_has_work = threading.Event()

        # Recieved data are messages that have arrived and are unpickled
        # (ie. ready for matching with a posted recv request)
        #There are no events as this is handled through the "pending_request_" event.
        self.received_data = []
        self.received_data_lock = threading.Lock()

        # General event to wake up main mpi thread
        self.has_work_event = threading.Event()

        # Shutdown signals
        self.shutdown_event = threading.Event() # MPI finalize has been called, shutdown in progress

        # Lock and counter for enumerating request ids
        self.current_request_id_lock = threading.Lock()
        self.current_request_id = 0

        # Pending system commands. These will be executed at first chance we have (we
        # need access to the user code). We also have a lock around the list, to ensure
        # proper access.
        self.pending_systems_commands = []
        self.pending_systems_commands_lock = threading.Lock()

        # Unstarted collective requests.
        self.unstarted_collective_requests = []
        self.unstarted_collective_requests_lock = threading.Lock()
        self.unstarted_collective_requests_has_work = threading.Event()

        # When the collective requsts are started they are moved to this queue until
        # they are finished.
        self.pending_collective_requests = []

        self.received_collective_data_lock = threading.Lock()
        self.received_collective_data = []
        self.pending_collective_requests_has_work = threading.Event()

        # The settings module. This will be handle proper by the
        # function ``generate_settings``.
        self.settings = None
        self.config_callbacks = []

        # Append callbacks
        from mpi.settings import standard_callbacks
        self.config_callbacks.extend(standard_callbacks)

        options = self.parse_options()

        # TODO: See if logger initialisations below here shouldn't be refactored into one

        # Decide how to deal with I/O
        if options.process_io == "remotefile":
            # Initialise the logger
            import os
            logger = Logger(os.path.join(options.logdir,"remotelog"), "proc-%d" % options.rank, options.debug, options.verbosity, True)
            filename = constants.DEFAULT_LOGDIR+'mpi.local.rank%s.log' % options.rank


            logger.debug("Opening file for I/O: %s" % filename)
            try:
                output = open(filename, "w")
            except:
                raise MPIException("File for I/O not writeable - check that this path exists and is writeable:\n%s" % constants.DEFAULT_LOGDIR)

            sys.stdout = output
            sys.stderr = output
        elif options.process_io == "none":
            # Initialise the logger
            logger = Logger(options.logdir+"mpi", "proc-%d" % options.rank, options.debug, options.verbosity, True)
            logger.debug("Closing stdout")
            sys.stdout = None
        else:
            # Initialise the logger
            logger = Logger(options.logdir+"mpi", "proc-%d" % options.rank, options.debug, options.verbosity, options.quiet)

        # TODO: Put this info under settings when they start to work properly
        #       Also we should check that the path here is accessible and valid
        # if filepath starts with something else than / it is a relative path and we assume it relative to pupympi dir
        if not options.logdir.startswith('/'):
            _BASE = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
            self.logdir = os.path.join(_BASE,options.logdir)
        else:
            self.logdir = options.logdir

        # Parse and save settings.
        self.generate_settings(options.settings)

        # Attributes for the security component.
        self.disable_utilities = options.disable_utilities
        self.security_component = None

        # First check for required Python version
        self._version_check()

        # Check for yappi support
        self._yappi_enabled = False
        if options.yappi:
            try:
                import yappi
                self._yappi_enabled = True
                self._yappi_sorttype = yappi.SORTTYPE_NCALL

                if options.yappi_sorttype:
                    if options.yappi_sorttype == 'name':
                        self._yappi_sorttype = yappi.SORTTYPE_NAME
                    elif options.yappi_sorttype == 'ncall':
                        self._yappi_sorttype = yappi.SORTTYPE_NCALL
                    elif options.yappi_sorttype == 'ttotal':
                        self._yappi_sorttype = yappi.SORTTYPE_TTOTAL
                    elif options.yappi_sorttype == 'tsub':
                        self._yappi_sorttype = yappi.SORTTYPE_TSUB
                    elif options.yappi_sorttype == 'tavg':
                        self._yappi_sorttype = yappi.SORTTYPE_TAVG
                    else:
                        logger.warn("Unknown yappi sorttype '%s' - defaulting to ncall." % options.yappi_sorttype)

            except ImportError:
                logger.warn("Yappi is not supported on this system. Statistics will not be logged.")
                self._yappi_enabled = False

        # Start built-in profiling facility
        self._profiler_enabled = False
        if options.enable_profiling:
            if self._yappi_enabled:
                logger.warn("Running yappi and pupyprof simultaneously is unpossible. Pupyprof has been disabled.");
            else:
                try:
                    import pupyprof
                    self._profiler_enabled = True
                except ImportError:
                    logger.warn("Pupyprof is not supported on this system. Tracefile will not be generated");
                    self._profiler_enabled = False

        # Set a resume parameter indicating if we are resuming a packed job.
        # This will be changed (maybe) in the netowrk startup.
        self.resume = False

        # Enable a register for the users to put values in. This register can be read
        # with the readregister.py script found in bin/utils/
        self.user_register = {}

        # Place to keep functions needed when packing / unpacking the running MPI
        # instance. The best place to start is migrate.py
        self.migrate_onpack = None

        self.network = Network(self, options)

        # Create the initial global Group, and assign the network all_procs as members
        world_Group = Group(options.rank)
        world_Group.members = self.network.all_procs

        # Create the initial communicator MPI_COMM_WORLD. It is initialized with
        # the rank of the process that holds it and size.
        # The members are filled out after the network is initialized.
        self.communicators = {}

        self.MPI_COMM_WORLD = Communicator(self, options.rank, options.size, self.network, world_Group, comm_root=None)

        # Tell the network about the global MPI_COMM_WORLD, and let it start to
        # listen on the corresponding network channels
        self.network.MPI_COMM_WORLD = self.MPI_COMM_WORLD

        # Change the contents of sys.argv runtime, so the user processes
        # can't see all the mpi specific parameters we start with.
        user_options =[sys.argv[0], ]
        user_options.extend(sys.argv[sys.argv.index("--")+1:])
        sys.argv = user_options

        # Set up the global mpi constants
        constants.MPI_GROUP_EMPTY = Group()

        self.daemon = True

        resumer = None
        if self.resume:
            resumer = self.resume_packed_state()

        self.start()

        # Make every node connect to each other if settings specify it
        if not options.disable_full_network_startup:
            self.network.start_full_network()

        self.initinfo = (self.MPI_COMM_WORLD, self.MPI_COMM_WORLD.rank(), self.MPI_COMM_WORLD.size())

        # Set a static attribute on the class so we know it is initialised.
        self.__class__._initialized = True

        if self._profiler_enabled:
            pupyprof.start()

        if self.resume and resumer:
            resumer(self)