Beispiel #1
0
class DataFetcher(DataFetcherBase):
    """
    Implementation of the data fetcher reacting on data sent by another
    hidra instance.
    """
    def __init__(self, datafetcher_base_config):
        """Initial setup

        Checks if all required parameters are set in the configuration
        """

        self.f_descriptors = dict()
        self.transfer = None

        DataFetcherBase.__init__(self, datafetcher_base_config, name=__name__)

        # base class sets
        #   self.config_all - all configurations
        #   self.config_df - the config of the datafetcher
        #   self.config - the module specific config
        #   self.df_type -  the name of the datafetcher module
        #   self.log_queue
        #   self.log

        self.metadata_r = None
        self.data_r = None

        self.set_required_params()

        # check that the required_params are set inside of module specific
        # config
        self.check_config()

        self._setup()

    def set_required_params(self):
        """
        Defines the parameters to be in configuration to run this data fetcher.
        Depending if on Linux or Windows other parameters are required.
        """

        self.required_params = {
            "network": ["ext_ip"],
        }

        df_params = [
            "status_check_resp_port", "confirmation_resp_port", "context"
        ]

        if utils.is_windows():
            df_params += ["datafetcher_port"]
        else:
            self.required_params["network"] += ["ipc_dir", "main_pid"]

        self.required_params["datafetcher"] = [
            "store_data", {
                self.df_type: df_params
            }
        ]

    def _setup(self):
        """Sets up and configures the transfer.
        """
        self.transfer = Transfer("STREAM", use_log=self.log_queue)

        config_net = self.config_all["network"]

        endpoint = "{}_{}".format(config_net["main_pid"], "out")
        self.transfer.start([config_net["ipc_dir"], endpoint],
                            protocol="ipc",
                            data_con_style="connect")

        # enable status check requests from any sender
        self.transfer.setopt(option="status_check",
                             value=[
                                 config_net["ext_ip"],
                                 self.config["status_check_resp_port"]
                             ])

        # enable confirmation reply if this is requested in a received data
        # packet
        self.transfer.setopt(option="confirmation",
                             value=[
                                 config_net["ext_ip"],
                                 self.config["confirmation_resp_port"]
                             ])

    def get_metadata(self, targets, metadata):
        """Implementation of the abstract method get_metadata.

        Args:
            targets (list): The target list this file is supposed to go.
            metadata (dict): The dictionary with the metadata to extend.
        """

        timeout = 10000

        # Get new data
        self.metadata_r, self.data_r = self.transfer.get(timeout)

        if (metadata["relative_path"] != self.metadata_r["relative_path"]
                or metadata["source_path"] != self.metadata_r["source_path"]
                or metadata["filename"] != self.metadata_r["filename"]):
            self.log.error("Received metadata do not match data")

        # Use received data to prevent mismatch of metadata and data
        # TODO handle case if file type requested by target does not match

        # pylint: disable=attribute-defined-outside-init

        # Build source file
        self.source_file = generate_filepath(self.metadata_r["source_path"],
                                             self.metadata_r)

        # Build target file
        # if local_target is not set (== None) generate_filepath returns None
        self.target_file = generate_filepath(self.config_df["local_target"],
                                             self.metadata_r)

        # Extends metadata
        if targets:
            if "filesize" not in self.metadata_r:
                self.log.error("Received metadata do not contain 'filesize'")

            if "file_mod_time" not in self.metadata_r:
                self.log.error("Received metadata do not contain "
                               "'file_mod_time'. Setting it to current time")
                self.metadata_r["file_mod_time"] = time.time()

            if "file_create_time" not in self.metadata_r:
                self.log.error("Received metadata do not contain "
                               "'file_create_time'. Setting it to current "
                               "time")
                self.metadata_r["file_create_time"] = time.time()

            if "chunksize" not in self.metadata_r:
                self.log.error("Received metadata do not contain 'chunksize'. "
                               "Setting it to locally configured one")
                self.metadata_r["chunksize"] = self.config_df["chunksize"]

    def send_data(self, targets, metadata, open_connections):
        """Implementation of the abstract method send_data.

        Args:
            targets (list): The target list this file is supposed to go.
            metadata (dict): The dictionary with the metadata of the file
            open_connections (dict): The dictionary containing all open zmq
                                     connections.
        """
        # pylint: disable=unused-argument

        if not targets:
            return

        # targets are of the form [[<host:port>, <prio>, <metadata|data>], ...]
        targets_data = [i for i in targets if i[2] == "data"]

        if not targets_data:
            return

        self.log.debug("Received data for file %s (chunknumber %s)",
                       self.source_file, self.metadata_r["chunk_number"])

        self.log.debug("Passing multipart-message for file '%s'...",
                       self.source_file)

        try:
            chunk_payload = [
                json.dumps(self.metadata_r).encode("utf-8"), self.data_r
            ]
        except Exception:
            self.log.error("Unable to pack multipart-message for file "
                           "'%s'",
                           self.source_file,
                           exc_info=True)
            return

        # send message to data targets
        try:
            self.send_to_targets(targets=targets_data,
                                 open_connections=open_connections,
                                 metadata=None,
                                 payload=chunk_payload,
                                 chunk_number=self.metadata_r["chunk_number"])
        except DataError:
            self.log.error(
                "Unable to send multipart-message for file '%s' (chunk %s)",
                self.source_file,
                self.metadata_r["chunk_number"],
                exc_info=True)
        except Exception:
            self.log.error(
                "Unable to send multipart-message for file '%s' (chunk %s)",
                self.source_file,
                self.metadata_r["chunk_number"],
                exc_info=True)

    def finish(self, targets, metadata, open_connections):
        """Implementation of the abstract method finish.

        Args:
            targets (list): The target list this file is supposed to go.
            metadata (dict): The dictionary with the metadata of the file
            open_connections (dict): The dictionary containing all open zmq
                                     connections.
        """

        # targets are of the form [[<host:port>, <prio>, <metadata|data>], ...]
        targets_metadata = [i for i in targets if i[2] == "metadata"]

        # send message to metadata targets
        if targets_metadata:
            try:
                self.send_to_targets(targets=targets_metadata,
                                     open_connections=open_connections,
                                     metadata=metadata,
                                     payload=None,
                                     chunk_number=None,
                                     timeout=self.config["send_timeout"])
                self.log.debug(
                    "Passing metadata multipart-message for file "
                    "%s...done.", self.source_file)

            except Exception:
                self.log.error(
                    "Unable to send metadata multipart-message for file"
                    "'%s' to '%s'",
                    self.source_file,
                    targets_metadata,
                    exc_info=True)

        # store data
        if self.config_df["store_data"]:
            try:
                # TODO: save message to file using a thread (avoids blocking)
                self.transfer.store_chunk(
                    descriptors=self.f_descriptors,
                    filepath=self.target_file,
                    payload=self.data_r,
                    base_path=self.config_df["local_target"],
                    metadata=self.metadata_r)
            except Exception:
                self.log.error(
                    "Storing multipart message for file '%s' failed",
                    self.source_file,
                    exc_info=True)

    def stop(self):
        """Implementation of the abstract method stop.
        """

        # Close base class zmq sockets
        self.close_socket()

        # Close open file handler to prevent file corruption
        for target_file in list(self.f_descriptors):
            self.f_descriptors[target_file].close()
            del self.f_descriptors[target_file]

        # Close zmq sockets
        if self.transfer is not None:
            self.transfer.stop()
Beispiel #2
0
class DataReceiver(object):
    """Receives data and stores it to disc usign the hidra API.
    """
    def __init__(self):

        self.transfer = None
        self.checking_thread = None
        self.timeout = None

        self.config = None

        self.log = None
        self.dirs_not_to_create = None
        self.lock = None
        self.target_dir = None
        self.data_ip = None
        self.data_port = None
        self.transfer = None
        self.checking_thread = None

        self.plugin_handler = None

        self.run_loop = True

        self.setup()

        self.exec_run()

    def setup(self):
        """Initializes parameters, logging and transfer object.
        """

        global _whitelist

        try:
            self.config = argument_parsing()
        except Exception:
            self.log = logging.getLogger("DataReceiver")
            raise

        config_gen = self.config["general"]
        config_recv = self.config["datareceiver"]

        # change user
        user_info, user_was_changed = utils.change_user(config_gen)

        # set up logging
        utils.check_writable(config_gen["log_file"])
        self._setup_logging()

        utils.log_user_change(self.log, user_was_changed, user_info)

        # set process name
        # pylint: disable=no-member
        setproctitle.setproctitle(config_gen["procname"])

        self.log.info("Version: %s", __version__)

        self.dirs_not_to_create = config_gen["dirs_not_to_create"]

        # for proper clean up if kill is called
        signal.signal(signal.SIGTERM, self.signal_term_handler)

        self.timeout = 2000
        self.lock = threading.Lock()

        try:
            ldap_retry_time = config_gen["ldap_retry_time"]
        except KeyError:
            ldap_retry_time = 10

        try:
            check_time = config_gen["netgroup_check_time"]
        except KeyError:
            check_time = 2

        if config_gen["whitelist"] is not None:
            self.log.debug("config_gen['whitelist']=%s",
                           config_gen["whitelist"])

            with self.lock:
                _whitelist = utils.extend_whitelist(config_gen["whitelist"],
                                                    config_gen["ldapuri"],
                                                    self.log)
            self.log.info("Configured whitelist: %s", _whitelist)
        else:
            _whitelist = None

        # only start the thread if a netgroup was configured
        if (config_gen["whitelist"] is not None
                and isinstance(config_gen["whitelist"], str)):
            self.log.debug("Starting checking thread")
            try:
                self.checking_thread = CheckNetgroup(config_gen["whitelist"],
                                                     self.lock,
                                                     config_gen["ldapuri"],
                                                     ldap_retry_time,
                                                     check_time)
                self.checking_thread.start()
            except Exception:
                self.log.error("Could not start checking thread",
                               exc_info=True)
        else:
            self.log.debug("Checking thread not started: %s",
                           config_gen["whitelist"])

        self.target_dir = os.path.normpath(config_recv["target_dir"])
        self.data_ip = config_recv["data_stream_ip"]
        self.data_port = config_recv["data_stream_port"]

        self.log.info("Writing to directory '%s'", self.target_dir)

        self.transfer = Transfer(connection_type="STREAM",
                                 use_log=True,
                                 dirs_not_to_create=self.dirs_not_to_create)

        self._load_plugin()

    def _setup_logging(self):
        config_gen = self.config["general"]

        # enable logging
        root = logging.getLogger()
        root.setLevel(logging.DEBUG)

        handlers = utils.get_log_handlers(config_gen["log_file"],
                                          config_gen["log_size"],
                                          config_gen["verbose"],
                                          config_gen["onscreen"])

        if isinstance(handlers, tuple):
            for hdl in handlers:
                root.addHandler(hdl)
        else:
            root.addHandler(handlers)

        self.log = logging.getLogger("DataReceiver")

    def _load_plugin(self):
        try:
            plugin_name = self.config["datareceiver"]["plugin"]
            plugin_config = self.config[plugin_name]
        except KeyError:
            self.log.debug("No plugin specified")
            return

        self.plugin_handler = PluginHandler(plugin_name, plugin_config,
                                            self.target_dir, self.log)

    def exec_run(self):
        """Wrapper around run to react to exceptions.
        """

        try:
            self.run()
        except KeyboardInterrupt:
            pass
        except Exception:
            self.log.error("Stopping due to unknown error condition",
                           exc_info=True)
            raise
        finally:
            self.stop()

    def run(self):
        """Start the transfer and store the data.
        """

        global _whitelist  # pylint: disable=global-variable-not-assigned
        global _changed_netgroup

        if self.plugin_handler is not None:
            plugin_type = self.plugin_handler.get_data_type()
            self.plugin_handler.start()
        else:
            plugin_type = None

        try:
            self.transfer.start([self.data_ip, self.data_port], _whitelist)
        except Exception:
            self.log.error("Could not initiate stream", exc_info=True)
            self.stop(store=False)
            raise

        # enable status check requests from any sender
        self.transfer.setopt("status_check")
        # enable confirmation reply if this is requested in a received data
        # packet
        self.transfer.setopt("confirmation")

        self.log.debug("Waiting for new messages...")
        self.run_loop = True
        # run loop, and wait for incoming messages
        while self.run_loop:
            if _changed_netgroup:
                self.log.debug("Re-registering whitelist")
                self.transfer.register(_whitelist)

                # reset flag
                with self.lock:
                    _changed_netgroup = False

            try:
                ret_val = self.transfer.store(target_base_path=self.target_dir,
                                              timeout=self.timeout,
                                              return_type=plugin_type)

            except KeyboardInterrupt:
                break
            except Exception:
                self.log.error("Storing data...failed.", exc_info=True)
                raise

            if self.plugin_handler is None or ret_val is None:
                continue

            try:
                self.plugin_handler.put(ret_val)
                # ret_val might have been mutated by the plugin and therefore
                # should only be reused if this is acceptable
            except Exception:
                self.log.error("Cannot submit message to plugin")

    def stop(self, store=True):
        """Stop threads, close sockets and cleans up.

        Args:
            store (optional, bool): Run a little longer to store remaining
                                    data.
        """

        self.run_loop = False

        if self.transfer is not None:
            self.transfer.status = [b"ERROR", "receiver is shutting down"]

            if store:
                stop_timeout = 0.5
                start_time = time.time()
                diff_time = (time.time() - start_time) * 1000
                self.log.debug("Storing remaining data.")
                while diff_time < stop_timeout:
                    try:
                        self.log.debug("Storing remaining data...")
                        self.transfer.store(self.target_dir, self.timeout)
                    except Exception:
                        self.log.error("Storing data...failed.", exc_info=True)
                    diff_time = (time.time() - start_time) * 1000

            self.log.info("Shutting down receiver...")
            self.transfer.stop()
            self.transfer = None

        if self.plugin_handler is not None:
            self.plugin_handler.stop()

        if self.checking_thread is not None:
            self.checking_thread.stop()
            self.checking_thread.join()
            self.log.debug("checking_thread stopped")
            self.checking_thread = None

    # pylint: disable=unused-argument
    def signal_term_handler(self, signal_to_react, frame):
        """React on external SIGTERM signal.
        """

        self.log.debug('got SIGTERM')
        self.stop()

    def __exit__(self, exception_type, exception_value, traceback):
        self.stop()

    def __del__(self):
        self.stop()