Ejemplo n.º 1
0
    def __init__(self,
                 config,
                 db_engine,
                 *args,
                 n_processes=1,
                 n_db_processes=1,
                 **kwargs):
        try:
            ForkingPickler.dumps(db_engine)
        except Exception as exc:
            raise TypeError(
                "multiprocessing is unable to pickle passed SQLAlchemy engine. "
                "use triage.create_engine instead when running MultiCoreExperiment: "
                "(e.g. from triage import create_engine)") from exc

        super(MultiCoreExperiment, self).__init__(config, db_engine, *args,
                                                  **kwargs)
        if n_processes < 1:
            raise ValueError("n_processes must be 1 or greater")
        if n_db_processes < 1:
            raise ValueError("n_db_processes must be 1 or greater")
        if n_db_processes == 1 and n_processes == 1:
            logging.warning(
                "Both n_processes and n_db_processes were set to 1. "
                "If you only wish to use one process to run the experiment, "
                "consider using the SingleThreadedExperiment class instead")
        self.n_processes = n_processes
        self.n_db_processes = n_db_processes
Ejemplo n.º 2
0
    def test_pickle_categorical(self):
        from multiprocessing.reduction import ForkingPickler

        from lale.lib.rasl import Project

        c = categorical(5)
        p = Project(columns=None, drop_columns=categorical(10))
        _ = ForkingPickler.dumps(c)
        _ = ForkingPickler.dumps(p)
Ejemplo n.º 3
0
    def __init__(self,
                 config,
                 db_engine,
                 *args,
                 n_processes=1,
                 n_bigtrain_processes=1,
                 n_db_processes=1,
                 **kwargs):
        """
        Args:
            config (dict)
            db_engine (sqlalchemy engine)
            n_processes (int) How many parallel processes to use for most CPU-bound tasks.
                Logistic regression and decision trees fall under this category.
                Usually good to set to the # of cores on the machine.
            n_bigtrain_processes (int) How many parallel processes to use for memory-intensive tasks
                Random forests and extra trees fall under this category.
                Usually good to start at 1, but can be increased if you have available memory.
            n_db_processes (int) How many parallel processes to use for database IO-intensive tasks.
                Cohort creation, label creation, and feature creation fall under this category. 
        """
        try:
            ForkingPickler.dumps(db_engine)
        except Exception as exc:
            raise TypeError(
                "multiprocessing is unable to pickle passed SQLAlchemy engine. "
                "use triage.create_engine instead when running MultiCoreExperiment: "
                "(e.g. from triage import create_engine)") from exc

        super(MultiCoreExperiment, self).__init__(config, db_engine, *args,
                                                  **kwargs)
        if n_processes < 1:
            raise ValueError("n_processes must be 1 or greater")
        if n_db_processes < 1:
            raise ValueError("n_db_processes must be 1 or greater")
        if n_bigtrain_processes < 1:
            raise ValueError("n_bigtrain_processes must be 1 or greater")
        if n_db_processes == 1 and n_processes == 1 and n_bigtrain_processes == 1:
            logger.notice(
                "Both n_processes and n_db_processes were set to 1. "
                "If you only wish to use one process to run the experiment, "
                "consider using the SingleThreadedExperiment class instead")
        self.n_processes = n_processes
        self.n_db_processes = n_db_processes
        self.n_bigtrain_processes = n_bigtrain_processes
        self.n_processes_lookup = {
            BatchKey.QUICKTRAIN: self.n_processes,
            BatchKey.BIGTRAIN: self.n_bigtrain_processes,
            BatchKey.MAYBETRAIN: self.n_processes
        }
Ejemplo n.º 4
0
 def _feed(buffer, notempty, send_bytes, writelock, close, ignore_epipe,
           queue_sem):
     while True:
         try:
             with notempty:
                 if not buffer:
                     notempty.wait()
             try:
                 while True:
                     obj = buffer.popleft()
                     if obj is _sentinel:
                         debug('feeder thread got sentinel -- exiting')
                         close()
                         return
                     obj = ForkingPickler.dumps(obj)
                     if sys.platform == 'win32':
                         send_bytes(obj)
                     else:
                         with writelock:
                             send_bytes(obj)
             except IndexError:  # 当buffer为空时popleft会抛出异常
                 pass
         except Exception as e:
             if ignore_epipe and getattr(e, 'errno', 0) == errno.EPIPE:
                 return
             # Since this runs in a daemon thread the resources it uses may be become unusable while the process is cleaning up.
             # We ignore errors which happen after the process has started to cleanup.
             if is_exiting():
                 info('error in queue thread: %s', e)
                 return
             else:
                 # Since the object has not been sent in the queue, we need to decrease the size of the queue.
                 # The error acts as if the object had been silently removed from the queue and this step is necessary to have a properly working queue.
                 queue_sem.release()
                 traceback.print_exc()
Ejemplo n.º 5
0
 def put(self, obj):
     obj = ForkingPickler.dumps(obj)
     if self._wlock is None:
         self._writer.send_bytes(
             obj)  # writes to a message oriented win32 pipe are atomic
     else:
         with self._wlock:
             self._writer.send_bytes(obj)
Ejemplo n.º 6
0
    def _validate_nodes(cls, nodes: Iterable[Node]):
        """Ensure all tasks are serializable."""
        unserializable = []
        for node in nodes:
            try:
                ForkingPickler.dumps(node)
            except (AttributeError, PicklingError):
                unserializable.append(node)

        if unserializable:
            raise AttributeError(
                "The following nodes cannot be serialized: {}\nIn order to "
                "utilize multiprocessing you need to make sure all nodes are "
                "serializable, i.e. nodes should not include lambda "
                "functions, nested functions, closures, etc.\nIf you "
                "are using custom decorators ensure they are correctly using "
                "functools.wraps().".format(sorted(unserializable)))
Ejemplo n.º 7
0
 def put(self, obj):
     # serialize the data before acquiring the lock
     obj = LokyPickler.dumps(obj)
     if self._wlock is None:
         # writes to a message oriented win32 pipe are atomic
         self._writer.send_bytes(obj)
     else:
         with self._wlock:
             self._writer.send_bytes(obj)
Ejemplo n.º 8
0
    def _feed(buffer, notempty, send_bytes, writelock, close, ignore_epipe):
        debug('starting thread to feed data to pipe')
        nacquire = notempty.acquire
        nrelease = notempty.release
        nwait = notempty.wait
        bpopleft = buffer.popleft
        sentinel = _sentinel
        if sys.platform != 'win32':
            wacquire = writelock.acquire
            wrelease = writelock.release
        else:
            wacquire = None

        try:
            while 1:
                nacquire()
                try:
                    if not buffer:
                        nwait()
                finally:
                    nrelease()
                try:
                    while 1:
                        obj = bpopleft()
                        if obj is sentinel:
                            debug('feeder thread got sentinel -- exiting')
                            close()
                            return

                        # serialize the data before acquiring the lock
                        obj = LokyPickler.dumps(obj)
                        if wacquire is None:
                            send_bytes(obj)
                        else:
                            wacquire()
                            try:
                                send_bytes(obj)
                            finally:
                                wrelease()
                except IndexError:
                    pass
        except Exception as e:
            if ignore_epipe and getattr(e, 'errno', 0) == errno.EPIPE:
                return
            # Since this runs in a daemon thread the resources it uses
            # may be become unusable while the process is cleaning up.
            # We ignore errors which happen after the process has
            # started to cleanup.
            try:
                if is_exiting():
                    info('error in queue thread: %s', e)
                else:
                    import traceback
                    traceback.print_exc()
            except Exception:
                pass
Ejemplo n.º 9
0
def test_stars_pickleable():
    """
    Verify that EPSFStars can be successfully
    pickled/unpickled for use multiprocessing
    """
    from multiprocessing.reduction import ForkingPickler
    # Doesn't need to actually contain anything useful
    stars = EPSFStars([1])
    # This should not blow up
    ForkingPickler.loads(ForkingPickler.dumps(stars))
Ejemplo n.º 10
0
    def _validate_catalog(cls, catalog: DataCatalog, pipeline: Pipeline):
        """Ensure that all data sets are serializable and that we do not have
        any non proxied memory data sets being used as outputs as their content
        will not be synchronized across threads.
        """

        data_sets = catalog._data_sets  # pylint: disable=protected-access

        unserializable = []
        for name, data_set in data_sets.items():
            if getattr(data_set, "_SINGLE_PROCESS", False):  # SKIP_IF_NO_SPARK
                unserializable.append(name)
                continue
            try:
                ForkingPickler.dumps(data_set)
            except (AttributeError, PicklingError):
                unserializable.append(name)

        if unserializable:
            raise AttributeError(
                "The following data sets cannot be used with multiprocessing: "
                "{}\nIn order to utilize multiprocessing you need to make sure "
                "all data sets are serializable, i.e. data sets should not make "
                "use of lambda functions, nested functions, closures etc.\nIf you "
                "are using custom decorators ensure they are correctly using "
                "functools.wraps().".format(sorted(unserializable))
            )

        memory_data_sets = []
        for name, data_set in data_sets.items():
            if (
                name in pipeline.all_outputs()
                and isinstance(data_set, MemoryDataSet)
                and not isinstance(data_set, BaseProxy)
            ):
                memory_data_sets.append(name)

        if memory_data_sets:
            raise AttributeError(
                "The following data sets are memory data sets: {}\n"
                "ParallelRunner does not support output to externally created "
                "MemoryDataSets".format(sorted(memory_data_sets))
            )
Ejemplo n.º 11
0
    def test_objective_pickle(self):
        # can you pickle the objective function?
        pkl = pickle.dumps(self.objective)
        pickle.loads(pkl)

        # check the ForkingPickler as well.
        if hasattr(ForkingPickler, 'dumps'):
            pkl = ForkingPickler.dumps(self.objective)
            pickle.loads(pkl)

        # can you pickle with an extra function present?
        self.objective.lnprob_extra = lnprob_extra
        pkl = pickle.dumps(self.objective)
        pickle.loads(pkl)

        # check the ForkingPickler as well.
        if hasattr(ForkingPickler, 'dumps'):
            pkl = ForkingPickler.dumps(self.objective)
            pickle.loads(pkl)
Ejemplo n.º 12
0
    def put(self, *args: T) -> None:
        """
        Put zero or more objects into the queue, contiguously.

        Raises BrokenPipeError if the receiving half has hung up.
        """
        if args:
            send = [ForkingPickler.dumps(arg) for arg in args]
            with self._write_lock:
                while send:
                    self._writer.send_bytes(send.pop(0))
Ejemplo n.º 13
0
    async def _start(self, loop):
        import inspect

        data = {
            'path': sys.path.copy(),
            'impl': bytes(ForkingPickler.dumps(self)),
            'main': inspect.getfile(sys.modules['__main__']),
            'authkey': bytes(current_process().authkey)
        }

        self.process = await asyncio.create_subprocess_exec(
            sys.executable,
            SUBPROCESS,
            stdin=asyncio.subprocess.PIPE,
            loop=loop)
        await self.process.communicate(pickle.dumps(data))
Ejemplo n.º 14
0
    async def _start(self, loop):
        import inspect

        data = {
            'path': sys.path.copy(),
            'impl': bytes(ForkingPickler.dumps(self)),
            'main': inspect.getfile(sys.modules['__main__']),
            'authkey': bytes(current_process().authkey)
        }

        self.process = await asyncio.create_subprocess_exec(
            sys.executable,
            SUBPROCESS,
            stdin=asyncio.subprocess.PIPE,
            loop=loop
        )
        await self.process.communicate(pickle.dumps(data))
Ejemplo n.º 15
0
import photutils
from multiprocessing.reduction import ForkingPickler

stars = photutils.psf.EPSFStars([1])

foo = ForkingPickler.loads(ForkingPickler.dumps(stars))

Ejemplo n.º 16
0
 def test_serializable(self, hdf_data_set):
     ForkingPickler.dumps(hdf_data_set)
Ejemplo n.º 17
0
 def dumps(cls, obj, protocol=4):
     return ForkingPickler.dumps(obj, protocol)
Ejemplo n.º 18
0
def test_pickle():
    data_set = MLflowDataSet(dataset="pkl")
    ForkingPickler.dumps(data_set)