Example #1
0
    def validate(self, x, y, classifier_kwargs={}, extra=None, parallel=True):
        '''
        Runs crossvalidation on the provided data.  The length of the :py:obj:`x` array should be identical to :py:obj:`y`
        and will be used to partition the lists by index.
        :param x: observations, needs to implement __len__ and __getitem__ aka len(x), x[i]
        :param y: expected output, needs to implement __getitem__, aka y[i]
        :param classifier_kwargs: a dictionary of parameters to pass to the classifier
        :param extra: @todo extra information to pull out of the classifier
        :returns: @todo figure this out
        '''
        if extra is not None:
            if not isinstance(extra, str) and \
               not isinstance(extra, types.MethodType) and \
               not isinstance(extra, types.FunctionType):
                raise ValueError('the `extra\' argument takes either a string or a method.')

        if isinstance(extra, types.MethodType):
            extra = extra.__name__
            assert(hasattr(self.classifier_cls, extra))

        log = logging.getLogger(PYXVAL_LOGGER)
        log.debug('beginning %d-fold crossvalidation' % self.folds)

        partition = CrossValidator.__partition(len(x), self.folds)
        # log.debug('partition assignments: %s' % str(partition))

        kwargs = deepcopy(self.classifier_kwargs)
        kwargs.update(classifier_kwargs)

        results = farmout(
            num=self.folds,
            setup=lambda f: (_folder, f, partition, x, y, self.classifier_cls(**kwargs), extra),
            worker=farmworker,
            isresult=lambda r: isinstance(r, tuple) and len(r) == 5,
            attempts=3,
            pickletest=self if parallel else False
        )

        stats = self.scorer_cls(**self.scorer_kwargs)
        lret = []
        xtra = []

        for l, x, t, p, w in results:
            if l is not None:
                lret.append(l)
            if x is not None:
                xtra.append(x)
            stats.append(t, p, w)

        log.debug('finished %d-fold crossvalidation, performance stats: %s' % (self.folds, stats))

        return ValidationResult(
            lret if len(lret) else None,
            stats,
            xtra if len(xtra) else None
        )
Example #2
0
    def map(self, argslist, globalvars={}, quiet=True):
        numjobs = len(argslist)
        if self._mpi:
            # message passing interface
            # don't f**k with the number of backslashes.
            # Seriously. You don't know what you're doing.
            cmds = dedent('''\
            GLOBAL_FPRINTF_REDIRECT = "/dev/null";
            _job = 0;
            %(jobopts)s
            _jobvals = {};
            _nodestates = { MPI_NODE_COUNT-1, 2 };
            _received = 0;
            while ( _received < %(numjobs)d ) {
                _recvjob = 1;
                if ( _job < %(numjobs)d ) {
                    for ( _node = 0; _node < MPI_NODE_COUNT-1; _node += 1 ) {
                        if ( _nodestates[ _node ][ 0 ] == 0 ) {
                            _recvjob = 0;
                            break;
                        }
                    }
                    if ( ! _recvjob ) {
                        _mpicmds = "";
                        _mpicmds * 256;
                        _mpicmds * ( "%(globalvars)s" );
                        _mpicmds * ( "MESSAGE_LOGGING = 0;" );
                        _mpicmds * ( "_options = " + _jobopts[ _job ] + ";" );
                        _mpicmds * ( "ExecuteAFile( "{1}%(batchfile)s"{1}, _options );" );
                        _mpicmds * ( "_retstr = "{1}"{1};" );
                        _mpicmds * ( "_retstr * 128;" );
                        _mpicmds * ( "_retstr * ( "{1}_retjob = "{1} + " + _job + " + "{1};"{1} );" );
                        _mpicmds * ( "_retstr * ( "{1}_retval = "{2}"{1} + ( %(retvar)s ^ {{ "{1}"{2}"{1}, "{1}"{3}"{1} }} ) + "{1}"{2};"{1} );" );
                        _mpicmds * ( "_retstr * 0;" );
                        _mpicmds * ( "return _retstr;" );
                        _mpicmds * 0;
                        MPISend( _node + 1, _mpicmds );
                        _nodestates[ _node ][ 0 ] = 1;
                        _nodestates[ _node ][ 1 ] = Time(0);
                        _job += 1;
                    }
                }
                if ( _recvjob ) {
                    MPIReceive( -1, _null, _retstr );
                    ExecuteCommands( _retstr );
                    _jobvals[ _retjob ] = _retval;
                    _received += 1;
                }
            }
            GLOBAL_FPRINTF_REDIRECT = "";
            fprintf( stdout, "[" + _jobvals[ 0 ] );
            for ( _job = 1; _job < %(numjobs)d; _job += 1 ) {
                fprintf ( stdout, "," + _jobvals[ _job ] );
            }
            fprintf( stdout, "]" );
            ''') % {
                'batchfile': self._batchfile,
                'globalvars': _globalvars(globalvars).replace('\n', '\\n').replace('"', '\\"'),
                'numjobs': numjobs,
                'retvar': self._retvar,
                'jobopts': _jobopts(argslist),
            }

            # this facility helps us write code that doesn't break
            # by allowing us to escape quotations appropriately:
            # by depth
            digits = set()
            for m in finditer(r'"{(\d+)}', cmds):
                digits.add(int(m.group(1)))

            for nslash in digits:
                pwr = 2 ** nslash - 1
                cmds = cmds.replace('"{%d}' % nslash, ('\\' * pwr) + '"')

            retcode, pout, perr = _runhyphympi(cmds)

            # the following no longer makes sense given the child process nature
            # of how we call hyphympi

            # if not quiet:
            #     if pout != '':
            #         print(pout, file=stderr)

            if perr != '':
                raise RuntimeError(perr)

            # this is a hideous parser for the outermost
            # json-esque array that the script above outputs

            # start at 1 because the first char is a '['
            i = 1
            nb = 0
            retarr = []
            for j, c in enumerate(pout):
                # every time we encounter a '[', increment nb
                if c == '[':
                    nb += 1
                # likewise, if we encounter a closing bracket, decrement
                if c == ']':
                    nb -= 1
                # if we're at a comma and nb == 1, we're in the outer list
                # so split from the last time we split to j-1
                if c == ',' and nb == 1:
                    retarr.append(pout[i:j])
                    i = j + 1

            # don't forget the final piece
            retarr.append(pout[i:(j + nb)])

            return retarr
        else:
            # multiprocessing
            numjobs = len(argslist)
            globals = _globalvars(globalvars)
            results = farmout(
                num=numjobs,
                setup=lambda i: (
                    _jobdispatch,
                    self._batchfile,
                    self._retvar,
                    globals,
                    (argslist[i],),
                    quiet
                ),
                worker=farmworker,
                isresult=lambda r: isinstance(r, list),
                attempts=3
            )
            return list(chain(*results))
Example #3
0
    def map(self, argslist, globalvars={}, quiet=True):
        numjobs = len(argslist)
        if self._mpi:
            # message passing interface
            # don't f**k with the number of backslashes.
            # Seriously. You don't know what you're doing.
            cmds = dedent('''\
            GLOBAL_FPRINTF_REDIRECT = "/dev/null";
            _job = 0;
            %(jobopts)s
            _jobvals = {};
            _nodestates = { MPI_NODE_COUNT-1, 2 };
            _received = 0;
            while ( _received < %(numjobs)d ) {
                _recvjob = 1;
                if ( _job < %(numjobs)d ) {
                    for ( _node = 0; _node < MPI_NODE_COUNT-1; _node += 1 ) {
                        if ( _nodestates[ _node ][ 0 ] == 0 ) {
                            _recvjob = 0;
                            break;
                        }
                    }
                    if ( ! _recvjob ) {
                        _mpicmds = "";
                        _mpicmds * 256;
                        _mpicmds * ( "%(globalvars)s" );
                        _mpicmds * ( "MESSAGE_LOGGING = 0;" );
                        _mpicmds * ( "_options = " + _jobopts[ _job ] + ";" );
                        _mpicmds * ( "ExecuteAFile( "{1}%(batchfile)s"{1}, _options );" );
                        _mpicmds * ( "_retstr = "{1}"{1};" );
                        _mpicmds * ( "_retstr * 128;" );
                        _mpicmds * ( "_retstr * ( "{1}_retjob = "{1} + " + _job + " + "{1};"{1} );" );
                        _mpicmds * ( "_retstr * ( "{1}_retval = "{2}"{1} + ( %(retvar)s ^ {{ "{1}"{2}"{1}, "{1}"{3}"{1} }} ) + "{1}"{2};"{1} );" );
                        _mpicmds * ( "_retstr * 0;" );
                        _mpicmds * ( "return _retstr;" );
                        _mpicmds * 0;
                        MPISend( _node + 1, _mpicmds );
                        _nodestates[ _node ][ 0 ] = 1;
                        _nodestates[ _node ][ 1 ] = Time(0);
                        _job += 1;
                    }
                }
                if ( _recvjob ) {
                    MPIReceive( -1, _null, _retstr );
                    ExecuteCommands( _retstr );
                    _jobvals[ _retjob ] = _retval;
                    _received += 1;
                }
            }
            GLOBAL_FPRINTF_REDIRECT = "";
            fprintf( stdout, "[" + _jobvals[ 0 ] );
            for ( _job = 1; _job < %(numjobs)d; _job += 1 ) {
                fprintf ( stdout, "," + _jobvals[ _job ] );
            }
            fprintf( stdout, "]" );
            ''') % {
                'batchfile':
                self._batchfile,
                'globalvars':
                _globalvars(globalvars).replace('\n', '\\n').replace(
                    '"', '\\"'),
                'numjobs':
                numjobs,
                'retvar':
                self._retvar,
                'jobopts':
                _jobopts(argslist),
            }

            # this facility helps us write code that doesn't break
            # by allowing us to escape quotations appropriately:
            # by depth
            digits = set()
            for m in finditer(r'"{(\d+)}', cmds):
                digits.add(int(m.group(1)))

            for nslash in digits:
                pwr = 2**nslash - 1
                cmds = cmds.replace('"{%d}' % nslash, ('\\' * pwr) + '"')

            retcode, pout, perr = _runhyphympi(cmds)

            # the following no longer makes sense given the child process nature
            # of how we call hyphympi

            # if not quiet:
            #     if pout != '':
            #         print(pout, file=stderr)

            if perr != '':
                raise RuntimeError(perr)

            # this is a hideous parser for the outermost
            # json-esque array that the script above outputs

            # start at 1 because the first char is a '['
            i = 1
            nb = 0
            retarr = []
            for j, c in enumerate(pout):
                # every time we encounter a '[', increment nb
                if c == '[':
                    nb += 1
                # likewise, if we encounter a closing bracket, decrement
                if c == ']':
                    nb -= 1
                # if we're at a comma and nb == 1, we're in the outer list
                # so split from the last time we split to j-1
                if c == ',' and nb == 1:
                    retarr.append(pout[i:j])
                    i = j + 1

            # don't forget the final piece
            retarr.append(pout[i:(j + nb)])

            return retarr
        else:
            # multiprocessing
            numjobs = len(argslist)
            globals = _globalvars(globalvars)
            results = farmout(
                num=numjobs,
                setup=lambda i:
                (_jobdispatch, self._batchfile, self._retvar, globals,
                 (argslist[i], ), quiet),
                worker=farmworker,
                isresult=lambda r: isinstance(r, list),
                attempts=3)
            return list(chain(*results))