def validate(self, x, y, classifier_kwargs={}, extra=None, parallel=True): ''' Runs crossvalidation on the provided data. The length of the :py:obj:`x` array should be identical to :py:obj:`y` and will be used to partition the lists by index. :param x: observations, needs to implement __len__ and __getitem__ aka len(x), x[i] :param y: expected output, needs to implement __getitem__, aka y[i] :param classifier_kwargs: a dictionary of parameters to pass to the classifier :param extra: @todo extra information to pull out of the classifier :returns: @todo figure this out ''' if extra is not None: if not isinstance(extra, str) and \ not isinstance(extra, types.MethodType) and \ not isinstance(extra, types.FunctionType): raise ValueError('the `extra\' argument takes either a string or a method.') if isinstance(extra, types.MethodType): extra = extra.__name__ assert(hasattr(self.classifier_cls, extra)) log = logging.getLogger(PYXVAL_LOGGER) log.debug('beginning %d-fold crossvalidation' % self.folds) partition = CrossValidator.__partition(len(x), self.folds) # log.debug('partition assignments: %s' % str(partition)) kwargs = deepcopy(self.classifier_kwargs) kwargs.update(classifier_kwargs) results = farmout( num=self.folds, setup=lambda f: (_folder, f, partition, x, y, self.classifier_cls(**kwargs), extra), worker=farmworker, isresult=lambda r: isinstance(r, tuple) and len(r) == 5, attempts=3, pickletest=self if parallel else False ) stats = self.scorer_cls(**self.scorer_kwargs) lret = [] xtra = [] for l, x, t, p, w in results: if l is not None: lret.append(l) if x is not None: xtra.append(x) stats.append(t, p, w) log.debug('finished %d-fold crossvalidation, performance stats: %s' % (self.folds, stats)) return ValidationResult( lret if len(lret) else None, stats, xtra if len(xtra) else None )
def map(self, argslist, globalvars={}, quiet=True): numjobs = len(argslist) if self._mpi: # message passing interface # don't f**k with the number of backslashes. # Seriously. You don't know what you're doing. cmds = dedent('''\ GLOBAL_FPRINTF_REDIRECT = "/dev/null"; _job = 0; %(jobopts)s _jobvals = {}; _nodestates = { MPI_NODE_COUNT-1, 2 }; _received = 0; while ( _received < %(numjobs)d ) { _recvjob = 1; if ( _job < %(numjobs)d ) { for ( _node = 0; _node < MPI_NODE_COUNT-1; _node += 1 ) { if ( _nodestates[ _node ][ 0 ] == 0 ) { _recvjob = 0; break; } } if ( ! _recvjob ) { _mpicmds = ""; _mpicmds * 256; _mpicmds * ( "%(globalvars)s" ); _mpicmds * ( "MESSAGE_LOGGING = 0;" ); _mpicmds * ( "_options = " + _jobopts[ _job ] + ";" ); _mpicmds * ( "ExecuteAFile( "{1}%(batchfile)s"{1}, _options );" ); _mpicmds * ( "_retstr = "{1}"{1};" ); _mpicmds * ( "_retstr * 128;" ); _mpicmds * ( "_retstr * ( "{1}_retjob = "{1} + " + _job + " + "{1};"{1} );" ); _mpicmds * ( "_retstr * ( "{1}_retval = "{2}"{1} + ( %(retvar)s ^ {{ "{1}"{2}"{1}, "{1}"{3}"{1} }} ) + "{1}"{2};"{1} );" ); _mpicmds * ( "_retstr * 0;" ); _mpicmds * ( "return _retstr;" ); _mpicmds * 0; MPISend( _node + 1, _mpicmds ); _nodestates[ _node ][ 0 ] = 1; _nodestates[ _node ][ 1 ] = Time(0); _job += 1; } } if ( _recvjob ) { MPIReceive( -1, _null, _retstr ); ExecuteCommands( _retstr ); _jobvals[ _retjob ] = _retval; _received += 1; } } GLOBAL_FPRINTF_REDIRECT = ""; fprintf( stdout, "[" + _jobvals[ 0 ] ); for ( _job = 1; _job < %(numjobs)d; _job += 1 ) { fprintf ( stdout, "," + _jobvals[ _job ] ); } fprintf( stdout, "]" ); ''') % { 'batchfile': self._batchfile, 'globalvars': _globalvars(globalvars).replace('\n', '\\n').replace('"', '\\"'), 'numjobs': numjobs, 'retvar': self._retvar, 'jobopts': _jobopts(argslist), } # this facility helps us write code that doesn't break # by allowing us to escape quotations appropriately: # by depth digits = set() for m in finditer(r'"{(\d+)}', cmds): digits.add(int(m.group(1))) for nslash in digits: pwr = 2 ** nslash - 1 cmds = cmds.replace('"{%d}' % nslash, ('\\' * pwr) + '"') retcode, pout, perr = _runhyphympi(cmds) # the following no longer makes sense given the child process nature # of how we call hyphympi # if not quiet: # if pout != '': # print(pout, file=stderr) if perr != '': raise RuntimeError(perr) # this is a hideous parser for the outermost # json-esque array that the script above outputs # start at 1 because the first char is a '[' i = 1 nb = 0 retarr = [] for j, c in enumerate(pout): # every time we encounter a '[', increment nb if c == '[': nb += 1 # likewise, if we encounter a closing bracket, decrement if c == ']': nb -= 1 # if we're at a comma and nb == 1, we're in the outer list # so split from the last time we split to j-1 if c == ',' and nb == 1: retarr.append(pout[i:j]) i = j + 1 # don't forget the final piece retarr.append(pout[i:(j + nb)]) return retarr else: # multiprocessing numjobs = len(argslist) globals = _globalvars(globalvars) results = farmout( num=numjobs, setup=lambda i: ( _jobdispatch, self._batchfile, self._retvar, globals, (argslist[i],), quiet ), worker=farmworker, isresult=lambda r: isinstance(r, list), attempts=3 ) return list(chain(*results))
def map(self, argslist, globalvars={}, quiet=True): numjobs = len(argslist) if self._mpi: # message passing interface # don't f**k with the number of backslashes. # Seriously. You don't know what you're doing. cmds = dedent('''\ GLOBAL_FPRINTF_REDIRECT = "/dev/null"; _job = 0; %(jobopts)s _jobvals = {}; _nodestates = { MPI_NODE_COUNT-1, 2 }; _received = 0; while ( _received < %(numjobs)d ) { _recvjob = 1; if ( _job < %(numjobs)d ) { for ( _node = 0; _node < MPI_NODE_COUNT-1; _node += 1 ) { if ( _nodestates[ _node ][ 0 ] == 0 ) { _recvjob = 0; break; } } if ( ! _recvjob ) { _mpicmds = ""; _mpicmds * 256; _mpicmds * ( "%(globalvars)s" ); _mpicmds * ( "MESSAGE_LOGGING = 0;" ); _mpicmds * ( "_options = " + _jobopts[ _job ] + ";" ); _mpicmds * ( "ExecuteAFile( "{1}%(batchfile)s"{1}, _options );" ); _mpicmds * ( "_retstr = "{1}"{1};" ); _mpicmds * ( "_retstr * 128;" ); _mpicmds * ( "_retstr * ( "{1}_retjob = "{1} + " + _job + " + "{1};"{1} );" ); _mpicmds * ( "_retstr * ( "{1}_retval = "{2}"{1} + ( %(retvar)s ^ {{ "{1}"{2}"{1}, "{1}"{3}"{1} }} ) + "{1}"{2};"{1} );" ); _mpicmds * ( "_retstr * 0;" ); _mpicmds * ( "return _retstr;" ); _mpicmds * 0; MPISend( _node + 1, _mpicmds ); _nodestates[ _node ][ 0 ] = 1; _nodestates[ _node ][ 1 ] = Time(0); _job += 1; } } if ( _recvjob ) { MPIReceive( -1, _null, _retstr ); ExecuteCommands( _retstr ); _jobvals[ _retjob ] = _retval; _received += 1; } } GLOBAL_FPRINTF_REDIRECT = ""; fprintf( stdout, "[" + _jobvals[ 0 ] ); for ( _job = 1; _job < %(numjobs)d; _job += 1 ) { fprintf ( stdout, "," + _jobvals[ _job ] ); } fprintf( stdout, "]" ); ''') % { 'batchfile': self._batchfile, 'globalvars': _globalvars(globalvars).replace('\n', '\\n').replace( '"', '\\"'), 'numjobs': numjobs, 'retvar': self._retvar, 'jobopts': _jobopts(argslist), } # this facility helps us write code that doesn't break # by allowing us to escape quotations appropriately: # by depth digits = set() for m in finditer(r'"{(\d+)}', cmds): digits.add(int(m.group(1))) for nslash in digits: pwr = 2**nslash - 1 cmds = cmds.replace('"{%d}' % nslash, ('\\' * pwr) + '"') retcode, pout, perr = _runhyphympi(cmds) # the following no longer makes sense given the child process nature # of how we call hyphympi # if not quiet: # if pout != '': # print(pout, file=stderr) if perr != '': raise RuntimeError(perr) # this is a hideous parser for the outermost # json-esque array that the script above outputs # start at 1 because the first char is a '[' i = 1 nb = 0 retarr = [] for j, c in enumerate(pout): # every time we encounter a '[', increment nb if c == '[': nb += 1 # likewise, if we encounter a closing bracket, decrement if c == ']': nb -= 1 # if we're at a comma and nb == 1, we're in the outer list # so split from the last time we split to j-1 if c == ',' and nb == 1: retarr.append(pout[i:j]) i = j + 1 # don't forget the final piece retarr.append(pout[i:(j + nb)]) return retarr else: # multiprocessing numjobs = len(argslist) globals = _globalvars(globalvars) results = farmout( num=numjobs, setup=lambda i: (_jobdispatch, self._batchfile, self._retvar, globals, (argslist[i], ), quiet), worker=farmworker, isresult=lambda r: isinstance(r, list), attempts=3) return list(chain(*results))