def __init__(self, func, makedict=1, nthreads=5, inq=None, outq=None, callback=None): """Set the function you want to use for extraction and the number of threads. The function should take a PIL Image as input and return an array of feature values. You can then directly add filenames to be feature extracted to this.inq. If 'makedict' is true (the default), then there will be an instance variable called 'featdict' which will be populated with the feature values as they are computed. This is a dict mapping fnames (from the inq) to the feature values. If you want to later use the 'getmultiple()' function, 'makedict' needs to be true. If this is false, then you can directly access the output queue yourself by calling this.outq.get(). Note that if using multiple threads, then you're not guaranteed that outputs will be in the same order as the inputs. You can optionally pass in the input and output queues. If you don't, then they are created. You can optionally pass in a callback called when a feature is extracted for an image. It is called with (fname, feats) """ from Queue import Queue # set params self.func = func self.nthreads = nthreads if not inq: inq = Queue() if not outq: outq = Queue() self.inq, self.outq = inq, outq self.callback = callback # start threads self.featxs = spawnWorkers(self.nthreads, featxloop, args=(self.func, self.inq, self.outq, self.callback), interval=0.1) if makedict: self.featdict = {} self.q2dictthread = spawnWorkers(1, q2dict, args=(self.outq, self.featdict)) else: self.featdict = None
def __init__(self, args, num=0, **kw): """Starts a set of processes, defined by num: if num is an int: > 0: that many procs <= 0: getNumCPUs() - num elif num is a float: that percentage of cpus of this sys Any additional kw args are passed to the initializer for Process(). (These are the same as the inputs to subprocess.Popen()) """ from threading import Lock from copy import deepcopy from Queue import Queue, LifoQueue from nkthreadutils import spawnWorkers from nkutils import parseNProcs self.nprocs = nprocs = parseNProcs(num) self.args = deepcopy(args) # in case we alter them # spawn processes and associated locks and working queues self.procs = [Process(args, **kw) for i in range(nprocs)] self.proclocks = [Lock() for p in self.procs] self.working = [LifoQueue() for p in self.procs] # spawn instance vars to track inputs and results self.inq = Queue() self.results = {} self.resultlock = Lock() # spawn worker threads self.inloop = spawnWorkers(1, self.inputloop)[0] self.outloop = spawnWorkers(1, self.outputloop)[0]
def gisttest(modelfname, maxsize=None): """Runs gist classification on all fnames passed in through stdin""" from nktrainutils import readSVMModelAndParams, bulkclassify from nkutils import stdmainloop # read the model model = readSVMModelAndParams(modelfname) assert model.scales fx = FeatureExtractor(lambda im: gistfeatures(im, maxsize=maxsize), makedict=0, nthreads=11) # setup the classification, in other threads def classifyfunc(fx=fx, model=model, timeout=1): from nkthreadutils import feastOnQueue while 1: t = time.time() els = [(fname, feats) for fname, feats in feastOnQueue(fx.outq, timeout) if feats is not None] if els: fnames, feats = zip(*els) scores = [l*v for l, v in bulkclassify(model, feats)] for score, fname in zip(scores, fnames): print '%s\t%s' % (score, fname) elapsed = time.time()-t if els: print >>sys.stderr, 'Classified %d els in %0.3fs, each of len %d' % (len(feats), elapsed, len(feats[0])) time.sleep(max(0, timeout-elapsed)) clsworkers = spawnWorkers(5, classifyfunc) # start reading from stdin stdmainloop(lambda fname: fx.inq.put(fname)) time.sleep(10)
If a callback is given, then it's executed with the same args when we're about to add.""" while 1: try: el = dbdelayq.get() func, args, kw, timestamp = el[:4] delay = time.time() - timestamp delays.append(delay) if len(delays) > MAX_DELAYS: delays.pop(0) if callback: callback(func, args, kw, timestamp) retrysql(func, nretries, *args, **kw) except Exception, e: pass spawnWorkers(nworkers, dbdelayexec) def addDelayedSQL(meth, *args, **kw): """Adds a sql statement to execute delayed. If you want it to execute immediately (synchronously), you can send immediate=1. Note that for debugging, we can easily turn all delayed entries into immediate ones.""" #TODO perhaps add a backoff sleep() if the lagtime becomes too long if debug or ('immediate' in kw and kw['immediate']): if 'immediate' in kw: del kw['immediate'] retrysql(meth, nretries, *args, **kw) else: dbdelayq.put((meth, args, kw, time.time())) if penalty > 0: d = delays[-1] p = penalty * d
def ipca_threaded(means, datacallback, keep=-1, dtype=float64, init=None, sonly=0, sparse=0): """Iterative PCA, which builds up the scatter matrix iteratively. This is a threaded version, which is faster if the datacallback takes some time... This iterates over datacallback repeatedly to get chunks of data. Each chunk C should be of size ndims X chunk_size (where chunk_size can vary between chunks). A scatter matrix is computed by first subtracting means, and then taking C * C.T. This is added to a running total scatter matrix S. Finally, when the datacallback is exhausted (e.g., end of generator or list), then PCA is run on S, and eigvalues are sorted. Returns (eigvals, eigvecs). 'keep' determines how many dimensions to keep: <= 0: all dimensions (default) 0.0 < keep <= 1.0: given percentage of total variance (must be a float) >= 1: given number of dimensions (must be an int) In all cases, at most min(len(means)) will be kept. You can also pass in an S matrix as initialization (otherwise zeros() is used). If sonly is true, then only returns the complete S matrix. """ t1 = time.time() ndims = max(means.shape) means = means.reshape((ndims, 1)) if init: assert init.shape[0] == ndims == init.shape[ 1], 'Given init must have shape (%d, %d) but had shape %s' % ( ndims, ndims, s.shape) s = init else: s = zeros((ndims, ndims), dtype=dtype) t2 = time.time() log(' Done initializing S matrix of shape %s in %0.3f secs...' % (s.shape, t2 - t1)) tot = 0 num = 0 from Queue import Queue qsize = 5 q = Queue(qsize) from nkthreadutils import spawnWorkers def loadq(q, callback): for d in callback: d -= means q.put(d) q.put(None) datathread = spawnWorkers(1, loadq, args=(q, datacallback))[0] i = 0 while 1: d = q.get() i += 1 if d is None: break num += d.shape[1] t3 = time.time() s += dot(d, d.T) tot += time.time() - t3 #log(' On iteration %d of datacallback and got %d total elements, added in %0.3f secs' % (i, num, tot)) if sonly: return s if sparse: assert type(keep) == type(123) values, vecs = sparseeigs(s, keep) else: values, vecs = sortedeigs(s, dtype=dtype) return prunedims(values, vecs, keep)
def ipca_threaded(means, datacallback, keep=-1, dtype=float64, init=None, sonly=0, sparse=0): """Iterative PCA, which builds up the scatter matrix iteratively. This is a threaded version, which is faster if the datacallback takes some time... This iterates over datacallback repeatedly to get chunks of data. Each chunk C should be of size ndims X chunk_size (where chunk_size can vary between chunks). A scatter matrix is computed by first subtracting means, and then taking C * C.T. This is added to a running total scatter matrix S. Finally, when the datacallback is exhausted (e.g., end of generator or list), then PCA is run on S, and eigvalues are sorted. Returns (eigvals, eigvecs). 'keep' determines how many dimensions to keep: <= 0: all dimensions (default) 0.0 < keep <= 1.0: given percentage of total variance (must be a float) >= 1: given number of dimensions (must be an int) In all cases, at most min(len(means)) will be kept. You can also pass in an S matrix as initialization (otherwise zeros() is used). If sonly is true, then only returns the complete S matrix. """ t1 = time.time() ndims = max(means.shape) means = means.reshape((ndims,1)) if init: assert init.shape[0] == ndims == init.shape[1], 'Given init must have shape (%d, %d) but had shape %s' % (ndims, ndims, s.shape) s = init else: s = zeros((ndims, ndims), dtype=dtype) t2 = time.time() log(' Done initializing S matrix of shape %s in %0.3f secs...' % (s.shape, t2-t1)) tot = 0 num = 0 from Queue import Queue qsize=5 q = Queue(qsize) from nkthreadutils import spawnWorkers def loadq(q, callback): for d in callback: d -= means q.put(d) q.put(None) datathread = spawnWorkers(1, loadq, args=(q, datacallback))[0] i = 0 while 1: d = q.get() i += 1 if d is None: break num += d.shape[1] t3 = time.time() s += dot(d, d.T) tot += time.time()-t3 #log(' On iteration %d of datacallback and got %d total elements, added in %0.3f secs' % (i, num, tot)) if sonly: return s if sparse: assert type(keep) == type(123) values, vecs = sparseeigs(s, keep) else: values, vecs = sortedeigs(s, dtype=dtype) return prunedims(values, vecs, keep)
The q should contain elements like (func, args, kw, timestamp). If a callback is given, then it's executed with the same args when we're about to add.""" while 1: try: el = dbdelayq.get() func, args, kw, timestamp = el[:4] delay = time.time()-timestamp delays.append(delay) if len(delays) > MAX_DELAYS: delays.pop(0) if callback: callback(func, args, kw, timestamp) retrysql(func, nretries, *args, **kw) except Exception, e: pass spawnWorkers(nworkers, dbdelayexec) def addDelayedSQL(meth, *args, **kw): """Adds a sql statement to execute delayed. If you want it to execute immediately (synchronously), you can send immediate=1. Note that for debugging, we can easily turn all delayed entries into immediate ones.""" #TODO perhaps add a backoff sleep() if the lagtime becomes too long if debug or ('immediate' in kw and kw['immediate']): if 'immediate' in kw: del kw['immediate'] retrysql(meth, nretries, *args, **kw) else: dbdelayq.put((meth, args, kw, time.time())) if penalty > 0: d = delays[-1] p = penalty * d