def readresults(self, out): """Parses a complete set of results""" c = self.config p = self.progress params = out['params'] # first find the relevant "done" struct, if it exists, else add one found = 0 for d in p['done']: if d['params'] == params: found = 1 break if not found: d = dict(params=params, scores={}, elapsed=0, best={}) p['done'].append(d) # now copy info from it d['elapsed'] += out['elapsed'] p['elapsed'] += out['elapsed'] try: self.parseresult(out, d) d['end'] = time.time() log('Read results for %d params %s, with best %s, and global best %s' % (len(d['params']), self.paramstr(d['params']), d['best']['score'], p['best'].get('score', 0))) except Exception, e: log('Had an error trying to read results for %d params %s: %s' % (len(d['params']), self.paramstr(d['params']), e))
def run(self): """Runs feature selection, picking up where we left off. Note that since we inherit from Thread, you can call start() to run this in a new Thread.""" #TODO what's the advantage to running this in a thread? import Queue p = self.progress inq = self.qname('inq', self.jobid) outq = self.qname('outq', self.jobid) status = self.qname('status', self.jobid) last = time.time() while len(p['done']) < self.config['maxcombos'] and len(p['todo']) > 0: if time.time() - last > self.config['status_interval']: log('STATUS (%s): %d done (best score %s), %d todo' % (self.config['name'], len(p['done']), p['best'].get( 'score', 0), len(p['todo']))) last = time.time() #print 'At top of loop' self.readdone() # see if we need to add more to the input q while self.db.llen(inq) < self.config['min_inq_size']: c = self.newcombo() if c: self.submitcombos([c]) else: break self.readdone() time.sleep(0.5) # at this point, we've ended p['end'] = time.time() log('All done, with best combo %s!' % (pprint.pformat(p['best']), ))
def search(self, fvecs): """Runs a nn search for the given feature vectors. Uses the existing parameters for metric, stype, k, and r. Any of those can be changed at any time. """ from nkpylib.nkutils import simplenn, filternnresults, nkgrouper, getTimeDiffs from nkpylib.nkthreadutils import spawnWorkers from Queue import Queue #return self.searchUsingProcs(fvecs) self.sort = 0 if self.stype == 'unsorted' else 1 start = time.time() if self.nprocs > 1: inq, outq = Queue(), Queue() def inproc(): while 1: idx, fvec = inq.get() t1 = time.time() dists = simplenn(self.data, fvec, metric=self.metric, normalize=None) t2 = time.time() out = filternnresults(dists, k=self.k, r=self.r, sort=self.sort) t3 = time.time() #log('Got times: %s' % (getTimeDiffs([t1,t2,t3]))) #log('Got outs: %s' % (out,)) outq.put((idx, out)) # spawn procs procs = spawnWorkers(self.nprocs, inproc, interval=0) # add to inq for i, fvec in enumerate(fvecs): inq.put((i, fvec)) #log('Added %d fvecs to inq' % (len(fvecs))) # read from outq outputs = [0] * len(fvecs) todo = set(range(len(fvecs))) while todo: if len(todo) % 10 == 0: log('%d left in todo, %0.3fs elapsed' % (len(todo), time.time() - start)) #log('Outputs: %s' % (outputs,)) pass idx, out = outq.get() todo.remove(idx) outputs[idx] = out else: alldists = (simplenn(self.data, fvec, metric=self.metric, normalize=None) for fvec in fvecs) outputs = [ filternnresults(dists, k=self.k, r=self.r, sort=self.sort) for dists in alldists ] return outputs
def nnmainloop(data, l): """Runs a single query with given input line.""" from nkutils import simplenn try: # parse line els = l.strip().split() metric = els.pop(0) assert metric in METRICS stype = els.pop(0) assert stype in SEARCH_TYPES # set params k = -1 r = -1 sort = 1 if stype == 'k': k = int(els.pop(0)) elif stype == 'radius': r = float(els.pop(0)) elif stype == 'radius-k': r = float(els.pop(0)) k = int(els.pop(0)) elif stype == 'unsorted': sort = 0 # parse data fvec = np.array(map(float, els)) assert len(fvec) == data.shape[1] # run the actual search dists = simplenn(data, fvec, metric=metric, normalize=None) ret = filternnresults(dists, k=k, r=r, sort=sort) print ret except Exception, e: log('Exception of type %s: %s' % (type(e), e)) print 0
def submitcombos(self, combos): """Adds a new combination of params to work on, using an rqs q""" c = self.config p = self.progress inq = self.qname('inq', self.jobid) allinq = self.qname('inq') def inqcallback(id, item, qname, rqs): """Sets the status to inq""" rqs.setstatusmsg(qname.replace(':inq', ':status'), id, 'inq') for params in combos: # break into subjobs by sets of svmstrs for svmstrs in nkgrouper(c['maxsvmstrs'], c['svmstrs']): realid = time.time() # create the job structure job = self.createjob(params, svmstrs=svmstrs[:], id=realid, jobid=self.jobid, submitted=time.time(), **c['default_job_kw']) # submit it log('Adding job with %d params %s, with %d svmstrs, %d pos, %d neg' % (len(params), self.paramstr(params), len(svmstrs), len(self.pos), len(self.neg))) item = (realid, job) self.rqs.put(realid, item, inq, callback=inqcallback) #TODO figure this out if 1: # this seems bad if one set of jobs are much slower than the other toadd = {self.jobid: 1000} self.rqs.redis.zadd(allinq, **toadd) else: self.rqs.redis.zincrby(allinq, self.jobid, 1) # add to list of todos p['todo'].append(dict(params=params, svmstrs=svmstrs, submitted=time.time(), realid=realid)) # write our progress to disk self.writeprogress()
def run(self): """Runs feature selection, picking up where we left off. Note that since we inherit from Thread, you can call start() to run this in a new Thread.""" #TODO what's the advantage to running this in a thread? import Queue p = self.progress inq = self.qname('inq', self.jobid) outq = self.qname('outq', self.jobid) status = self.qname('status', self.jobid) last = time.time() while len(p['done']) < self.config['maxcombos'] and len(p['todo']) > 0: if time.time() - last > self.config['status_interval']: log('STATUS (%s): %d done (best score %s), %d todo' % (self.config['name'], len(p['done']), p['best'].get('score', 0), len(p['todo']))) last = time.time() #print 'At top of loop' self.readdone() # see if we need to add more to the input q while self.db.llen(inq) < self.config['min_inq_size']: c = self.newcombo() if c: self.submitcombos([c]) else: break self.readdone() time.sleep(0.5) # at this point, we've ended p['end'] = time.time() log('All done, with best combo %s!' % (pprint.pformat(p['best']),))
def submitjobs(self): """Submits jobs""" inq = self.qname('inq', self.jobid) allinq = self.qname('inq') def inqcallback(id, item, qname, rqs): """Sets the status to inq""" rqs.setstatusmsg(qname.replace(':inq', ':status'), id, 'inq') groups = list(nkgrouper(GROUPSIZE, self.inputs)) print ' Got %d groups of groupsize %d' % (len(groups), GROUPSIZE) for inputs in groups: realid = time.time() # create the job structure job = self.jobfunc(inputs, id=realid, jobid=self.jobid, submitted=time.time(), **self.jobkw) # submit it log('Adding job with %d inputs' % (len(inputs))) item = (realid, job) self.rqs.put(realid, item, inq, callback=inqcallback) # make sure the overall list of featx inputs is valid for this jobid toadd = {self.jobid: 1000} self.rqs.redis.zadd(allinq, **toadd) # add to list of todos p['todo'].append( dict(params=params, svmstrs=svmstrs, submitted=time.time(), realid=realid))
def submitcombos(self, combos): """Adds a new combination of params to work on, using an rqs q""" c = self.config p = self.progress inq = self.qname('inq', self.jobid) allinq = self.qname('inq') def inqcallback(id, item, qname, rqs): """Sets the status to inq""" rqs.setstatusmsg(qname.replace(':inq', ':status'), id, 'inq') for params in combos: # break into subjobs by sets of svmstrs for svmstrs in nkgrouper(c['maxsvmstrs'], c['svmstrs']): realid = time.time() # create the job structure job = self.createjob(params, svmstrs=svmstrs[:], id=realid, jobid=self.jobid, submitted=time.time(), **c['default_job_kw']) # submit it log('Adding job with %d params %s, with %d svmstrs, %d pos, %d neg' % (len(params), self.paramstr(params), len(svmstrs), len(self.pos), len(self.neg))) item = (realid, job) self.rqs.put(realid, item, inq, callback=inqcallback) #TODO figure this out if 1: # this seems bad if one set of jobs are much slower than the other toadd = {self.jobid: 1000} self.rqs.redis.zadd(allinq, **toadd) else: self.rqs.redis.zincrby(allinq, self.jobid, 1) # add to list of todos p['todo'].append( dict(params=params, svmstrs=svmstrs, submitted=time.time(), realid=realid)) # write our progress to disk self.writeprogress()
def submitjobs(self): """Submits jobs""" inq = self.qname('inq', self.jobid) allinq = self.qname('inq') def inqcallback(id, item, qname, rqs): """Sets the status to inq""" rqs.setstatusmsg(qname.replace(':inq', ':status'), id, 'inq') groups = list(nkgrouper(GROUPSIZE, self.inputs)) print ' Got %d groups of groupsize %d' % (len(groups), GROUPSIZE) for inputs in groups: realid = time.time() # create the job structure job = self.jobfunc(inputs, id=realid, jobid=self.jobid, submitted=time.time(), **self.jobkw) # submit it log('Adding job with %d inputs' % (len(inputs))) item = (realid, job) self.rqs.put(realid, item, inq, callback=inqcallback) # make sure the overall list of featx inputs is valid for this jobid toadd = {self.jobid: 1000} self.rqs.redis.zadd(allinq, **toadd) # add to list of todos p['todo'].append(dict(params=params, svmstrs=svmstrs, submitted=time.time(), realid=realid))
def run(self): """Runs feature selection, picking up where we left off. Note that since we inherit from Thread, you can call start() to run this in a new Thread.""" #TODO what's the advantage to running this in a thread? import Queue # figure out what's been already submitted from our todos and submit the rest inq = self.qname('inq', self.jobid) outq = self.qname('outq', self.jobid) all = self.db.lrange(inq, 0, -1) + self.db.lrange(outq, 0, -1) inprog = {} for el in all: try: j = json.loads(el)[1][1] params = j['params'] inprog.setdefault(keyize(params), set()).update(svmstrs) except Exception: pass tosub = [] for t in todo: sub = inprog.get(keyize(t['params']), set()) tosub.append(t['params']) log('Had %d in progress, %d todo, got %d to submit' % (len(inprog), len(todo), len(tosub))) self.submitjobs(tosub) p = self.progress inq = self.qname('inq', self.jobid) outq = self.qname('outq', self.jobid) status = self.qname('status', self.jobid) last = time.time() while len(p['done']) < self.config['maxcombos'] and len(p['todo']) > 0: if time.time() - last > self.config['status_interval']: log('STATUS (%s): %d done (best score %s), %d todo' % (self.config['name'], len(p['done']), p['best'].get( 'score', 0), len(p['todo']))) last = time.time() #print 'At top of loop' self.readdone() # see if we need to add more to the input q while self.db.llen(inq) < self.config['min_inq_size']: c = self.newcombo() if c: self.submitjobs([c]) else: break self.readdone() time.sleep(0.5) # at this point, we've ended p['end'] = time.time() log('All done, with best combo %s!' % (pprint.pformat(p['best']), ))
def run(self): """Runs feature selection, picking up where we left off. Note that since we inherit from Thread, you can call start() to run this in a new Thread.""" #TODO what's the advantage to running this in a thread? import Queue # figure out what's been already submitted from our todos and submit the rest inq = self.qname('inq', self.jobid) outq = self.qname('outq', self.jobid) all = self.db.lrange(inq, 0, -1) + self.db.lrange(outq, 0, -1) inprog = {} for el in all: try: j = json.loads(el)[1][1] params = j['params'] inprog.setdefault(keyize(params), set()).update(svmstrs) except Exception: pass tosub = [] for t in todo: sub = inprog.get(keyize(t['params']), set()) tosub.append(t['params']) log('Had %d in progress, %d todo, got %d to submit' % (len(inprog), len(todo), len(tosub))) self.submitjobs(tosub) p = self.progress inq = self.qname('inq', self.jobid) outq = self.qname('outq', self.jobid) status = self.qname('status', self.jobid) last = time.time() while len(p['done']) < self.config['maxcombos'] and len(p['todo']) > 0: if time.time() - last > self.config['status_interval']: log('STATUS (%s): %d done (best score %s), %d todo' % (self.config['name'], len(p['done']), p['best'].get('score', 0), len(p['todo']))) last = time.time() #print 'At top of loop' self.readdone() # see if we need to add more to the input q while self.db.llen(inq) < self.config['min_inq_size']: c = self.newcombo() if c: self.submitjobs([c]) else: break self.readdone() time.sleep(0.5) # at this point, we've ended p['end'] = time.time() log('All done, with best combo %s!' % (pprint.pformat(p['best']),))
def __init__(self, configfname, progressfname): #RQS_CONFIG = dict(host='arnold.cs.washington.edu', port=10001, password='******') """Initializes this with the config and progress filenames""" Thread.__init__(self) # set vars self.configfname = configfname self.progressfname = progressfname # parse data files c = self.config = json.load(open(configfname)) for k in REQUIRED: assert k in c, 'Field %s must be in config!' % (k) # fill in defaults if missing for k, v in DEFAULT_CFG.items(): if k == 'rqs_cfg': # for rqs, go down into the corresponding dict crqs = c[k] for rk, rv in v.iteritems(): if rk not in crqs: crqs[rk] = rv else: # for everything else, just add it if not given if k not in c: c[k] = v self.jobid = c['jobid'] # initialize rqs self.rqs = RedisQueueService(**c['rqs_cfg']) self.db = self.rqs.redis self.qname = makeqname(c['qbase']) init = 0 try: self.progress = json.load(open(progressfname)) except IOError: init = 1 self.progress = self.newprogress() # set our pos and neg self.pos = [c['pos'][p] for p in self.progress['trainpos']] self.neg = [c['neg'][n] for n in self.progress['trainneg']] if init: # initialize all single params, and submit them all combos = self.createcombos() self.submitcombos(combos) else: # figure out what's been already submitted from our todos and submit the rest #TODO for now, we just go through the inq and outq, rather than anything fancier inq = self.qname('inq', self.jobid) outq = self.qname('outq', self.jobid) all = self.db.lrange(inq, 0, -1) + self.db.lrange(outq, 0, -1) inprog = {} for el in all: try: j = json.loads(el)[1][1] params = j['params'] svmstrs = j['svmstrs'] inprog.setdefault(keyize(params), set()).update(svmstrs) except Exception: pass #print 'Got %d inprog: %s' % (len(inprog), pprint.pformat(inprog)) todo = self.progress['todo'] #print 'Had %d todo: %s' % (len(todo), pprint.pformat(todo)) tosub = [] for t in todo: sub = inprog.get(keyize(t['params']), set()) left = set(t['svmstrs']) - sub if not left: continue # there was nothing left to submit for this job # if we're here, then there is something to submit, so add this tosub.append(t['params']) log('Had %d in progress, %d todo, got %d to submit' % (len(inprog), len(todo), len(tosub))) self.submitcombos(tosub) # write our progress to disk, in case we had a new one self.writeprogress()
print 'Took %0.3fs to run %d queries: %s' % (t2 - t1, len(ret), ret[:2]) #print ret else: # NN proc main loop from nkutils import stdmainloop # to read raw data, args are just (fname,) # to read mmap, args are (fname, nrows, ncols, [dtype='float64']) # read data t1 = time.time() if len(sys.argv) == 2: # raw data fname = sys.argv[1] if fname == '-': f = sys.stdin else: f = open(fname) data = readData(f=f) elif len(sys.argv) >= 4: fname, nrows, ncols = sys.argv[1:4] mmapshape = (int(nrows), int(ncols)) try: dtype = sys.argv[4] except Exception: dtype = 'float64' data = np.memmap(fname, dtype=dtype, mode='r', shape=mmapshape) t2 = time.time() log('Read %d data vecs with %d dims each in %0.3fs' % (data.shape[0], data.shape[1], t2 - t1)) stdmainloop(lambda l: nnmainloop(data, l))