Example #1
0
 def readresults(self, out):
     """Parses a complete set of results"""
     c = self.config
     p = self.progress
     params = out['params']
     # first find the relevant "done" struct, if it exists, else add one
     found = 0
     for d in p['done']:
         if d['params'] == params:
             found = 1
             break
     if not found:
         d = dict(params=params, scores={}, elapsed=0, best={})
         p['done'].append(d)
     # now copy info from it
     d['elapsed'] += out['elapsed']
     p['elapsed'] += out['elapsed']
     try:
         self.parseresult(out, d)
         d['end'] = time.time()
         log('Read results for %d params %s, with best %s, and global best %s'
             % (len(d['params']), self.paramstr(d['params']),
                d['best']['score'], p['best'].get('score', 0)))
     except Exception, e:
         log('Had an error trying to read results for %d params %s: %s' %
             (len(d['params']), self.paramstr(d['params']), e))
Example #2
0
 def run(self):
     """Runs feature selection, picking up where we left off.
     Note that since we inherit from Thread, you can call
     start() to run this in a new Thread."""
     #TODO what's the advantage to running this in a thread?
     import Queue
     p = self.progress
     inq = self.qname('inq', self.jobid)
     outq = self.qname('outq', self.jobid)
     status = self.qname('status', self.jobid)
     last = time.time()
     while len(p['done']) < self.config['maxcombos'] and len(p['todo']) > 0:
         if time.time() - last > self.config['status_interval']:
             log('STATUS (%s): %d done (best score %s), %d todo' %
                 (self.config['name'], len(p['done']), p['best'].get(
                     'score', 0), len(p['todo'])))
             last = time.time()
         #print 'At top of loop'
         self.readdone()
         # see if we need to add more to the input q
         while self.db.llen(inq) < self.config['min_inq_size']:
             c = self.newcombo()
             if c:
                 self.submitcombos([c])
             else:
                 break
             self.readdone()
         time.sleep(0.5)
     # at this point, we've ended
     p['end'] = time.time()
     log('All done, with best combo %s!' % (pprint.pformat(p['best']), ))
Example #3
0
    def search(self, fvecs):
        """Runs a nn search for the given feature vectors.
        Uses the existing parameters for metric, stype, k, and r.
        Any of those can be changed at any time.
        """
        from nkpylib.nkutils import simplenn, filternnresults, nkgrouper, getTimeDiffs
        from nkpylib.nkthreadutils import spawnWorkers
        from Queue import Queue
        #return self.searchUsingProcs(fvecs)
        self.sort = 0 if self.stype == 'unsorted' else 1
        start = time.time()
        if self.nprocs > 1:
            inq, outq = Queue(), Queue()

            def inproc():
                while 1:
                    idx, fvec = inq.get()
                    t1 = time.time()
                    dists = simplenn(self.data,
                                     fvec,
                                     metric=self.metric,
                                     normalize=None)
                    t2 = time.time()
                    out = filternnresults(dists,
                                          k=self.k,
                                          r=self.r,
                                          sort=self.sort)
                    t3 = time.time()
                    #log('Got times: %s' % (getTimeDiffs([t1,t2,t3])))
                    #log('Got outs: %s' % (out,))
                    outq.put((idx, out))

            # spawn procs
            procs = spawnWorkers(self.nprocs, inproc, interval=0)
            # add to inq
            for i, fvec in enumerate(fvecs):
                inq.put((i, fvec))
            #log('Added %d fvecs to inq' % (len(fvecs)))
            # read from outq
            outputs = [0] * len(fvecs)
            todo = set(range(len(fvecs)))
            while todo:
                if len(todo) % 10 == 0:
                    log('%d left in todo, %0.3fs elapsed' %
                        (len(todo), time.time() - start))
                    #log('Outputs: %s' % (outputs,))
                    pass
                idx, out = outq.get()
                todo.remove(idx)
                outputs[idx] = out
        else:
            alldists = (simplenn(self.data,
                                 fvec,
                                 metric=self.metric,
                                 normalize=None) for fvec in fvecs)
            outputs = [
                filternnresults(dists, k=self.k, r=self.r, sort=self.sort)
                for dists in alldists
            ]
        return outputs
Example #4
0
def nnmainloop(data, l):
    """Runs a single query with given input line."""
    from nkutils import simplenn
    try:
        # parse line
        els = l.strip().split()
        metric = els.pop(0)
        assert metric in METRICS
        stype = els.pop(0)
        assert stype in SEARCH_TYPES
        # set params
        k = -1
        r = -1
        sort = 1
        if stype == 'k':
            k = int(els.pop(0))
        elif stype == 'radius':
            r = float(els.pop(0))
        elif stype == 'radius-k':
            r = float(els.pop(0))
            k = int(els.pop(0))
        elif stype == 'unsorted':
            sort = 0
        # parse data
        fvec = np.array(map(float, els))
        assert len(fvec) == data.shape[1]
        # run the actual search
        dists = simplenn(data, fvec, metric=metric, normalize=None)
        ret = filternnresults(dists, k=k, r=r, sort=sort)
        print ret
    except Exception, e:
        log('Exception of type %s: %s' % (type(e), e))
        print 0
Example #5
0
    def submitcombos(self, combos):
        """Adds a new combination of params to work on, using an rqs q"""
        c = self.config
        p = self.progress
        inq = self.qname('inq', self.jobid)
        allinq = self.qname('inq')
        def inqcallback(id, item, qname, rqs):
            """Sets the status to inq"""
            rqs.setstatusmsg(qname.replace(':inq', ':status'), id, 'inq')

        for params in combos:
            # break into subjobs by sets of svmstrs
            for svmstrs in nkgrouper(c['maxsvmstrs'], c['svmstrs']):
                realid = time.time()
                # create the job structure
                job = self.createjob(params, svmstrs=svmstrs[:], id=realid, jobid=self.jobid, submitted=time.time(), **c['default_job_kw'])
                # submit it
                log('Adding job with %d params %s, with %d svmstrs, %d pos, %d neg' % (len(params), self.paramstr(params), len(svmstrs), len(self.pos), len(self.neg)))
                item = (realid, job)
                self.rqs.put(realid, item, inq, callback=inqcallback)
                #TODO figure this out
                if 1: # this seems bad if one set of jobs are much slower than the other
                    toadd = {self.jobid: 1000}
                    self.rqs.redis.zadd(allinq, **toadd)
                else:
                    self.rqs.redis.zincrby(allinq, self.jobid, 1)
                # add to list of todos
                p['todo'].append(dict(params=params, svmstrs=svmstrs, submitted=time.time(), realid=realid))
        # write our progress to disk
        self.writeprogress()
Example #6
0
 def run(self):
     """Runs feature selection, picking up where we left off.
     Note that since we inherit from Thread, you can call
     start() to run this in a new Thread."""
     #TODO what's the advantage to running this in a thread?
     import Queue
     p = self.progress
     inq = self.qname('inq', self.jobid)
     outq = self.qname('outq', self.jobid)
     status = self.qname('status', self.jobid)
     last = time.time()
     while len(p['done']) < self.config['maxcombos'] and len(p['todo']) > 0:
         if time.time() - last > self.config['status_interval']:
             log('STATUS (%s): %d done (best score %s), %d todo' % (self.config['name'], len(p['done']), p['best'].get('score', 0), len(p['todo'])))
             last = time.time()
         #print 'At top of loop'
         self.readdone()
         # see if we need to add more to the input q
         while self.db.llen(inq) < self.config['min_inq_size']:
             c = self.newcombo()
             if c:
                 self.submitcombos([c])
             else:
                 break
             self.readdone()
         time.sleep(0.5)
     # at this point, we've ended
     p['end'] = time.time()
     log('All done, with best combo %s!' % (pprint.pformat(p['best']),))
Example #7
0
    def submitjobs(self):
        """Submits jobs"""
        inq = self.qname('inq', self.jobid)
        allinq = self.qname('inq')

        def inqcallback(id, item, qname, rqs):
            """Sets the status to inq"""
            rqs.setstatusmsg(qname.replace(':inq', ':status'), id, 'inq')

        groups = list(nkgrouper(GROUPSIZE, self.inputs))
        print '    Got %d groups of groupsize %d' % (len(groups), GROUPSIZE)
        for inputs in groups:
            realid = time.time()
            # create the job structure
            job = self.jobfunc(inputs,
                               id=realid,
                               jobid=self.jobid,
                               submitted=time.time(),
                               **self.jobkw)
            # submit it
            log('Adding job with %d inputs' % (len(inputs)))
            item = (realid, job)
            self.rqs.put(realid, item, inq, callback=inqcallback)
            # make sure the overall list of featx inputs is valid for this jobid
            toadd = {self.jobid: 1000}
            self.rqs.redis.zadd(allinq, **toadd)
            # add to list of todos
            p['todo'].append(
                dict(params=params,
                     svmstrs=svmstrs,
                     submitted=time.time(),
                     realid=realid))
Example #8
0
    def submitcombos(self, combos):
        """Adds a new combination of params to work on, using an rqs q"""
        c = self.config
        p = self.progress
        inq = self.qname('inq', self.jobid)
        allinq = self.qname('inq')

        def inqcallback(id, item, qname, rqs):
            """Sets the status to inq"""
            rqs.setstatusmsg(qname.replace(':inq', ':status'), id, 'inq')

        for params in combos:
            # break into subjobs by sets of svmstrs
            for svmstrs in nkgrouper(c['maxsvmstrs'], c['svmstrs']):
                realid = time.time()
                # create the job structure
                job = self.createjob(params,
                                     svmstrs=svmstrs[:],
                                     id=realid,
                                     jobid=self.jobid,
                                     submitted=time.time(),
                                     **c['default_job_kw'])
                # submit it
                log('Adding job with %d params %s, with %d svmstrs, %d pos, %d neg'
                    % (len(params), self.paramstr(params), len(svmstrs),
                       len(self.pos), len(self.neg)))
                item = (realid, job)
                self.rqs.put(realid, item, inq, callback=inqcallback)
                #TODO figure this out
                if 1:  # this seems bad if one set of jobs are much slower than the other
                    toadd = {self.jobid: 1000}
                    self.rqs.redis.zadd(allinq, **toadd)
                else:
                    self.rqs.redis.zincrby(allinq, self.jobid, 1)
                # add to list of todos
                p['todo'].append(
                    dict(params=params,
                         svmstrs=svmstrs,
                         submitted=time.time(),
                         realid=realid))
        # write our progress to disk
        self.writeprogress()
Example #9
0
 def readresults(self, out):
     """Parses a complete set of results"""
     c = self.config
     p = self.progress
     params = out['params']
     # first find the relevant "done" struct, if it exists, else add one
     found = 0
     for d in p['done']:
         if d['params'] == params:
             found = 1
             break
     if not found:
         d = dict(params=params, scores={}, elapsed=0, best={})
         p['done'].append(d)
     # now copy info from it
     d['elapsed'] += out['elapsed']
     p['elapsed'] += out['elapsed']
     try:
         self.parseresult(out, d)
         d['end'] = time.time()
         log('Read results for %d params %s, with best %s, and global best %s' % (len(d['params']), self.paramstr(d['params']), d['best']['score'], p['best'].get('score', 0)))
     except Exception, e:
         log('Had an error trying to read results for %d params %s: %s' % (len(d['params']), self.paramstr(d['params']), e))
Example #10
0
    def submitjobs(self):
        """Submits jobs"""
        inq = self.qname('inq', self.jobid)
        allinq = self.qname('inq')
        def inqcallback(id, item, qname, rqs):
            """Sets the status to inq"""
            rqs.setstatusmsg(qname.replace(':inq', ':status'), id, 'inq')

        groups = list(nkgrouper(GROUPSIZE, self.inputs))
        print '    Got %d groups of groupsize %d' % (len(groups), GROUPSIZE)
        for inputs in groups:
            realid = time.time()
            # create the job structure
            job = self.jobfunc(inputs, id=realid, jobid=self.jobid, submitted=time.time(), **self.jobkw)
            # submit it
            log('Adding job with %d inputs' % (len(inputs)))
            item = (realid, job)
            self.rqs.put(realid, item, inq, callback=inqcallback)
            # make sure the overall list of featx inputs is valid for this jobid
            toadd = {self.jobid: 1000}
            self.rqs.redis.zadd(allinq, **toadd)
            # add to list of todos
            p['todo'].append(dict(params=params, svmstrs=svmstrs, submitted=time.time(), realid=realid))
Example #11
0
    def run(self):
        """Runs feature selection, picking up where we left off.
        Note that since we inherit from Thread, you can call
        start() to run this in a new Thread."""
        #TODO what's the advantage to running this in a thread?
        import Queue

        # figure out what's been already submitted from our todos and submit the rest
        inq = self.qname('inq', self.jobid)
        outq = self.qname('outq', self.jobid)
        all = self.db.lrange(inq, 0, -1) + self.db.lrange(outq, 0, -1)
        inprog = {}
        for el in all:
            try:
                j = json.loads(el)[1][1]
                params = j['params']
                inprog.setdefault(keyize(params), set()).update(svmstrs)
            except Exception:
                pass
        tosub = []
        for t in todo:
            sub = inprog.get(keyize(t['params']), set())
            tosub.append(t['params'])
        log('Had %d in progress, %d todo, got %d to submit' %
            (len(inprog), len(todo), len(tosub)))
        self.submitjobs(tosub)

        p = self.progress
        inq = self.qname('inq', self.jobid)
        outq = self.qname('outq', self.jobid)
        status = self.qname('status', self.jobid)
        last = time.time()
        while len(p['done']) < self.config['maxcombos'] and len(p['todo']) > 0:
            if time.time() - last > self.config['status_interval']:
                log('STATUS (%s): %d done (best score %s), %d todo' %
                    (self.config['name'], len(p['done']), p['best'].get(
                        'score', 0), len(p['todo'])))
                last = time.time()
            #print 'At top of loop'
            self.readdone()
            # see if we need to add more to the input q
            while self.db.llen(inq) < self.config['min_inq_size']:
                c = self.newcombo()
                if c:
                    self.submitjobs([c])
                else:
                    break
                self.readdone()
            time.sleep(0.5)
        # at this point, we've ended
        p['end'] = time.time()
        log('All done, with best combo %s!' % (pprint.pformat(p['best']), ))
Example #12
0
    def run(self):
        """Runs feature selection, picking up where we left off.
        Note that since we inherit from Thread, you can call
        start() to run this in a new Thread."""
        #TODO what's the advantage to running this in a thread?
        import Queue

        # figure out what's been already submitted from our todos and submit the rest
        inq = self.qname('inq', self.jobid)
        outq = self.qname('outq', self.jobid)
        all = self.db.lrange(inq, 0, -1) + self.db.lrange(outq, 0, -1)
        inprog = {}
        for el in all:
            try:
                j = json.loads(el)[1][1]
                params = j['params']
                inprog.setdefault(keyize(params), set()).update(svmstrs)
            except Exception: pass
        tosub = []
        for t in todo:
            sub = inprog.get(keyize(t['params']), set())
            tosub.append(t['params'])
        log('Had %d in progress, %d todo, got %d to submit' % (len(inprog), len(todo), len(tosub)))
        self.submitjobs(tosub)




        p = self.progress
        inq = self.qname('inq', self.jobid)
        outq = self.qname('outq', self.jobid)
        status = self.qname('status', self.jobid)
        last = time.time()
        while len(p['done']) < self.config['maxcombos'] and len(p['todo']) > 0:
            if time.time() - last > self.config['status_interval']:
                log('STATUS (%s): %d done (best score %s), %d todo' % (self.config['name'], len(p['done']), p['best'].get('score', 0), len(p['todo'])))
                last = time.time()
            #print 'At top of loop'
            self.readdone()
            # see if we need to add more to the input q
            while self.db.llen(inq) < self.config['min_inq_size']:
                c = self.newcombo()
                if c:
                    self.submitjobs([c])
                else:
                    break
                self.readdone()
            time.sleep(0.5)
        # at this point, we've ended
        p['end'] = time.time()
        log('All done, with best combo %s!' % (pprint.pformat(p['best']),))
Example #13
0
 def __init__(self, configfname, progressfname):
     #RQS_CONFIG = dict(host='arnold.cs.washington.edu', port=10001, password='******')
     """Initializes this with the config and progress filenames"""
     Thread.__init__(self)
     # set vars
     self.configfname = configfname
     self.progressfname = progressfname
     # parse data files
     c = self.config = json.load(open(configfname))
     for k in REQUIRED:
         assert k in c, 'Field %s must be in config!' % (k)
     # fill in defaults if missing
     for k, v in DEFAULT_CFG.items():
         if k == 'rqs_cfg':
             # for rqs, go down into the corresponding dict
             crqs = c[k]
             for rk, rv in v.iteritems():
                 if rk not in crqs:
                     crqs[rk] = rv
         else:
             # for everything else, just add it if not given
             if k not in c:
                 c[k] = v
     self.jobid = c['jobid']
     # initialize rqs
     self.rqs = RedisQueueService(**c['rqs_cfg'])
     self.db = self.rqs.redis
     self.qname = makeqname(c['qbase'])
     init = 0
     try:
         self.progress = json.load(open(progressfname))
     except IOError:
         init = 1
         self.progress = self.newprogress()
     # set our pos and neg
     self.pos = [c['pos'][p] for p in self.progress['trainpos']]
     self.neg = [c['neg'][n] for n in self.progress['trainneg']]
     if init:
         # initialize all single params, and submit them all
         combos = self.createcombos()
         self.submitcombos(combos)
     else:
         # figure out what's been already submitted from our todos and submit the rest
         #TODO for now, we just go through the inq and outq, rather than anything fancier
         inq = self.qname('inq', self.jobid)
         outq = self.qname('outq', self.jobid)
         all = self.db.lrange(inq, 0, -1) + self.db.lrange(outq, 0, -1)
         inprog = {}
         for el in all:
             try:
                 j = json.loads(el)[1][1]
                 params = j['params']
                 svmstrs = j['svmstrs']
                 inprog.setdefault(keyize(params), set()).update(svmstrs)
             except Exception:
                 pass
         #print 'Got %d inprog: %s' % (len(inprog), pprint.pformat(inprog))
         todo = self.progress['todo']
         #print 'Had %d todo: %s' % (len(todo), pprint.pformat(todo))
         tosub = []
         for t in todo:
             sub = inprog.get(keyize(t['params']), set())
             left = set(t['svmstrs']) - sub
             if not left:
                 continue  # there was nothing left to submit for this job
             # if we're here, then there is something to submit, so add this
             tosub.append(t['params'])
         log('Had %d in progress, %d todo, got %d to submit' %
             (len(inprog), len(todo), len(tosub)))
         self.submitcombos(tosub)
     # write our progress to disk, in case we had a new one
     self.writeprogress()
Example #14
0
        print 'Took %0.3fs to run %d queries: %s' % (t2 - t1, len(ret),
                                                     ret[:2])
        #print ret
    else:
        # NN proc main loop
        from nkutils import stdmainloop
        # to read raw data, args are just (fname,)
        # to read mmap, args are (fname, nrows, ncols, [dtype='float64'])
        # read data
        t1 = time.time()
        if len(sys.argv) == 2:
            # raw data
            fname = sys.argv[1]
            if fname == '-':
                f = sys.stdin
            else:
                f = open(fname)
            data = readData(f=f)
        elif len(sys.argv) >= 4:
            fname, nrows, ncols = sys.argv[1:4]
            mmapshape = (int(nrows), int(ncols))
            try:
                dtype = sys.argv[4]
            except Exception:
                dtype = 'float64'
            data = np.memmap(fname, dtype=dtype, mode='r', shape=mmapshape)
        t2 = time.time()
        log('Read %d data vecs with %d dims each in %0.3fs' %
            (data.shape[0], data.shape[1], t2 - t1))
        stdmainloop(lambda l: nnmainloop(data, l))
Example #15
0
 def __init__(self, configfname, progressfname):
     #RQS_CONFIG = dict(host='arnold.cs.washington.edu', port=10001, password='******')
     """Initializes this with the config and progress filenames"""
     Thread.__init__(self)
     # set vars
     self.configfname = configfname
     self.progressfname = progressfname
     # parse data files
     c = self.config = json.load(open(configfname))
     for k in REQUIRED:
         assert k in c, 'Field %s must be in config!' % (k)
     # fill in defaults if missing
     for k, v in DEFAULT_CFG.items():
         if k == 'rqs_cfg':
             # for rqs, go down into the corresponding dict
             crqs = c[k]
             for rk, rv in v.iteritems():
                 if rk not in crqs:
                     crqs[rk] = rv
         else:
             # for everything else, just add it if not given
             if k not in c:
                 c[k] = v
     self.jobid = c['jobid']
     # initialize rqs
     self.rqs = RedisQueueService(**c['rqs_cfg'])
     self.db = self.rqs.redis
     self.qname = makeqname(c['qbase'])
     init = 0
     try:
         self.progress = json.load(open(progressfname))
     except IOError:
         init = 1
         self.progress = self.newprogress()
     # set our pos and neg
     self.pos = [c['pos'][p] for p in self.progress['trainpos']]
     self.neg = [c['neg'][n] for n in self.progress['trainneg']]
     if init:
         # initialize all single params, and submit them all
         combos = self.createcombos()
         self.submitcombos(combos)
     else:
         # figure out what's been already submitted from our todos and submit the rest
         #TODO for now, we just go through the inq and outq, rather than anything fancier
         inq = self.qname('inq', self.jobid)
         outq = self.qname('outq', self.jobid)
         all = self.db.lrange(inq, 0, -1) + self.db.lrange(outq, 0, -1)
         inprog = {}
         for el in all:
             try:
                 j = json.loads(el)[1][1]
                 params = j['params']
                 svmstrs = j['svmstrs']
                 inprog.setdefault(keyize(params), set()).update(svmstrs)
             except Exception: pass
         #print 'Got %d inprog: %s' % (len(inprog), pprint.pformat(inprog))
         todo = self.progress['todo']
         #print 'Had %d todo: %s' % (len(todo), pprint.pformat(todo))
         tosub = []
         for t in todo:
             sub = inprog.get(keyize(t['params']), set())
             left = set(t['svmstrs']) - sub
             if not left: continue # there was nothing left to submit for this job
             # if we're here, then there is something to submit, so add this
             tosub.append(t['params'])
         log('Had %d in progress, %d todo, got %d to submit' % (len(inprog), len(todo), len(tosub)))
         self.submitcombos(tosub)
     # write our progress to disk, in case we had a new one
     self.writeprogress()