def __init__(self,nactions,input_ranges,nelemns=[],k=1,alpha=0.3):
        
        
        self.cl = self.ndlinspace(input_ranges,nelemns)
      

        self.lbounds = []
        self.ubounds = []

        self.k          = k
        self.shape      = self.cl.shape
        self.nactions   = nactions
        
        self.ps         = zeros((self.shape[0],nactions,self.shape[1]))
        self.ps_exp     = zeros((self.shape[0],nactions))
        self.ac         = []
        self.knn        = []
        self.alpha      = alpha
         
        self.last_state = zeros((1,self.shape[1]))+0.0
        self.next_state = array(self.last_state)
       

        
        
        for r in input_ranges:
            self.lbounds.append(r[0])
            self.ubounds.append(r[1])


        
        self.lbounds = array(self.lbounds)
        self.ubounds = array(self.ubounds)
        self.cl = array (self.RescaleInputs(self.cl))
        self.knntree = kdtree(self.cl)
Example #2
0
def predictedGroup(p, tr, nn = 3, e='s'):
  #Compares the points (p) to your tree (tr), providing its predicted 
  # category for each point based off of its (nn) nearest neighbors
  #Explaination can be short e[s] or long e[l]
  k = ann.kdtree(tr[:,:tr.shape[1]-1])
  l = k.knn(p[:,:p.shape[1]-1],3)
  #   dist = distGroups(tr)
  #   pr = []
  print "l[0]"; print l[0], "\n"; 
  #   print dist; 
  #   print "tr[0][-1] \n",tr[0][-1]
  if e == 's':
    for i in l[0]:
      pass
  else:
    print tr,"\n"; print p,"\n";
    ll = np.zeros((l[0].shape[0],p.shape[1]))
    kk = 0
    for i in l[0]:
      ii = 0
      for j in i:
        ll[kk][ii] = tr[j][-1]
        ii += 1
      kk += 1
    print "Groups \n", ll; print "Modes \n",stat.mode(ll,1)
  pred = assignGroup(p,stat.mode(ll,1)[0])
  print pred
  return pred
Example #3
0
    def nearest(self, location, count = 1):
        if not self._scikitA:
            self._scikitA = kdtree(self.as_array())
        (index_array, distance_array,) = self._scikitA.knn(location, count)
        count = len(index_array[0])
        results = []
        for x in range(count):
            results.append((self[index_array[0][x]], distance_array[0][x]))

        return results
Example #4
0
def nearest_distances(X, k=1):
    '''
    X = array(N,M)
    N = number of points
    M = number of dimensions
    
    returns the squared distance to the kth nearest neighbor for every point in X
    '''
    ktree = ann.kdtree(X)
    _, d = ktree.knn(X, k + 1)  # the first nearest neighbor is itself
    return d[:, -1]  # returns the distance to the kth nearest neighbor
Example #5
0
def nearest_distances(X, k=1):
    '''
    X = array(N,M)
    N = number of points
    M = number of dimensions
    
    returns the squared distance to the kth nearest neighbor for every point in X
    '''
    ktree = ann.kdtree(X)
    _, d = ktree.knn(X, k + 1) # the first nearest neighbor is itself
    return d[:, -1] # returns the distance to the kth nearest neighbor
Example #6
0
 def load(self, data_file):
     self.labels = []
     self.label_set = set()
     points = []
     with open(data_file) as fp:
         for line in fp:
             fields = line.split('\t')
             self.labels.append(fields[0])
             self.label_set.add(fields[0])
             points.append([float(f) for f in fields[1:]])
     self.kdtree = ann.kdtree(np.array(points))
     self.n = len(points)
     self.k = 25
Example #7
0
def find_n_nearest_neighbors_fast(embed,n=10):
    """Searches the n nearest neighbors in state-space.
    Takes an array of TD-Vectors as returned by make_time_delay_embedding
    where first index is tp, second index is component.
    Returns an array of shape (embed.shape[0],n) containing the indices 
    of the nearest neighbors for each tp."""
    assert len(embed.shape) == 2, "Only 2d-arrays (one array of TD-Vectors) are supported at the moment."
    n = int(round(n))
    try:
        import scikits.ann as ann
        k = ann.kdtree(embed)
        rv_ar = k.knn(embed,n+1)[0][:,1:]
    except Exception, e:
        from scipy.spatial import KDTree
        kdt = KDTree(embed)
        rv_ar = kdt.query(kdt.data,k=n)[1][:,1:]
Example #8
0
    def getSuccessorDistribution(self, state):
        """ Return the successor distribution for the given *state*. 
        
        Returns an iterator that yields pairs of states and
        their probabilities of being the successor of the given *state*. 
        """
        if self.states == None:
            raise ModelNotInitialized()

        k = min(self.states.shape[0], self.k)

        if self.rebuildSucc:
            self.succKDTree = ann.kdtree(self.states)
            self.rebuildSucc = False

        indices, distances = self.succKDTree.knn(state, k)

        denominator = numpy.sum(numpy.exp(-distances[0] / (self.b_Sa**2)))

        # If the distances become too large, then all values can become zero
        # In this situation, we simply return the closest state and probability 1.
        if denominator == 0 or numpy.isnan(denominator):
            import warnings
            warnings.warn(
                "Too large distances, returning only closest example")
            indices[0] = [indices[0][0]]
            distances[0] = [0.0]
            denominator = numpy.exp(0.0 / (self.b_Sa**2))

        for index, distance in zip(indices[0], distances[0]):
            neighbor = State(
                self.states[index],
                state.dimensions)  # TODO: not use state.dimensions
            succState, reward = self.successorSamples[neighbor]

            delta = succState - neighbor
            predictedSuccState = State(state + delta, state.dimensions)

            if not 0 <= gaussian(distance, self.b_Sa) / denominator <= 1:
                import warnings
                import sys
                warnings.warn("Invalid distances in KNN Model!")
                print distances
                sys.exit(0)

            yield predictedSuccState, gaussian(distance,
                                               self.b_Sa) / denominator
 def __init__(self, codewords):
     """
     Constructor.
     Needs an initialized codebook, one code per line.
     """
     self._codebook = copy.deepcopy(codewords)
     self._nCodes = codewords.shape[0]
     self._codesize = codewords.shape[1]
     self._dist = euclidean_dist
     self._codebounds = np.ones([self._nCodes, self._nCodes]) * np.inf
     self._init_bounds()
     # test kd-tree
     self._kdtree = scipy.spatial.KDTree(self._codebook, leafsize=100)
     self._ckdtree = scipy.spatial.cKDTree(self._codebook, leafsize=100)
     # test ann kdtree
     tstart = time.time()
     self._ann = ann.kdtree(self._codebook)
     print 'time to build ann tree:', time.time() - tstart, 'seconds.'
Example #10
0
File: knn.py Project: mekruthi/mmlf
    def train(self, trainingSet):
        """ Trains the KNN function approximator with the given *trainingSet*. 
        
        Stores the examples contained in the *trainingSet* and overwrites
        the old examples
        """
        self.qValues = trainingSet 
        
        states = defaultdict(list)
        for (state, action), target in trainingSet.iteritems():
            states[action].append(state)

        for action in self.actions:
            if len(states[action]) > 0:
                self.states[action] = numpy.vstack(states[action])                    
                self.actionsKDTree[action] = ann.kdtree(self.states[action])
            else:
                self.actionsKDTree[action] = None
 def __init__(self,codewords):
     """
     Constructor.
     Needs an initialized codebook, one code per line.
     """
     self._codebook = copy.deepcopy(codewords)
     self._nCodes = codewords.shape[0]
     self._codesize = codewords.shape[1]
     self._dist = euclidean_dist
     self._codebounds = np.ones([self._nCodes,self._nCodes]) * np.inf
     self._init_bounds()
     # test kd-tree
     self._kdtree = scipy.spatial.KDTree(self._codebook,leafsize=100)
     self._ckdtree = scipy.spatial.cKDTree(self._codebook,leafsize=100)
     # test ann kdtree
     tstart = time.time()
     self._ann = ann.kdtree(self._codebook)
     print 'time to build ann tree:',time.time()-tstart,'seconds.'
Example #12
0
    def getExplorationValue(self, state):
        """ Return the exploratory value of the given state *state*
        
        The exploratory value of a state under this model is defined simply as
        the sum of the activations of its k nearest neighbors 
        """
        if self.states == None:
            return 0.0

        k = min(self.states.shape[0], self.k)

        if self.rebuildSucc:
            self.succKDTree = ann.kdtree(self.states)
            self.rebuildSucc = False

        indices, distances = self.succKDTree.knn(state, k)

        return numpy.sum(numpy.exp(-distances[0] / (self.b_Sa**2)))
Example #13
0
def find_n_nearest_neighbors_fast(embed, n=10):
    """Searches the n nearest neighbors in state-space.
    Takes an array of TD-Vectors as returned by make_time_delay_embedding
    where first index is tp, second index is component.
    Returns an array of shape (embed.shape[0],n) containing the indices 
    of the nearest neighbors for each tp."""
    assert len(
        embed.shape
    ) == 2, "Only 2d-arrays (one array of TD-Vectors) are supported at the moment."
    n = int(round(n))
    try:
        import scikits.ann as ann
        k = ann.kdtree(embed)
        rv_ar = k.knn(embed, n + 1)[0][:, 1:]
    except Exception, e:
        from scipy.spatial import KDTree
        kdt = KDTree(embed)
        rv_ar = kdt.query(kdt.data, k=n)[1][:, 1:]
Example #14
0
    def predicts(self, feats):
        """
        Returns two lists, best_code_per_pattern
        and average squared distance

        Note on method used:
        Uses ann if we have many features. Reason:
        during training, when we might look at one new sample
        at a time then update the codebook, building a kdtree is
        useless. If the model is trained and we want to predict on
        a large database, t's worth having the kdtree.
        Threshold set at 200 features.
        """
        assert feats.shape[1] > 0, 'empty feats???'
        # ann
        use_ann = feats.shape[0] > 50 and _ann_imported
        if use_ann:
            kdtree = ann.kdtree(self._codebook)
            best_code_per_p, dists = self._closest_code_ann(feats, kdtree)
            best_code_per_p = np.array(best_code_per_p)
            # note that dists is already squared euclidean distance
            avg_dists = np.array(map(lambda x: x * 1. / feats.shape[1], dists))
            if np.isnan(dists).any():
                # sometimes ann has numerical errors, redo wrong ones
                nan_idx = np.where(np.isnan(avg_dists))[0]
                for idx in nan_idx:
                    code, dist = self._closest_code_batch(feats[idx])
                    best_code_per_p[idx] = int(code)
                    avg_dists[idx] = dist * dist * 1. / feats.shape[1]
                assert not np.isnan(avg_dists).any(), 'NaN with ann not fixed'
        if not use_ann:
            # prepare result
            best_code_per_p = np.zeros(feats.shape[0])
            avg_dists = np.zeros(feats.shape[0])
            idx = -1
            # iterate over features
            for f in feats:
                idx += 1
                code, dist = self._closest_code_batch(f)
                best_code_per_p[idx] = int(code)
                avg_dists[idx] = dist * dist * 1. / feats.shape[1]
            assert not np.isnan(avg_dists).any(), 'NaN with regular code'
        # done, return two list
        return best_code_per_p, avg_dists
Example #15
0
    def predicts(self,feats):
        """
        Returns two lists, best_code_per_pattern
        and average squared distance

        Note on method used:
        Uses ann if we have many features. Reason:
        during training, when we might look at one new sample
        at a time then update the codebook, building a kdtree is
        useless. If the model is trained and we want to predict on
        a large database, t's worth having the kdtree.
        Threshold set at 200 features.
        """
        assert feats.shape[1] > 0,'empty feats???'
        # ann
        use_ann = feats.shape[0] > 50 and _ann_imported
        if use_ann:
            kdtree = ann.kdtree(self._codebook)
            best_code_per_p, dists = self._closest_code_ann(feats,kdtree)
            best_code_per_p = np.array(best_code_per_p)
            # note that dists is already squared euclidean distance
            avg_dists = np.array(map(lambda x: x * 1. /feats.shape[1],dists))
            if np.isnan(dists).any():
                # sometimes ann has numerical errors, redo wrong ones
                nan_idx = np.where(np.isnan(avg_dists))[0]
                for idx in nan_idx:
                    code,dist = self._closest_code_batch(feats[idx])
                    best_code_per_p[idx] = int(code)
                    avg_dists[idx] = dist * dist * 1. / feats.shape[1]
                assert not np.isnan(avg_dists).any(),'NaN with ann not fixed'
        if not use_ann:
            # prepare result
            best_code_per_p = np.zeros(feats.shape[0])
            avg_dists = np.zeros(feats.shape[0])
            idx = -1
            # iterate over features
            for f in feats:
                idx += 1
                code,dist = self._closest_code_batch(f)
                best_code_per_p[idx] = int(code)
                avg_dists[idx] = dist * dist * 1. / feats.shape[1]
            assert not np.isnan(avg_dists).any(),'NaN with regular code'
        # done, return two list
        return best_code_per_p, avg_dists
Example #16
0
    def __init__(self,
                 nactions,
                 input_ranges,
                 nelemns=[],
                 npoints=0,
                 k=1,
                 alpha=0.3,
                 lm=0.90):
        if not (nelemns == False) ^ (npoints == False):
            raise ValueError('Plese indicate either: [nelemns] Xor [npoints]')

        if nelemns:
            #self.cl = self.CreateFullspace(input_ranges,nelemns)
            self.cl = self.ndlinspace(input_ranges, nelemns)
        else:
            self.cl = self.CreateRandomSpace(input_ranges, npoints)

        self.lbounds = []
        self.ubounds = []

        self.k = k
        self.shape = self.cl.shape
        self.nactions = nactions
        self.Q = zeros((self.shape[0], nactions))
        self.ps = zeros((self.shape[0], nactions, self.shape[1]))
        #self.Q         = uniform(-1,0,(self.shape[0],nactions))+0.0
        self.e = zeros((self.shape[0], nactions)) + 0.0
        #self.ac         = zeros((self.shape[0]))+0.0 #classifiers activation
        self.ac = []
        self.knn = []
        self.alpha = alpha
        self.lm = lm
        self.last_state = zeros((1, self.shape[1])) + 0.0
        self.next_state = array(self.last_state)

        for r in input_ranges:
            self.lbounds.append(r[0])
            self.ubounds.append(r[1])

        self.lbounds = array(self.lbounds)
        self.ubounds = array(self.ubounds)
        self.cl = array(self.RescaleInputs(self.cl))
        self.knntree = kdtree(self.cl)
Example #17
0
    def getPredecessorDistribution(self, state):
        """ Return a states drawn from *state*'s predecessor distribution 
        
        Returns a possible predecessor state of *state* drawn from the 
        predecessor state distribution according to its probability mass function.
        """
        if self.succStates == None:
            raise ModelNotInitialized()

        k = min(self.states.shape[0], self.k)

        if self.rebuildPred:
            self.predKDTree = ann.kdtree(self.succStates)
            self.rebuildPred = False

        indices, distances = self.predKDTree.knn(state, k)

        denominator = numpy.sum(numpy.exp(-distances[0] / (self.b_Sa**2)))

        # If the distances become too large, then all values can become zero
        # In this situation, we simply return the closest state and probability 1.
        if denominator == 0:
            import warnings
            warnings.warn("Too large distances, returing only closest example")
            indices[0] = [indices[0][0]]
            distances[0] = [0.0]
            denominator = numpy.exp(0.0 / (self.b_Sa**2))

        for index, distance in zip(indices[0], distances[0]):
            neighbor = State(
                self.succStates[index],
                state.dimensions)  # TODO: not use state.dimensions
            predState, reward = self.predecessorSamples[neighbor]

            delta = predState - neighbor
            predictedPredState = State(state + delta, state.dimensions)

            yield predictedPredState, gaussian(distance,
                                               self.b_Sa) / denominator
Example #18
0
    def getExpectedReward(self, state):
        """ Returns the expected reward for the given state """
        if self.states == None:
            return 0.0

        k = min(self.states.shape[0], self.k)

        if self.rebuildSucc:
            self.succKDTree = ann.kdtree(self.states)
            self.rebuildSucc = False

        indices, distances = self.succKDTree.knn(state, k)

        denominator = numpy.sum(numpy.exp(-distances[0] / (self.b_Sa**2)))

        # If the distances become too large, then all values can become zero
        # In this situation, we simply return the closest state and probability 1.
        if denominator == 0:
            import warnings
            warnings.warn(
                "Too large distances, returning only closest example")
            indices[0] = [indices[0][0]]
            distances[0] = [0.0]
            denominator = numpy.exp(0.0 / (self.b_Sa**2))

        expectedReward = 0.0
        for index, distance in zip(indices[0], distances[0]):
            neighbor = State(
                self.states[index],
                state.dimensions)  # TODO: not use state.dimensions

            succState, reward = self.successorSamples[neighbor]

            weight = gaussian(distance, self.b_Sa) / denominator
            expectedReward += reward * weight

        return expectedReward
Example #19
0
def _obj_det_match(cells,
                   db,
                   obj_tabname,
                   det_tabname,
                   o2d_tabname,
                   radius,
                   explist=None,
                   _rematching=False):
    """
	This kernel assumes:
	   a) det_table and obj_table have equal partitioning (equally
	      sized/enumerated spatial cells)
	   b) both det_table and obj_table have up-to-date neighbor caches
	   c) temporal det_table cells within this spatial cell are stored
	      local to this process (relevant for shared-nothing setups)
	   d) exposures don't stretch across temporal cells

	Algorithm:
	   - fetch all existing static sky objects, including the cached ones (*)
	   - project them to tangent plane around the center of the cell
	     (we assume the cell is small enough for the distortions not to matter)
	   - construct a kD tree in (x, y) tangent space
	   - for each temporal cell, in sorted order (++):
	   	1.) Fetch the detections, including the cached ones (+)
	   	2.) Project to tangent plane

	   	3.) for each exposure, in sorted order (++):
		    a.) Match agains the kD tree of objects
		    b.) Add those that didn't match to the list of objects 

		4.) For newly added objects: store to disk only those that
		    fall within this cell (the others will be matched and
		    stored in their parent cells)

		5.) For matched detections: Drop detections matched to cached
		    objects (these will be matched and stored in the objects'
		    parent cell). Store the rest.


	   (+) It is allowed (and necessary to allow) for a cached detection
		    to be matched against an object within our cell.  This
		    correctly matches cases when the object is right inside
		    the cell boundary, but the detection is just to the
		    outside.

	   (++) Having cells and detections sorted ensures that objects in overlapping
	        (cached) regions are seen by kernels in different cells in the same
	        order, thus resulting in the same matches. Note: this may fail in
	        extremely crowded region, but as of now it's not clear how big of
	        a problem (if any!) will this pose.

	   (*) Cached objects must be loaded and matched against to guard against
	       the case where an object is just outside the edge, while a detection
	       is just inside. If the cached object was not loaded, the detection
	       would not match and be proclamed to be a new object. However, in the
	       cached object's parent cell, the detection would match to the object
	       and be stored there as well.
	       
	       The algorithm above ensures that such a detection will matched to
	       the cached object in this cell (and be dropped in step 5), preventing
	       it from being promoted into a new object.

	   TODO: The above algorithm ensures no detection is assigned to more than
	   	one object. It also ensures that each detection links to an object.
	   	Implement a consistency check to verify that.
	"""

    from scikits.ann import kdtree

    # Input is a tuple of obj_cell, and det_cells falling under that obj_cell
    obj_cell, det_cells = cells
    det_cells.sort()
    assert len(det_cells)

    # Fetch the frequently used bits
    obj_table = db.table(obj_tabname)
    det_table = db.table(det_tabname)
    o2d_table = db.table(o2d_tabname)
    pix = obj_table.pix

    # locate cell center (for gnomonic projection)
    (bounds, tbounds) = pix.cell_bounds(obj_cell)
    (clon, clat) = bhpix.deproj_bhealpix(*bounds.center())

    # fetch existing static sky, convert to gnomonic
    objs = db.query('_ID, _LON, _LAT FROM %s' % obj_tabname).fetch_cell(
        obj_cell, include_cached=True)
    xyobj = np.column_stack(gnomonic(objs['_LON'], objs['_LAT'], clon, clat))
    nobj = len(objs)  # Total number of static sky objects
    tree = None
    nobj_old = 0

    # for sanity checks/debugging (see below)
    expseen = set()

    ## TODO: Debugging, remove when happy
    assert (np.unique(sorted(det_cells)) == sorted(det_cells)).all()
    ##print "Det cells: ", det_cells

    # Loop, xmatch, and store
    if explist is not None:
        explist = np.asarray(list(explist),
                             dtype=np.uint64)  # Ensure explist is a ndarray
    det_query = db.query('_ID, _LON, _LAT, _EXP, _CACHED FROM %s' %
                         det_tabname)
    for det_cell in sorted(det_cells):
        # fetch detections in this cell, convert to gnomonic coordinates
        # keep only detections with _EXP in explist, unless explist is None
        detections = det_query.fetch_cell(det_cell, include_cached=True)

        # if there are no preexisting static sky objects, and all detections in this cell are cached,
        # there's no way we'll get a match that will be kept in the end. Just continue to the
        # next one if this is the case.
        cachedonly = len(objs) == 0 and detections._CACHED.all()
        if cachedonly:
            #			print "Skipping cached-only", len(cached)
            yield (
                None, None, None, None, None, None
            )  # Yield just to have the progress counter properly incremented
            continue

        if explist is not None:
            keep = np.in1d(detections._EXP, explist)
            if not np.all(keep):
                detections = detections[keep]
            if len(detections) == 0:
                yield (
                    None, None, None, None, None, None
                )  # Yield just to have the progress counter properly incremented
                continue
        _, ra2, dec2, exposures, cached = detections.as_columns()
        detections.add_column('xy',
                              np.column_stack(gnomonic(ra2, dec2, clon, clat)))

        # prep join table
        join = ColGroup(dtype=o2d_table.dtype_for(
            ['_ID', '_M1', '_M2', '_DIST', '_LON', '_LAT']))
        njoin = 0
        nobj0 = nobj

        ##print "Cell", det_cell, " - Unique exposures: ", set(exposures)

        # Process detections exposure-by-exposure, as detections from
        # different exposures within a same temporal cell are allowed
        # to belong to the same object
        uexposures = set(exposures)
        for exposure in sorted(uexposures):
            # Sanity check: a consistent table cannot have two
            # exposures stretching over more than one cell
            assert exposure not in expseen
            expseen.add(exposure)

            # Extract objects belonging to this exposure only
            detections2 = detections[exposures == exposure]
            id2, ra2, dec2, _, _, xydet = detections2.as_columns()
            ndet = len(xydet)

            if len(xyobj) != 0:
                # Construct kD-tree and find the object nearest to each
                # detection from this cell
                if tree is None or nobj_old != len(xyobj):
                    del tree
                    nobj_old = len(xyobj)
                    tree = kdtree(xyobj)
                match_idx, match_d2 = tree.knn(xydet, 1)
                match_idx = match_idx[:, 0]  # First neighbor only

                ####
                #if np.uint64(13828114484734072082) in id2:
                #	np.savetxt('bla.%d.static=%d.txt' % (det_cell, pix.static_cell_for_cell(det_cell)), objs.as_ndarray(), fmt='%s')

                # Compute accurate distances, and select detections not matched to existing objects
                dist = gc_dist(objs['_LON'][match_idx],
                               objs['_LAT'][match_idx], ra2, dec2)
                unmatched = dist >= radius
            else:
                # All detections will become new objects (and therefore, dist=0)
                dist = np.zeros(ndet, dtype='f4')
                unmatched = np.ones(ndet, dtype=bool)
                match_idx = np.empty(ndet, dtype='i4')

#			x, y, t = pix._xyt_from_cell_id(det_cell)
#			print "det_cell %s, MJD %s, Exposure %s  ==  %d detections, %d objects, %d matched, %d unmatched" % (det_cell, t, exposure, len(detections2), nobj, len(unmatched)-unmatched.sum(), unmatched.sum())

# Promote unmatched detections to new objects
            _, newra, newdec, _, _, newxy = detections2[unmatched].as_columns()
            nunmatched = unmatched.sum()
            reserve_space(objs, nobj + nunmatched)
            objs['_LON'][nobj:nobj + nunmatched] = newra
            objs['_LAT'][nobj:nobj + nunmatched] = newdec
            dist[unmatched] = 0.
            match_idx[unmatched] = np.arange(
                nobj, nobj + nunmatched, dtype='i4'
            )  # Set the indices of unmatched detections to newly created objects

            # Join objects to their detections
            reserve_space(join, njoin + ndet)
            join['_M1'][njoin:njoin + ndet] = match_idx
            join['_M2'][njoin:njoin + ndet] = id2
            join['_DIST'][njoin:njoin + ndet] = dist
            # TODO: For debugging; remove when happy
            join['_LON'][njoin:njoin + ndet] = ra2
            join['_LAT'][njoin:njoin + ndet] = dec2
            njoin += ndet

            # Prep for next loop
            nobj += nunmatched
            xyobj = np.append(xyobj, newxy, axis=0)

            # TODO: Debugging: Final consistency check (remove when happy with the code)
            dist = gc_dist(objs['_LON'][join['_M1'][njoin - ndet:njoin]],
                           objs['_LAT'][join['_M1'][njoin - ndet:njoin]], ra2,
                           dec2)
            assert (dist < radius).all()

        # Truncate output tables to their actual number of elements
        objs = objs[0:nobj]
        join = join[0:njoin]
        assert len(objs) >= nobj0

        # Find the objects that fall outside of cell boundaries. These will
        # be processed and stored by their parent cells. Also leave out the objects
        # that are already stored in the database
        (x, y) = bhpix.proj_bhealpix(objs['_LON'], objs['_LAT'])
        in_ = bounds.isInsideV(x, y)
        innew = in_.copy()
        innew[:nobj0] = False  # New objects in cell selector

        ids = objs['_ID']
        nobjadded = innew.sum()
        if nobjadded:
            # Append the new objects to the object table, obtaining their IDs.
            assert not _rematching, 'cell_id=%s, nnew=%s\n%s' % (
                det_cell, nobjadded, objs[innew])
            ids[innew] = obj_table.append(objs[('_LON', '_LAT')][innew])

        # Set the indices of objects not in this cell to zero (== a value
        # no valid object in the database can have). Therefore, all
        # out-of-bounds links will have _M1 == 0 (#1), and will be removed
        # by the np1d call (#2)
        ids[~in_] = 0

        # 1) Change the relative index to true obj_id in the join table
        join['_M1'] = ids[join['_M1']]

        # 2) Keep only the joins to objects inside the cell
        join = join[np.in1d(join['_M1'], ids[in_])]

        # Append to the join table, in *dec_cell* of obj_table (!important!)
        if len(join) != 0:
            # compute the cell_id part of the join table's
            # IDs.While this is unimportant now (as we could
            # just set all of them equal to cell_id part of
            # cell_id), if we ever decide to change the
            # pixelation of the table later on, this will
            # allow us to correctly split up the join table as
            # well.
            #_, _, t    = pix._xyt_from_cell_id(det_cell)	# This row points to a detection in the temporal cell ...
            #x, y, _, _ = pix._xyti_from_id(join['_M1'])	# ... but at the spatial location given by the object table.
            #join['_ID'][:] = pix._id_from_xyti(x, y, t, 0) # This will make the new IDs have zeros in the object part (so Table.append will autogen them)
            join['_ID'][:] = det_cell

            o2d_table.append(join)

        assert not cachedonly or (nobjadded == 0 and len(join) == 0)

        # return: Number of exposures, number of objects before processing this cell, number of detections processed (incl. cached),
        #         number of newly added objects, number of detections xmatched, number of detection processed that weren't cached
        # Note: some of the xmatches may be to newly added objects (e.g., if there are two
        #       overlapping exposures within a cell; first one will add new objects, second one will match agains them)
        yield (len(uexposures), nobj0, len(detections), nobjadded, len(join),
               (cached == False).sum())
Example #20
0
if __name__ == '__main__': 
    
    import scikits.ann as ann
    #np.set_printoptions(threshold = np.nan)
    k=200;
    points = np.random.rand(500000, 64).astype(np.float32);
    queries = np.random.rand(500, 64).astype(np.float32);
    
    t0 =time.time()    
    gpu_knn = KnnFinder(points)
    our_indexies,our_distances = gpu_knn.get_knn(queries,k)
    
    t1 =time.time()   
    print t1-t0
    
    truth_tree = ann.kdtree(points);
    truth_points, truth_distances= truth_tree.knn(queries,k);
    truth_distances=np.sqrt(truth_distances)
    
    t2 =time.time()   
    
    print  t2-t1
    #print truth_points
    
    error = np.abs(our_distances- truth_distances)/truth_distances;

    index_error = our_indexies-truth_points
    print "----"
    print np.nansum(error)/error.size
    print np.sum(np.abs(index_error)>0)/index_error.size
    
    def makePredictions((mu, sigma, vt, a_to_u, smat, unsmat)):
        sys.stderr.write("Analysis Completed, Beginning Making Predictions\n")
        def getvData():
            vtfile = open(sys.argv[2], 'r')
            vvfile = open(sys.argv[3], 'r')
            d_i, d_j, d_v = getData(vtfile)
            sys.stderr.write("Reading Validation Validation Data\n")
            v_i, v_j, v_v = getData(vvfile)
            return (d_i, d_j, d_v), (v_i, v_j, v_v)

        def getProjectedLocs(smat, vt):
            return smat * vt.T

        def actuallyMakePredictions(((d_i, d_j, d_v), (v_i, v_j, v_v), vt, mudict, sigmadict, smat, unsmat)):
            sys.stderr.write("Making " + str(len(v_i)) + " Predictions, Analyzing Performance, and Outputting Errors\n")
            u_to_index = {key : [] for key in set(d_i)}
            for i in xrange(len(d_i)):
                u_to_index[d_i[i]].append(i)
            plocs = getProjectedLocs(smat, vt) # plocs is a map from users to projected coordinates

            sys.stderr.write("Creating KDTrees\n")
            a_to_kdtree = {}
            a_to_kdtreemap = {}

            for test_no in xrange(len(v_v)):
                sys.stderr.write("Executing Test Number " + str(test_no) + " out of " + str(len(v_v)) + "\n")
    #default prediction
                prediction = 5
                writeflag = True
                if v_j[test_no] in u_to_index:
                    if test_no in u_to_index:
            #normalization and centering
                        for index in u_to_index[test_no]:
                            try: #if data[index][1] in aset:
                                d_v[index] -= mudict[d_j[index]]
                                if not sigmadict[d_j[index]] == 0:
                                    d_v[index] /= sigmadict[d_j[index]]
                            except KeyError:
                                u_to_index[test_no].remove(index)
                        coordinates = []
                        for cur_eig in vt:
                            coordinates.append(sum(d_v[x]*cur_eig[d_j[x]] for x in u_to_index[test_no]))
                        coordinates = np.array(coordinates)

                        k = int(sys.argv[4])
                        rel_users = list(a_to_u[v_j[test_no]])
                        sys.stderr.write("Found " + str(len(rel_users)) + " Relevant Users\n")
                        k = max(1, min(k, len(rel_users) / 2))
                        #manual method
#                        knn = []
#                        mindist = [k*100] * k #fragile
#                        knn = [-1] * k
#                        for user in rel_users:
#                            d = np.linalg.norm(coordinates - plocs[user])
#                            for i in xrange(k):
#                                if d < mindist[i]:
#                                    mindist.insert(i, d)
#                                    mindist.pop()
#                                    knn.insert(i, user)
#                                    knn.pop()
#                                    break
                        
                        if not v_j[test_no] in a_to_kdtree:
                            sys.stderr.write("Building KDTree\n")
                            a_to_kdtreemap[v_j[test_no]] = rel_users
                            a_to_kdtree[v_j[test_no]] = ann.kdtree(np.array([plocs[u].tolist() for u in rel_users]))
                        knn = a_to_kdtree[v_j[test_no]].knn(coordinates, k)[0][0]
                        knn_ratings = [unsmat[a_to_kdtreemap[v_j[test_no]][x]][v_j[test_no]] for x in knn]
                        mmm = sys.argv[5]
                        if mmm == "mean":
                            prediction = np.mean(knn_ratings)
                        elif mmm == "median":
                            prediction = np.median(knn_ratings)
                        else:
                            prediction = np.argmax(np.bincount(knn_ratings))

                    else:
                        writeflag = False
                        prediction = mudict[v_j[test_no]]
                sys.stderr.write("Printing Prediction\n")
                sys.stderr.write("Predicted: " + str(prediction) + " Actual: " + str(v_v[test_no]) +"\n")
                print prediction - v_v[test_no]
Example #22
0
def process_filelist_test(filelist=None,model=None,tmpfilename=None,K=1):
    """
    Main function, process all files in the list (as long as their track_id
    is not in testsongs)
    INPUT
       filelist     - a list of song files
       model        - h5 file containing feats and artist_id for all train songs
       tmpfilename  - where to save our processed features
       K            - K-nn parameter (default=1)
    """
    # sanity check
    for arg in locals().values():
        assert not arg is None,'process_filelist_train, missing an argument, something still None'
    if os.path.isfile(tmpfilename):
        print 'ERROR: file',tmpfilename,'already exists.'
        return
    if not os.path.isfile(model):
        print 'ERROR: model',model,'does not exist.'
        return
    # dimension fixed (12-dimensional timbre vector)
    ndim = 12
    finaldim = 90
    # create kdtree
    h5model = tables.openFile(model, mode='r')
    assert h5model.root.data.feats.shape[1]==finaldim,'inconsistency in final dim'
    kd = ANN.kdtree(h5model.root.data.feats)
    # create outputfile
    output = tables.openFile(tmpfilename, mode='a')
    group = output.createGroup("/",'data','TMP FILE FOR ARTIST RECOGNITION')
    output.createEArray(group,'artist_id_real',tables.StringAtom(18,shape=()),(0,),'',
                        expectedrows=len(filelist))
    output.createEArray(group,'artist_id_pred',tables.StringAtom(18,shape=()),(0,),'',
                        expectedrows=len(filelist))
    # iterate over files
    cnt_f = 0
    for f in filelist:
        cnt_f += 1
        # verbose
        if cnt_f % 50000 == 0:
            print 'training... checking file #',cnt_f
        # check what file/song is this
        h5 = GETTERS.open_h5_file_read(f)
        artist_id = GETTERS.get_artist_id(h5)
        track_id = GETTERS.get_track_id(h5)
        if track_id in testsongs: # just in case, but should not be necessary
            print 'Found test track_id during training? weird.',track_id
            h5.close()
            continue
        # extract features, then close file
        processed_feats = compute_features(h5)
        h5.close()
        if processed_feats is None:
            continue
        # do prediction
        artist_id_pred = do_prediction(processed_feats,kd,h5model,K)
        # save features to tmp file
        output.root.data.artist_id_real.append( np.array( [artist_id] ) )
        output.root.data.artist_id_pred.append( np.array( [artist_id_pred] ) )
    # we're done, close output
    output.close()
    return
Example #23
0
def process_filelist_test(filelist=None,
                          model=None,
                          tmpfilename=None,
                          npicks=None,
                          winsize=None,
                          finaldim=None,
                          K=1,
                          typecompress='picks'):
    """
    Main function, process all files in the list (as long as their artist
    is in testartist)
    INPUT
       filelist     - a list of song files
       model        - h5 file containing feats and year for all train songs
       tmpfilename  - where to save our processed features
       npicks       - number of segments to pick per song
       winsize      - size of each segment we pick
       finaldim     - how many values do we keep
       K            - param of KNN (default 1)
       typecompress - feature type, 'picks', 'corrcoeff' or 'cov'
                      must be the same as in training
    """
    # sanity check
    for arg in locals().values():
        assert not arg is None, 'process_filelist_test, missing an argument, something still None'
    if os.path.isfile(tmpfilename):
        print 'ERROR: file', tmpfilename, 'already exists.'
        return
    if not os.path.isfile(model):
        print 'ERROR: model', model, 'does not exist.'
        return
    # create kdtree
    h5model = tables.openFile(model, mode='r')
    assert h5model.root.data.feats.shape[
        1] == finaldim, 'inconsistency in final dim'
    kd = ANN.kdtree(h5model.root.data.feats)
    # create outputfile
    output = tables.openFile(tmpfilename, mode='a')
    group = output.createGroup("/", 'data', 'TMP FILE FOR YEAR RECOGNITION')
    output.createEArray(group,
                        'year_real',
                        tables.IntAtom(shape=()), (0, ),
                        '',
                        expectedrows=len(filelist))
    output.createEArray(group,
                        'year_pred',
                        tables.Float64Atom(shape=()), (0, ),
                        '',
                        expectedrows=len(filelist))
    # random projection
    ndim = 12  # fixed in this dataset
    if typecompress == 'picks':
        randproj = RANDPROJ.proj_point5(ndim * winsize, finaldim)
    elif typecompress == 'corrcoeff' or typecompress == 'cov':
        randproj = RANDPROJ.proj_point5(ndim * ndim, finaldim)
    elif typecompress == 'avgcov':
        randproj = RANDPROJ.proj_point5(90, finaldim)
    else:
        assert False, 'Unknown type of compression: ' + str(typecompress)
    # go through files
    cnt_f = 0
    for f in filelist:
        cnt_f += 1
        if cnt_f % 5000 == 0:
            print 'TESTING FILE #' + str(cnt_f)
        # check file
        h5 = GETTERS.open_h5_file_read(f)
        artist_id = GETTERS.get_artist_id(h5)
        year = GETTERS.get_year(h5)
        track_id = GETTERS.get_track_id(h5)
        h5.close()
        if year <= 0:  # probably useless but...
            continue
        if typecompress == 'picks':
            # we have a train artist with a song year, we're good
            bttimbre = get_bttimbre(f)
            if bttimbre is None:
                continue
            # we even have normal features, awesome!
            processed_feats = CBTF.extract_and_compress(bttimbre,
                                                        npicks,
                                                        winsize,
                                                        finaldim,
                                                        randproj=randproj)
        elif typecompress == 'corrcoeff':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.corr_and_compress(timbres,
                                                     finaldim,
                                                     randproj=randproj)
        elif typecompress == 'cov':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.cov_and_compress(timbres,
                                                    finaldim,
                                                    randproj=randproj)
        elif typecompress == 'avgcov':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.avgcov_and_compress(timbres,
                                                       finaldim,
                                                       randproj=randproj)
        else:
            assert False, 'Unknown type of compression: ' + str(typecompress)
        if processed_feats is None:
            continue
        if processed_feats.shape[0] == 0:
            continue
        # do prediction
        year_pred = do_prediction(processed_feats, kd, h5model, K)
        # add pred and ground truth to output
        if not year_pred is None:
            output.root.data.year_real.append([year])
            output.root.data.year_pred.append([year_pred])
    # close output and model
    del kd
    h5model.close()
    output.close()
    # done
    return
        timekd += time.time() - tstart
        # ckdtree
        tstart = time.time()
        idx4 = cb.closest_code_ckdtree(sample)
        timeckd += time.time() - tstart
        # ann
        tstart = time.time()
        idx5 = cb.closest_code_ann(sample)
        timeann += time.time() - tstart
        # batch
        tstart = time.time()
        idx6 = cb.closest_code_batch(sample)
        timebatch += time.time() - tstart
        # ann redoing codebook
        tstart = time.time()
        cb._ann = ann.kdtree(cb._codebook)
        idx7 = cb.closest_code_ann(sample)
        timeann2 += time.time() - tstart
        # checking
        assert idx2 == idx3 or (cb[idx2] == cb[idx3]).all()
        assert idx2 == idx4 or (cb[idx2] == cb[idx4]).all()
        assert idx2 == idx5 or (cb[idx2] == cb[idx5]).all()
        assert idx2 == idx6 or (cb[idx2] == cb[idx6]).all()
        assert idx2 == idx7 or (cb[idx2] == cb[idx7]).all()
        assert idx2 == idxes[sampleidx] or (cb[idx2] == cb[idxes[sampleidx]]).all()

    #print 'time for fast algo:',timefast,'seconds.'
    print 'time for slow algo:        ',timeslow,'seconds.'
    print 'time for kd algo:          ',timekd,'seconds.'
    print 'time for ckd algo:         ',timeckd,'seconds.'
    print 'time for ann algo:         ',timeann,'seconds.'
Example #25
0
File: tasks.py Project: banados/lsd
def _xmatch_mapper(qresult, tabname_to, radius, tabname_xm, n_neighbors):
	"""
	    Mapper:
	    	- given all objects in a cell, make an ANN tree
	    	- load all objects in tabname_to (including neighbors), make an ANN tree, find matches
	    	- store the output into an index table
	"""
	from scikits.ann import kdtree

	db       = qresult.db
	pix      = qresult.pix
	table_xm = db.table(tabname_xm)

	for rows in qresult:
		cell_id  = rows.info.cell_id

		join = ColGroup(dtype=[('_M1', 'u8'), ('_M2', 'u8'), ('_DIST', 'f4'), ('_NR', 'u1'), ('_LON', 'f8'), ('_LAT', 'f8')])

		(id1, ra1, dec1) = rows.as_columns()
		(id2, ra2, dec2) = db.query('_ID, _LON, _LAT FROM %s' % tabname_to).fetch_cell(cell_id, include_cached=True).as_columns()

		if len(id2) != 0:
			# Project to tangent plane around the center of the cell. We
			# assume the cell is small enough for the distortions not to
			# matter and Euclidian distances apply
			bounds, _    = pix.cell_bounds(cell_id)
			(clon, clat) = bhpix.deproj_bhealpix(*bounds.center())
			xy1 = np.column_stack(gnomonic(ra1, dec1, clon, clat))
			xy2 = np.column_stack(gnomonic(ra2, dec2, clon, clat))

			# Construct kD-tree to find an object in table_to that is nearest
			# to an object in table_from, for every object in table_from
			tree = kdtree(xy2)
			match_idxs, match_d2 = tree.knn(xy1, min(n_neighbors, len(xy2)))
			del tree

			# Create the index table array
			join.resize(match_idxs.size)
			for k in xrange(match_idxs.shape[1]):
				match_idx = match_idxs[:,k]
				join['_M1'][k::match_idxs.shape[1]]   = id1
				join['_M2'][k::match_idxs.shape[1]]   = id2[match_idx]
				join['_DIST'][k::match_idxs.shape[1]] = gc_dist(ra1, dec1, ra2[match_idx], dec2[match_idx])
				join['_LON'][k::match_idxs.shape[1]]  = ra2[match_idx]
				join['_LAT'][k::match_idxs.shape[1]]  = dec2[match_idx]
				join['_NR'][k::match_idxs.shape[1]]   = k

			# Remove matches beyond the xmatch radius
			join = join[join['_DIST'] < radius]

		if len(join):
			# compute the cell_id part of the join table's
			# IDs. While this is unimportant now (as we could
			# just set all of them equal to cell_id part of
			# cell_id), if we ever decide to change the
			# pixelation of the table later on, this will
			# allow us to correctly repixelize the join table as
			# well.
			#x, y, t, _  = pix._xyti_from_id(join['_M1'])	# ... but at the spatial location given by the object table.
			#join['_ID'] = pix._id_from_xyti(x, y, t, 0)     # This will make the new IDs have zeros in the object part (so Table.append will autogen them)
			
			# TODO: Allow the stuff above (in Table.append)
			join['_ID'] = pix.cell_for_id(join['_M1'])

			# TODO: Debugging, remove when happy
			cid = np.unique(pix.cell_for_id(join['_ID']))
			assert len(cid) == 1, len(cid)
			assert cid[0] == cell_id, '%s %s' % (cid[0], cell_id)
			####

			table_xm.append(join)

			yield len(id1), len(id2), len(join)
		else:
			yield len(rows), 0, 0
Example #26
0
File: smf.py Project: banados/lsd
def _obj_det_match(cells, db, obj_tabname, det_tabname, o2d_tabname, radius, explist=None, _rematching=False):
	"""
	This kernel assumes:
	   a) det_table and obj_table have equal partitioning (equally
	      sized/enumerated spatial cells)
	   b) both det_table and obj_table have up-to-date neighbor caches
	   c) temporal det_table cells within this spatial cell are stored
	      local to this process (relevant for shared-nothing setups)
	   d) exposures don't stretch across temporal cells

	Algorithm:
	   - fetch all existing static sky objects, including the cached ones (*)
	   - project them to tangent plane around the center of the cell
	     (we assume the cell is small enough for the distortions not to matter)
	   - construct a kD tree in (x, y) tangent space
	   - for each temporal cell, in sorted order (++):
	   	1.) Fetch the detections, including the cached ones (+)
	   	2.) Project to tangent plane

	   	3.) for each exposure, in sorted order (++):
		    a.) Match agains the kD tree of objects
		    b.) Add those that didn't match to the list of objects 

		4.) For newly added objects: store to disk only those that
		    fall within this cell (the others will be matched and
		    stored in their parent cells)

		5.) For matched detections: Drop detections matched to cached
		    objects (these will be matched and stored in the objects'
		    parent cell). Store the rest.


	   (+) It is allowed (and necessary to allow) for a cached detection
		    to be matched against an object within our cell.  This
		    correctly matches cases when the object is right inside
		    the cell boundary, but the detection is just to the
		    outside.

	   (++) Having cells and detections sorted ensures that objects in overlapping
	        (cached) regions are seen by kernels in different cells in the same
	        order, thus resulting in the same matches. Note: this may fail in
	        extremely crowded region, but as of now it's not clear how big of
	        a problem (if any!) will this pose.

	   (*) Cached objects must be loaded and matched against to guard against
	       the case where an object is just outside the edge, while a detection
	       is just inside. If the cached object was not loaded, the detection
	       would not match and be proclamed to be a new object. However, in the
	       cached object's parent cell, the detection would match to the object
	       and be stored there as well.
	       
	       The algorithm above ensures that such a detection will matched to
	       the cached object in this cell (and be dropped in step 5), preventing
	       it from being promoted into a new object.

	   TODO: The above algorithm ensures no detection is assigned to more than
	   	one object. It also ensures that each detection links to an object.
	   	Implement a consistency check to verify that.
	"""

	from scikits.ann import kdtree

	# Input is a tuple of obj_cell, and det_cells falling under that obj_cell
	obj_cell, det_cells = cells
	det_cells.sort()
	assert len(det_cells)

	# Fetch the frequently used bits
	obj_table = db.table(obj_tabname)
	det_table = db.table(det_tabname)
	o2d_table = db.table(o2d_tabname)
	pix = obj_table.pix

	# locate cell center (for gnomonic projection)
	(bounds, tbounds)  = pix.cell_bounds(obj_cell)
	(clon, clat) = bhpix.deproj_bhealpix(*bounds.center())

	# fetch existing static sky, convert to gnomonic
	objs  = db.query('_ID, _LON, _LAT FROM %s' % obj_tabname).fetch_cell(obj_cell, include_cached=True)
	xyobj = np.column_stack(gnomonic(objs['_LON'], objs['_LAT'], clon, clat))
	nobj  = len(objs)	# Total number of static sky objects
	tree  = None
	nobj_old = 0

	# for sanity checks/debugging (see below)
	expseen = set()

	## TODO: Debugging, remove when happy
	assert (np.unique(sorted(det_cells)) == sorted(det_cells)).all()
	##print "Det cells: ", det_cells

	# Loop, xmatch, and store
	if explist is not None:
		explist = np.asarray(list(explist), dtype=np.uint64)	# Ensure explist is a ndarray
	det_query = db.query('_ID, _LON, _LAT, _EXP, _CACHED FROM %s' % det_tabname)
	for det_cell in sorted(det_cells):
		# fetch detections in this cell, convert to gnomonic coordinates
		# keep only detections with _EXP in explist, unless explist is None
		detections = det_query.fetch_cell(det_cell, include_cached=True)

		# if there are no preexisting static sky objects, and all detections in this cell are cached,
		# there's no way we'll get a match that will be kept in the end. Just continue to the
		# next one if this is the case.
		cachedonly = len(objs) == 0 and detections._CACHED.all()
		if cachedonly:
#			print "Skipping cached-only", len(cached)
			yield (None, None, None, None, None, None) # Yield just to have the progress counter properly incremented
			continue;

		if explist is not None:
			keep = np.in1d(detections._EXP, explist)
			if not np.all(keep):
				detections = detections[keep]
			if len(detections) == 0:
				yield (None, None, None, None, None, None) # Yield just to have the progress counter properly incremented
				continue
		_, ra2, dec2, exposures, cached = detections.as_columns()
		detections.add_column('xy', np.column_stack(gnomonic(ra2, dec2, clon, clat)))

		# prep join table
		join  = ColGroup(dtype=o2d_table.dtype_for(['_ID', '_M1', '_M2', '_DIST', '_LON', '_LAT']))
		njoin = 0;
		nobj0 = nobj;

		##print "Cell", det_cell, " - Unique exposures: ", set(exposures)

		# Process detections exposure-by-exposure, as detections from
		# different exposures within a same temporal cell are allowed
		# to belong to the same object
		uexposures = set(exposures)
		for exposure in sorted(uexposures):
			# Sanity check: a consistent table cannot have two
			# exposures stretching over more than one cell
			assert exposure not in expseen
			expseen.add(exposure);

			# Extract objects belonging to this exposure only
			detections2 = detections[exposures == exposure]
			id2, ra2, dec2, _, _, xydet = detections2.as_columns()
			ndet = len(xydet)

			if len(xyobj) != 0:
				# Construct kD-tree and find the object nearest to each
				# detection from this cell
				if tree is None or nobj_old != len(xyobj):
					del tree
					nobj_old = len(xyobj)
					tree = kdtree(xyobj)
				match_idx, match_d2 = tree.knn(xydet, 1)
				match_idx = match_idx[:,0]		# First neighbor only

				####
				#if np.uint64(13828114484734072082) in id2:
				#	np.savetxt('bla.%d.static=%d.txt' % (det_cell, pix.static_cell_for_cell(det_cell)), objs.as_ndarray(), fmt='%s')

				# Compute accurate distances, and select detections not matched to existing objects
				dist       = gc_dist(objs['_LON'][match_idx], objs['_LAT'][match_idx], ra2, dec2)
				unmatched  = dist >= radius
			else:
				# All detections will become new objects (and therefore, dist=0)
				dist       = np.zeros(ndet, dtype='f4')
				unmatched  = np.ones(ndet, dtype=bool)
				match_idx  = np.empty(ndet, dtype='i4')

#			x, y, t = pix._xyt_from_cell_id(det_cell)
#			print "det_cell %s, MJD %s, Exposure %s  ==  %d detections, %d objects, %d matched, %d unmatched" % (det_cell, t, exposure, len(detections2), nobj, len(unmatched)-unmatched.sum(), unmatched.sum())

			# Promote unmatched detections to new objects
			_, newra, newdec, _, _, newxy = detections2[unmatched].as_columns()
			nunmatched = unmatched.sum()
			reserve_space(objs, nobj+nunmatched)
			objs['_LON'][nobj:nobj+nunmatched] = newra
			objs['_LAT'][nobj:nobj+nunmatched] = newdec
			dist[unmatched]                    = 0.
			match_idx[unmatched] = np.arange(nobj, nobj+nunmatched, dtype='i4')	# Set the indices of unmatched detections to newly created objects

			# Join objects to their detections
			reserve_space(join, njoin+ndet)
			join['_M1'][njoin:njoin+ndet]   = match_idx
			join['_M2'][njoin:njoin+ndet]   =       id2
			join['_DIST'][njoin:njoin+ndet] =      dist
			# TODO: For debugging; remove when happy
			join['_LON'][njoin:njoin+ndet]  =       ra2
			join['_LAT'][njoin:njoin+ndet]  =      dec2
			njoin += ndet

			# Prep for next loop
			nobj  += nunmatched
			xyobj  = np.append(xyobj, newxy, axis=0)

			# TODO: Debugging: Final consistency check (remove when happy with the code)
			dist = gc_dist( objs['_LON'][  join['_M1'][njoin-ndet:njoin]  ],
					objs['_LAT'][  join['_M1'][njoin-ndet:njoin]  ], ra2, dec2)
			assert (dist < radius).all()

		# Truncate output tables to their actual number of elements
		objs = objs[0:nobj]
		join = join[0:njoin]
		assert len(objs) >= nobj0

		# Find the objects that fall outside of cell boundaries. These will
		# be processed and stored by their parent cells. Also leave out the objects
		# that are already stored in the database
		(x, y) = bhpix.proj_bhealpix(objs['_LON'], objs['_LAT'])
		in_    = bounds.isInsideV(x, y)
		innew  = in_.copy();
		innew[:nobj0] = False											# New objects in cell selector

		ids = objs['_ID']
		nobjadded = innew.sum()
		if nobjadded:
			# Append the new objects to the object table, obtaining their IDs.
			assert not _rematching, 'cell_id=%s, nnew=%s\n%s' % (det_cell, nobjadded, objs[innew])
			ids[innew] = obj_table.append(objs[('_LON', '_LAT')][innew])

		# Set the indices of objects not in this cell to zero (== a value
		# no valid object in the database can have). Therefore, all
		# out-of-bounds links will have _M1 == 0 (#1), and will be removed
		# by the np1d call (#2)
		ids[~in_] = 0

		# 1) Change the relative index to true obj_id in the join table
		join['_M1'] = ids[join['_M1']]

		# 2) Keep only the joins to objects inside the cell
		join = join[ np.in1d(join['_M1'], ids[in_]) ]

		# Append to the join table, in *dec_cell* of obj_table (!important!)
		if len(join) != 0:
			# compute the cell_id part of the join table's
			# IDs.While this is unimportant now (as we could
			# just set all of them equal to cell_id part of
			# cell_id), if we ever decide to change the
			# pixelation of the table later on, this will
			# allow us to correctly split up the join table as
			# well.
			#_, _, t    = pix._xyt_from_cell_id(det_cell)	# This row points to a detection in the temporal cell ...
			#x, y, _, _ = pix._xyti_from_id(join['_M1'])	# ... but at the spatial location given by the object table.
			#join['_ID'][:] = pix._id_from_xyti(x, y, t, 0) # This will make the new IDs have zeros in the object part (so Table.append will autogen them)
			join['_ID'][:] = det_cell

			o2d_table.append(join)

		assert not cachedonly or (nobjadded == 0 and len(join) == 0)

		# return: Number of exposures, number of objects before processing this cell, number of detections processed (incl. cached),
		#         number of newly added objects, number of detections xmatched, number of detection processed that weren't cached
		# Note: some of the xmatches may be to newly added objects (e.g., if there are two 
		#       overlapping exposures within a cell; first one will add new objects, second one will match agains them)
		yield (len(uexposures), nobj0, len(detections), nobjadded, len(join), (cached == False).sum())
Example #27
0
    def addTransition(self, state, succState, reward):
        """ Add the given transition (state, succState, reward) to the example set.
        
        Return which state transition (if any) has been removed.
         """
        # Lazy initialization when dimensionality is known
        if self.states == None:
            self.states = numpy.zeros((len(state), 0))
            self.succStates = numpy.zeros((len(state), 0))
            self.rewards = numpy.zeros((1, 0))
            self.stateDimensions = state.dimensions
        else:
            assert (self.stateDimensions == state.dimensions)

        removedState = None

        if not self.isFull() or not hasattr(
                self, "kdtree"):  # No way to remove states
            # Add sample to internal memory
            self.states = numpy.hstack((self.states, numpy.array([state]).T))
            self.succStates = numpy.hstack(
                (self.succStates, numpy.array([succState]).T))
            self.rewards = numpy.hstack(
                (self.rewards, numpy.array([[reward]])))
        else:
            # The example set is full, we remove the nearest neighbor of the added
            # state
            # Determine the distance of the current example to its
            # nearest neighbor
            minDist = self.kdTree.knn(state, 1)[1][0, 0]

            # Since it is too expensive to compute the closest pair of the
            # whole example set, we randomly pick some old examples, compute
            # their distance to their respective nearest neighbors and
            # replace the example with the minimal distance.
            replaceIndex = None
            for i in range(25):
                rndIndex = random.randint(0, self.states.shape[1] - 1)
                dist = self.kdTree.knn(self.states.T[rndIndex], 2)[1][0, 1]
                if dist < minDist:
                    minDist = dist
                    replaceIndex = rndIndex

            # If all old example have a distance larger than the new example,
            # we ignore the new example
            if replaceIndex == None:
                return None

            # Remember which state transition has been removed and return that
            # at the end of the method
            removedState = copy.copy(self.states.T[replaceIndex])

            # Replace the nearest neighbor by the current state
            self.states.T[replaceIndex] = state
            self.succStates.T[replaceIndex] = succState
            self.rewards.T[replaceIndex] = reward

        try:
            # Update the KD Tree used for nearest neighbor search
            self.kdTree = ann.kdtree(self.states.T)
        except NameError:
            pass

        # Return which state transition has been removed
        return removedState
    def makePredictions((mu, sigma, vt, a_to_u, smat, unsmat)):
        sys.stderr.write("Analysis Completed, Beginning Making Predictions\n")
        def getvData():
            vtfile = open(sys.argv[2], 'r')
            d_i, d_j, d_v = getData(vtfile)
            return (d_i, d_j, d_v),

        def getProjectedLocs(smat, vt):
            return smat * vt.T

        def actuallyMakePredictions(((d_i, d_j, d_v), vt, mudict, sigmadict, smat, unsmat)):
            sys.stderr.write("Making Predictions")

            min_users = int(sys.argv[3])
            n_recommendations = int(sys.argv[6])

            u_to_index = {key : [] for key in set(d_i)}
            for i in xrange(len(d_i)):
                u_to_index[d_i[i]].append(i)

            for index in u_to_index[0]:
                try: #if data[index][1] in aset:
                    d_v[index] -= mudict[d_j[index]]
                    if not sigmadict[d_j[index]] == 0:
                        d_v[index] /= sigmadict[d_j[index]]
                except KeyError:
                    u_to_index[test_no].remove(index)
            coordinates = []
            for cur_eig in vt:
                coordinates.append(sum(d_v[x]*cur_eig[d_j[x]] for x in u_to_index[0]))
            coordinates = np.array(coordinates)

            plocs = getProjectedLocs(smat, vt) # plocs is a map from users to projected coordinates

            sys.stderr.write("Creating KDTrees\n")
            a_to_kdtree = {}
            a_to_kdtreemap = {}

            a_to_predicted_rating = {}

            for a in a_to_u:
                sys.stderr.write("Estimating Rating for " + str(a) + "\n")
    #default prediction
                prediction = 5
                writeflag = True



                k = int(sys.argv[4])
                rel_users = list(a_to_u[a])
                if len(rel_users) < min_users:
                    sys.stderr.write("Inadequate Data\n")
                    continue
                sys.stderr.write("Found " + str(len(rel_users)) + " Relevant Users\n")
                k = max(1, min(k, len(rel_users) / 2))
                        
                sys.stderr.write("Building KDTree\n")
                a_to_kdtreemap[a] = rel_users
                ploclist = []
                for i in xrange(len(rel_users)):
                    ploclist.append(plocs[rel_users[i]].tolist())
                a_to_kdtree[a] = ann.kdtree(np.array(ploclist))

                knn = a_to_kdtree[a].knn(coordinates, k)[0][0]
                sys.stderr.write("Using K = " + str(k) + "\n")
                knn_ratings = [unsmat[a_to_kdtreemap[a][x]][a] for x in knn]
                mmm = sys.argv[5]
                if mmm == "mean":
                    prediction = np.mean(knn_ratings)
                elif mmm == "median":
                    prediction = np.median(knn_ratings)
                else:
                    prediction = np.argmax(np.bincount(knn_ratings))

                sys.stderr.write("Predicted: " + str(prediction) + "\n")
                a_to_predicted_rating[a] = prediction
            sys.stderr.write("Predictions Complete!\n")
            
            cannot_rec = set([ d_j[index] for index in u_to_index[0] ])

            i = 0
            while i < n_recommendations:
                maxvalue = 0
                maxindex = 0
                for key, value in a_to_predicted_rating.iteritems():
                    if value > maxvalue:
                        maxindex = key
                        maxvalue = value
                top = maxindex
                if not top in cannot_rec:
                    cannot_rec.add(top)
                    print top
                    i += 1
                del a_to_predicted_rating[top]
        timekd += time.time() - tstart
        # ckdtree
        tstart = time.time()
        idx4 = cb.closest_code_ckdtree(sample)
        timeckd += time.time() - tstart
        # ann
        tstart = time.time()
        idx5 = cb.closest_code_ann(sample)
        timeann += time.time() - tstart
        # batch
        tstart = time.time()
        idx6 = cb.closest_code_batch(sample)
        timebatch += time.time() - tstart
        # ann redoing codebook
        tstart = time.time()
        cb._ann = ann.kdtree(cb._codebook)
        idx7 = cb.closest_code_ann(sample)
        timeann2 += time.time() - tstart
        # checking
        assert idx2 == idx3 or (cb[idx2] == cb[idx3]).all()
        assert idx2 == idx4 or (cb[idx2] == cb[idx4]).all()
        assert idx2 == idx5 or (cb[idx2] == cb[idx5]).all()
        assert idx2 == idx6 or (cb[idx2] == cb[idx6]).all()
        assert idx2 == idx7 or (cb[idx2] == cb[idx7]).all()
        assert idx2 == idxes[sampleidx] or (cb[idx2]
                                            == cb[idxes[sampleidx]]).all()

    #print 'time for fast algo:',timefast,'seconds.'
    print 'time for slow algo:        ', timeslow, 'seconds.'
    print 'time for kd algo:          ', timekd, 'seconds.'
    print 'time for ckd algo:         ', timeckd, 'seconds.'
Example #30
0
def _xmatch_mapper(qresult, tabname_to, radius, tabname_xm, n_neighbors):
    """
	    Mapper:
	    	- given all objects in a cell, make an ANN tree
	    	- load all objects in tabname_to (including neighbors), make an ANN tree, find matches
	    	- store the output into an index table
	"""
    from scikits.ann import kdtree

    db = qresult.db
    pix = qresult.pix
    table_xm = db.table(tabname_xm)

    for rows in qresult:
        cell_id = rows.info.cell_id

        join = ColGroup(
            dtype=[('_M1', 'u8'), ('_M2',
                                   'u8'), ('_DIST',
                                           'f4'), ('_NR',
                                                   'u1'), ('_LON',
                                                           'f8'), ('_LAT',
                                                                   'f8')])

        (id1, ra1, dec1) = rows.as_columns()
        (id2, ra2,
         dec2) = db.query('_ID, _LON, _LAT FROM %s' % tabname_to).fetch_cell(
             cell_id, include_cached=True).as_columns()

        if len(id2) != 0:
            # Project to tangent plane around the center of the cell. We
            # assume the cell is small enough for the distortions not to
            # matter and Euclidian distances apply
            bounds, _ = pix.cell_bounds(cell_id)
            (clon, clat) = bhpix.deproj_bhealpix(*bounds.center())
            xy1 = np.column_stack(gnomonic(ra1, dec1, clon, clat))
            xy2 = np.column_stack(gnomonic(ra2, dec2, clon, clat))

            # Construct kD-tree to find an object in table_to that is nearest
            # to an object in table_from, for every object in table_from
            tree = kdtree(xy2)
            match_idxs, match_d2 = tree.knn(xy1, min(n_neighbors, len(xy2)))
            del tree

            # Create the index table array
            join.resize(match_idxs.size)
            for k in xrange(match_idxs.shape[1]):
                match_idx = match_idxs[:, k]
                join['_M1'][k::match_idxs.shape[1]] = id1
                join['_M2'][k::match_idxs.shape[1]] = id2[match_idx]
                join['_DIST'][k::match_idxs.shape[1]] = gc_dist(
                    ra1, dec1, ra2[match_idx], dec2[match_idx])
                join['_LON'][k::match_idxs.shape[1]] = ra2[match_idx]
                join['_LAT'][k::match_idxs.shape[1]] = dec2[match_idx]
                join['_NR'][k::match_idxs.shape[1]] = k

            # Remove matches beyond the xmatch radius
            join = join[join['_DIST'] < radius]

        if len(join):
            # compute the cell_id part of the join table's
            # IDs. While this is unimportant now (as we could
            # just set all of them equal to cell_id part of
            # cell_id), if we ever decide to change the
            # pixelation of the table later on, this will
            # allow us to correctly repixelize the join table as
            # well.
            #x, y, t, _  = pix._xyti_from_id(join['_M1'])	# ... but at the spatial location given by the object table.
            #join['_ID'] = pix._id_from_xyti(x, y, t, 0)     # This will make the new IDs have zeros in the object part (so Table.append will autogen them)

            # TODO: Allow the stuff above (in Table.append)
            join['_ID'] = pix.cell_for_id(join['_M1'])

            # TODO: Debugging, remove when happy
            cid = np.unique(pix.cell_for_id(join['_ID']))
            assert len(cid) == 1, len(cid)
            assert cid[0] == cell_id, '%s %s' % (cid[0], cell_id)
            ####

            table_xm.append(join)

            yield len(id1), len(id2), len(join)
        else:
            yield len(rows), 0, 0
def process_filelist_test(filelist=None,model=None,tmpfilename=None,
                           npicks=None,winsize=None,finaldim=None,K=1,
                          typecompress='picks'):
    """
    Main function, process all files in the list (as long as their artist
    is in testartist)
    INPUT
       filelist     - a list of song files
       model        - h5 file containing feats and year for all train songs
       tmpfilename  - where to save our processed features
       npicks       - number of segments to pick per song
       winsize      - size of each segment we pick
       finaldim     - how many values do we keep
       K            - param of KNN (default 1)
       typecompress - feature type, 'picks', 'corrcoeff' or 'cov'
                      must be the same as in training
    """
    # sanity check
    for arg in locals().values():
        assert not arg is None,'process_filelist_test, missing an argument, something still None'
    if os.path.isfile(tmpfilename):
        print 'ERROR: file',tmpfilename,'already exists.'
        return
    if not os.path.isfile(model):
        print 'ERROR: model',model,'does not exist.'
        return
    # create kdtree
    h5model = tables.openFile(model, mode='r')
    assert h5model.root.data.feats.shape[1]==finaldim,'inconsistency in final dim'
    kd = ANN.kdtree(h5model.root.data.feats)
    # create outputfile
    output = tables.openFile(tmpfilename, mode='a')
    group = output.createGroup("/",'data','TMP FILE FOR YEAR RECOGNITION')
    output.createEArray(group,'year_real',tables.IntAtom(shape=()),(0,),'',
                        expectedrows=len(filelist))
    output.createEArray(group,'year_pred',tables.Float64Atom(shape=()),(0,),'',
                        expectedrows=len(filelist))
    # random projection
    ndim = 12 # fixed in this dataset
    if typecompress == 'picks':
        randproj = RANDPROJ.proj_point5(ndim * winsize, finaldim)
    elif typecompress == 'corrcoeff' or typecompress=='cov':
        randproj = RANDPROJ.proj_point5(ndim * ndim, finaldim)
    elif typecompress == 'avgcov':
        randproj = RANDPROJ.proj_point5(90, finaldim)
    else:
        assert False,'Unknown type of compression: '+str(typecompress)
    # go through files
    cnt_f = 0
    for f in filelist:
        cnt_f += 1
        if cnt_f % 5000 == 0:
            print 'TESTING FILE #'+str(cnt_f)
        # check file
        h5 = GETTERS.open_h5_file_read(f)
        artist_id = GETTERS.get_artist_id(h5)
        year = GETTERS.get_year(h5)
        track_id = GETTERS.get_track_id(h5)
        h5.close()
        if year <= 0: # probably useless but...
            continue
        if typecompress == 'picks':
            # we have a train artist with a song year, we're good
            bttimbre = get_bttimbre(f)
            if bttimbre is None:
                continue
            # we even have normal features, awesome!
            processed_feats = CBTF.extract_and_compress(bttimbre,npicks,winsize,finaldim,
                                                        randproj=randproj)
        elif typecompress == 'corrcoeff':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.corr_and_compress(timbres,finaldim,randproj=randproj)
        elif typecompress == 'cov':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.cov_and_compress(timbres,finaldim,randproj=randproj)
        elif typecompress == 'avgcov':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.avgcov_and_compress(timbres,finaldim,randproj=randproj)
        else:
            assert False,'Unknown type of compression: '+str(typecompress)
        if processed_feats is None:
            continue
        if processed_feats.shape[0] == 0:
            continue
        # do prediction
        year_pred = do_prediction(processed_feats,kd,h5model,K)
        # add pred and ground truth to output
        if not year_pred is None:
            output.root.data.year_real.append( [year] )
            output.root.data.year_pred.append( [year_pred] )
    # close output and model
    del kd
    h5model.close()
    output.close()
    # done
    return
Example #32
0
File: knn.py Project: mekruthi/mmlf
 def __setstate__(self, dict):
     self.__dict__ = dict
     # restore 
     for action in self.actions:
         self.actionsKDTree[action] = ann.kdtree(self.states[action])