def __init__(self,nactions,input_ranges,nelemns=[],k=1,alpha=0.3): self.cl = self.ndlinspace(input_ranges,nelemns) self.lbounds = [] self.ubounds = [] self.k = k self.shape = self.cl.shape self.nactions = nactions self.ps = zeros((self.shape[0],nactions,self.shape[1])) self.ps_exp = zeros((self.shape[0],nactions)) self.ac = [] self.knn = [] self.alpha = alpha self.last_state = zeros((1,self.shape[1]))+0.0 self.next_state = array(self.last_state) for r in input_ranges: self.lbounds.append(r[0]) self.ubounds.append(r[1]) self.lbounds = array(self.lbounds) self.ubounds = array(self.ubounds) self.cl = array (self.RescaleInputs(self.cl)) self.knntree = kdtree(self.cl)
def predictedGroup(p, tr, nn = 3, e='s'): #Compares the points (p) to your tree (tr), providing its predicted # category for each point based off of its (nn) nearest neighbors #Explaination can be short e[s] or long e[l] k = ann.kdtree(tr[:,:tr.shape[1]-1]) l = k.knn(p[:,:p.shape[1]-1],3) # dist = distGroups(tr) # pr = [] print "l[0]"; print l[0], "\n"; # print dist; # print "tr[0][-1] \n",tr[0][-1] if e == 's': for i in l[0]: pass else: print tr,"\n"; print p,"\n"; ll = np.zeros((l[0].shape[0],p.shape[1])) kk = 0 for i in l[0]: ii = 0 for j in i: ll[kk][ii] = tr[j][-1] ii += 1 kk += 1 print "Groups \n", ll; print "Modes \n",stat.mode(ll,1) pred = assignGroup(p,stat.mode(ll,1)[0]) print pred return pred
def nearest(self, location, count = 1): if not self._scikitA: self._scikitA = kdtree(self.as_array()) (index_array, distance_array,) = self._scikitA.knn(location, count) count = len(index_array[0]) results = [] for x in range(count): results.append((self[index_array[0][x]], distance_array[0][x])) return results
def nearest_distances(X, k=1): ''' X = array(N,M) N = number of points M = number of dimensions returns the squared distance to the kth nearest neighbor for every point in X ''' ktree = ann.kdtree(X) _, d = ktree.knn(X, k + 1) # the first nearest neighbor is itself return d[:, -1] # returns the distance to the kth nearest neighbor
def load(self, data_file): self.labels = [] self.label_set = set() points = [] with open(data_file) as fp: for line in fp: fields = line.split('\t') self.labels.append(fields[0]) self.label_set.add(fields[0]) points.append([float(f) for f in fields[1:]]) self.kdtree = ann.kdtree(np.array(points)) self.n = len(points) self.k = 25
def find_n_nearest_neighbors_fast(embed,n=10): """Searches the n nearest neighbors in state-space. Takes an array of TD-Vectors as returned by make_time_delay_embedding where first index is tp, second index is component. Returns an array of shape (embed.shape[0],n) containing the indices of the nearest neighbors for each tp.""" assert len(embed.shape) == 2, "Only 2d-arrays (one array of TD-Vectors) are supported at the moment." n = int(round(n)) try: import scikits.ann as ann k = ann.kdtree(embed) rv_ar = k.knn(embed,n+1)[0][:,1:] except Exception, e: from scipy.spatial import KDTree kdt = KDTree(embed) rv_ar = kdt.query(kdt.data,k=n)[1][:,1:]
def getSuccessorDistribution(self, state): """ Return the successor distribution for the given *state*. Returns an iterator that yields pairs of states and their probabilities of being the successor of the given *state*. """ if self.states == None: raise ModelNotInitialized() k = min(self.states.shape[0], self.k) if self.rebuildSucc: self.succKDTree = ann.kdtree(self.states) self.rebuildSucc = False indices, distances = self.succKDTree.knn(state, k) denominator = numpy.sum(numpy.exp(-distances[0] / (self.b_Sa**2))) # If the distances become too large, then all values can become zero # In this situation, we simply return the closest state and probability 1. if denominator == 0 or numpy.isnan(denominator): import warnings warnings.warn( "Too large distances, returning only closest example") indices[0] = [indices[0][0]] distances[0] = [0.0] denominator = numpy.exp(0.0 / (self.b_Sa**2)) for index, distance in zip(indices[0], distances[0]): neighbor = State( self.states[index], state.dimensions) # TODO: not use state.dimensions succState, reward = self.successorSamples[neighbor] delta = succState - neighbor predictedSuccState = State(state + delta, state.dimensions) if not 0 <= gaussian(distance, self.b_Sa) / denominator <= 1: import warnings import sys warnings.warn("Invalid distances in KNN Model!") print distances sys.exit(0) yield predictedSuccState, gaussian(distance, self.b_Sa) / denominator
def __init__(self, codewords): """ Constructor. Needs an initialized codebook, one code per line. """ self._codebook = copy.deepcopy(codewords) self._nCodes = codewords.shape[0] self._codesize = codewords.shape[1] self._dist = euclidean_dist self._codebounds = np.ones([self._nCodes, self._nCodes]) * np.inf self._init_bounds() # test kd-tree self._kdtree = scipy.spatial.KDTree(self._codebook, leafsize=100) self._ckdtree = scipy.spatial.cKDTree(self._codebook, leafsize=100) # test ann kdtree tstart = time.time() self._ann = ann.kdtree(self._codebook) print 'time to build ann tree:', time.time() - tstart, 'seconds.'
def train(self, trainingSet): """ Trains the KNN function approximator with the given *trainingSet*. Stores the examples contained in the *trainingSet* and overwrites the old examples """ self.qValues = trainingSet states = defaultdict(list) for (state, action), target in trainingSet.iteritems(): states[action].append(state) for action in self.actions: if len(states[action]) > 0: self.states[action] = numpy.vstack(states[action]) self.actionsKDTree[action] = ann.kdtree(self.states[action]) else: self.actionsKDTree[action] = None
def __init__(self,codewords): """ Constructor. Needs an initialized codebook, one code per line. """ self._codebook = copy.deepcopy(codewords) self._nCodes = codewords.shape[0] self._codesize = codewords.shape[1] self._dist = euclidean_dist self._codebounds = np.ones([self._nCodes,self._nCodes]) * np.inf self._init_bounds() # test kd-tree self._kdtree = scipy.spatial.KDTree(self._codebook,leafsize=100) self._ckdtree = scipy.spatial.cKDTree(self._codebook,leafsize=100) # test ann kdtree tstart = time.time() self._ann = ann.kdtree(self._codebook) print 'time to build ann tree:',time.time()-tstart,'seconds.'
def getExplorationValue(self, state): """ Return the exploratory value of the given state *state* The exploratory value of a state under this model is defined simply as the sum of the activations of its k nearest neighbors """ if self.states == None: return 0.0 k = min(self.states.shape[0], self.k) if self.rebuildSucc: self.succKDTree = ann.kdtree(self.states) self.rebuildSucc = False indices, distances = self.succKDTree.knn(state, k) return numpy.sum(numpy.exp(-distances[0] / (self.b_Sa**2)))
def find_n_nearest_neighbors_fast(embed, n=10): """Searches the n nearest neighbors in state-space. Takes an array of TD-Vectors as returned by make_time_delay_embedding where first index is tp, second index is component. Returns an array of shape (embed.shape[0],n) containing the indices of the nearest neighbors for each tp.""" assert len( embed.shape ) == 2, "Only 2d-arrays (one array of TD-Vectors) are supported at the moment." n = int(round(n)) try: import scikits.ann as ann k = ann.kdtree(embed) rv_ar = k.knn(embed, n + 1)[0][:, 1:] except Exception, e: from scipy.spatial import KDTree kdt = KDTree(embed) rv_ar = kdt.query(kdt.data, k=n)[1][:, 1:]
def predicts(self, feats): """ Returns two lists, best_code_per_pattern and average squared distance Note on method used: Uses ann if we have many features. Reason: during training, when we might look at one new sample at a time then update the codebook, building a kdtree is useless. If the model is trained and we want to predict on a large database, t's worth having the kdtree. Threshold set at 200 features. """ assert feats.shape[1] > 0, 'empty feats???' # ann use_ann = feats.shape[0] > 50 and _ann_imported if use_ann: kdtree = ann.kdtree(self._codebook) best_code_per_p, dists = self._closest_code_ann(feats, kdtree) best_code_per_p = np.array(best_code_per_p) # note that dists is already squared euclidean distance avg_dists = np.array(map(lambda x: x * 1. / feats.shape[1], dists)) if np.isnan(dists).any(): # sometimes ann has numerical errors, redo wrong ones nan_idx = np.where(np.isnan(avg_dists))[0] for idx in nan_idx: code, dist = self._closest_code_batch(feats[idx]) best_code_per_p[idx] = int(code) avg_dists[idx] = dist * dist * 1. / feats.shape[1] assert not np.isnan(avg_dists).any(), 'NaN with ann not fixed' if not use_ann: # prepare result best_code_per_p = np.zeros(feats.shape[0]) avg_dists = np.zeros(feats.shape[0]) idx = -1 # iterate over features for f in feats: idx += 1 code, dist = self._closest_code_batch(f) best_code_per_p[idx] = int(code) avg_dists[idx] = dist * dist * 1. / feats.shape[1] assert not np.isnan(avg_dists).any(), 'NaN with regular code' # done, return two list return best_code_per_p, avg_dists
def predicts(self,feats): """ Returns two lists, best_code_per_pattern and average squared distance Note on method used: Uses ann if we have many features. Reason: during training, when we might look at one new sample at a time then update the codebook, building a kdtree is useless. If the model is trained and we want to predict on a large database, t's worth having the kdtree. Threshold set at 200 features. """ assert feats.shape[1] > 0,'empty feats???' # ann use_ann = feats.shape[0] > 50 and _ann_imported if use_ann: kdtree = ann.kdtree(self._codebook) best_code_per_p, dists = self._closest_code_ann(feats,kdtree) best_code_per_p = np.array(best_code_per_p) # note that dists is already squared euclidean distance avg_dists = np.array(map(lambda x: x * 1. /feats.shape[1],dists)) if np.isnan(dists).any(): # sometimes ann has numerical errors, redo wrong ones nan_idx = np.where(np.isnan(avg_dists))[0] for idx in nan_idx: code,dist = self._closest_code_batch(feats[idx]) best_code_per_p[idx] = int(code) avg_dists[idx] = dist * dist * 1. / feats.shape[1] assert not np.isnan(avg_dists).any(),'NaN with ann not fixed' if not use_ann: # prepare result best_code_per_p = np.zeros(feats.shape[0]) avg_dists = np.zeros(feats.shape[0]) idx = -1 # iterate over features for f in feats: idx += 1 code,dist = self._closest_code_batch(f) best_code_per_p[idx] = int(code) avg_dists[idx] = dist * dist * 1. / feats.shape[1] assert not np.isnan(avg_dists).any(),'NaN with regular code' # done, return two list return best_code_per_p, avg_dists
def __init__(self, nactions, input_ranges, nelemns=[], npoints=0, k=1, alpha=0.3, lm=0.90): if not (nelemns == False) ^ (npoints == False): raise ValueError('Plese indicate either: [nelemns] Xor [npoints]') if nelemns: #self.cl = self.CreateFullspace(input_ranges,nelemns) self.cl = self.ndlinspace(input_ranges, nelemns) else: self.cl = self.CreateRandomSpace(input_ranges, npoints) self.lbounds = [] self.ubounds = [] self.k = k self.shape = self.cl.shape self.nactions = nactions self.Q = zeros((self.shape[0], nactions)) self.ps = zeros((self.shape[0], nactions, self.shape[1])) #self.Q = uniform(-1,0,(self.shape[0],nactions))+0.0 self.e = zeros((self.shape[0], nactions)) + 0.0 #self.ac = zeros((self.shape[0]))+0.0 #classifiers activation self.ac = [] self.knn = [] self.alpha = alpha self.lm = lm self.last_state = zeros((1, self.shape[1])) + 0.0 self.next_state = array(self.last_state) for r in input_ranges: self.lbounds.append(r[0]) self.ubounds.append(r[1]) self.lbounds = array(self.lbounds) self.ubounds = array(self.ubounds) self.cl = array(self.RescaleInputs(self.cl)) self.knntree = kdtree(self.cl)
def getPredecessorDistribution(self, state): """ Return a states drawn from *state*'s predecessor distribution Returns a possible predecessor state of *state* drawn from the predecessor state distribution according to its probability mass function. """ if self.succStates == None: raise ModelNotInitialized() k = min(self.states.shape[0], self.k) if self.rebuildPred: self.predKDTree = ann.kdtree(self.succStates) self.rebuildPred = False indices, distances = self.predKDTree.knn(state, k) denominator = numpy.sum(numpy.exp(-distances[0] / (self.b_Sa**2))) # If the distances become too large, then all values can become zero # In this situation, we simply return the closest state and probability 1. if denominator == 0: import warnings warnings.warn("Too large distances, returing only closest example") indices[0] = [indices[0][0]] distances[0] = [0.0] denominator = numpy.exp(0.0 / (self.b_Sa**2)) for index, distance in zip(indices[0], distances[0]): neighbor = State( self.succStates[index], state.dimensions) # TODO: not use state.dimensions predState, reward = self.predecessorSamples[neighbor] delta = predState - neighbor predictedPredState = State(state + delta, state.dimensions) yield predictedPredState, gaussian(distance, self.b_Sa) / denominator
def getExpectedReward(self, state): """ Returns the expected reward for the given state """ if self.states == None: return 0.0 k = min(self.states.shape[0], self.k) if self.rebuildSucc: self.succKDTree = ann.kdtree(self.states) self.rebuildSucc = False indices, distances = self.succKDTree.knn(state, k) denominator = numpy.sum(numpy.exp(-distances[0] / (self.b_Sa**2))) # If the distances become too large, then all values can become zero # In this situation, we simply return the closest state and probability 1. if denominator == 0: import warnings warnings.warn( "Too large distances, returning only closest example") indices[0] = [indices[0][0]] distances[0] = [0.0] denominator = numpy.exp(0.0 / (self.b_Sa**2)) expectedReward = 0.0 for index, distance in zip(indices[0], distances[0]): neighbor = State( self.states[index], state.dimensions) # TODO: not use state.dimensions succState, reward = self.successorSamples[neighbor] weight = gaussian(distance, self.b_Sa) / denominator expectedReward += reward * weight return expectedReward
def _obj_det_match(cells, db, obj_tabname, det_tabname, o2d_tabname, radius, explist=None, _rematching=False): """ This kernel assumes: a) det_table and obj_table have equal partitioning (equally sized/enumerated spatial cells) b) both det_table and obj_table have up-to-date neighbor caches c) temporal det_table cells within this spatial cell are stored local to this process (relevant for shared-nothing setups) d) exposures don't stretch across temporal cells Algorithm: - fetch all existing static sky objects, including the cached ones (*) - project them to tangent plane around the center of the cell (we assume the cell is small enough for the distortions not to matter) - construct a kD tree in (x, y) tangent space - for each temporal cell, in sorted order (++): 1.) Fetch the detections, including the cached ones (+) 2.) Project to tangent plane 3.) for each exposure, in sorted order (++): a.) Match agains the kD tree of objects b.) Add those that didn't match to the list of objects 4.) For newly added objects: store to disk only those that fall within this cell (the others will be matched and stored in their parent cells) 5.) For matched detections: Drop detections matched to cached objects (these will be matched and stored in the objects' parent cell). Store the rest. (+) It is allowed (and necessary to allow) for a cached detection to be matched against an object within our cell. This correctly matches cases when the object is right inside the cell boundary, but the detection is just to the outside. (++) Having cells and detections sorted ensures that objects in overlapping (cached) regions are seen by kernels in different cells in the same order, thus resulting in the same matches. Note: this may fail in extremely crowded region, but as of now it's not clear how big of a problem (if any!) will this pose. (*) Cached objects must be loaded and matched against to guard against the case where an object is just outside the edge, while a detection is just inside. If the cached object was not loaded, the detection would not match and be proclamed to be a new object. However, in the cached object's parent cell, the detection would match to the object and be stored there as well. The algorithm above ensures that such a detection will matched to the cached object in this cell (and be dropped in step 5), preventing it from being promoted into a new object. TODO: The above algorithm ensures no detection is assigned to more than one object. It also ensures that each detection links to an object. Implement a consistency check to verify that. """ from scikits.ann import kdtree # Input is a tuple of obj_cell, and det_cells falling under that obj_cell obj_cell, det_cells = cells det_cells.sort() assert len(det_cells) # Fetch the frequently used bits obj_table = db.table(obj_tabname) det_table = db.table(det_tabname) o2d_table = db.table(o2d_tabname) pix = obj_table.pix # locate cell center (for gnomonic projection) (bounds, tbounds) = pix.cell_bounds(obj_cell) (clon, clat) = bhpix.deproj_bhealpix(*bounds.center()) # fetch existing static sky, convert to gnomonic objs = db.query('_ID, _LON, _LAT FROM %s' % obj_tabname).fetch_cell( obj_cell, include_cached=True) xyobj = np.column_stack(gnomonic(objs['_LON'], objs['_LAT'], clon, clat)) nobj = len(objs) # Total number of static sky objects tree = None nobj_old = 0 # for sanity checks/debugging (see below) expseen = set() ## TODO: Debugging, remove when happy assert (np.unique(sorted(det_cells)) == sorted(det_cells)).all() ##print "Det cells: ", det_cells # Loop, xmatch, and store if explist is not None: explist = np.asarray(list(explist), dtype=np.uint64) # Ensure explist is a ndarray det_query = db.query('_ID, _LON, _LAT, _EXP, _CACHED FROM %s' % det_tabname) for det_cell in sorted(det_cells): # fetch detections in this cell, convert to gnomonic coordinates # keep only detections with _EXP in explist, unless explist is None detections = det_query.fetch_cell(det_cell, include_cached=True) # if there are no preexisting static sky objects, and all detections in this cell are cached, # there's no way we'll get a match that will be kept in the end. Just continue to the # next one if this is the case. cachedonly = len(objs) == 0 and detections._CACHED.all() if cachedonly: # print "Skipping cached-only", len(cached) yield ( None, None, None, None, None, None ) # Yield just to have the progress counter properly incremented continue if explist is not None: keep = np.in1d(detections._EXP, explist) if not np.all(keep): detections = detections[keep] if len(detections) == 0: yield ( None, None, None, None, None, None ) # Yield just to have the progress counter properly incremented continue _, ra2, dec2, exposures, cached = detections.as_columns() detections.add_column('xy', np.column_stack(gnomonic(ra2, dec2, clon, clat))) # prep join table join = ColGroup(dtype=o2d_table.dtype_for( ['_ID', '_M1', '_M2', '_DIST', '_LON', '_LAT'])) njoin = 0 nobj0 = nobj ##print "Cell", det_cell, " - Unique exposures: ", set(exposures) # Process detections exposure-by-exposure, as detections from # different exposures within a same temporal cell are allowed # to belong to the same object uexposures = set(exposures) for exposure in sorted(uexposures): # Sanity check: a consistent table cannot have two # exposures stretching over more than one cell assert exposure not in expseen expseen.add(exposure) # Extract objects belonging to this exposure only detections2 = detections[exposures == exposure] id2, ra2, dec2, _, _, xydet = detections2.as_columns() ndet = len(xydet) if len(xyobj) != 0: # Construct kD-tree and find the object nearest to each # detection from this cell if tree is None or nobj_old != len(xyobj): del tree nobj_old = len(xyobj) tree = kdtree(xyobj) match_idx, match_d2 = tree.knn(xydet, 1) match_idx = match_idx[:, 0] # First neighbor only #### #if np.uint64(13828114484734072082) in id2: # np.savetxt('bla.%d.static=%d.txt' % (det_cell, pix.static_cell_for_cell(det_cell)), objs.as_ndarray(), fmt='%s') # Compute accurate distances, and select detections not matched to existing objects dist = gc_dist(objs['_LON'][match_idx], objs['_LAT'][match_idx], ra2, dec2) unmatched = dist >= radius else: # All detections will become new objects (and therefore, dist=0) dist = np.zeros(ndet, dtype='f4') unmatched = np.ones(ndet, dtype=bool) match_idx = np.empty(ndet, dtype='i4') # x, y, t = pix._xyt_from_cell_id(det_cell) # print "det_cell %s, MJD %s, Exposure %s == %d detections, %d objects, %d matched, %d unmatched" % (det_cell, t, exposure, len(detections2), nobj, len(unmatched)-unmatched.sum(), unmatched.sum()) # Promote unmatched detections to new objects _, newra, newdec, _, _, newxy = detections2[unmatched].as_columns() nunmatched = unmatched.sum() reserve_space(objs, nobj + nunmatched) objs['_LON'][nobj:nobj + nunmatched] = newra objs['_LAT'][nobj:nobj + nunmatched] = newdec dist[unmatched] = 0. match_idx[unmatched] = np.arange( nobj, nobj + nunmatched, dtype='i4' ) # Set the indices of unmatched detections to newly created objects # Join objects to their detections reserve_space(join, njoin + ndet) join['_M1'][njoin:njoin + ndet] = match_idx join['_M2'][njoin:njoin + ndet] = id2 join['_DIST'][njoin:njoin + ndet] = dist # TODO: For debugging; remove when happy join['_LON'][njoin:njoin + ndet] = ra2 join['_LAT'][njoin:njoin + ndet] = dec2 njoin += ndet # Prep for next loop nobj += nunmatched xyobj = np.append(xyobj, newxy, axis=0) # TODO: Debugging: Final consistency check (remove when happy with the code) dist = gc_dist(objs['_LON'][join['_M1'][njoin - ndet:njoin]], objs['_LAT'][join['_M1'][njoin - ndet:njoin]], ra2, dec2) assert (dist < radius).all() # Truncate output tables to their actual number of elements objs = objs[0:nobj] join = join[0:njoin] assert len(objs) >= nobj0 # Find the objects that fall outside of cell boundaries. These will # be processed and stored by their parent cells. Also leave out the objects # that are already stored in the database (x, y) = bhpix.proj_bhealpix(objs['_LON'], objs['_LAT']) in_ = bounds.isInsideV(x, y) innew = in_.copy() innew[:nobj0] = False # New objects in cell selector ids = objs['_ID'] nobjadded = innew.sum() if nobjadded: # Append the new objects to the object table, obtaining their IDs. assert not _rematching, 'cell_id=%s, nnew=%s\n%s' % ( det_cell, nobjadded, objs[innew]) ids[innew] = obj_table.append(objs[('_LON', '_LAT')][innew]) # Set the indices of objects not in this cell to zero (== a value # no valid object in the database can have). Therefore, all # out-of-bounds links will have _M1 == 0 (#1), and will be removed # by the np1d call (#2) ids[~in_] = 0 # 1) Change the relative index to true obj_id in the join table join['_M1'] = ids[join['_M1']] # 2) Keep only the joins to objects inside the cell join = join[np.in1d(join['_M1'], ids[in_])] # Append to the join table, in *dec_cell* of obj_table (!important!) if len(join) != 0: # compute the cell_id part of the join table's # IDs.While this is unimportant now (as we could # just set all of them equal to cell_id part of # cell_id), if we ever decide to change the # pixelation of the table later on, this will # allow us to correctly split up the join table as # well. #_, _, t = pix._xyt_from_cell_id(det_cell) # This row points to a detection in the temporal cell ... #x, y, _, _ = pix._xyti_from_id(join['_M1']) # ... but at the spatial location given by the object table. #join['_ID'][:] = pix._id_from_xyti(x, y, t, 0) # This will make the new IDs have zeros in the object part (so Table.append will autogen them) join['_ID'][:] = det_cell o2d_table.append(join) assert not cachedonly or (nobjadded == 0 and len(join) == 0) # return: Number of exposures, number of objects before processing this cell, number of detections processed (incl. cached), # number of newly added objects, number of detections xmatched, number of detection processed that weren't cached # Note: some of the xmatches may be to newly added objects (e.g., if there are two # overlapping exposures within a cell; first one will add new objects, second one will match agains them) yield (len(uexposures), nobj0, len(detections), nobjadded, len(join), (cached == False).sum())
if __name__ == '__main__': import scikits.ann as ann #np.set_printoptions(threshold = np.nan) k=200; points = np.random.rand(500000, 64).astype(np.float32); queries = np.random.rand(500, 64).astype(np.float32); t0 =time.time() gpu_knn = KnnFinder(points) our_indexies,our_distances = gpu_knn.get_knn(queries,k) t1 =time.time() print t1-t0 truth_tree = ann.kdtree(points); truth_points, truth_distances= truth_tree.knn(queries,k); truth_distances=np.sqrt(truth_distances) t2 =time.time() print t2-t1 #print truth_points error = np.abs(our_distances- truth_distances)/truth_distances; index_error = our_indexies-truth_points print "----" print np.nansum(error)/error.size print np.sum(np.abs(index_error)>0)/index_error.size
def makePredictions((mu, sigma, vt, a_to_u, smat, unsmat)): sys.stderr.write("Analysis Completed, Beginning Making Predictions\n") def getvData(): vtfile = open(sys.argv[2], 'r') vvfile = open(sys.argv[3], 'r') d_i, d_j, d_v = getData(vtfile) sys.stderr.write("Reading Validation Validation Data\n") v_i, v_j, v_v = getData(vvfile) return (d_i, d_j, d_v), (v_i, v_j, v_v) def getProjectedLocs(smat, vt): return smat * vt.T def actuallyMakePredictions(((d_i, d_j, d_v), (v_i, v_j, v_v), vt, mudict, sigmadict, smat, unsmat)): sys.stderr.write("Making " + str(len(v_i)) + " Predictions, Analyzing Performance, and Outputting Errors\n") u_to_index = {key : [] for key in set(d_i)} for i in xrange(len(d_i)): u_to_index[d_i[i]].append(i) plocs = getProjectedLocs(smat, vt) # plocs is a map from users to projected coordinates sys.stderr.write("Creating KDTrees\n") a_to_kdtree = {} a_to_kdtreemap = {} for test_no in xrange(len(v_v)): sys.stderr.write("Executing Test Number " + str(test_no) + " out of " + str(len(v_v)) + "\n") #default prediction prediction = 5 writeflag = True if v_j[test_no] in u_to_index: if test_no in u_to_index: #normalization and centering for index in u_to_index[test_no]: try: #if data[index][1] in aset: d_v[index] -= mudict[d_j[index]] if not sigmadict[d_j[index]] == 0: d_v[index] /= sigmadict[d_j[index]] except KeyError: u_to_index[test_no].remove(index) coordinates = [] for cur_eig in vt: coordinates.append(sum(d_v[x]*cur_eig[d_j[x]] for x in u_to_index[test_no])) coordinates = np.array(coordinates) k = int(sys.argv[4]) rel_users = list(a_to_u[v_j[test_no]]) sys.stderr.write("Found " + str(len(rel_users)) + " Relevant Users\n") k = max(1, min(k, len(rel_users) / 2)) #manual method # knn = [] # mindist = [k*100] * k #fragile # knn = [-1] * k # for user in rel_users: # d = np.linalg.norm(coordinates - plocs[user]) # for i in xrange(k): # if d < mindist[i]: # mindist.insert(i, d) # mindist.pop() # knn.insert(i, user) # knn.pop() # break if not v_j[test_no] in a_to_kdtree: sys.stderr.write("Building KDTree\n") a_to_kdtreemap[v_j[test_no]] = rel_users a_to_kdtree[v_j[test_no]] = ann.kdtree(np.array([plocs[u].tolist() for u in rel_users])) knn = a_to_kdtree[v_j[test_no]].knn(coordinates, k)[0][0] knn_ratings = [unsmat[a_to_kdtreemap[v_j[test_no]][x]][v_j[test_no]] for x in knn] mmm = sys.argv[5] if mmm == "mean": prediction = np.mean(knn_ratings) elif mmm == "median": prediction = np.median(knn_ratings) else: prediction = np.argmax(np.bincount(knn_ratings)) else: writeflag = False prediction = mudict[v_j[test_no]] sys.stderr.write("Printing Prediction\n") sys.stderr.write("Predicted: " + str(prediction) + " Actual: " + str(v_v[test_no]) +"\n") print prediction - v_v[test_no]
def process_filelist_test(filelist=None,model=None,tmpfilename=None,K=1): """ Main function, process all files in the list (as long as their track_id is not in testsongs) INPUT filelist - a list of song files model - h5 file containing feats and artist_id for all train songs tmpfilename - where to save our processed features K - K-nn parameter (default=1) """ # sanity check for arg in locals().values(): assert not arg is None,'process_filelist_train, missing an argument, something still None' if os.path.isfile(tmpfilename): print 'ERROR: file',tmpfilename,'already exists.' return if not os.path.isfile(model): print 'ERROR: model',model,'does not exist.' return # dimension fixed (12-dimensional timbre vector) ndim = 12 finaldim = 90 # create kdtree h5model = tables.openFile(model, mode='r') assert h5model.root.data.feats.shape[1]==finaldim,'inconsistency in final dim' kd = ANN.kdtree(h5model.root.data.feats) # create outputfile output = tables.openFile(tmpfilename, mode='a') group = output.createGroup("/",'data','TMP FILE FOR ARTIST RECOGNITION') output.createEArray(group,'artist_id_real',tables.StringAtom(18,shape=()),(0,),'', expectedrows=len(filelist)) output.createEArray(group,'artist_id_pred',tables.StringAtom(18,shape=()),(0,),'', expectedrows=len(filelist)) # iterate over files cnt_f = 0 for f in filelist: cnt_f += 1 # verbose if cnt_f % 50000 == 0: print 'training... checking file #',cnt_f # check what file/song is this h5 = GETTERS.open_h5_file_read(f) artist_id = GETTERS.get_artist_id(h5) track_id = GETTERS.get_track_id(h5) if track_id in testsongs: # just in case, but should not be necessary print 'Found test track_id during training? weird.',track_id h5.close() continue # extract features, then close file processed_feats = compute_features(h5) h5.close() if processed_feats is None: continue # do prediction artist_id_pred = do_prediction(processed_feats,kd,h5model,K) # save features to tmp file output.root.data.artist_id_real.append( np.array( [artist_id] ) ) output.root.data.artist_id_pred.append( np.array( [artist_id_pred] ) ) # we're done, close output output.close() return
def process_filelist_test(filelist=None, model=None, tmpfilename=None, npicks=None, winsize=None, finaldim=None, K=1, typecompress='picks'): """ Main function, process all files in the list (as long as their artist is in testartist) INPUT filelist - a list of song files model - h5 file containing feats and year for all train songs tmpfilename - where to save our processed features npicks - number of segments to pick per song winsize - size of each segment we pick finaldim - how many values do we keep K - param of KNN (default 1) typecompress - feature type, 'picks', 'corrcoeff' or 'cov' must be the same as in training """ # sanity check for arg in locals().values(): assert not arg is None, 'process_filelist_test, missing an argument, something still None' if os.path.isfile(tmpfilename): print 'ERROR: file', tmpfilename, 'already exists.' return if not os.path.isfile(model): print 'ERROR: model', model, 'does not exist.' return # create kdtree h5model = tables.openFile(model, mode='r') assert h5model.root.data.feats.shape[ 1] == finaldim, 'inconsistency in final dim' kd = ANN.kdtree(h5model.root.data.feats) # create outputfile output = tables.openFile(tmpfilename, mode='a') group = output.createGroup("/", 'data', 'TMP FILE FOR YEAR RECOGNITION') output.createEArray(group, 'year_real', tables.IntAtom(shape=()), (0, ), '', expectedrows=len(filelist)) output.createEArray(group, 'year_pred', tables.Float64Atom(shape=()), (0, ), '', expectedrows=len(filelist)) # random projection ndim = 12 # fixed in this dataset if typecompress == 'picks': randproj = RANDPROJ.proj_point5(ndim * winsize, finaldim) elif typecompress == 'corrcoeff' or typecompress == 'cov': randproj = RANDPROJ.proj_point5(ndim * ndim, finaldim) elif typecompress == 'avgcov': randproj = RANDPROJ.proj_point5(90, finaldim) else: assert False, 'Unknown type of compression: ' + str(typecompress) # go through files cnt_f = 0 for f in filelist: cnt_f += 1 if cnt_f % 5000 == 0: print 'TESTING FILE #' + str(cnt_f) # check file h5 = GETTERS.open_h5_file_read(f) artist_id = GETTERS.get_artist_id(h5) year = GETTERS.get_year(h5) track_id = GETTERS.get_track_id(h5) h5.close() if year <= 0: # probably useless but... continue if typecompress == 'picks': # we have a train artist with a song year, we're good bttimbre = get_bttimbre(f) if bttimbre is None: continue # we even have normal features, awesome! processed_feats = CBTF.extract_and_compress(bttimbre, npicks, winsize, finaldim, randproj=randproj) elif typecompress == 'corrcoeff': h5 = GETTERS.open_h5_file_read(f) timbres = GETTERS.get_segments_timbre(h5).T h5.close() processed_feats = CBTF.corr_and_compress(timbres, finaldim, randproj=randproj) elif typecompress == 'cov': h5 = GETTERS.open_h5_file_read(f) timbres = GETTERS.get_segments_timbre(h5).T h5.close() processed_feats = CBTF.cov_and_compress(timbres, finaldim, randproj=randproj) elif typecompress == 'avgcov': h5 = GETTERS.open_h5_file_read(f) timbres = GETTERS.get_segments_timbre(h5).T h5.close() processed_feats = CBTF.avgcov_and_compress(timbres, finaldim, randproj=randproj) else: assert False, 'Unknown type of compression: ' + str(typecompress) if processed_feats is None: continue if processed_feats.shape[0] == 0: continue # do prediction year_pred = do_prediction(processed_feats, kd, h5model, K) # add pred and ground truth to output if not year_pred is None: output.root.data.year_real.append([year]) output.root.data.year_pred.append([year_pred]) # close output and model del kd h5model.close() output.close() # done return
timekd += time.time() - tstart # ckdtree tstart = time.time() idx4 = cb.closest_code_ckdtree(sample) timeckd += time.time() - tstart # ann tstart = time.time() idx5 = cb.closest_code_ann(sample) timeann += time.time() - tstart # batch tstart = time.time() idx6 = cb.closest_code_batch(sample) timebatch += time.time() - tstart # ann redoing codebook tstart = time.time() cb._ann = ann.kdtree(cb._codebook) idx7 = cb.closest_code_ann(sample) timeann2 += time.time() - tstart # checking assert idx2 == idx3 or (cb[idx2] == cb[idx3]).all() assert idx2 == idx4 or (cb[idx2] == cb[idx4]).all() assert idx2 == idx5 or (cb[idx2] == cb[idx5]).all() assert idx2 == idx6 or (cb[idx2] == cb[idx6]).all() assert idx2 == idx7 or (cb[idx2] == cb[idx7]).all() assert idx2 == idxes[sampleidx] or (cb[idx2] == cb[idxes[sampleidx]]).all() #print 'time for fast algo:',timefast,'seconds.' print 'time for slow algo: ',timeslow,'seconds.' print 'time for kd algo: ',timekd,'seconds.' print 'time for ckd algo: ',timeckd,'seconds.' print 'time for ann algo: ',timeann,'seconds.'
def _xmatch_mapper(qresult, tabname_to, radius, tabname_xm, n_neighbors): """ Mapper: - given all objects in a cell, make an ANN tree - load all objects in tabname_to (including neighbors), make an ANN tree, find matches - store the output into an index table """ from scikits.ann import kdtree db = qresult.db pix = qresult.pix table_xm = db.table(tabname_xm) for rows in qresult: cell_id = rows.info.cell_id join = ColGroup(dtype=[('_M1', 'u8'), ('_M2', 'u8'), ('_DIST', 'f4'), ('_NR', 'u1'), ('_LON', 'f8'), ('_LAT', 'f8')]) (id1, ra1, dec1) = rows.as_columns() (id2, ra2, dec2) = db.query('_ID, _LON, _LAT FROM %s' % tabname_to).fetch_cell(cell_id, include_cached=True).as_columns() if len(id2) != 0: # Project to tangent plane around the center of the cell. We # assume the cell is small enough for the distortions not to # matter and Euclidian distances apply bounds, _ = pix.cell_bounds(cell_id) (clon, clat) = bhpix.deproj_bhealpix(*bounds.center()) xy1 = np.column_stack(gnomonic(ra1, dec1, clon, clat)) xy2 = np.column_stack(gnomonic(ra2, dec2, clon, clat)) # Construct kD-tree to find an object in table_to that is nearest # to an object in table_from, for every object in table_from tree = kdtree(xy2) match_idxs, match_d2 = tree.knn(xy1, min(n_neighbors, len(xy2))) del tree # Create the index table array join.resize(match_idxs.size) for k in xrange(match_idxs.shape[1]): match_idx = match_idxs[:,k] join['_M1'][k::match_idxs.shape[1]] = id1 join['_M2'][k::match_idxs.shape[1]] = id2[match_idx] join['_DIST'][k::match_idxs.shape[1]] = gc_dist(ra1, dec1, ra2[match_idx], dec2[match_idx]) join['_LON'][k::match_idxs.shape[1]] = ra2[match_idx] join['_LAT'][k::match_idxs.shape[1]] = dec2[match_idx] join['_NR'][k::match_idxs.shape[1]] = k # Remove matches beyond the xmatch radius join = join[join['_DIST'] < radius] if len(join): # compute the cell_id part of the join table's # IDs. While this is unimportant now (as we could # just set all of them equal to cell_id part of # cell_id), if we ever decide to change the # pixelation of the table later on, this will # allow us to correctly repixelize the join table as # well. #x, y, t, _ = pix._xyti_from_id(join['_M1']) # ... but at the spatial location given by the object table. #join['_ID'] = pix._id_from_xyti(x, y, t, 0) # This will make the new IDs have zeros in the object part (so Table.append will autogen them) # TODO: Allow the stuff above (in Table.append) join['_ID'] = pix.cell_for_id(join['_M1']) # TODO: Debugging, remove when happy cid = np.unique(pix.cell_for_id(join['_ID'])) assert len(cid) == 1, len(cid) assert cid[0] == cell_id, '%s %s' % (cid[0], cell_id) #### table_xm.append(join) yield len(id1), len(id2), len(join) else: yield len(rows), 0, 0
def _obj_det_match(cells, db, obj_tabname, det_tabname, o2d_tabname, radius, explist=None, _rematching=False): """ This kernel assumes: a) det_table and obj_table have equal partitioning (equally sized/enumerated spatial cells) b) both det_table and obj_table have up-to-date neighbor caches c) temporal det_table cells within this spatial cell are stored local to this process (relevant for shared-nothing setups) d) exposures don't stretch across temporal cells Algorithm: - fetch all existing static sky objects, including the cached ones (*) - project them to tangent plane around the center of the cell (we assume the cell is small enough for the distortions not to matter) - construct a kD tree in (x, y) tangent space - for each temporal cell, in sorted order (++): 1.) Fetch the detections, including the cached ones (+) 2.) Project to tangent plane 3.) for each exposure, in sorted order (++): a.) Match agains the kD tree of objects b.) Add those that didn't match to the list of objects 4.) For newly added objects: store to disk only those that fall within this cell (the others will be matched and stored in their parent cells) 5.) For matched detections: Drop detections matched to cached objects (these will be matched and stored in the objects' parent cell). Store the rest. (+) It is allowed (and necessary to allow) for a cached detection to be matched against an object within our cell. This correctly matches cases when the object is right inside the cell boundary, but the detection is just to the outside. (++) Having cells and detections sorted ensures that objects in overlapping (cached) regions are seen by kernels in different cells in the same order, thus resulting in the same matches. Note: this may fail in extremely crowded region, but as of now it's not clear how big of a problem (if any!) will this pose. (*) Cached objects must be loaded and matched against to guard against the case where an object is just outside the edge, while a detection is just inside. If the cached object was not loaded, the detection would not match and be proclamed to be a new object. However, in the cached object's parent cell, the detection would match to the object and be stored there as well. The algorithm above ensures that such a detection will matched to the cached object in this cell (and be dropped in step 5), preventing it from being promoted into a new object. TODO: The above algorithm ensures no detection is assigned to more than one object. It also ensures that each detection links to an object. Implement a consistency check to verify that. """ from scikits.ann import kdtree # Input is a tuple of obj_cell, and det_cells falling under that obj_cell obj_cell, det_cells = cells det_cells.sort() assert len(det_cells) # Fetch the frequently used bits obj_table = db.table(obj_tabname) det_table = db.table(det_tabname) o2d_table = db.table(o2d_tabname) pix = obj_table.pix # locate cell center (for gnomonic projection) (bounds, tbounds) = pix.cell_bounds(obj_cell) (clon, clat) = bhpix.deproj_bhealpix(*bounds.center()) # fetch existing static sky, convert to gnomonic objs = db.query('_ID, _LON, _LAT FROM %s' % obj_tabname).fetch_cell(obj_cell, include_cached=True) xyobj = np.column_stack(gnomonic(objs['_LON'], objs['_LAT'], clon, clat)) nobj = len(objs) # Total number of static sky objects tree = None nobj_old = 0 # for sanity checks/debugging (see below) expseen = set() ## TODO: Debugging, remove when happy assert (np.unique(sorted(det_cells)) == sorted(det_cells)).all() ##print "Det cells: ", det_cells # Loop, xmatch, and store if explist is not None: explist = np.asarray(list(explist), dtype=np.uint64) # Ensure explist is a ndarray det_query = db.query('_ID, _LON, _LAT, _EXP, _CACHED FROM %s' % det_tabname) for det_cell in sorted(det_cells): # fetch detections in this cell, convert to gnomonic coordinates # keep only detections with _EXP in explist, unless explist is None detections = det_query.fetch_cell(det_cell, include_cached=True) # if there are no preexisting static sky objects, and all detections in this cell are cached, # there's no way we'll get a match that will be kept in the end. Just continue to the # next one if this is the case. cachedonly = len(objs) == 0 and detections._CACHED.all() if cachedonly: # print "Skipping cached-only", len(cached) yield (None, None, None, None, None, None) # Yield just to have the progress counter properly incremented continue; if explist is not None: keep = np.in1d(detections._EXP, explist) if not np.all(keep): detections = detections[keep] if len(detections) == 0: yield (None, None, None, None, None, None) # Yield just to have the progress counter properly incremented continue _, ra2, dec2, exposures, cached = detections.as_columns() detections.add_column('xy', np.column_stack(gnomonic(ra2, dec2, clon, clat))) # prep join table join = ColGroup(dtype=o2d_table.dtype_for(['_ID', '_M1', '_M2', '_DIST', '_LON', '_LAT'])) njoin = 0; nobj0 = nobj; ##print "Cell", det_cell, " - Unique exposures: ", set(exposures) # Process detections exposure-by-exposure, as detections from # different exposures within a same temporal cell are allowed # to belong to the same object uexposures = set(exposures) for exposure in sorted(uexposures): # Sanity check: a consistent table cannot have two # exposures stretching over more than one cell assert exposure not in expseen expseen.add(exposure); # Extract objects belonging to this exposure only detections2 = detections[exposures == exposure] id2, ra2, dec2, _, _, xydet = detections2.as_columns() ndet = len(xydet) if len(xyobj) != 0: # Construct kD-tree and find the object nearest to each # detection from this cell if tree is None or nobj_old != len(xyobj): del tree nobj_old = len(xyobj) tree = kdtree(xyobj) match_idx, match_d2 = tree.knn(xydet, 1) match_idx = match_idx[:,0] # First neighbor only #### #if np.uint64(13828114484734072082) in id2: # np.savetxt('bla.%d.static=%d.txt' % (det_cell, pix.static_cell_for_cell(det_cell)), objs.as_ndarray(), fmt='%s') # Compute accurate distances, and select detections not matched to existing objects dist = gc_dist(objs['_LON'][match_idx], objs['_LAT'][match_idx], ra2, dec2) unmatched = dist >= radius else: # All detections will become new objects (and therefore, dist=0) dist = np.zeros(ndet, dtype='f4') unmatched = np.ones(ndet, dtype=bool) match_idx = np.empty(ndet, dtype='i4') # x, y, t = pix._xyt_from_cell_id(det_cell) # print "det_cell %s, MJD %s, Exposure %s == %d detections, %d objects, %d matched, %d unmatched" % (det_cell, t, exposure, len(detections2), nobj, len(unmatched)-unmatched.sum(), unmatched.sum()) # Promote unmatched detections to new objects _, newra, newdec, _, _, newxy = detections2[unmatched].as_columns() nunmatched = unmatched.sum() reserve_space(objs, nobj+nunmatched) objs['_LON'][nobj:nobj+nunmatched] = newra objs['_LAT'][nobj:nobj+nunmatched] = newdec dist[unmatched] = 0. match_idx[unmatched] = np.arange(nobj, nobj+nunmatched, dtype='i4') # Set the indices of unmatched detections to newly created objects # Join objects to their detections reserve_space(join, njoin+ndet) join['_M1'][njoin:njoin+ndet] = match_idx join['_M2'][njoin:njoin+ndet] = id2 join['_DIST'][njoin:njoin+ndet] = dist # TODO: For debugging; remove when happy join['_LON'][njoin:njoin+ndet] = ra2 join['_LAT'][njoin:njoin+ndet] = dec2 njoin += ndet # Prep for next loop nobj += nunmatched xyobj = np.append(xyobj, newxy, axis=0) # TODO: Debugging: Final consistency check (remove when happy with the code) dist = gc_dist( objs['_LON'][ join['_M1'][njoin-ndet:njoin] ], objs['_LAT'][ join['_M1'][njoin-ndet:njoin] ], ra2, dec2) assert (dist < radius).all() # Truncate output tables to their actual number of elements objs = objs[0:nobj] join = join[0:njoin] assert len(objs) >= nobj0 # Find the objects that fall outside of cell boundaries. These will # be processed and stored by their parent cells. Also leave out the objects # that are already stored in the database (x, y) = bhpix.proj_bhealpix(objs['_LON'], objs['_LAT']) in_ = bounds.isInsideV(x, y) innew = in_.copy(); innew[:nobj0] = False # New objects in cell selector ids = objs['_ID'] nobjadded = innew.sum() if nobjadded: # Append the new objects to the object table, obtaining their IDs. assert not _rematching, 'cell_id=%s, nnew=%s\n%s' % (det_cell, nobjadded, objs[innew]) ids[innew] = obj_table.append(objs[('_LON', '_LAT')][innew]) # Set the indices of objects not in this cell to zero (== a value # no valid object in the database can have). Therefore, all # out-of-bounds links will have _M1 == 0 (#1), and will be removed # by the np1d call (#2) ids[~in_] = 0 # 1) Change the relative index to true obj_id in the join table join['_M1'] = ids[join['_M1']] # 2) Keep only the joins to objects inside the cell join = join[ np.in1d(join['_M1'], ids[in_]) ] # Append to the join table, in *dec_cell* of obj_table (!important!) if len(join) != 0: # compute the cell_id part of the join table's # IDs.While this is unimportant now (as we could # just set all of them equal to cell_id part of # cell_id), if we ever decide to change the # pixelation of the table later on, this will # allow us to correctly split up the join table as # well. #_, _, t = pix._xyt_from_cell_id(det_cell) # This row points to a detection in the temporal cell ... #x, y, _, _ = pix._xyti_from_id(join['_M1']) # ... but at the spatial location given by the object table. #join['_ID'][:] = pix._id_from_xyti(x, y, t, 0) # This will make the new IDs have zeros in the object part (so Table.append will autogen them) join['_ID'][:] = det_cell o2d_table.append(join) assert not cachedonly or (nobjadded == 0 and len(join) == 0) # return: Number of exposures, number of objects before processing this cell, number of detections processed (incl. cached), # number of newly added objects, number of detections xmatched, number of detection processed that weren't cached # Note: some of the xmatches may be to newly added objects (e.g., if there are two # overlapping exposures within a cell; first one will add new objects, second one will match agains them) yield (len(uexposures), nobj0, len(detections), nobjadded, len(join), (cached == False).sum())
def addTransition(self, state, succState, reward): """ Add the given transition (state, succState, reward) to the example set. Return which state transition (if any) has been removed. """ # Lazy initialization when dimensionality is known if self.states == None: self.states = numpy.zeros((len(state), 0)) self.succStates = numpy.zeros((len(state), 0)) self.rewards = numpy.zeros((1, 0)) self.stateDimensions = state.dimensions else: assert (self.stateDimensions == state.dimensions) removedState = None if not self.isFull() or not hasattr( self, "kdtree"): # No way to remove states # Add sample to internal memory self.states = numpy.hstack((self.states, numpy.array([state]).T)) self.succStates = numpy.hstack( (self.succStates, numpy.array([succState]).T)) self.rewards = numpy.hstack( (self.rewards, numpy.array([[reward]]))) else: # The example set is full, we remove the nearest neighbor of the added # state # Determine the distance of the current example to its # nearest neighbor minDist = self.kdTree.knn(state, 1)[1][0, 0] # Since it is too expensive to compute the closest pair of the # whole example set, we randomly pick some old examples, compute # their distance to their respective nearest neighbors and # replace the example with the minimal distance. replaceIndex = None for i in range(25): rndIndex = random.randint(0, self.states.shape[1] - 1) dist = self.kdTree.knn(self.states.T[rndIndex], 2)[1][0, 1] if dist < minDist: minDist = dist replaceIndex = rndIndex # If all old example have a distance larger than the new example, # we ignore the new example if replaceIndex == None: return None # Remember which state transition has been removed and return that # at the end of the method removedState = copy.copy(self.states.T[replaceIndex]) # Replace the nearest neighbor by the current state self.states.T[replaceIndex] = state self.succStates.T[replaceIndex] = succState self.rewards.T[replaceIndex] = reward try: # Update the KD Tree used for nearest neighbor search self.kdTree = ann.kdtree(self.states.T) except NameError: pass # Return which state transition has been removed return removedState
def makePredictions((mu, sigma, vt, a_to_u, smat, unsmat)): sys.stderr.write("Analysis Completed, Beginning Making Predictions\n") def getvData(): vtfile = open(sys.argv[2], 'r') d_i, d_j, d_v = getData(vtfile) return (d_i, d_j, d_v), def getProjectedLocs(smat, vt): return smat * vt.T def actuallyMakePredictions(((d_i, d_j, d_v), vt, mudict, sigmadict, smat, unsmat)): sys.stderr.write("Making Predictions") min_users = int(sys.argv[3]) n_recommendations = int(sys.argv[6]) u_to_index = {key : [] for key in set(d_i)} for i in xrange(len(d_i)): u_to_index[d_i[i]].append(i) for index in u_to_index[0]: try: #if data[index][1] in aset: d_v[index] -= mudict[d_j[index]] if not sigmadict[d_j[index]] == 0: d_v[index] /= sigmadict[d_j[index]] except KeyError: u_to_index[test_no].remove(index) coordinates = [] for cur_eig in vt: coordinates.append(sum(d_v[x]*cur_eig[d_j[x]] for x in u_to_index[0])) coordinates = np.array(coordinates) plocs = getProjectedLocs(smat, vt) # plocs is a map from users to projected coordinates sys.stderr.write("Creating KDTrees\n") a_to_kdtree = {} a_to_kdtreemap = {} a_to_predicted_rating = {} for a in a_to_u: sys.stderr.write("Estimating Rating for " + str(a) + "\n") #default prediction prediction = 5 writeflag = True k = int(sys.argv[4]) rel_users = list(a_to_u[a]) if len(rel_users) < min_users: sys.stderr.write("Inadequate Data\n") continue sys.stderr.write("Found " + str(len(rel_users)) + " Relevant Users\n") k = max(1, min(k, len(rel_users) / 2)) sys.stderr.write("Building KDTree\n") a_to_kdtreemap[a] = rel_users ploclist = [] for i in xrange(len(rel_users)): ploclist.append(plocs[rel_users[i]].tolist()) a_to_kdtree[a] = ann.kdtree(np.array(ploclist)) knn = a_to_kdtree[a].knn(coordinates, k)[0][0] sys.stderr.write("Using K = " + str(k) + "\n") knn_ratings = [unsmat[a_to_kdtreemap[a][x]][a] for x in knn] mmm = sys.argv[5] if mmm == "mean": prediction = np.mean(knn_ratings) elif mmm == "median": prediction = np.median(knn_ratings) else: prediction = np.argmax(np.bincount(knn_ratings)) sys.stderr.write("Predicted: " + str(prediction) + "\n") a_to_predicted_rating[a] = prediction sys.stderr.write("Predictions Complete!\n") cannot_rec = set([ d_j[index] for index in u_to_index[0] ]) i = 0 while i < n_recommendations: maxvalue = 0 maxindex = 0 for key, value in a_to_predicted_rating.iteritems(): if value > maxvalue: maxindex = key maxvalue = value top = maxindex if not top in cannot_rec: cannot_rec.add(top) print top i += 1 del a_to_predicted_rating[top]
timekd += time.time() - tstart # ckdtree tstart = time.time() idx4 = cb.closest_code_ckdtree(sample) timeckd += time.time() - tstart # ann tstart = time.time() idx5 = cb.closest_code_ann(sample) timeann += time.time() - tstart # batch tstart = time.time() idx6 = cb.closest_code_batch(sample) timebatch += time.time() - tstart # ann redoing codebook tstart = time.time() cb._ann = ann.kdtree(cb._codebook) idx7 = cb.closest_code_ann(sample) timeann2 += time.time() - tstart # checking assert idx2 == idx3 or (cb[idx2] == cb[idx3]).all() assert idx2 == idx4 or (cb[idx2] == cb[idx4]).all() assert idx2 == idx5 or (cb[idx2] == cb[idx5]).all() assert idx2 == idx6 or (cb[idx2] == cb[idx6]).all() assert idx2 == idx7 or (cb[idx2] == cb[idx7]).all() assert idx2 == idxes[sampleidx] or (cb[idx2] == cb[idxes[sampleidx]]).all() #print 'time for fast algo:',timefast,'seconds.' print 'time for slow algo: ', timeslow, 'seconds.' print 'time for kd algo: ', timekd, 'seconds.' print 'time for ckd algo: ', timeckd, 'seconds.'
def _xmatch_mapper(qresult, tabname_to, radius, tabname_xm, n_neighbors): """ Mapper: - given all objects in a cell, make an ANN tree - load all objects in tabname_to (including neighbors), make an ANN tree, find matches - store the output into an index table """ from scikits.ann import kdtree db = qresult.db pix = qresult.pix table_xm = db.table(tabname_xm) for rows in qresult: cell_id = rows.info.cell_id join = ColGroup( dtype=[('_M1', 'u8'), ('_M2', 'u8'), ('_DIST', 'f4'), ('_NR', 'u1'), ('_LON', 'f8'), ('_LAT', 'f8')]) (id1, ra1, dec1) = rows.as_columns() (id2, ra2, dec2) = db.query('_ID, _LON, _LAT FROM %s' % tabname_to).fetch_cell( cell_id, include_cached=True).as_columns() if len(id2) != 0: # Project to tangent plane around the center of the cell. We # assume the cell is small enough for the distortions not to # matter and Euclidian distances apply bounds, _ = pix.cell_bounds(cell_id) (clon, clat) = bhpix.deproj_bhealpix(*bounds.center()) xy1 = np.column_stack(gnomonic(ra1, dec1, clon, clat)) xy2 = np.column_stack(gnomonic(ra2, dec2, clon, clat)) # Construct kD-tree to find an object in table_to that is nearest # to an object in table_from, for every object in table_from tree = kdtree(xy2) match_idxs, match_d2 = tree.knn(xy1, min(n_neighbors, len(xy2))) del tree # Create the index table array join.resize(match_idxs.size) for k in xrange(match_idxs.shape[1]): match_idx = match_idxs[:, k] join['_M1'][k::match_idxs.shape[1]] = id1 join['_M2'][k::match_idxs.shape[1]] = id2[match_idx] join['_DIST'][k::match_idxs.shape[1]] = gc_dist( ra1, dec1, ra2[match_idx], dec2[match_idx]) join['_LON'][k::match_idxs.shape[1]] = ra2[match_idx] join['_LAT'][k::match_idxs.shape[1]] = dec2[match_idx] join['_NR'][k::match_idxs.shape[1]] = k # Remove matches beyond the xmatch radius join = join[join['_DIST'] < radius] if len(join): # compute the cell_id part of the join table's # IDs. While this is unimportant now (as we could # just set all of them equal to cell_id part of # cell_id), if we ever decide to change the # pixelation of the table later on, this will # allow us to correctly repixelize the join table as # well. #x, y, t, _ = pix._xyti_from_id(join['_M1']) # ... but at the spatial location given by the object table. #join['_ID'] = pix._id_from_xyti(x, y, t, 0) # This will make the new IDs have zeros in the object part (so Table.append will autogen them) # TODO: Allow the stuff above (in Table.append) join['_ID'] = pix.cell_for_id(join['_M1']) # TODO: Debugging, remove when happy cid = np.unique(pix.cell_for_id(join['_ID'])) assert len(cid) == 1, len(cid) assert cid[0] == cell_id, '%s %s' % (cid[0], cell_id) #### table_xm.append(join) yield len(id1), len(id2), len(join) else: yield len(rows), 0, 0
def process_filelist_test(filelist=None,model=None,tmpfilename=None, npicks=None,winsize=None,finaldim=None,K=1, typecompress='picks'): """ Main function, process all files in the list (as long as their artist is in testartist) INPUT filelist - a list of song files model - h5 file containing feats and year for all train songs tmpfilename - where to save our processed features npicks - number of segments to pick per song winsize - size of each segment we pick finaldim - how many values do we keep K - param of KNN (default 1) typecompress - feature type, 'picks', 'corrcoeff' or 'cov' must be the same as in training """ # sanity check for arg in locals().values(): assert not arg is None,'process_filelist_test, missing an argument, something still None' if os.path.isfile(tmpfilename): print 'ERROR: file',tmpfilename,'already exists.' return if not os.path.isfile(model): print 'ERROR: model',model,'does not exist.' return # create kdtree h5model = tables.openFile(model, mode='r') assert h5model.root.data.feats.shape[1]==finaldim,'inconsistency in final dim' kd = ANN.kdtree(h5model.root.data.feats) # create outputfile output = tables.openFile(tmpfilename, mode='a') group = output.createGroup("/",'data','TMP FILE FOR YEAR RECOGNITION') output.createEArray(group,'year_real',tables.IntAtom(shape=()),(0,),'', expectedrows=len(filelist)) output.createEArray(group,'year_pred',tables.Float64Atom(shape=()),(0,),'', expectedrows=len(filelist)) # random projection ndim = 12 # fixed in this dataset if typecompress == 'picks': randproj = RANDPROJ.proj_point5(ndim * winsize, finaldim) elif typecompress == 'corrcoeff' or typecompress=='cov': randproj = RANDPROJ.proj_point5(ndim * ndim, finaldim) elif typecompress == 'avgcov': randproj = RANDPROJ.proj_point5(90, finaldim) else: assert False,'Unknown type of compression: '+str(typecompress) # go through files cnt_f = 0 for f in filelist: cnt_f += 1 if cnt_f % 5000 == 0: print 'TESTING FILE #'+str(cnt_f) # check file h5 = GETTERS.open_h5_file_read(f) artist_id = GETTERS.get_artist_id(h5) year = GETTERS.get_year(h5) track_id = GETTERS.get_track_id(h5) h5.close() if year <= 0: # probably useless but... continue if typecompress == 'picks': # we have a train artist with a song year, we're good bttimbre = get_bttimbre(f) if bttimbre is None: continue # we even have normal features, awesome! processed_feats = CBTF.extract_and_compress(bttimbre,npicks,winsize,finaldim, randproj=randproj) elif typecompress == 'corrcoeff': h5 = GETTERS.open_h5_file_read(f) timbres = GETTERS.get_segments_timbre(h5).T h5.close() processed_feats = CBTF.corr_and_compress(timbres,finaldim,randproj=randproj) elif typecompress == 'cov': h5 = GETTERS.open_h5_file_read(f) timbres = GETTERS.get_segments_timbre(h5).T h5.close() processed_feats = CBTF.cov_and_compress(timbres,finaldim,randproj=randproj) elif typecompress == 'avgcov': h5 = GETTERS.open_h5_file_read(f) timbres = GETTERS.get_segments_timbre(h5).T h5.close() processed_feats = CBTF.avgcov_and_compress(timbres,finaldim,randproj=randproj) else: assert False,'Unknown type of compression: '+str(typecompress) if processed_feats is None: continue if processed_feats.shape[0] == 0: continue # do prediction year_pred = do_prediction(processed_feats,kd,h5model,K) # add pred and ground truth to output if not year_pred is None: output.root.data.year_real.append( [year] ) output.root.data.year_pred.append( [year_pred] ) # close output and model del kd h5model.close() output.close() # done return
def __setstate__(self, dict): self.__dict__ = dict # restore for action in self.actions: self.actionsKDTree[action] = ann.kdtree(self.states[action])