def k_distance(dataset, k, sd_away, distance=proximity.euclidean): ''' compute the k-distance as radius for dbscan. k-distance is obtained by the min. distance immediately larger than a particular standard deviation away from mean ''' global cached if not cached: proximity.build_cache(dataset, distance) cached = True kdist = [pairs[k][1] for pairs in [proximity.cache[a] for a in dataset]] kdist.sort() size = len(kdist) mean = sum(kdist) / float(size) diff = [(d - mean)**2 for d in kdist] sd = math.sqrt(sum(diff) / float(size - 1)) anchor = mean + (sd * sd_away) print 'mean:', mean, 'sd:', sd, 'sd + mean:', anchor, last = None for d in reversed(kdist): if d > anchor: last = d if d < anchor and last is not None: print 'kdist:', last return last raise Exception('sd too far away from mean')
def k_distance(dataset, k, sd_away, distance=proximity.euclidean): ''' compute the k-distance as radius for dbscan. k-distance is obtained by the min. distance immediately larger than a particular standard deviation away from mean ''' global cached if not cached: proximity.build_cache(dataset, distance) cached = True kdist = [ pairs[k][1] for pairs in [proximity.cache[a] for a in dataset] ] kdist.sort() size = len(kdist) mean = sum(kdist) / float(size) diff = [(d - mean) ** 2 for d in kdist] sd = math.sqrt(sum(diff) / float(size - 1)) anchor = mean + (sd * sd_away) print 'mean:', mean, 'sd:', sd, 'sd + mean:', anchor, last = None for d in reversed(kdist): if d > anchor: last = d if d < anchor and last is not None: print 'kdist:', last return last raise Exception('sd too far away from mean')
def find_neighbour(instance, dataset, radius, distance): ''' find all neighbour within radius ''' global cached if not cached: proximity.build_cache(dataset, distance) cached = True pairs = proximity.cache[instance] neighbour = [which for which, dist in pairs if radius >= dist] return neighbour