def space_partitioning():
  """ This is the main function for reading the input file, processing queries,
      and printing the results. It takes a space-partitioning approach with
      two kd-trees.  
  """
   
  logging.info("Reading from sys.stdin...")
  
  data = read_input(sys.stdin)
  
  logging.info("Building a tree from {} topic points.".format(len(data['topics'])))
    
  # Build the topics tree
  t0 = time.clock()
  dimensions = ['x', 'y']
  tree = kdtree.KDTree(data['topics'], dimensions)
  t1 = time.clock()
  
  logging.info("Tree constructed, there are {} total nodes ({} s).".
          format(tree.number_nodes, t1 - t0))
    
  # Build another tree for questions queries, with empty topics excluded 
  logging.info("Building a tree from {} topic points.".format(len(data['topics_with_questions'])))
  t0 = time.clock()
  dimensions = ['x', 'y']
  pruned_tree = kdtree.KDTree(data['topics_with_questions'].values(), dimensions)
  t1 = time.clock()
  logging.info("Tree constructed, there are {} total nodes ({} s).".
          format(tree.number_nodes, t1 - t0))  
  
  # Actually process the queries
  logging.info("Starting {} queries...".format(len(data['queries'])))
  stat_list = []
  pass_list = []
  t0 = time.clock()
  process_queries(data, tree, pruned_tree, stat_list, pass_list)              
  t1 = time.clock()
  logging.info("Queries finished ({} s)".format(t1-t0))

  # Pull together some analysis for debugging and optimization.
  pass_list.sort()
  logging.info("In {} queries, the number passes was:".
        format(len(data['queries'])))
  logging.info("  {} -> min".format(pass_list[0]))
  logging.info("  {} -> average".format(sum(pass_list)/len(pass_list)))
  logging.info("  {} -> median".format(pass_list[len(pass_list)/2]))
  logging.info("  {} -> max".format(pass_list[-1]))  
  
  stat_list.sort()
  logging.info("In {} queries, the number nodes visited was:".
        format(len(data['queries'])))
  logging.info("  {} -> min".format(stat_list[0]))
  logging.info("  {} -> average".format(sum(stat_list)/len(stat_list)))
  logging.info("  {} -> median".format(stat_list[len(stat_list)/2]))
  logging.info("  {} -> max".format(stat_list[-1]))          
Exemple #2
0
def main():
    # Calculate risk analysis
    weather_data, accident_data = load_data('wcvaarr.db')
    print('PRIMA')
    weather_tree = kdtree.KDTree(points=weather_data,
                                 features=['latitude', 'longitude', 'date'])
    accident_tree = kdtree.KDTree(points=accident_data,
                                  features=['latitude', 'longitude', 'date'])
    # Write the data out to file
    data = weather_tree.root.flatten()
    with open('weather.json', 'w') as fp:
        json.dump(data, fp)
    data = accident_tree.root.flatten()
    with open('accident.json', 'w') as fp:
        json.dump(data, fp)
Exemple #3
0
def GetOffsets(patches, indices):
    start = time()
    kd = kdtree.KDTree(patches, leafsize=cfg.KDT_LEAF_SIZE, tau=cfg.TAU)
    dist, offsets = kdtree.get_annf_offsets(patches, indices, kd.tree, cfg.TAU)
    end = time()
    print "GetOffsets execution time: ", end - start
    return offsets
Exemple #4
0
 def __init__(self, metric, method='bruteforce'):
     """Accepts either bruteforce, kdtree, or balltree methods.
     If kdtree, metric must be a weighted euclidean metric."""
     self.metric = metric
     self.method = method
     if self.method == 'kdtree':
         self.kdtree = kdtree.KDTree(self.metric)
         if check_kdtree:
             self.checker = NearestNeighbors(self.metric)
             print "Debugging: Double checking KD-tree with nearest neighbors"
     elif self.method == 'balltree':
         try:
             from sklearn.neighbors import BallTree, DistanceMetric
             self.points = []
             self.datas = []
             self.dirty = True
         except ImportError:
             print "NearestNeighbors: scikit-learn is not installed, falling back to brute force"
             self.method = 'bruteforce'
             self.nodes = []
     elif self.method == 'se3balltree':
         try:
             from sklearn.neighbors import BallTree, DistanceMetric
             self.points = []
             self.datas = []
             self.dirty = True
         except ImportError:
             print "NearestNeighbors: scikit-learn is not installed, falling back to brute force"
             self.method = 'bruteforce'
             self.nodes = []
     else:
         self.nodes = []
Exemple #5
0
def mosaic(img_set, img_target, tile_size, nearest_imgs=0, blend=0):
    """
    Return an image of img_target composed of images
    from img_set with tile_size. The images in img_set
    might be rescaled and cropped to fit tile_size.
    """
    # number of tiles to be used
    tx, ty = img_target.size[0] / tile_size[0] + 1, img_target.size[
        1] / tile_size[1] + 1

    # transform all images in set to tile_size
    img_set = [ImagePoint(rescale_crop(img, tile_size)) for img in img_set]

    # build mosaic image
    mosaic_img = Image.new('RGB', img_target.size)

    # rescale image into each pixel as a tile
    target_pixels = img_target.resize((tx, ty), Image.ANTIALIAS).load()

    # Create a KDTree of image set for tiles
    tree = kdtree.KDTree(img_set)

    # tiles tracking for alternation
    last_tile_position = [[] for x in xrange(nearest_imgs)]

    # for each tile, select the best image and compose mosaic
    for x in xrange(tx):
        for y in xrange(ty):
            # selects neigh, the n-th neighbour fartest from itself
            fartest = 0
            for p in xrange(nearest_imgs):
                if not last_tile_position[p]:
                    neigh = p
                    break
                else:
                    # find the closest distance
                    dist = distance((x, y),
                                    min(last_tile_position[p],
                                        key=lambda e: distance((x, y), e)))

                    if dist > fartest:
                        fartest = dist
                        neigh = p

            # sets current position as last
            last_tile_position[neigh].append((x, y))

            # calculate target's tile mean
            target_mean = target_pixels[x, y][:3]

            # sorts img_set acording to distance:
            best_tile = tree.query(target_mean, t=nearest_imgs)[neigh].image

            # apply best tile to mosaic image
            mosaic_img.paste(best_tile, (tile_size[0] * x, tile_size[1] * y))

    return Image.blend(mosaic_img, img_target, blend)
 def __init__(self, metric, method='bruteforce'):
     self.metric = metric
     self.method = method
     if self.method == 'kdtree':
         self.kdtree = kdtree.KDTree(self.metric)
         if check_kdtree:
             self.checker = NearestNeighbors(self.metric)
             print "Debugging: Double checking KD-tree with nearest neighbors"
     else:
         self.nodes = []
Exemple #7
0
def do_stuff(set_array, range_bounds):
    """Funkcja pomocnicza do tworzenia drzewa na podstawie zadanych parametrów

    :param set_array: zbiór punktów
    :param range_bounds: obszar przeszukiwania
    :return: drzewo kd
    """
    tree = kdt.KDTree(set_array)
    tree.search_range(range_bounds[0], range_bounds[1])
    print(tree)
    return tree
Exemple #8
0
def rmspe(conf, point_list_brd):
    """
    Return the RMSPE error statistic generated from K-fold cross validation.

    Take the given KFoldConf object and an ordered broadcasted list of point
    objects and return the desired error statistic.
    """

    # deep copy point_list
    points = copy.deepcopy(point_list_brd.value)

    # scale time dimensions
    for p in points:
        p.scale_time(conf.time_scale)

    # build a list of sets representing the relevant partition
    partition = [list() for i in range(conf.folds)]
    for i, p in enumerate(points):
        partition[i % conf.folds].append(p)

    # generate results for kfold cross validation with this err stat
    results = [0.0] * conf.folds
    for i in range(conf.folds):

        # initialize validation_set and training_set
        validation_set = partition[i]
        training_set = list()
        for j in range(conf.folds):
            if j != i:
                training_set.extend(partition[j])

        # generate conf.m bags at conf.alpha by sampling with replacement
        n_prime = int(len(training_set) * conf.alpha)
        bags = [sample_with_replacement(training_set, n_prime)
                for i in range(conf.m)]
        trees = [kdtree.KDTree(bag) for bag in bags]

        for point in validation_set:
            # compute the average estimate for pollution at point over bags
            avg_estimate = 0.0
            for tree in trees:
                nnl = tree.query(point, conf.neighbors)
                avg_estimate += point.interpolate(nnl, conf.power)
            avg_estimate /= conf.m
            # incorporate this information into the results vector
            results[i] += ((avg_estimate - point.value()) /
                           point.value()) ** 2.0
        results[i] /= len(validation_set)
        results[i] = math.sqrt(results[i]) * 100

    # return the average of the elements in the results vector
    return sum(results) / len(results)
def integration_test():
    k = 2
    p = 0.01
    TRAIN_SIZE = 100000
    TEST_SIZE = 2000

    mu = np.zeros(k)
    cov = np.diag(np.ones(k))
    dist = scipy.stats.multivariate_normal(mean=mu, cov=cov)
    sample_size = 1000000
    np.random.seed(0)
    threshold = np.percentile(dist.pdf(
        np.random.multivariate_normal(mean=mu, cov=cov, size=sample_size)
    ), p * 100)
    print("Threshold: {}".format(threshold))

    np.random.seed(0)
    training_data = np.random.multivariate_normal(mean=mu, cov=cov, size=TRAIN_SIZE)
    bw = TRAIN_SIZE ** (-1 / (k + 4))
    print("BW: {}".format(bw))
    # kernel = scipy.stats.multivariate_normal(mean=mu, cov=cov * (bw*bw))
    kernel = Kernel(k=k, bw=bw)
    start_time = time.time()
    t = kdtree.KDTree(dim=k).build(training_data)
    print("Constructed Tree in: {}".format(time.time() - start_time))
    raw_threshold = threshold * TRAIN_SIZE
    eps = 0.01
    tkde = tkde.TKDE(
        t,
        kernel,
        threshold=raw_threshold,
        epsilon=eps * raw_threshold
    )

    np.random.seed(1)
    test_data = np.random.multivariate_normal(mean=mu, cov=cov, size=1000)
    test_pdfs = np.array([tkde.calc(test_query)[0] for test_query in test_data])
    est_threshold = np.percentile(test_pdfs, p * 100)
    print("Est Threshold: {}".format(est_threshold))

    actual_test_pdfs = dist.pdf(test_data)
    disagree_on = (~((actual_test_pdfs < threshold) == (test_pdfs < threshold)))
    n_disagree = np.sum(disagree_on)
    print("disagree on: {} ".format(n_disagree))
    print(test_pdfs[disagree_on])
    print(actual_test_pdfs[disagree_on])
Exemple #10
0
def rmspe(conf, point_list_brd, radius_table_brd):
    """
    Return the RMSPE error statistic generated from K-fold cross validation.

    Take the given KFoldConf object and an ordered broadcasted list of point
    objects and return the desired error statistic.
    """

    # deep copy point_list
    points = copy.deepcopy(point_list_brd.value)

    # scale time dimensions
    for p in points:
        p.scale_time(conf.time_scale)

    # build a list of sets representing the relevant partition
    partition = [list() for i in range(conf.folds)]
    for i, p in enumerate(points):
        partition[i % conf.folds].append(p)

    results = [0.0 for i in range(conf.folds)]
    for i in range(conf.folds):
        # initialize validation set and training set
        validation_set = partition[i]
        training_set = list()
        for j in range(conf.folds):
            if j != i:
                training_set += partition[j]
        # build a kdtree from the training set
        tree = kdtree.KDTree(training_set)
        # compute result for this validation set
        for p in validation_set:
            nnl = tree.query(p, conf.neighbors)
            # The modification below was made for experiment #03B
            distance_limit = radius_table_brd.value[str(p.time_scale)]
            nnl = exclude.exclude_nodes(nnl, p, distance_limit)
            results[i] += ((p.interpolate(nnl, conf.power) - p.value()) /
                           p.value())**2.0
        results[i] /= len(validation_set)
        results[i] = math.sqrt(results[i]) * 100

    # return the average of the elements in the results vector
    return sum(results) / len(results)
Exemple #11
0
def mare(conf, point_list_brd):
    """
    Return the MARE error statistic generated from K-fold cross validation.

    Take the given KFoldConf object and an ordered broadcasted list of point
    objects and return the desired error statistic.
    """

    # deep copy point_list
    points = copy.deepcopy(point_list_brd.value)

    # scale time dimensions
    for p in points:
        p.scale_time(conf.time_scale)

    # build a list of sets representing the relevant partition
    partition = [list() for i in range(conf.folds)]
    for i, p in enumerate(points):
        partition[i % conf.folds].append(p)

    results = [0.0 for i in range(conf.folds)]
    for i in range(conf.folds):
        # initialize validation set and training set
        validation_set = partition[i]
        training_set = list()
        for j in range(conf.folds):
            if j != i:
                training_set += partition[j]
        # build a kdtree from the training set
        tree = kdtree.KDTree(training_set)
        # compute result for this validation set
        for p in validation_set:
            nnl = tree.query(p, conf.neighbors)
            results[i] += (abs(p.interpolate(nnl, conf.power) - p.value()) /
                           p.value())
        results[i] /= len(validation_set)

    # return the average of the elements in the results vector
    return sum(results) / len(results)
Exemple #12
0
def weakly_simplefy_polygon(polygon, cutouts):
    for c in cutouts:
        c.reverse()
    while len(cutouts) > 0:
        kdtree.c = 0
        sys.stderr.write('todo:' + str(len(cutouts)) + '\n')
        tree = kdtree.KDTree([(x, y, i)
                              for i, (x, y) in enumerate(polygon[:-1])])
        c_best, best, limit, c_best_n = None, None, None, None
        for c in cutouts:
            for i, (x, y) in enumerate(c[:-1]):
                n_best, n_limit = tree.find_nearest((x, y, c, i), limit)
                if best == None or limit > n_limit:
                    c_best, c_best_n, best, limit = c, i, n_best, n_limit

        pn = best[2]
        sys.stderr.write(
            str(kdtree.c) + ' ' + str(polygon[pn]) + ' ' +
            str(c_best[c_best_n]) + '\n')
        polygon[pn:pn] = [polygon[pn]
                          ] + c_best[c_best_n:-1] + c_best[:c_best_n + 1]
        cutouts.remove(c_best)
    return polygon
 def reset(self):
     if self.method == 'kdtree':
         self.kdtree = kdtree.KDTree(self.metric)
         if check_kdtree: self.checker = NearestNeighbors(self.metric)
     else:
         self.nodes = []
Exemple #14
0
def test_verbose():
    """ Processes the queries and displays output for checking accuracy, instead
      of just printing out query results. Very verbose, so running this on 
      more than 25 topics or queries is a mistake. """

    logging.info("Reading from sys.stdin...")
    data = read_input(sys.stdin)

    show_topics(data['topics'])
    show_queries(data['queries'])

    # Nature of the dataset
    logging.info("There are {} topics, {} questions, and {} queries.".format(
        len(data['topics']), len(data['questions']), len(data['queries'])))
    logging.info("There are {} topics that have no questions.".format(
        data['num_topics_without_questions']))
    logging.info("There are {} questions that have no topics.".format(
        data['num_questions_without_topics']))

    # Build the topic tree
    logging.info("Building tree from {} topic points...".format(
        len(data['topics'])))
    t0 = time.clock()
    dimensions = ['x', 'y']
    tree = kdtree.KDTree(data['topics'], dimensions)
    t1 = time.clock()
    logging.info("Tree constructed, there are {} total nodes ({} s).".format(
        tree.number_nodes, t1 - t0))
    #logging.info("Here's what the tree structure looks like: ")
    #tree.root.print_tree()

    # Build the pruned topic tree for questions queries, with empty topics excluded
    logging.info("Building pruned tree from {} topic points...".format(
        len(data['topics_with_questions'])))
    t0 = time.clock()
    dimensions = ['x', 'y']
    pruned_tree = kdtree.KDTree(data['topics_with_questions'].values(),
                                dimensions)
    t1 = time.clock()
    logging.info("Tree constructed, there are {} total nodes ({} s).".format(
        pruned_tree.number_nodes, t1 - t0))
    #logging.info("Here's what the tree structure looks like: ")
    #pruned_tree.root.print_tree()

    stats = {}
    for query in data['queries']:

        # Pull out the number of results desired for the query.
        num_results = query['count']

        logging.info(
            "Query: The {query[count]} {type}'s nearest to ({query[x]:0.2f}, {query[y]:0.2f})"
            .format(query=query, type=query['type']))

        # Topic queries are straight up nearest neighbor queries.
        if query['type'] == 't':

            nearest = tree.k_nearest(query, num_results, stats)
            nearest['list'].sort(key=itemgetter('distance'))

            # Just print out the topics
            for count, result in enumerate(nearest['list']):
                logging.info(
                    "  Topic {0} - ({1[point]}), distance {1[distance]:0.2f}".
                    format(count, result))

            # And some nice info.
            logging.info(
                "  {} nodes (over {} passes) in the {}-node tree were traversed to get this result."
                .format(stats['nodes'], stats['passes'], tree.number_nodes))

        # Otherwise search is more complicated because we care about number of
        # records associated with the nearest point(s)
        elif query['type'] == 'q':

            nearest = tree.k_nearest_linked_records(
                query, num_results, 'questions',
                data['max_possible_questions'], stats)

            nearest['questions'].sort(key=itemgetter('distance'))

            for count, result in enumerate(nearest['questions']):
                logging.info(
                    "  Question {1[id]}, distance {1[distance]:0.2f}".format(
                        count, result))

            logging.info(
                "  {} nodes (over {} passes) in the {}-node tree were traversed to get this result."
                .format(stats['nodes'], stats['passes'], tree.number_nodes))
Exemple #15
0
def stress_test():
    """ This is a function to give an idea of the order of magnitude of running time
      on large inputs. """

    # Sample number points randomly on a square of specified origin and size.
    origin = {'x': 0, 'y': 0}
    size = 1000000
    number = 10000
    data = sample_square(origin, size, number)

    print(
        "Building a 2d-tree from {number} points sampled on a square of size {size}..."
        .format(size=size, number=number))
    dimensions = ['x', 'y']
    tree = kdtree.KDTree(data, dimensions)

    print("Tree constructed, there are {} total nodes.".format(
        tree.number_nodes))

    stats = {'nodes': 0}
    queries = 10000

    print("Randomly generating {} test points for querying the tree...".format(
        queries))
    test_points = sample_square(origin, size, queries)

    print("Test points created.")

    print("Start nearest-neighbor queries...")
    t0 = time.clock()
    stat_list = []
    for death in test_points:

        # Find the single nearest neighbor to the query point
        result = tree.root.nearest(death, stats)
        stat_list.append(stats['nodes'])
    t1 = time.clock()
    time_elapsed = t1 - t0

    # Print some stats to give an idea of the number of nodes traversed.
    print("Queries finished ({} s).".format(time_elapsed))

    stat_list.sort()
    print("In {} NN queries, the number nodes visited was:".format(queries))
    print("  {} -> min".format(stat_list[0]))
    print("  {} -> average".format(sum(stat_list) / len(stat_list)))
    print("  {} -> median".format(stat_list[len(stat_list) / 2]))
    print("  {} -> max".format(stat_list[-1]))

    k = 10
    print("Starting {}-nearest-neighbor queries...".format(k))
    t0 = time.clock()
    stat_list = []
    pass_list = []
    for death in test_points:

        # Find the single nearest neighbor to the query point
        result = tree.root.k_nearest(death, k, stats)
        stat_list.append(stats['nodes'])
        pass_list.append(stats['passes'])

    time_elapsed = time.clock() - t0
    # Print some stats to give an idea of the number of nodes traversed.
    print("Queries finished ({} s)".format(time_elapsed))

    pass_list.sort()
    print("In {} {}NN queries, the number passes was:".format(queries, k))
    print("  {} -> min".format(pass_list[0]))
    print("  {} -> average".format(sum(pass_list) / len(pass_list)))
    print("  {} -> median".format(pass_list[len(pass_list) / 2]))
    print("  {} -> max".format(pass_list[-1]))

    stat_list.sort()
    print("In {} {}NN queries, the number nodes visited was:".format(
        queries, k))
    print("  {} -> min".format(stat_list[0]))
    print("  {} -> average".format(sum(stat_list) / len(stat_list)))
    print("  {} -> median".format(stat_list[len(stat_list) / 2]))
    print("  {} -> max".format(stat_list[-1]))
Exemple #16
0
def check_tree():
    """ This is a function for testing the accuracy of results. """

    # Sample number points randomly on a square of specified origin and size.
    origin = {'x': 2, 'y': 1}
    side_length = 100
    number = 10
    data = sample_square(origin, side_length, number)

    print(
        "Building a 2d-tree from {number} points sampled on a square of size {size}..."
        .format(size=side_length, number=number))

    dimensions = ['x', 'y']
    tree = kdtree.KDTree(data, dimensions)

    print("Tree constructed, there are {} total nodes.".format(
        tree.number_nodes))

    # Build a kd-tree, using the sort / scan / sublist method.
    dimensions = ['x', 'y']
    tree = kdtree.KDTree(data, dimensions)

    print("Here's what the tree structure looks like: ")
    tree.root.print_tree()

    # Test searching for a point that is guaranteed to be in the tree
    print("Searching for a point guaranteed to be in the tree...")
    result = tree.root.search(data[0])
    print("Search result for ({0[x]:0.2f},{0[y]:0.2f}) is: {1}".format(
        data[0], result))

    # Dictionary to hold stats about kd-tree traversals.
    stats = {}

    # How many queries to make (and how many random test points to create)
    queries = 1
    print("Randomly generating a test point for querying the tree...".format(
        queries))
    test_points = sample_square(origin, side_length, queries)

    # Test searching for a point not in the tree, to get the potential parent
    print("Searching for a point not in the tree...")
    result = tree.root.search(test_points[0])
    print("Search result for ({0[x]:0.2f},{0[y]:0.2f}) is: {1}".format(
        test_points[0], result))

    # Test nearest, which finds the single closest point to the query
    nearest = tree.root.nearest(test_points[0], stats)
    print(
        "Result of nearest for ({p[x]:0.2f},{p[y]:0.2f}) is: {result}".format(
            p=test_points[0], result=nearest['point']))

    print(
        "And {} nodes in the {}-node tree were traversed to get this result.".
        format(stats['nodes'], tree.number_nodes))

    # Test k-nearest, which finds the k nearest points to the query
    num_results = 5
    nearest = tree.root.k_nearest(test_points[0], num_results, stats)
    print("Result of {k}-nearest for ({p[x]:0.2f},{p[y]:0.2f}) is:".format(
        p=test_points[0], k=num_results))

    nearest['list'].sort(key=itemgetter('distance'))
    for count, point in enumerate(nearest['list']):
        print("  {0} - ({1[point]}), distance {1[distance]:0.2f}".format(
            count, point))

    print(
        "And {} nodes (over {} passes) in the {}-node tree were traversed to get this result."
        .format(stats['nodes'], stats['passes'], tree.number_nodes))

    # Calculate and display the actual distances of each point from the target
    num_results = 5
    print("And here are the top {} nearest points to the target: ".format(
        num_results))
    results = []
    for point in data:
        distance = kdtree.KDTreeNode.distance(point, test_points[0])
        results.append({
            'x': point['x'],
            'y': point['y'],
            'distance': distance
        })
    results.sort(key=itemgetter('distance'))
    for result in results[:num_results]:
        print("  ({0[x]:0.2f}, {0[y]:0.2f}) -> {0[distance]:0.2f}".format(
            result))

    partitions_to_file(tree, test_points[0], origin, {
        'x': origin['x'] + side_length,
        'y': origin['y'] + side_length
    })

    datapoints_to_file(data)

    searchpoints_to_file(test_points)
Exemple #17
0
def check_k_nearest_accuracy():
    """ This is a function for verifying the accuracy of results by
      calculating the actual nearest neighbors by brute force.
      Needless to say this is for debugging only. """

    # Sample number points randomly on a square of specified origin and size.
    origin = {'x': 0, 'y': 0}
    size = 1000000
    number = 10000
    k = 10
    data = sample_square(origin, size, number)

    print("Testing the accuracy of {} nearest neighbors results...")

    print(
        "Building a 2d-tree from {number} points sampled on a square of size {size}..."
        .format(size=size, number=number))
    dimensions = ['x', 'y']
    tree = kdtree.KDTree(data, dimensions)

    print("Tree constructed, there are {} total nodes.".format(
        tree.number_nodes))

    stats = {'nodes': 0}
    queries = 100

    print("Randomly generating {} test points for querying the tree...".format(
        queries))
    test_points = sample_square(origin, size, queries)

    print("Test points created.")

    print("Start queries...")
    stat_list = []
    result_list = []
    stats = {}
    for test_point in test_points:

        # Find the k nearest neighborsto the query point
        k_nearest = tree.k_nearest(test_point, k, stats)

        # Now calculate the actual distances of each point from the target
        # for the purpose of testing accuracy
        all_points = []
        for point in data:
            distance = kdtree.KDTreeNode.distance(point, test_point)
            all_points.append({
                'x': point['x'],
                'y': point['y'],
                'distance': distance
            })

        # Now sort the list of points by distance from the test point and pull out
        # the point with minimum distance
        all_points.sort(key=itemgetter('distance'))
        real_nearest = all_points[:k]

        # Mark whether the result was correct or not in the result_list. So
        # if result_list[4] is false it means the nearest calculation was wrong for
        # test_points[4]
        # The == operator should work here because the numbers are pulled/calculated
        # in exactly the same way.
        for index, neighbor in enumerate(k_nearest['list']):
            correct = (
                real_nearest[index]['x'] == neighbor['point'].point['x']
                and real_nearest[index]['y'] == neighbor['point'].point['y']
                and real_nearest[index]['distance'] == neighbor['distance'])
            # Fail out on first non-match.
            if not correct:
                break

        result_list.append(correct)

        if not correct:
            # Print results if they don't match.
            print("Bruteforce results: ")

            for index, result in enumerate(real_nearest):
                print(
                    "{0}: {1[x]:0.2f}, {1[y]:0.2f}, distance {1[distance]:0.2f}"
                    .format(index, result))

            print("Kdtree results: ")

            for index, neighbor in enumerate(k_nearest['list']):
                print(
                    "{0}: {1[x]:0.2f}, {1[y]:0.2f}, distance {2:0.2f}".format(
                        index, neighbor['point'].point, neighbor['distance']))

    # Print some stats to give an idea of the number of nodes traversed.
    print("Queries and testing finished.".format(queries))

    frequencies = Counter(result_list)
    print("In {} queries, {} were correct and {} were incorrect.".format(
        queries, frequencies[True], frequencies[False]))
Exemple #18
0
def _plot_plane(ax, node, num_dims, default_plane_width=10, num_samples=10):
    boundaries = helper._boundaries(node, num_dims)
    boundaries = boundaries[node.axis:] + boundaries[:node.axis]
    
    child_dim = _dim_range(boundaries[1], default_plane_width, num_samples)
    grandchild_dim = _dim_range(boundaries[2], default_plane_width, num_samples)
    child_matrix, grandchild_matrix  = np.meshgrid(child_dim, grandchild_dim)

    constant_dim = np.linspace(node.data[node.axis], node.data[node.axis], num_samples)
    constant_matrix, _ = np.meshgrid(constant_dim, constant_dim)

    plot_input = [constant_matrix, child_matrix, grandchild_matrix]
    plot_input = plot_input[-node.axis:] + plot_input[:-node.axis]
    ax.plot_surface(plot_input[0], plot_input[1], plot_input[2], alpha=0.8)

def _dim_range(boundary, default_plane_width, num_samples):
    beg = boundary[0] if boundary[0] is not None else -default_plane_width
    end = boundary[1] if boundary[1] is not None else default_plane_width
    return np.linspace(beg, end, num_samples)

if __name__ == "__main__":

    num_dims = 3
    tree = kdtree.KDTree(test_data.list3d_2, num_dims)

    point = test_data.rand_point(num_dims)
    k = 7
    result = tree.knn(point, k)

    knn(tree, point, result)
Exemple #19
0
def _data_interpolation(centroid_rdd, pollutant):
    """ Run the interpolation of ozone at the centroid locations. """

    # Set parameters unique for this interpolation task.
    if pollutant == 'ozone':
        time_scale = (0.4 + 2.0) / 2.0
        data_file = '../data/clean/monthly_ozone_1990-2015.csv'
        point_list = point.load_point_file(data_file)
        point_list = [p.scale_time(time_scale) for p in point_list]
    else:
        time_scale = (0.18 + 0.16) / 2.0
        data_file = '../data/clean/monthly_pm25_1990-2015.csv'
        point_list = point.load_point_file(data_file)
        point_list = [p.scale_time(time_scale) for p in point_list]

    # Bag the point list and produce a list of trees to use for prediction.
    bag_size = int(len(point_list) * ALPHA)
    bags = [kfold.sample_with_replacement(point_list, bag_size)
            for _ in range(NUM_BAGS)]
    trees = [kdtree.KDTree(bag) for bag in bags]
    tree_tuple_brd = SC.broadcast(trees)

    # Define a mapper for interpolating each query point.
    def interpolation_mapper(query_point, tree_tuple_brd):
        """
        Set the max and mean estimates for query_point using the list
        of KDTree objects for interpolation.
        """

        # Generate a list of estimates for this query point.
        estimates = []
        for tree in tree_tuple_brd.value:
            nodes = tree.query(query_point, NEIGHBORS)
            estimates.append(query_point.interpolate(nodes, POWER))

        # Average the estimates from each bag.
        max_est = sum([est[0] for est in estimates]) / len(estimates)
        mean_est = sum([est[1] for est in estimates]) / len(estimates)

        # Fix the results within query_point.
        query_point.max_est = max_est
        query_point.mean_est = mean_est

        return query_point

    # Transform centroid_rdd into an RDD of query points, scale the time
    # dimension, and cache the intermediate result.
    def query_point_factory(record, month):
        """ Build a QueryPoint from a CSV record and a month value. """
        result = point.QueryPoint(record)
        result.month = month
        return result
    query_point_rdd = centroid_rdd.map(lambda p: query_point_factory(*p))
    query_point_rdd = query_point_rdd.map(lambda q: q.scale_time(time_scale))
    query_point_rdd = query_point_rdd.cache()

    # Map the query_point_rdd through the interpolation mapper.
    query_point_rdd = query_point_rdd.map(lambda q: interpolation_mapper(q, tree_tuple_brd)).cache()

    # ----------------------  Aggregation  ----------------------------------

    # TESTING
    # -------

    def simple_report(query_point):
        """ No comment. """
        month = (query_point.month % 12) + 1
        year = (query_point.month / 12) + 1990

        return query_point.blk_id +\
               ',' +\
               str(month) +\
               ',' +\
               str(year) +\
               ',' +\
               str(query_point.max_est) +\
               ',' +\
               str(query_point.mean_est)

    # Write the output to a file.
    if pollutant == 'ozone':
        query_point_rdd.map(simple_report).saveAsTextFile('ozone_inter_output')
    else:
        query_point_rdd.map(simple_report).saveAsTextFile('pm25_inter_output')
Exemple #20
0
	args = parser.parse_args()

	
	# Construct database	
	fields = ["x","y"]
	if args.quadtree:
		fields.append("quad")
	dtb =  db.Database(fields)
	field_idx = dtb.fields()
	
	# Load the data
	data_loader = dl.DataLoader()
	data_loader.load(args,dtb)

	# Create KDTree
	tree = kd.KDTree(dtb, {'max-depth' : args.max_depth, 'max-elements' : args.max_elements})

	plotter = pl.Plotter(tree,dtb,args)


	# Testing: Implementing the QuadTree
	if args.quadtree:
		quadtree = qt.QuadTree(tree.bounding_box(), args.quadtree)
		if args.quadshow:
			plotter.add_quadtree(quadtree)

	# Testing: Implementing the KDTree
	if args.closest:
		# This is for testing, to check if your closest query is correclty implemented

		# Step 1 query and fetch		
Exemple #21
0
    BLUE = (0, 0, 255)

    dd = [1000, 1000]
    game_display = pygame.display.set_mode(dd)
    pygame.display.set_caption('Quad Tree Test')
    pygame.display.update()

    game_exit = False
    clock = pygame.time.Clock()
    fps = 100
    one_pressed = False
    three_pressed = False
    state = True
    counter = 0
    boundary = kdtr.Boundary(0, 0, dd[0], dd[1])
    quadtree = kdtr.KDTree(boundary, 4)
    rectangle = None
    points = []
    for i in range(1000):
        p = kdtr.Point(random.gauss(dd[0] / 2, dd[0] / 8),
                       random.gauss(dd[1] / 2, dd[1] / 8))
        quadtree.insert(p)
        points.append(p)
    # for j in range((dd[0]/2)-10, (dd[0]/2)+10):
    #     for k in range((dd[1]/2)-10, (dd[1]/2)+10):
    #         p = kdtr.Point(j, k)
    #         points.append(p)
    #         quadtree.insert(p)

    while not game_exit:
        points_in_range = []
Exemple #22
0
def database_to_tree(c, dim=512):
    c.execute("SELECT * FROM responseData")
    res = []
    for sentence, source, vector in c.fetchall():
        res.append({"sentence": sentence, "source": source, "vector": vector})
    return kdtree.KDTree(res, dim)
Exemple #23
0
def _plot_line(node, num_dims, default_plane_width=10, num_samples=10):
    boundaries = helper._boundaries(node, num_dims)
    boundaries = boundaries[node.axis:] + boundaries[:node.axis]

    other_dim = _dim_range(boundaries[1], default_plane_width, num_samples)
    constant_dim = np.linspace(node.data[node.axis], node.data[node.axis],
                               num_samples)

    plot_input = [constant_dim, other_dim]
    plot_input = plot_input[-node.axis:] + plot_input[:-node.axis]
    plt.plot(plot_input[0], plot_input[1], alpha=0.8)


def _dim_range(boundary, default_plane_width, num_samples):
    beg = boundary[0] if boundary[0] is not None else -default_plane_width
    end = boundary[1] if boundary[1] is not None else default_plane_width
    return np.linspace(beg, end, num_samples)


if __name__ == "__main__":

    num_dims = 2
    tree = kdtree.KDTree(test_data.list2d_1, num_dims)

    point = test_data.rand_point(num_dims)
    k = 3
    result = tree.knn(point, k)

    knn(tree, point, result)
import math
import random
import re
import sqlite3
import string
import sys

import kdtree

app = flask.Flask(__name__)

# Load the trees
try:
    print('Loading trees...', end='')
    sys.stdout.flush()
    weather_tree = kdtree.KDTree(features=['latitude', 'longitude', 'date'],
                                 json_file='json/weather.json')
    accident_tree = kdtree.KDTree(features=['latitude', 'longitude', 'date'],
                                  json_file='json/accident.json')
    print('done', end='\n\n')
except:
    print(
        'Unable to load trees. "json/accident.json" or "json/weather.json" may not exist.',
        file=sys.stderr)
    sys.exit(1)


def convert_from_dms(degree, minute, second, direction):
    """
    Convert from degrees-minute-second form to decimal form and return it.
    """
    return direction * (degree + (1.0 / 60.0) * minute +
Exemple #25
0
def check_nearest_accuracy():
    """ This is a function for verifying the accuracy of results by
      calculating the actual nearest neighbors by brute force.
      Needless to say this is for debugging only. """

    print("Testing the accuracy of nearest neighbor results...")

    # Sample number points randomly on a square of specified origin and size.
    origin = {'x': 0, 'y': 0}
    size = 1000000
    number = 10000
    data = sample_square(origin, size, number)

    print(
        "Building a 2d-tree from {number} points sampled on a square of size {size}..."
        .format(size=size, number=number))
    dimensions = ['x', 'y']
    tree = kdtree.KDTree(data, dimensions)

    print("Tree constructed, there are {} total nodes.".format(
        tree.number_nodes))

    stats = {'nodes': 0}
    queries = 100

    print("Randomly generating {} test points for querying the tree...".format(
        queries))
    test_points = sample_square(origin, size, queries)

    print("Test points created.")

    print("Start queries...")
    stat_list = []
    result_list = []
    for test_point in test_points:

        # Find the single nearest neighbor to the query point
        nearest = tree.root.nearest(test_point, stats)
        stat_list.append(stats['nodes'])

        # Now calculate the actual distances of each point from the target
        # for the purpose of testing accuracy
        all_points = []
        for point in data:
            distance = kdtree.KDTreeNode.distance(point, test_point)
            all_points.append({
                'x': point['x'],
                'y': point['y'],
                'distance': distance
            })

        # Now sort the list of points by distance from the test point and pull out
        # the point with minimum distance
        all_points.sort(key=itemgetter('distance'))
        real_nearest = all_points[0]

        # Mark whether the result was correct or not in the result_list. So
        # if result_list[4] is false it means the nearest calculation was wrong for
        # test_points[4]
        # The == operator should work here because the numbers are pulled/calculated
        # in exactly the same way.
        correct = (real_nearest['x'] == nearest['point'].point['x']
                   and real_nearest['y'] == nearest['point'].point['y']
                   and real_nearest['distance'] == nearest['distance'])

        result_list.append(correct)

    # Print some stats to give an idea of the number of nodes traversed.
    print("Queries and testing finished.".format(queries))

    frequencies = Counter(result_list)
    print("In {} queries, {} were correct and {} were incorrect.".format(
        queries, frequencies[True], frequencies[False]))

    stat_list.sort()
    print("In {} queries on the {}-node tree, the number nodes visited was:".
          format(queries, tree.number_nodes))
    print("  {} -> min".format(stat_list[0]))
    print("  {} -> average".format(sum(stat_list) / len(stat_list)))
    print("  {} -> median".format(stat_list[len(stat_list) / 2]))
    print("  {} -> max".format(stat_list[-1]))