def centroid(X: np.ndarray, tree: BallTree) -> np.ndarray:
    '''Find the centroid of the distribution given by X.'''

    # Find an appropriate radius.
    radius = determine_radius(X, tree)
    rho_max = 0
    runs = []

    # Make sure to sample the whole space.
    for init in range(20):

        # Choose a random initilization.
        points = [X[np.random.choice(X.shape[0])]]
        density = len(tree.query_radius(two_d(points[-1]), r=radius)[0])

        # Start MCMC-esque exploration procedure.
        for i in range(100):

            potential = tree.query_radius(two_d(points[-1]), r=radius)
            new_point = X[np.random.choice(potential[0])]
            new_density = len(tree.query_radius(two_d(new_point), r=radius)[0])

            if np.random.random() < (new_density / density):
                points.append(new_point)
                density = new_density

            if rho_max < density:
                rho_max = density
                best_run = init

        runs.append(points)

    return np.array(runs[best_run]), radius, densest(runs[best_run], tree,
                                                     radius)
Exemple #2
0
def _calc_tree(xx, yy, radius):
    X = np.zeros((len(xx), 2), dtype='float')
    X[:, 0] = xx[:]
    X[:, 1] = yy[:]
    tree = BallTree(X, metric='euclidean')
    ind = tree.query_radius(X, r=radius)
    ind_sw = tree.query_radius(X, r=VARIANCE_RADIUS_SW)
    return ind, ind_sw
Exemple #3
0
def _calc_tree(xx, yy, radius):
    X = np.zeros((len(xx), 2), dtype='float')
    X[:, 0] = xx[:]
    X[:, 1] = yy[:]
    tree = BallTree(X, metric='euclidean')
    ind = tree.query_radius(X, r=radius)
    ind_sw = tree.query_radius(X, r=VARIANCE_RADIUS_SW)
    return ind, ind_sw
Exemple #4
0
    def _predict_gam(ds, conf, time, quantiles=None, size=None,
                     return_gam=False,  return_counts=False,
                     max_time_diff=200):
        # insert 0s for every timeseries in the ensemble for the reference
        # period at -35 BP (1985)

        climate = conf.climate + '_ensemble'
        age = conf.age + '_ensemble'

        x = ds[age].values.ravel()
        y = ds[climate].values.ravel()

        mask = (~np.isnan(x)) & (~np.isnan(y))
        if not mask.any():
            return
        else:
            x = x[mask]
            y = y[mask]

        gam = pygam.LinearGAM(pygam.s(0)).gridsearch(
            x[:, np.newaxis], y, progress=False)

        time = np.asarray(time)

        ret = (gam.predict(time), )

        if quantiles is not None:
            ret = ret + (gam.prediction_intervals(time, quantiles=quantiles), )
        if size is not None:

            ret = ret + (gam.sample(
                x[:, np.newaxis], y, sample_at_X=time, n_draws=size).T, )
        if return_counts:
            tree = BallTree(ds[age].values.ravel()[:, np.newaxis])
            counts = tree.query_radius(time[:, np.newaxis], return_counts,
                                       count_only=True).astype(float)
            ret = ret + (counts, )

        # look how many samples in the ensemble fall into the `max_time_diff`
        # time interval around the predicted time
        tree = BallTree(ds[age].values.ravel()[:, np.newaxis])
        counts = tree.query_radius(time[:, np.newaxis], max_time_diff,
                                   count_only=True)

        idx = counts < 100
        if idx.any():
            for arr in ret:
                arr[idx] = np.nan

        if return_gam:
            return ret + (gam, )
        else:
            return ret
def get_nearest(infected_coordinates, uninfected_coordinates, d):
    """ This method returns the indices and distances of the uninfected users that are within a distance "d"(paramater) of the infected users.
        Input:
        ------
           @infected_coordinates: array
              Latitude and lontitude of GPS coordinates of infected users.
           @uninfected_coordinates: array
              Latitude and lontitude of GPS coordinates of uninfected users.
           @d : int
              distance parameter
        Output:
        -------
           @indices : array
              indices of the uninfected users that are within a distance "d" of the infected users.
           @distances : array
              distance fron uninfected users to infected users.
    """
    # Create tree from the GPS coordinates of uninfected users
    tree = BallTree(uninfected_coordinates, leaf_size=15, metric='haversine')
    indices, distances = tree.query_radius(infected_coordinates,
                                           r=d,
                                           return_distance=True)
    indices = indices.transpose()
    distances = distances.transpose()
    return indices, distances
Exemple #6
0
def getMosquitoActivity(lat, long):
    bites_df = getBitesDF()
    bites_df = bites_df.drop('image_Base64', 1)
    bites_df = bites_df.drop('image_name', 1)
    day_1_ = pd.to_datetime(int(time.time()), unit='s')
    day_0_ = day_1_ - timedelta(days=7)
    mask = ((bites_df.index < day_1_) & (bites_df.index > day_0_))

    bites_df = bites_df.loc[mask]
    rad_bites_df = pd.DataFrame()
    rad_bites_df['timestamp'] = bites_df.index
    rad_bites_df['lat_rad'] = toRad_vec(bites_df['latitude'])
    rad_bites_df['long_rad'] = toRad_vec(bites_df['longitude'])
    rad_bites_df = rad_bites_df.set_index('timestamp', drop=True)

    bt = BallTree(rad_bites_df.as_matrix(), metric='haversine')
    indices, distances = bt.query_radius(
        [latLongtoRad(float(lat)),
         latLongtoRad(float(long))],
        r=RADIUS_DEFAULT,
        return_distance=True)
    print indices
    print distances

    nn_list = indices[0].tolist()
    return bites_df.iloc[nn_list, :].to_dict(orient='records')
Exemple #7
0
def find_hits_for_targets(
    *,
    targets: List[Tuple[float, ...]],
    predictions: List[Tuple[float, ...]],
    radius: float,
) -> List[Tuple[int, ...]]:
    """
    Generates a list of the predicted points that are within a radius r of the
    targets. The indicies are returned in sorted order, from closest to
    farthest point.

    Parameters
    ----------
    targets
        A list of target points
    predictions
        A list of predicted points
    radius
        The maximum distance that two points can be apart for them to be
        considered a hit

    Returns
    -------

    A list which has the same length as the targets list. Each element within
    this list contains another list that contains the indicies of the
    predictions that are considered hits.

    """
    predictions_tree = BallTree(array(predictions))
    hits, _ = predictions_tree.query_radius(X=targets,
                                            r=radius,
                                            return_distance=True,
                                            sort_results=True)
    return hits
Exemple #8
0
def get_score_for_ideal_points(points, ideal_points, IDEAL_HEIGHT):

    model, scene, after_tps = nrr.non_rigid_registration(points, ideal_points)

    print("Model: ", model)
    print("Scene: ", scene)
    print("after_tps: ", after_tps)

    distances_array = []

    ballTree = BallTree(after_tps)

    i = 0
    for point in ideal_points:
        ind = ballTree.query_radius(point, IDEAL_HEIGHT)
        if len(ind[0]) == 1:
            distances_array.append(np.linalg.norm(point -
                                                  after_tps[ind[0][0]]))
        else:
            i += 1
            distances_array.append(1000)

    print("SCORE: ", np.mean(distances_array))

    return np.mean(distances_array)
Exemple #9
0
class BallTreeANN:
    def __init__(self):
        """
        Constructor
        """
        self.nbrs = None

    def build_index(self, dataset, leaf_size):
        self.nbrs = BallTree(dataset, leaf_size=leaf_size, metric="euclidean")
        return self.nbrs

    def build_store_index(self, dataset, path, leaf_size):
        self.build_index(dataset, leaf_size)
        self.store_index(path)

    def store_index(self, path):
        with open(path, "wb") as output1:
            pickle.dump(self.nbrs, output1, pickle.HIGHEST_PROTOCOL)

    def load_index(self, path):
        with open(path, "rb") as input1:
            self.nbrs = pickle.load(input1)

    def search_in_radious(self, vector, radious=2):
        distances, indices = self.nbrs.query_radius(vector, r=radious, return_distance=True)
        return distances, indices

    def search_neighbors(self, vector, num_neighbors):
        distances, indices = self.nbrs.query(vector, k=num_neighbors)
        return distances, indices
Exemple #10
0
def query_neighbors(coords, r2, distance_metric='haversine', weighted=False):
    """Build a network from a set of points and a threshold distance.
    
    Parameters
    ----------
        coords : array-like (N, 2)
        r2 : float
            Threshold distance.
        distance_metric : str
            Either 'haversine' or None.
    
    Returns
    -------
        nodes : list of ints
            Correspond to the list of nodes
        edges : list of tuples
            An edge between two nodes exist if they are closer than r2.
        singleton nodes : list of ints
            Nodes that have no connections, e.g. have been visited once.
    """

    # If the metric is haversine update points (to radians) and r2 accordingly.
    if distance_metric == 'haversine':
        coords = np.radians(coords)
        r2 = r2 / 6371000

    # Init tree
    tree = BallTree(coords, metric=distance_metric)

    # Query
    return tree.query_radius(coords, r=r2, return_distance=weighted)
Exemple #11
0
def tract_smooth(optional_flags, tractography, var, file_output):
    from sklearn.neighbors import BallTree

    var = float(var)
    std = var**2

    points = tractography.original_tracts()

    all_points = numpy.vstack(points)
    bt = BallTree(all_points)
    N = len(all_points) / 3
    I = numpy.eye(3)[None, ...]
    for i, tract in enumerate(tractography.original_tracts()):
        # all_points = numpy.vstack(points[:i] + points[i + 1:])
        # bt = BallTree(all_points)

        diff = numpy.diff(tract, axis=0)
        diff = numpy.vstack((diff, diff[-1]))
        lengths = numpy.sqrt((diff**2).sum(1))
        # cum_lengths = numpy.cumsum(lengths)

        diff_norm = diff / lengths[:, None]
        tangent_lines = diff_norm[:, None, :] * diff_norm[:, :, None]
        normal_planes = I - tangent_lines
        #        weight_matrices = normal_planes + 1e10 * tangent_lines

        N = max(len(d) for d in bt.query_radius(tract, var * 3))

        close_point_distances, close_point_indices = bt.query(tract, N)

        close_points = all_points[close_point_indices]
        difference_vectors = close_points - tract[:, None, :]
        projected_vectors = (normal_planes[:, None, :] *
                             difference_vectors[..., None]).sum(-2)
        projected_points = projected_vectors + tract[:, None, :]
        # projected_distances2 = (projected_vectors**2).sum(-1)
        # projected_weights = numpy.exp(- .5 * projected_distances2 / std)
        # projected_weights /= projected_weights.sum(-1)[:, None]

        weights = numpy.exp(-.5 * close_point_distances**2 / std)[..., None]
        weights /= weights.sum(-2)[..., None]

        # tract += (weights * projected_vectors).sum(-2)

        #        weighted_distances = (
        #            weight_matrices[:, None, :] *
        #            difference_vectors[..., None]
        #        ).sum(-2)
        #        weighted_distances *= difference_vectors
        #        weighted_distances = weighted_distances.sum(-1) ** .5
        # weighted_points = (projected_points * weights).sum(1)

        weighted_points = (projected_points * weights).sum(1)

        tract[:] = weighted_points
        # tract /= norm_term

    return Tractography(tractography.original_tracts(),
                        tractography.original_tracts_data(),
                        **tractography.extra_args)
Exemple #12
0
    def get_nearest_neighbours(
        df: pd.DataFrame,
        target_id: int,
        dmax: Optional[int] = None,
        extent: Optional[int] = None,
    ) -> tuple:
        """
        Args:
            df: halo DataFrame
            target_id: object id for which to find NNs
            dmax: maximal distance between objects

        Return:
            indices and distances
        """
        pos = df[["theta1_deg", "theta2_deg"]].values
        pos_i = df[df["id"] == target_id][["theta1_deg", "theta2_deg"]].values
        if dmax is None:
            dmax = df[df["id"] == target_id]["r200_deg"].values
        if extent is not None:
            dmax *= extent
        if len(pos_i.shape) == 1:
            pos_i = pos_i[np.newaxis, :]
        btree = BallTree(pos)
        pairs = btree.query_radius(pos_i, dmax, return_distance=True,)
        return pairs[0][0], pairs[1][0]
Exemple #13
0
def faithful_downsampling(data: np.array, h: float):
    """
    An implementation of faithful downsampling as described in:  Zare H, Shooshtari P, Gupta A, Brinkman R.
    Data reduction for spectral clustering to analyze high throughput flow cytometry data.
    BMC Bioinformatics 2010;11:403

    Parameters
    -----------
    data: Numpy.array
        numpy array to be down-sampled
    h: float
        radius for nearest neighbours search

    Returns
    --------
    Numpy.array
        Down-sampled array
    """
    communities = None
    registered = np.zeros(data.shape[0])
    tree = BallTree(data)
    while not all([x == 1 for x in registered]):
        i_ = np.random.choice(np.where(registered == 0)[0])
        registered[i_] = 1
        registering_idx = tree.query_radius(data[i_].reshape(1, -1), r=h)[0]
        registering_idx = [t for t in registering_idx if t != i_]
        registered[registering_idx] = 1
        if communities is None:
            communities = data[registering_idx]
        else:
            communities = np.unique(np.concatenate(
                (communities, data[registering_idx]), 0),
                                    axis=0)
    return communities
Exemple #14
0
def getVoxel(seedPoint,rad,cloud):
	kdt = BallTree(cloud, leaf_size=5,metric='euclidean')
	#print('Extracting with rad %f'%rad)
	ind = kdt.query_radius(seedPoint.reshape(1,-1),r=rad)
	point_ids=np.expand_dims(ind,axis=0)[0,0].reshape(1,-1)
	print(point_ids.shape)
	#print(scene_cloud[point_ids[0,:],:].shape)
	return cloud[point_ids[0,:],:]
def is_repeated_stop(stop_coords, all_coords, distance):
    if len(all_coords) == 0: return False
    tree = BallTree(radians(all_coords), leaf_size=2, metric='haversine')
    result = tree.query_radius(radians([stop_coords]),
                               r=calculate_radius(distance),
                               count_only=True)[0]
    if result == 0: return False
    else: return True
def densest_radius(X: np.ndarray, support_idx: np.ndarray, tree: BallTree,
                   d_centroids: float) -> int:
    '''Identify the support vector with the densest radius.'''

    return np.argmax([
        len(tree.query_radius(np.atleast_2d(X[i]), r=(0.1 * d_centroids))[0])
        for i in support_idx
    ])
def ratio(X: np.ndarray, vector_idx: int, centroid: np.ndarray,
          d_centroids: float, tree: BallTree) -> float:
    '''Compute the ratio between the density at the support vector and a centroid.'''

    density_vector = len(
        tree.query_radius(np.atleast_2d(X[vector_idx]),
                          r=(0.1 * d_centroids))[0])
    density_centroid = len(
        tree.query_radius(np.atleast_2d(centroid), r=(0.1 * d_centroids))[0])

    if density_centroid == 0:
        density_centroid = 1

    print(f'Density of vector location: {density_vector}')
    print(f'Density of centroid: {density_centroid}')

    return density_vector / density_centroid
Exemple #18
0
def count_amenity(src_points, candidates, rad):
    """Find amenity being searched within the stated radius
    amenity: school, train station, police centre
    """
    # Create tree from the candidate points
    tree = BallTree(candidates, leaf_size=15, metric='haversine')

    # Get distance and index of nearest amenity
    dist, nearest_ind = tree.query(src_points, k=1)

    dist = dist * 6371000
    # Count number of amenity within radius
    count = tree.query_radius(src_points, r=rad, count_only=True)
    # Get indexes of all the amenity within radius
    all_ind = tree.query_radius(src_points, r=rad)

    return count, dist.ravel(), nearest_ind, all_ind
class StopBallTree:
    """
    This class is a stop friendly implementation of a sklearn ball-tree
    """

    def __init__(self, stops):
        """

        :param stops: list of Stop objects
        """
        self.tree = BallTree(self.stop_2_tup(stops),metric='haversine')
        self.tree_stops = list(stops)
        self.R = 3959.87433 * 5280

    def stop_2_tup(self,stops):
        """
        This function takes a list of stops and returns a set of tuples with postions in radians
        :param stops: list of stop.Stop objects
        :return: np.ndarray with [lat lon]
        """
        _ = np.asarray([(s.lat,s.lon) for s in stops])
        return np.radians(_)

    def query(self, stops):
        """
        This function takes takes the query results from sklearn and reformats them
        :param query_result: tuple, of lists with [[val],[val]]
        :return: tuple(distances, matches)
        """
        dist, matches = self.tree.query(self.stop_2_tup(stops))
        dist = [self.R * x[0] for x in dist]
        matches = [x[0] for x in matches]
        return dist, matches

    def query_radius(self, stops, radius,earth=False):
        """
        Interface for stops with the sklearn query_radius function
        :param stops:
        :return:
        """
        if earth:
            ind = self.tree.query_radius(self.stop_2_tup(stops), r = radius/self.R )
        else:
            ind = self.tree.query_radius(self.stop_2_tup(stops), r=radius)
        # need to convert radius into radians distance
        return {stops[i]:[self.tree_stops[j] for j in ind[i]] for i in range(len(stops))}
Exemple #20
0
def radiusUpdate(encode_record, params):
    encode_record -= np.mean(encode_record, 0)
    print(np.max(np.max(encode_record)), np.min(np.min(encode_record)))
    tree = BallTree(encode_record)
    neighbor = tree.query_radius(encode_record, 3, count_only=True) + 1
    print(np.max(neighbor), np.min(neighbor))
    weights = np.power(neighbor, params.alpha) * params.beta
    return tf.constant(weights, dtype=tf.float32)
Exemple #21
0
def queryNN(X_train, X_test, radius, leaf_size):
    """
    Method that identifies from a dataset the NN most similar cases (Nearest neighbors).
    X_train: dataset to find neighbours
    X_test: dataset to find neighbors for
    BallTree_leaf_size: leaf size of kd tree
    radius: radius in high dimensional space to search for NNs
    
    Returns:
    counts: count of NNs for each datapoint
    indices: indices of NNs from dataset X_train
    """

    tree = BallTree(X_train, leaf_size=leaf_size)
    counts = tree.query_radius(X_test, r=radius, count_only=True)
    indices = tree.query_radius(X_test, r=radius)
    return counts, indices
Exemple #22
0
    def spatial_expression_internal(adata_subset, x_coordinate, y_coordinate,
                                    method, radius, knn, imageid, use_raw,
                                    subset, label):

        # Create a DataFrame with the necessary inforamtion
        data = pd.DataFrame({
            'x': adata_subset.obs[x_coordinate],
            'y': adata_subset.obs[y_coordinate]
        })

        # Identify neighbourhoods based on the method used
        # a) KNN method
        if method == 'knn':
            print("Identifying the " + str(knn) +
                  " nearest neighbours for every cell")
            tree = BallTree(data, leaf_size=2)
            dist, ind = tree.query(data, k=knn, return_distance=True)

        # b) Local radius method
        if method == 'radius':
            print("Identifying neighbours within " + str(radius) +
                  " pixels of every cell")
            kdt = BallTree(data, metric='euclidean')
            ind, dist = kdt.query_radius(data, r=radius, return_distance=True)

        # Normalize range (0-1) and account for total number of cells
        d = scipy.sparse.lil_matrix((len(data), len(data)))
        for row, (columns, values) in enumerate(zip(ind, dist)):
            # Drop self-distance element.
            idx = columns != row
            columns = columns[idx]
            values = values[idx]
            if len(values) == 1:
                values = [1.0]
            elif len(values) > 1:
                # Normalize distances.
                values = (values.max() - values) / (values.max() -
                                                    values.min())
                values /= values.sum()
            # Assign row to matrix.
            d[row, columns] = values

        # convert to csr sparse matrix
        wn_matrix_sparse = d.tocsr()

        # Calculation of spatial lag
        if use_raw == True:
            spatial_lag = pd.DataFrame(wn_matrix_sparse *
                                       np.log1p(adata_subset.raw.X),
                                       columns=adata_subset.var.index,
                                       index=adata_subset.obs.index)
        else:
            spatial_lag = pd.DataFrame(wn_matrix_sparse * adata_subset.X,
                                       columns=adata_subset.var.index,
                                       index=adata_subset.obs.index)

        # return value
        return spatial_lag
Exemple #23
0
def triangles_from_keypoints(keypoints,
                             lower=TRIANGLE_LOWER,
                             upper=TRIANGLE_UPPER):
    """Get Triangles from keypoints.

    >>> from .keypoints import compute_keypoints
    >>> filename = 'fullEndToEndDemo/inputImages/cat_original.png'
    >>> img = cv2.imread(filename)
    >>> keypoints = compute_keypoints(img)
    >>> res = triangles_from_keypoints(keypoints)
    >>> len(res)
    11590
    >>> print(list(map(lambda x: x.tolist(), res[0])))
    [[162.0, 203.0], [261.0, 76.0], [131.0, 63.0]]
    >>> res2 = triangles_from_keypoints(keypoints, lower=10)
    >>> len(res2)
    14238
    >>> res3 = triangles_from_keypoints(keypoints, upper=100)
    >>> len(res3)
    315
    """
    keypoints = np.asarray(keypoints, dtype=float)

    tree = BallTree(keypoints, leaf_size=10)
    i_lower = tree.query_radius(keypoints, r=lower)
    i_upper = tree.query_radius(keypoints, r=upper)
    in_range = [set(u) - set(l) for l, u in zip(i_lower, i_upper)]

    seen = set()
    result = []

    for i, center in enumerate(keypoints):
        seen.add(i)

        in_range_of_center = in_range[i] - seen
        if not in_range_of_center:
            continue

        processed = set()

        for j in in_range_of_center:
            if j < i + 1:
                continue

            points_idx = in_range[j] & in_range_of_center - processed
            if not points_idx:
                continue

            keypoint = keypoints[j]
            points = keypoints[list(points_idx)]
            area = np.absolute(np.cross(points - center,
                                        points - keypoint)) / 2
            result += [(center, keypoint, p) for p in points[area > 1300]]

            processed.add(j)

    return result
Exemple #24
0
def __calculate_lonely_points(grid, point_cloud, distance):
    # Generate BallTree for point cloud
    ball_tree = BallTree(point_cloud.get_xy(), metric='manhattan')

    # Calculate for each of the points in the grid, the amount of neighbors in the original ground cloud
    count = ball_tree.query_radius(grid, distance - EPSILON, count_only=True)

    # Return only the points in the grid that don't have a neighbor
    return grid[count == 0]
Exemple #25
0
def spatial_coactive_sets(population,
                          spkdict,
                          time_bins,
                          trajectory,
                          return_tree=False):
    """
    Estimates spatially co-active activity ensembles from the given spike dictionary.
    """

    import sklearn
    from sklearn.neighbors import BallTree

    x, y, d, t = trajectory

    pch_x = interpolate.pchip(t, x)
    pch_y = interpolate.pchip(t, y)

    spatial_bins = np.column_stack(
        [pch_x(time_bins[:-1]), pch_y(time_bins[:-1])])

    acv_dict = {
        gid: np.histogram(np.asarray(lst), bins=time_bins)[0]
        for (gid, lst) in viewitems(spkdict[population]) if len(lst) > 1
    }
    n_features = len(time_bins) - 1
    n_samples = len(acv_dict)

    active_gid = {}
    active_bins = np.zeros((n_samples, n_features), dtype=np.bool)
    for i, (gid, acv) in enumerate(viewitems(acv_dict)):
        active_bins[i, :] = acv > 0
        active_gid[i] = gid

    tree = BallTree(active_bins, metric='jaccard')
    qbins = np.zeros((n_features, n_features), dtype=np.bool)
    for ibin in range(n_features):
        qbins[ibin, ibin] = True

    nnrs, nndists = tree.query_radius(qbins, r=1, return_distance=True)

    fnnrs = []
    fnndists = []
    for i, (nns, nndist) in enumerate(zip(nnrs, nndists)):
        inds = [
            inn for inn, nn in enumerate(nns)
            if np.any(np.logical_and(active_bins[nn, :], active_bins[i, :]))
        ]
        fnns = np.asarray([nns[inn] for inn in inds])
        fdist = np.asarray([nndist[inn] for inn in inds])
        fnnrs.append(fnns)
        fnndists.append(fdist)

    if return_tree:
        return n_samples, spatial_bins, fnnrs, fnndists, (tree, active_gid)
    else:
        return n_samples, spatial_bins, fnnrs, fnndists
    def eval(self, X):
        """Evaluate the kernel density estimation

        Parameters
        ----------
        X : array_like
            array of points at which to evaluate the KDE.  Shape is
            (n_points, n_dim), where n_dim matches the dimension of
            the training points.

        Returns
        -------
        dens : ndarray
            array of shape (n_points,) giving the density at each point.
            The density will be normalized for metric='gaussian' or
            metric='tophat', and will be unnormalized otherwise.
        """
        X = np.atleast_2d(X)
        if X.ndim != 2:
            raise ValueError('X must be two-dimensional')

        if X.shape[1] != self.X_.shape[1]:
            raise ValueError('dimensions of X do not match training dimension')

        if self.metric == 'gaussian':
            # wrangle gaussian into scikit-learn's 'rbf' kernel
            gamma = 0.5 / self.h / self.h
            D = pairwise_kernels(X, self.X_, metric='rbf', gamma=gamma)
            D /= np.sqrt(2 * np.pi * self.h ** (2 * X.shape[1]))
            dens = D.sum(1)

        elif self.metric == 'tophat':
            # use Ball Tree to efficiently count neighbors
            bt = BallTree(self.X_)
            counts = bt.query_radius(X, self.h,
                                     count_only=True)
            dens = counts / n_volume(self.h, X.shape[1])

        elif self.metric == 'exponential':
            D = pairwise_distances(X, self.X_)
            dens = np.exp(-abs(D) / self.h)
            dens = dens.sum(1)
            dens /= n_volume(self.h, X.shape[1]) * special.gamma(X.shape[1])

        elif self.metric == 'quadratic':
            D = pairwise_distances(X, self.X_)
            dens = (1 - (D / self.h) ** 2)
            dens[D > self.h] = 0
            dens = dens.sum(1)
            dens /= 2. * n_volume(self.h, X.shape[1]) / (X.shape[1] + 2)

        else:
            D = pairwise_kernels(X, self.X_, metric=self.metric, **self.kwargs)
            dens = D.sum(1)

        return dens
Exemple #27
0
    def eval(self, X):
        """Evaluate the kernel density estimation

        Parameters
        ----------
        X : array_like
            array of points at which to evaluate the KDE.  Shape is
            (n_points, n_dim), where n_dim matches the dimension of
            the training points.

        Returns
        -------
        dens : ndarray
            array of shape (n_points,) giving the density at each point.
            The density will be normalized for metric='gaussian' or
            metric='tophat', and will be unnormalized otherwise.
        """
        X = np.atleast_2d(X)
        if X.ndim != 2:
            raise ValueError('X must be two-dimensional')

        if X.shape[1] != self.X_.shape[1]:
            raise ValueError('dimensions of X do not match training dimension')

        if self.metric == 'gaussian':
            # wrangle gaussian into scikit-learn's 'rbf' kernel
            gamma = 0.5 / self.h / self.h
            D = pairwise_kernels(X, self.X_, metric='rbf', gamma=gamma)
            D /= np.sqrt(2 * np.pi * self.h ** (2 * X.shape[1]))
            dens = D.sum(1)

        elif self.metric == 'tophat':
            # use Ball Tree to efficiently count neighbors
            bt = BallTree(self.X_)
            counts = bt.query_radius(X, self.h,
                                     count_only=True)
            dens = counts / n_volume(self.h, X.shape[1])

        elif self.metric == 'exponential':
            D = pairwise_distances(X, self.X_)
            dens = np.exp(-abs(D) / self.h)
            dens = dens.sum(1)
            dens /= n_volume(self.h, X.shape[1]) * special.gamma(X.shape[1])

        elif self.metric == 'quadratic':
            D = pairwise_distances(X, self.X_)
            dens = (1 - (D / self.h) ** 2)
            dens[D > self.h] = 0
            dens = dens.sum(1)
            dens /= 2. * n_volume(self.h, X.shape[1]) / (X.shape[1] + 2)

        else:
            D = pairwise_kernels(X, self.X_, metric=self.metric, **self.kwargs)
            dens = D.sum(1)

        return dens
Exemple #28
0
    def _binned_mean(ds, conf, time, quantiles=None, size=None,
                     return_counts=False, max_time_diff=200):
        climate = conf.climate + '_ensemble'
        age = conf.age + '_ensemble'
        agedim = ds[conf.age].dims[0]
        ens = conf.ensemble

        def bootstrap_mean(da):
            resampler = np.random.randint(0, len(da), (len(da), size))
            return xr.DataArray(da.values[resampler].mean(axis=0),
                                dims=(ens, ))

        ds = ds.stack(**{age + ens: (agedim, ens)})

        time = pd.Index(time)
        tree = BallTree(time[:, np.newaxis])
        ind, dists = tree.query_radius(
            ds[age].values[:, np.newaxis], max_time_diff,
            return_distance=True, sort_results=True)

        miss = len(time)

        ind = np.array([t[0] if t.size else miss for t in ind])

        grouper = ds[age + ens].copy(data=np.r_[time, [np.nan]][ind])

        grouped = ds[climate].groupby(grouper)

        mask = grouped.count() > 100

        ret = (grouped.mean().where(mask), )
        if quantiles is not None:
            ret = ret + (grouped.quantile(quantiles).where(mask), )
        if size is not None:
            ret = ret + (grouped.apply(bootstrap_mean).where(mask), )
        if return_counts:
            tree = BallTree(ds[age].values.ravel()[:, np.newaxis])
            counts = tree.query_radius(time[:, np.newaxis], return_counts,
                                       count_only=True)
            ret = ret + (xr.DataArray(counts, dims=ret[0].dims[0],
                                      coords={ret[0].dims[0]: time}), )

        return tuple(arr.reindex({age + ens: time}).values for arr in ret)
Exemple #29
0
def OrderCell(data, radius):
    tree = BallTree(data, leaf_size=2)
    Countnumber = []
    for point in range(len(data)):
        count = tree.query_radius(
            data[point].reshape(1, -1), r=radius,
            count_only=True)  # counting the number of neighbors for each point
        Countnumber.append(count)  # storing number of neighbors
    CountnumberDf = pd.DataFrame(Countnumber, columns=['neighbors'])
    return (CountnumberDf)
Exemple #30
0
    def spatial_pscore_internal (adata_subset,proximity,x_coordinate,y_coordinate,phenotype,method,radius,knn,
                                imageid,subset,label):

        # Create a DataFrame with the necessary inforamtion
        data = pd.DataFrame({'x': adata_subset.obs[x_coordinate], 'y': adata_subset.obs[y_coordinate], 'phenotype': adata_subset.obs[phenotype]})
        
        # Identify neighbourhoods based on the method used
        # a) KNN method
        if method == 'knn':
            print("Identifying the " + str(knn) + " nearest neighbours for every cell")
            tree = BallTree(data[['x','y']], leaf_size= 2)
            ind = tree.query(data[['x','y']], k=knn, return_distance= False)
            neighbours = pd.DataFrame(ind.tolist(), index = data.index) # neighbour DF
            neighbours_ind = neighbours.copy() # neighbour DF
            #neighbours.drop(0, axis=1, inplace=True) # Remove self neighbour
        
        # b) Local radius method
        if method == 'radius':
            print("Identifying neighbours within " + str(radius) + " pixels of every cell")
            kdt = BallTree(data[['x','y']], metric='euclidean') 
            ind = kdt.query_radius(data[['x','y']], r=radius, return_distance=False)
            #for i in range(0, len(ind)): ind[i] = np.delete(ind[i], np.argwhere(ind[i] == i))#remove self
            neighbours = pd.DataFrame(ind.tolist(), index = data.index) # neighbour DF
            neighbours_ind = neighbours.copy() # neighbour DF
            
        # Map phenotype
        phenomap = dict(zip(list(range(len(ind))), data['phenotype'])) # Used for mapping
        phenomap_ind = dict(zip(list(range(len(ind))), data.index)) # Used for mapping cell_nme
        
        # Loop through (all functionized methods were very slow)
        for i in neighbours.columns:
            neighbours[i] = neighbours[i].dropna().map(phenomap, na_action='ignore')
        # do the same index and cell name
        for i in neighbours_ind.columns:
            neighbours_ind[i] = neighbours_ind[i].dropna().map(phenomap_ind, na_action='ignore')
            
            
        # Idetify all the neighbourhoods that contains the user defined proximity phenotypes
        for i in proximity:
            print (str('Finding neighbourhoods with ') + str(i))
            nn = neighbours[neighbours.isin([i])].dropna(how='all').index
            neighbours = neighbours.loc[nn]
        
        # Identify all the cells that was part of the neighbourhood in this analysis
        neighbours_ind = neighbours_ind.loc[neighbours.index]
        neighbours_ind_unique = pd.unique(neighbours_ind.values.ravel())
        
        # subset the neighbourhood cells to include only the cells in the user defined list
        cleaned_neighbours_ind_unique = [x for x in neighbours_ind_unique if str(x) != 'nan']
        d = data.loc[cleaned_neighbours_ind_unique]
        d = d[d['phenotype'].isin(proximity)].index
        
        # return neighbours for score and image_neighbours for plotting on image
        return {'neighbours': neighbours.index, 'image_neighbours': d }
Exemple #31
0
def get_mask(gaia_file):
	print(gaia_file)
	r_mask = gaia_file_2_radius[os.path.basename(gaia_file)] / 3600.
	hdu_G     = fits.open(gaia_file)
	ra_gaia, dec_gaia = hdu_G[1].data['ra'], hdu_G[1].data['dec']
	gaia_coordinates = deg_to_rad * np.transpose([dec_gaia, ra_gaia])
	print('measures distances')
	Tree_obj_Gaia = BallTree(gaia_coordinates, metric='haversine') 
	test_c = Tree_obj_Gaia.query_radius(agn_coordinates, r = r_mask, count_only = True) 
	to_be_masked = (test_c>0)
	print('N to mask:', len(to_be_masked.nonzero()[0]))
	return to_be_masked
Exemple #32
0
class BallTree():
    def __init__(self,walkers):
        self.walkers = walkers
        self.tree = BT(walkers.getWalkersLocation())

    def getEdges(self, radius):
        results = []
        for i,neighbors in enumerate(self.tree.query_radius(self.walkers.getWalkersLocation(),radius)):
            if len(neighbors) > 0:
                for n in neighbors:
                    results.append((self.walkers[i],self.walkers[n]))
        return results
def minPointsEstimate(enc, eps, imgpath):
    tree = BallTree(np.array(enc))
    allNgbr = []
    allNgbr.append(tree.query_radius(enc, eps, count_only=True))
    _, bins, _ = plt.hist(allNgbr, bins=45)
    plt.grid(axis='y', alpha=0.75)
    plt.xticks(bins, rotation=90)
    plt.title("MinPts Estimate " + encoding)
    plt.ylabel('Number of sessions')
    plt.xlabel('Number of neighbors')
    plt.tight_layout()
    plt.savefig(os.path.join(imgpath, "minptsEstimate.png"))
    plt.close()
Exemple #34
0
def lat_errors(T1, T2, thres=2):
    tree = BallTree(T1.values) 
    inds, dists = tree.query_radius(T2.values, r=thres, sort_results=True, return_distance=True)
    closest_l = []
    for i, ind in enumerate(inds):
        if len(ind) >= 2:
            closest = pd.DataFrame({'X1': [T1.iloc[ind[0]].X], 'Y1': [T1.iloc[ind[0]].Y], 'X2': [T1.iloc[ind[1]].X], 'Y2': [T1.iloc[ind[1]].Y]},
                    index=[i])
            closest_l.append(closest)
    closest_df = pd.concat(closest_l)
    f = T2.join(closest_df)
    lat_errors = abs((f.X2-f.X1)*(f.Y1-f.Y) - (f.X1-f.X)*(f.Y2-f.Y1))/np.sqrt((f.X2-f.X1)**2+(f.Y2-f.Y1)**2)
    return lat_errors
def estimate_bayes_factor(traces, logp, r=0.05, return_list=False):
    """From astroml, estimates the bayes factor using the local density of points"""
    D, N = traces.shape

    # compute volume of a D-dimensional sphere of radius r
    Vr = np.pi ** (0.5 * D) / scipy.special.gamma(0.5 * D + 1) * (r ** D)

    # use neighbor count within r as a density estimator
    bt = BallTree(traces.T)
    count = bt.query_radius(traces.T, r=r, count_only=True)

    BF = logp + np.log(N) + np.log(Vr) - np.log(count)

    if return_list:
        return BF
    else:
        p25, p50, p75 = np.percentile(BF, [25, 50, 75])
        return p50, 0.7413 * (p75 - p25)
Exemple #36
0
def avgdigamma(data, dvec, leaf_size=16):
    """Convenience function for finding expectation value of <psi(nx)> given
    some number of neighbors in some radius in a marginal space.

    Parameters
    ----------
    points : numpy.ndarray
    dvec : array_like (n_points,)
    Returns
    -------
    avgdigamma : float
        expectation value of <psi(nx)>
    """
    tree = BallTree(data, leaf_size=leaf_size, p=float('inf'))

    n_points = tree.query_radius(data, dvec - EPS, count_only=True)

    return digamma(n_points).mean()
Exemple #37
0
def mean_shift(X, bandwidth, seeds, kernel_update_function, max_iterations=10):
    n_points, n_features = X.shape
    stop_thresh = 1e-3 * bandwidth  # when mean has converged                                                                                                               
    cluster_centers = []
    ball_tree = BallTree(X)  # to efficiently look up nearby points

    # For each seed, climb gradient until convergence or max_iterations                                                                                                     
    for weighted_mean in seeds:
         completed_iterations = 0
         while True:
             points_within = X[ball_tree.query_radius([weighted_mean], bandwidth*3)[0]]
             old_mean = weighted_mean  # save the old mean                                                                                                                  
             weighted_mean = kernel_update_function(old_mean, points_within, bandwidth)
             converged = extmath.norm(weighted_mean - old_mean) < stop_thresh
             if converged or completed_iterations == max_iterations:
                 cluster_centers.append(weighted_mean)
                 break
             completed_iterations += 1

    return cluster_centers
def mean_shift_clustering(points, bandwidth, max_iterations=500):
    stop_thresh = 1e-3 * bandwidth
    cluster_centers = []
    points_labels = []
    ball_tree = BallTree(points)

    for weighted_mean in points:
        iter = 0
        while True:
            points_within = points[ball_tree.query_radius([weighted_mean],
                                                          bandwidth*3)[0]]
            old_mean = weighted_mean
            weighted_mean = mean_shift(old_mean, points_within, bandwidth)
            converged = euclid_dist(weighted_mean, old_mean) < stop_thresh
            if converged or iter == max_iterations:
                cluster_centers, points_labels = assign_cluster(weighted_mean,
                                                                cluster_centers,
                                                                points_labels)
                break
            iter += 1

    return np.asarray(cluster_centers), np.asarray(points_labels)
xyz = np.zeros((5903, 3))
xyz[:, 0] = x[:, 0]
xyz[:, 1] = y[:, 0]
xyz[:, 2] = z[:, 0]

Xtrain = import_train["Xtrain"]
scaler = preprocessing.StandardScaler().fit(Xtrain)
Xtrain = scaler.transform(Xtrain)


from sklearn.neighbors import kneighbors_graph, BallTree
from sklearn.feature_extraction.image import grid_to_graph

xyz_balltree = BallTree(xyz)
print xyz_balltree
print xyz_balltree.query_radius(xyz[0], r=0.04)


# connectivity = kneighbors_graph(xyz_balltree, 2, include_self=True,mode='connectivity')
# connectivity = grid_to_graph(n_x =x, n_y = y, n_z = z )
# agglo = cluster.FeatureAgglomeration(n_clusters = 590)
# agglo.fit(Xtrain)
# Xtrain_reduced = agglo.transform(Xtrain)


"""

#k_fold = cross_validation.KFold(len(X_train), 5)
Y_kf = Ytrain.ravel()
k_fold = StratifiedKFold(Y_kf, n_folds=10)
Exemple #40
0
def two_point(data, bins, BT_D=None, BT_R=None, method='standard',
              data_R=None, random_state=None, return_trees=False, 
              verbose=False, RR=None, return_RR=False, return_DD=False):
    #Edited by CW to allow user to supply more things and have more things
    #returned.
    """
    Two-point correlation function in Euclidean space.  Options to return
    a number of things.  What gets returned is up to the user but the order
    will always be correlation_function, data_balltree, random_balltree,
    random_random, data_data.  If the user asks for a subset of those, the
    list will be shorter but the order will be maintained.

    Parameters
    ----------
    data : array_like
        Input data, shape = [n_samples, n_features]
    bins : array_like
        Bins within which to compute the 2-point correlation.
        Shape = Nbins + 1
    BT_D : BallTree (optional)
        Ball tree created with the data positions
    BT_R : BallTree (optional)
        Ball tree created with the random positions
    method : string (optional)
        "standard" or "landy-szalay".  Default is 'standard'.
    data_R : array_like (optional if no BT_R)
        If specified, use this as the random comparison sample.  This must
        be included if you wish to use a pre-computed random ball tree
    random_state : integer, np.random.RandomState, or None (optional)
        Specify the random state to use for generating background.  Not
        used if the randoms are provided by the user.  Default is None
    RR : 1D array-like, shape = Nbins
        If this exact set of randoms and theta bins has been
        run, you can supply the RR counts and not calculate them again.
        You also need the data if you're running with method='landy-szalay'
    return_trees : boolean (optional)
        If True, the returns will include the data and random ball trees.
        Default is False.
    return_RR : boolean (optional)
        If you know you'll be running a CF with this
        exact same random sample and binning (like with a bootstrap),
        you can get the RR counts returned and feed them back in the
        next time
    return_DD : boolean (optional)
        In case you want to fit to the pair counts rather
        than the w(theta) estimator, you can get this back too
    verbose: boolean (optional)
        Determines whether or not the function narrates what it's doing.
        Default is False.
 
    Returns
    -------
    corr : ndarray
        the estimate of the correlation function within each bin
        shape = Nbins
    data_tree : BallTree (optional)
        the ball tree used to calculate distances between objects
        quickly in the data.  only returned if return_trees == True
    random_tree : BallTree (optional)
        the ball tree used to calculate distances between objects
        quickly in the randomly generated set.  only returned if
        return_trees == True
    RR : ndarray (optional)
        the RR counts may be returned (if return_RR==True) and used
        again without recomputing if the theta bins and the random
        sample is exactly the same
    DD : ndarray (optional)
        the DD pair counts, returned if return_DD==True   
    """
    data = np.asarray(data)
    bins = np.asarray(bins)
    rng = check_random_state(random_state)

    if method not in ['standard', 'landy-szalay']:
        raise ValueError("method must be 'standard' or 'landy-szalay'")

    if bins.ndim != 1:
        raise ValueError("bins must be a 1D array")

    if data.ndim == 1:
        data = data[:, np.newaxis]
    elif data.ndim != 2:
        raise ValueError("data should be 1D or 2D")

    n_samples, n_features = data.shape
    Nbins = len(bins) - 1

    # shuffle all but one axis to get background distribution
    if data_R is None:
        print "two_point says: generating random sample"
        data_R = data.copy()
        for i in range(n_features - 1):
            rng.shuffle(data_R[:, i])
    else:
        data_R = np.asarray(data_R)
        if (data_R.ndim != 2) or (data_R.shape[-1] != n_features):
            raise ValueError('data_R must have same n_features as data')

    factor = len(data_R) * 1. / len(data)

    if BT_D is None:
        if verbose:
            print "two_point says: computing BallTree for data"
        BT_D = BallTree(data)
    if BT_R is None:
        if verbose:
            print "two_point says: computing BallTree for random sample"
        BT_R = BallTree(data_R)

    counts_DD = np.zeros(Nbins + 1)
    counts_RR = np.zeros(Nbins + 1)

    if verbose:
        print "two_point says: working through the CF calc.  This could take a while"
    for i in range(Nbins + 1):
        counts_DD[i] = np.sum(BT_D.query_radius(data, bins[i],
                                                count_only=True))
        if RR is None:
            counts_RR[i] = np.sum(BT_R.query_radius(data_R, bins[i],
                                                    count_only=True))

    if verbose:
        print "two_point says: binning done!"
    DD = np.diff(counts_DD)
    if RR is None:
        RR = np.diff(counts_RR)

    # check for zero in the denominator
    RR_zero = (RR == 0)
    RR[RR_zero] = 1

    if method == 'standard':
        corr = factor**2 * DD / RR - 1
    elif method == 'landy-szalay':
        counts_DR = np.zeros(Nbins + 1)
        for i in range(Nbins + 1):
            counts_DR[i] = np.sum(BT_R.query_radius(data, bins[i],
                                                    count_only=True))
        DR = np.diff(counts_DR)
        corr = (factor ** 2 * DD - 2 * factor * DR + RR) / RR

    corr[RR_zero] = np.nan

    to_return=corr
    if return_trees:
        to_return=[to_return]
        to_return.append(BT_D)
        to_return.append(BT_R)
    if return_RR:
        if not return_trees:
            to_return=[to_return]
        to_return.append(RR)
    if return_DD:
        if (not return_trees) and (not return_RR):
            to_return=[to_return]
        to_return.append(DD)        
    
    return to_return
Exemple #41
0
class Learner(object):
	"""
	A class that instantiates the feature space for an individual AI,
	chooses moves, and performs learning
	"""
	def __init__(self, data_points = None, ai_history = None, threshold = THRESHOLD):
		self.state_list = []
		self.weights_list = []

		if data_points is None:
			data_points = []
		if ai_history is None:
			ai_history = []

		for state, weights in data_points:
			assert(len(state) == 32)
			self.state_list.append(state)
			self.weights_list.append(weights)

		self._threshold = threshold
		self._ai_history = cp.deepcopy(ai_history)

		#self._featureTransform()
		self.X = np.array(self.state_list)

		assert(self.X.shape == (len(data_points), 32) or len(data_points) == 0)
		#Think about different distance metrics. Manhattan or minkowski? P < 1?
		if len(data_points) > 0:
			self._tree = BallTree(self.X, metric='manhattan')
		else:
			self._tree = None

	def getNextMove(self, current_board):
		# current_board.printBoard()
		nn_move = self._getNearestNeighbors(current_board)
		if nn_move is not None:
			next_move = nn_move
		else:
			next_move = self._getMinimax(current_board)
		self._ai_history.append(next_move)
		return next_move

	def updateWeights(self, game_history, status):
		
		if status == WIN:
			factor = WIN_FACTOR
		elif status == LOSE:
			factor = LOSE_FACTOR
		elif status == TIE:
			factor = 1

		# old_board = Board()
		for _board, _move in game_history:
			assert(any(_move == mv[1] for mv in _board.getMoveList(_move.color)))

			if _move.color == AI_COLOR:
				state = _board.getArray().tolist()

				if state in self.state_list:
					i = self.state_list.index(state)
					# j = self.state_list[i].find(move)
					# print zip(*_board.getMoveList(AI_COLOR))[1]
					# print list(zip(*_board.getMoveList(AI_COLOR))[1])
					j = list(zip(*_board.getMoveList(AI_COLOR))[1]).index(_move)
					self.weights_list[i][j] *= factor

				else:
					self.state_list.append(state)
					self.weights_list.append([1] * len(_board.getMoveList(AI_COLOR)))
					# print zip(*_board.getMoveList(AI_COLOR))[1]
					j = list(zip(*_board.getMoveList(AI_COLOR))[1]).index(_move)
					self.weights_list[-1][j] *= factor

			elif _move.color == PLAYER_COLOR:
				_move = _move.getInverse()
				state = _board.getInverse().getArray().tolist()

				if state in self.state_list:
					i = self.state_list.index(state)
					# j = self.state_list[i].find(move)
					j = list(zip(*_board.getInverse().getMoveList(AI_COLOR))[1]).index(_move)
					self.weights_list[i][j] *= (1.0 / factor)

				else:
					self.state_list.append(state)
					self.weights_list.append([1] * len(_board.getInverse().getMoveList(AI_COLOR)))
					j = list(zip(*_board.getInverse().getMoveList(AI_COLOR))[1]).index(_move)
					self.weights_list[-1][j] *= (1.0 / factor)

		self.X = np.array(self.state_list)
		self._tree = BallTree(self.X, metric='manhattan')



	def getAiHistory(self):
		return cp.deepcopy(self._ai_history)

	def _getMinimax(self, current_board):
		# return random.choice([bd[1] for bd in current_board.getMoveList(AI_COLOR)])
		(bestBoard, bestVal) = minMax2(current_board, 6)
		# print("bestVal", bestVal)
		# bestBoard[0].printBoard()
		return bestBoard[1]

	def _getNearestNeighbors(self, current_board):
		#dist, ind = self._tree.query(current_board.getArray(), k=3)
		if self._tree is None:
			return None
		ind = self._tree.query_radius(current_board.getArray(), r = self._threshold).tolist()
		ind = ind[0].tolist()

		if len(ind) > 0:
			pass
			# print "neighbors found"

		#cur_moves = current_board.getMoveList(AI_COLOR)
		moves = []
		weights = []
		# print ind
		for i in ind:
			_board = Board(new_array = self.state_list[i])
			assert(len(_board.getMoveList(AI_COLOR)) == len(self.weights_list[i]))
			for j, (board, move) in enumerate(_board.getMoveList(AI_COLOR)):
				# move.printMove()
				# current_board.printBoard()
				if current_board.verifyMove(AI_COLOR, move = move):
					# print "move found"
					# move.printMove()
					if move not in moves:
						moves.append(move)
						weights.append(self.weights_list[i][j])
					else:
						weights[moves.index(move)] *= self.weights_list[i][j]
		if len(moves) == 0:
			# raise Exception()
			# print "aborted neighbors"
			return None
		else:
			assert(len(moves) == len(weights))
			zipped = zip(moves, weights)
			moves = [mv[0] for mv in zipped if mv[1] >= 1]
			weights = [mv[1] for mv in zipped if mv[1] >= 1]

			if len(moves) < 1: return None

			return np.random.choice(moves, 1, weights)[0]
		#neighbor_moves = [move for move in neighbor_moves if move in cur_moves]


	def _featureTransform(self):
		#replace weights with a Gaussian at some point
		#or come up with a better feature transform
		weights = [1, 2, 3, 4, 4, 3, 2, 1]
		transformed_list = []
		for state in self.state_list:
			assert(len(state) == 32)
			new_state = []
			for i in range(32):
				new_state.append(state[i] * weights[i / 4])
			transformed_list.append(new_state)

		self.X = np.array(transformed_list)
class SupportGrid:
    """Grid structure to support the computation of viewpoints.

    Grid structure to support the computation of viewpoints that will be used
    to detect the rho-boundary of a particle system which particle's positions
    are stored in the 'points' array.

    Attributes:
        points: A numpy array containing the position of the particles.
        rho: The value of rho, in general the h value from SPH simulations
            is a good approximation.
        dimension: The dimension of the particle system and the grid.
        cell_size: The length of the cells edges.
        aabb_min: The lower corner of the Axis Aligned Bounding Box containing
            the points.
        aabb_max: The upper corner of the Axis Aligned Bounding Box containing
            the points.
        grid_dims: The number of cells along each axis needed to compute the
            viewpoints, it includes some padding cells on each side.
        grid_min: The lower corner of the grid.
        grid_max: The upper corner of the grid.
        grid_count: A numpy array used to keep the number of points per cell.
        grid_elems: A numpy array containing lists of the indexes of the points
            inside each cell.
        tree: A KDTree structure used to simplify and speedup neighborhood queries.
        neighbor_cell_list: A numpy array with indexes in {-1, 0, 1} used to
            assist the traversal of neighboring cells in any dimension >= 1.
    """

    def __init__(self, points, rho, dimension):
        """Constructor

        Initializes the grid and helper structures using the provided points
        and rho parameter.

        Args:
            points: A numpy array containing the coordinates of the particles.
            rho: Needed to compute the rho-boundary of the system.
            dimension: The dimension of the particle system.
        """
        self.points = points
        self.rho = rho
        self.dimension = dimension
        self.cell_size = 2.0 * rho

        self.aabb_min = np.amin(points, axis=0)
        self.aabb_max = np.amax(points, axis=0)

        self.grid_dims = (self.aabb_max - self.aabb_min) / self.cell_size
        # Regarding the + 3: 1 for left side, 1 for right side, 1 for rounding
        # up
        self.grid_dims = np.trunc(self.grid_dims) + 3
        self.grid_dims = self.grid_dims.astype(int)

        self.grid_min = self.aabb_min - self.cell_size
        self.grid_max = self.grid_min + self.grid_dims * self.cell_size

        self.grid_count = np.zeros(self.grid_dims, dtype=int)
        self.grid_elems = np.empty(self.grid_dims, dtype=object)

        self.update_grid()
        self.tree = NeighborsTree(
            self.points, leaf_size=10, metric='euclidean')

        self.neighbor_cell_list = self.compute_neighbor_cell_list()

    def update_grid(self):
        """Updates the grid with the counting and indexes.

        Updates the grid with the number of particles in each cell and puts
        the index of each particle in the corresponding cell.
        """
        for i in range(self.points.shape[0]):
            pt = self.points[i]

            idx = (pt - self.grid_min) / self.cell_size
            idx = to_index_tuple(idx)
            self.grid_count[idx] += 1
            if (self.grid_elems[idx] == None):
                self.grid_elems[idx] = []
            self.grid_elems[idx].append(i)

    def compute_neighbor_cell_list(self):
        """Computes a list of offsets to the neighboring cells.

        Computes a list of offsets to the neighboring cells based on the
        dimension. This is used to simplify the traversal of neighbor cells in
        any dimension. For a 2D grid it produces:
        [[-1 -1], [-1 0], [-1 1], [0 -1], [0 0], [0 1], [1 -1], [1 0], [1 1]].
        By using this list we can visit all the 9 cells around a point or cell
        with a single loop.

        Returns:
            A numpy array containing a list of offests to neighboring cells.
        """
        previous = np.array([[-1], [0], [1]], dtype=int)
        current = None
        current_n_rows = 3
        for c in range(1, self.dimension):
            ones = np.ones((current_n_rows, 1))
            for i in range(-1, 2):
                temp = np.hstack((ones * i, previous))
                if (current is None):
                    current = temp
                else:
                    current = np.vstack((current, temp))

            current_n_rows *= 3
            previous = current
            current = None

        return previous

    def get_viewpoints(self):
        """Computes and returns the viewpoints that will be used by the instances
        of the HPR operator.

        Computes and returns the viewpoints that will be used by the instances
        of the HPR operator. Empty cells neighboring non-empty cells get a
        viewpoint in its center; Non-empty cells that have no empty neighbor go
        through an additional step to generate viewpoints in cavity cells.

        Returns:
            A numpy array containing the viewpoints.
        """
        self.viewpoints = []

        # for i in range(self.grid_dims[0]):
        #    for j in range(self.grid_dims[1]):
        #        for k in range(self.grid_dims[2]):

        for cell in range(self.grid_dims.prod()):
            idx = np.unravel_index(cell, self.grid_dims)
            if (self.grid_count[idx] == 0):
                self.process_empty_cell(idx)
            else:
                self.process_nonempty_cell(idx)

        return self.viewpoints

    def process_empty_cell(self, idx):
        """Processes an empty cell and produces a viewpoint on its center.

        Processes an empty cell and produces a viewpoint on its center.
        The viewpoint is created only if the empty cell has a non-empty neighbor
        cell.

        Args:
            idx: The index of the cell.
        """
        for i in range(self.neighbor_cell_list.shape[0]):
            n_idx = idx + self.neighbor_cell_list[i]

            # check grid limits
            if (np.any(np.less(n_idx, np.zeros([1, self.dimension]))) or
                    np.any(np.greater_equal(n_idx, self.grid_dims))):
                continue

            n_idx = to_index_tuple(n_idx)

            # If there is a nonempty neighbor, we place a viewpoint
            # at the center of the current cell
            if (self.grid_count[n_idx] != 0):
                viewpoint = self.grid_min + \
                    np.array(idx) * self.cell_size + 0.5 * self.cell_size
                self.viewpoints.append(viewpoint)
                return

    def process_nonempty_cell(self, idx):
        """Processes an non-empty cell and produces viewpoints if possible.

        Processes an non-empty cell and produces a set of viewpoints based on the
        points inside the cell and its distribution.

        Args:
            idx: The index of the cell.
        """
        # Check if there is an empty neighbor,
        # in this case the empty neighbor should be enough
        for i in range(self.neighbor_cell_list.shape[0]):
            n_idx = idx + self.neighbor_cell_list[i]

            # check grid limits
            if (np.any(np.less(n_idx, np.zeros([1, self.dimension]))) or
                    np.any(np.greater_equal(n_idx, self.grid_dims))):
                continue

            n_idx = to_index_tuple(n_idx)

            if (self.grid_count[n_idx] == 0):
                return

        # Get everyone in the cell, and define a new viewpoint candidate,
        # based on its neighborhood centroid
        for i in range(self.grid_count[idx]):
            ii = self.grid_elems[idx][i]

            pt = self.points[ii]

            neighbors = self.tree.query_radius(pt.reshape(1,-1), r=2.0 * self.rho)[0]

            centroid = np.sum(
                self.points[neighbors], axis=0) / neighbors.shape[0]

            V = pt - centroid
            V = V / np.linalg.norm(V)

            viewpoint = pt + V * self.rho

            neighbors = self.tree.query_radius(viewpoint.reshape(1,-1), r=0.95 * self.rho)[0]
            if (neighbors.size == 0):
                self.viewpoints.append(viewpoint)

    def get_candidates(self, viewpoint):
        """Gets a set of points that are candidates to be marked as boundary.

        Gets a set of points that are candidates to be marked as boundary. These
        candidates are inside the local neighbohood of a viewpoint and will be
        used on the HPR operator.

        Args:
            viewpoint: The viewpoint that will be used by the HPR operator.

        Returns:
            A numpy array containing the boundary candidates around the viewpoint.
        """
        return self.tree.query_radius(viewpoint.reshape(1,-1), r=4.0 * self.rho)[0]
Exemple #43
0
def optics(X, eps=float('inf'), min_samples=1, metric='euclidean',
           extraction='hierarchical', ext_kwargs={}):
    """
    Perform OPTICS clustering from vector array or distance matrix.

    Parameters
    ----------
    X : array [n_samples, n_samples] or [n_samples, n_features]
        Array of distances between samples, or a feature array.
        The array is treated as a feature array unless the metric is given as
        'precomputed'.

    eps : float, optional
        The generating distance between two samples for them to be considered
        as in the same neighborhood.

    min_samples : int, optional
        The number of samples in a neighborhood for a point to be considered
        as a core point.

    metric : string or callable, optional
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string or callable, it must be one of
        the options allowed by metrics.pairwise.calculate_distance for its
        metric parameter.
        If metric is "precomputed", X is assumed to be a distance matrix and
        must be square.

    extraction : string, optional
        The extraction method used to generate clusters from the ordering of
        points returned by the OPTICS algorithm.

    ext_kwargs : dict
        Keyword arguments to be supplied to the extraction function.

    Returns
    -------
    core_distances : array [n_samples]
        Core distance for each sample.

    ordering : array [n_samples]
        Indices of the samples in the order generated by OPTICS.

    reachability_distances : array [n_samples]
        Reachability distance for each sample.

    labels : array [n_samples]
        Cluster labels for each point. Noisy samples are given the label -1.

    Notes
    -----
    See examples/cluster/plot_optics.py for an example.

    References
    ----------
    Ankerst, Mihael, Markus M. Breunig, Hans-Peter Kriegel, and Jörg Sander.
    "OPTICS: ordering points to identify the clustering structure." ACM SIGMOD
    Record 28, no. 2 (1999): 49-60.

    """
    X = atleast2d_or_csr(X)
    n = X.shape[0]
    if min_samples > n:
        raise ValueError('min_samples must be lower than the total number of samples')
    ordering = []
    core_distances = np.ndarray(len(X))
    # Initiate reachability distances to infinity
    reachability_distances = float('inf') * np.ones(n)
    # Set reachability for first point
    reachability_distances[0] = 0
    # Construct spatial indexing structure
    if metric != 'precomputed':
        # TODO: Construct BallTree with the correct metric once the
        # metrics branch has been merged into master
        tree = BallTree(X, metric=metric)

    seeds = np.ones(n, dtype=bool)
    i = 0
    while True:
        # Mark current point as processed
        seeds[i] = False
        # Add current point to the ordering
        ordering.append(i)
        if not any(seeds):
            break
        # Calculate core distance
        if metric == 'precomputed':
            D = X[i]
            core_dist = np.sort(D)[min_samples]
        else:
            core_dist = tree.query(X[i], min_samples+1)[0][0][-1]
        core_distances[i] = core_dist

        if core_dist <= eps:
            # Get the neighbors of the current point
            if metric == 'precomputed':
                neighbors = D[seeds] <= eps
                ds = D[neighbors]
            else:
                ind, dist = tree.query_radius(X[i], eps, True)
                si = seeds[ind[0]]
                neighbors = ind[0][si]
                ds = dist[0][si]
            cds = core_dist * np.ones(len(ds))
            # Set the new reachability distances to
            # max(core_distance, distance)
            new_reach_dists = np.maximum(cds, ds)
            reachability_distances[neighbors] = new_reach_dists
            i = np.nonzero(seeds)[0][np.argmin(reachability_distances[seeds])]
        else:
            i = np.where(seeds)[0][0]

    if type(extraction) is str:
        estr = extraction.lower()
        if estr in EXTRACTION_FUNCTIONS:
            func = EXTRACTION_FUNCTIONS[estr]
            labels = func(ordering, reachability_distances, min_samples,
                    **ext_kwargs)
        else:
            raise ValueError('Unknown Extraction Method: %s' % estr)
    else:
        raise TypeError('Extraction Method must be a string.')

    return core_distances, ordering, reachability_distances, labels
    # 2.4.2) sum this minimum distance to tot
    # tot is the final distance between s0 and s2
    # the s2 with minimum distance is the desired streamline

    np.random.seed(0)
    prototypes_id = np.random.permutation(dm.shape[0])[:200]
    dp = dm[:,prototypes_id] # dissimilarity projection
    
    kdt = BallTree(dp) # KDTree(dp)
    
    radius = 100
    k = 10

    sid = 9
    
    idx1 = kdt.query_radius(dp[sid], radius)[0]
    # idx1 = kdt.query(dp[sid], k)[1][0]
    dm_small1 = dm[idx1][:,idx1]
    e1 = dm_small1[np.triu_indices(dm_small1.shape[0],1)]

    spgk = np.zeros(dm.shape[0])
    for i in range(dm.shape[0]):
        idx2 = kdt.query_radius(dp[i], radius)[0]
        # idx2 = kdt.query(dp[i], k)[1][0]
        dm_small2 = dm[idx2][:,idx2]
        e2 = dm_small2[np.triu_indices(dm_small2.shape[0],1)]

        spgk[i] = np.multiply.outer(np.exp(-e1), np.exp(-e2)).sum()
        print i, spgk[i]

    
Exemple #45
0
def variable_bw_mean_shift(X, bandwidth_array, seeds=None, max_iterations=300):
    """Variable bandwidth mean shift with gaussian kernel

	Parameters
	----------
	X : array-like, shape=[n_samples, n_features]
		Input data.

	bandwidth : array[float], shape=[n_samples]
		Kernel bandwidth.

	seeds : array[float, float], shape=(n_seeds, n_features), optional
		Point used as initial kernel locations. Default is
		setting each point in input data as a seed.

	max_iter : int, default 300
		Maximum number of iterations, per seed point before the clustering
		operation terminates (for that seed point), if has not converged yet.

	Returns
	-------
	cluster_centers : array, shape=[n_clusters, n_features]
		Coordinates of cluster centers.

	labels : array, shape=[n_samples]
		Cluster labels for each point.

	Notes
	-----
	Code adapted from scikit-learn library.

	"""

    if not seeds:
        seeds = X

    n_points, n_features = X.shape
    stop_thresh = 1e-3 * np.mean(bandwidth_array)  # when mean has converged
    center_intensity_dict = {}
    cluster_centers = []
    ball_tree = BallTree(X)  # to efficiently look up nearby points

    def gaussian_kernel(x, points, bandwidth):
        distances = euclidean_distances(points, x)
        weights = np.exp(-1 * (distances ** 2 / bandwidth ** 2))
        return np.sum(points * weights, axis=0) / np.sum(weights)

        # For each seed, climb gradient until convergence or max_iterations

    for i, weighted_mean in enumerate(seeds):
        completed_iterations = 0
        while True:
            points_within = X[ball_tree.query_radius([weighted_mean], bandwidth_array[i])[0]]
            old_mean = weighted_mean  # save the old mean
            weighted_mean = gaussian_kernel(old_mean, points_within, bandwidth_array[i])
            converged = extmath.norm(weighted_mean - old_mean) < stop_thresh

            if converged or completed_iterations == max_iterations:
                if completed_iterations == max_iterations:
                    print("reached max iterations")
                cluster_centers.append(weighted_mean)
                center_intensity_dict[tuple(weighted_mean)] = len(points_within)
                break

            completed_iterations += 1

            # POST PROCESSING: remove near duplicate points
            # If the distance between two kernels is less than the bandwidth,
            # then we have to remove one because it is a duplicate. Remove the
            # one with fewer points.
    sorted_by_intensity = sorted(center_intensity_dict.items(), key=lambda tup: tup[1], reverse=True)
    sorted_centers = np.array([tup[0] for tup in sorted_by_intensity])
    unique = np.ones(len(sorted_centers), dtype=np.bool)
    ball_tree = BallTree(sorted_centers)

    for i, center in enumerate(sorted_centers):
        if unique[i]:
            neighbor_idxs = ball_tree.query_radius([center], np.mean(bandwidth_array))[0]
            unique[neighbor_idxs] = 0
            unique[i] = 1  # leave the current point as unique
    cluster_centers = sorted_centers[unique]

    # ASSIGN LABELS: a point belongs to the cluster that it is closest to
    nbrs = NearestNeighbors(n_neighbors=1, algorithm="ball_tree").fit(cluster_centers)
    labels = np.zeros(n_points, dtype=np.int)
    distances, idxs = nbrs.kneighbors(X)
    labels = idxs.flatten()

    return cluster_centers, labels
     train = int(total * TRAIN_PERCENTAGE)
     test = total - train
     distances = []
     per_stops_popu = collections.defaultdict(int)
 distance = dist(lat, lon, stop_lat, stop_lon)
 if cnt < train:
     per_stops_popu[actual_stop] += 1
     distances.append(distance)
 else:
     if cnt == train:
         idx = int(len(distances) * DISTANCE_FACTOR)
         if idx >= int(len(distances)):
             idx = -1 
         per_radius = sorted(distances)[idx]
     # global
     stops = tree.query_radius([lat, lon], r=RADIUS)[0]
     l = [(id_to_stop_id[s][0], dist(lat, lon, id_to_stop_id[s][1], id_to_stop_id[s][2]), id_to_stop_id[s][3], id_to_stop_id[s][4]) for s in stops]
     if actual_stop in get_largest_n(l, [1], LIST_SIZE):
         global_nearest += 1
     if actual_stop in get_largest_n(l, [2], LIST_SIZE):
         global_route += 1
     if actual_stop in get_largest_n(l, [3], LIST_SIZE):
         global_popu += 1
     if distance > RADIUS:
         global_lost += 1
     # personalized
     stops = tree.query_radius([lat, lon], r=per_radius)[0]
     l = [(id_to_stop_id[s][0], dist(lat, lon, id_to_stop_id[s][1], id_to_stop_id[s][2]), id_to_stop_id[s][3], id_to_stop_id[s][4], per_stops_popu[s]) for s in stops]
     if actual_stop in get_largest_n(l, [1], LIST_SIZE):
         per_nearest += 1
     if actual_stop in get_largest_n(l, [2], LIST_SIZE):
Exemple #47
0
def two_point(data, bins, method='standard',
              data_R=None, random_state=None):
    """Two-point correlation function

    Parameters
    ----------
    data : array_like
        input data, shape = [n_samples, n_features]
    bins : array_like
        bins within which to compute the 2-point correlation.
        shape = Nbins + 1
    method : string
        "standard" or "landy-szalay".
    data_R : array_like (optional)
        if specified, use this as the random comparison sample
    random_state : integer, np.random.RandomState, or None
        specify the random state to use for generating background

    Returns
    -------
    corr : ndarray
        the estimate of the correlation function within each bin
        shape = Nbins
    """
    data = np.asarray(data)
    bins = np.asarray(bins)
    rng = check_random_state(random_state)

    if method not in ['standard', 'landy-szalay']:
        raise ValueError("method must be 'standard' or 'landy-szalay'")

    if bins.ndim != 1:
        raise ValueError("bins must be a 1D array")

    if data.ndim == 1:
        data = data[:, np.newaxis]
    elif data.ndim != 2:
        raise ValueError("data should be 1D or 2D")

    n_samples, n_features = data.shape
    Nbins = len(bins) - 1

    # shuffle all but one axis to get background distribution
    if data_R is None:
        data_R = data.copy()
        for i in range(n_features - 1):
            rng.shuffle(data_R[:, i])
    else:
        data_R = np.asarray(data_R)
        if (data_R.ndim != 2) or (data_R.shape[-1] != n_features):
            raise ValueError('data_R must have same n_features as data')

    factor = len(data_R) * 1. / len(data)

    if sklearn_has_two_point:
        # Fast two-point correlation functions added in scikit-learn v. 0.14
        KDT_D = KDTree(data)
        KDT_R = KDTree(data_R)

        counts_DD = KDT_D.two_point_correlation(data, bins)
        counts_RR = KDT_R.two_point_correlation(data_R, bins)

    else:
        warnings.warn("Version 0.3 of astroML will require scikit-learn "
                      "version 0.14 or higher for correlation function "
                      "calculations. Upgrade to sklearn 0.14+ now for much "
                      "faster correlation function calculations.")

        BT_D = BallTree(data)
        BT_R = BallTree(data_R)

        counts_DD = np.zeros(Nbins + 1)
        counts_RR = np.zeros(Nbins + 1)

        for i in range(Nbins + 1):
            counts_DD[i] = np.sum(BT_D.query_radius(data, bins[i],
                                                    count_only=True))
            counts_RR[i] = np.sum(BT_R.query_radius(data_R, bins[i],
                                                    count_only=True))

    DD = np.diff(counts_DD)
    RR = np.diff(counts_RR)

    # check for zero in the denominator
    RR_zero = (RR == 0)
    RR[RR_zero] = 1

    if method == 'standard':
        corr = factor ** 2 * DD / RR - 1
    elif method == 'landy-szalay':
        if sklearn_has_two_point:
            counts_DR = KDT_R.two_point_correlation(data, bins)
        else:
            counts_DR = np.zeros(Nbins + 1)
            for i in range(Nbins + 1):
                counts_DR[i] = np.sum(BT_R.query_radius(data, bins[i],
                                                        count_only=True))
        DR = np.diff(counts_DR)

        corr = (factor ** 2 * DD - 2 * factor * DR + RR) / RR

    corr[RR_zero] = np.nan

    return corr
class AngularCatalog(object):
    """
    This class is the workhorse of Py2PAC.  It manages the actual catalogs
    of objects, it creates various objects to hold information, and it
    performs the correlation function calculations on the catalogs.
    AngularCatalogs are single-bin objects, so if you want to sub-divide
    your data set, do so before you pull it into AngularCatalogs.  Future
    releases of Py2PAC will include a MultiCatalog that manages slicing a
    catalog into bins.

    Parameters
    ----------
    ra : array-like
        A list of RAs for your objects in degrees
        
    dec : array-like
        A list of declinations for your objects in degrees

    generate_randoms : bool (optional)
        If True, ``__init__`` will call the mask's random generation to
        produce a random sample of size ``len(ra) * default_oversample``.
        If False, no randoms will be generated.  Default is False.

    default_oversample : float (optional)
        The default number of randoms to make in units of the number of
        data points.  If ``default_oversample==1``, then by default the
        object will generate the same number of randoms as you have data
        points.  If ``default_oversample==1``, then by default the
        object will generate twice as many randoms as you have data points,
        etc.  Default value is 1.

    properties : dictionary (optional)
        Any additional properties that you want to carry around with the
        angular positions.  This isn't used at all by AngularCatalog, but
        makes it easier to access things.

    weight_file : string (optional)
        A path from / to a FITS weight file to be used to generate the
        ImageMask.

    image_mask : ImageMask instance (optional)
        An instance of an ImageMask object.

    Returns
    -------
    cat : AngularCatalog instance
        The AngularCatalog instance with all the properties that you gave
        it.
    """

    #------------------#
    #- Initialization -#
    #------------------#Your data and masked data will be the same
    def __init__(self, ra, dec, generate_randoms=False, default_oversample=1.,
                 properties=None, weight_file=None, image_mask=None):
        """
        The init function for the AngularCatalog class
        """

        #Make sure we have Numpy arrays
        ra = np.array(ra)
        dec = np.array(dec)

        #Check to make sure we have sensible values for RA and Dec
        if ra.ndim != 1:
            raise ValueError('RA list must be a 1D array')
        if dec.ndim != 1:
            raise ValueError('Dec list must be a 1D array')
        if dec.size != ra.size:
            raise ValueError('RA and Dec arrays must be the same length')

        #Now store the RA and Dec information
        self._ra = ra
        self._dec = dec
        self._ra_range = np.array([ra.min(), ra.max()])
        self._ra_span = np.diff(self._ra_range)[0]
        self._dec_range = np.array([dec.min(), dec.max()])
        self._dec_span = np.diff(self._dec_range)[0]
        self._input_n_objects = ra.size
        self._n_objects=None

        #Store the info from keywords
        self._image_mask = image_mask
        self._weight_file_name = weight_file
        self._properties = properties
        self._random_oversample = default_oversample

        #Store some defaults/holders
        self._theta_bins=None
        self._cfs={}

        #Make blank things so I can ask "is None" rather than "exists"
        self._data_tree=None
        self._random_tree=None
        self._ra_random=None
        self._dec_random=None
        self._Gp=None
        self._completeness=None
        self._use=None
        self._use_random=None
        self._subregion_number=None

        #Set up the mask and generate the randoms if asked
        self.setup_mask()
        if generate_randoms:
            self.generate_random_sample()

    #------------------------------------------------------------------------------------------
    #--------------------------------------------#
    #- Class method for making a random catalog -#
    #--------------------------------------------#
    @classmethod
    def random_catalog(cls, n_randoms, image_mask = None, ra_range=None,
                       dec_range=None):
        """
        Creates an AngularCatalog populated with RAs and Decs placed
        randomly within the mask.  This can be passed either an image
        mask or an RA and Dec range

        **Syntax**

        * cat = ac_class.AngularCatalog.random_catalog(n_randoms, image_mask=ImageMask_object)
        OR
        * cat = ac_class.AngularCatalog.random_catalog(n_randoms, ra_range=[min, max], dec_range=[min, max])

        Parameters
        ----------
        n_randoms : scalar
            The number of randoms that you want in you catalog
            
        image_mask : ImageMask object (optional)
            An ImageMask object with the outline that you want for your
            randoms.  This is one option.
            
        ra_range : two-element array-like (optional)
            The minimum and maximum RA you would like your randoms to have.
            This is an alternative to the image_mask option.  This must be
            combined with the dec_range argument as well.
            
        dec_range : two-element array-like (optional)
            The minimum and maximum Dec you would like your randoms to have.
            This is an alternative to the image_mask option.  This must be
            combined with the ra_range argument.

        Returns
        -------
        cat : AngularCatalog object
            An AngularCatalog instance with n_randoms distributed over either
            the image_mask or over the RA and Dec range.
        """

        #Make an image mask from the RA and Dec ranges if we don't have an
        #image mask already
        need_image_mask = image_mask is None
        if need_image_mask:
            image_mask = imclass.ImageMask.from_ranges(ra_range, dec_range)

        #Use the ImageMask to create random RAs and Decs and make them into
        #an AngularCatalog with the corresponding mask.
        ra, dec, comp = image_mask.generate_random_sample(n_randoms)
        return AngularCatalog(ra, dec, image_mask=image_mask)
        
    #------------------------------------------------------------------------------------------

    #----------------------------#
    #- Set the weight file name -#
    #----------------------------#            
    def set_mask_to_weight_file(self, filename):
        """
        Set the weight file name and process the file to an image mask

        Parameters
        ----------
        filename : string
            The location of the FITS file that you want to process to
            a weight mask.  The file name should be specified from /
        """
        self._weight_file_name=filename
        self.setup_mask(force_remake=True)
        return

    #------------------------------------------------------------------------------------------

    #-------------------------------------------#
    #- Make an image mask from the weight file -#
    #-------------------------------------------#
    def setup_mask(self, force_remake=False):
        #Create an image mask (from the weight file if given one)
        if (self._image_mask is None) or force_remake:
            if self._weight_file_name is not None:
                immask = imclass.ImageMask.from_FITS_weight_file(self._weight_file_name)
            else:
                immask = imclass.ImageMask.from_ranges(self._ra_range, self._dec_range)
        
        #Ask the mask for the completenesses of each data object
        self._completeness = self._image_mask.return_completenesses(self._ra, self._dec)

        #Generate random numbers- this is basically for when we have non-binary completeness
        compare_to = rand.random(size=self._n_objects)

        #Use the random numbers to figure out which guys in the data to use
        self._use = compare_to < self._completeness

        #Set up the data tree now that we have a mask
        self.make_data_tree()

        #Record how many objects we're actually using
        self._n_objects=len(self._ra[self._use])

    #------------------------------------------------------------------------------------------

    #-------------#
    #- Move mask -#
    #-------------#
    def move_mask(self, delta_ra=None, delta_dec=None,
                  theta_degrees=None, preview=False):
        #Calls the image mask's translation/rotation routine.
        if preview:
            newmask=self._image_mask.move_mask_on_sky(delta_ra=delta_ra,
                                                      delta_dec=delta_dec,
                                                      theta_degrees=theta_degrees,
                                                      preview=preview)
            return newmask
        else:
            self._image_mask.move_mask_on_sky(delta_ra=delta_ra,
                                              delta_dec=delta_dec,
                                              theta_degrees=theta_degrees,
                                              preview=preview)
    
    #------------------------------------------------------------------------------------------

    #------------------------------------------------------------------------------------------
        
    #---------------------------------#
    #- Compute BallTree for the data -#
    #---------------------------------#
    def make_data_tree(self):
        #The astroML correlation function methods want a cartesian position
        #instead of the angular positions- this does the conversion
        
        print "make_datatree says: Computing the BallTree for data."
        data = np.asarray(corr.ra_dec_to_xyz(self._ra[self._use], self._dec[self._use]), order='F').T
        self._data_tree = BallTree(data, leaf_size=2)
        
        return

    #------------------------------------------------------------------------------------------

    #------------------------------------------#
    #- Compute BallTree for the random sample -#
    #------------------------------------------#
    def make_random_tree(self):
        #Make sure we have the random data made
        if (self._ra_random is None) or (self._dec_random is None):
            print "make_random_tree says: no random sample found.  Generating one."
            self.generate_random_sample()

        #Make the tree
        print "make_randomtree says: Computing the BallTree for the randoms."
        random_data=np.asarray(corr.ra_dec_to_xyz(self._ra_random, self._dec_random), order='F').T
        self._random_tree = BallTree(random_data, leaf_size=2)                
                
        return          

    #------------------------------------------------------------------------------------------

    def set_theta_bins(self, min_theta, max_theta, nbins,
                       unit='a', logbins=True):
        #Make a ThetaBins class and save it.
        self._theta_bins = binclass.ThetaBins(min_theta, max_theta, nbins,
                                              unit=unit, logbins=logbins)

    #------------------------------------------------------------------------------------------

    #---------------------------------------------------------------------#
    #- Check to make sure we have all the info needed for CF calculation -#
    #---------------------------------------------------------------------#         
    def __check_cf_setup(self, need_subregions=False,
                         random_oversample=None, check_trees=True):
        #Make sure that we have all the things we need to do a
        #correlation function properly (I got tired of the redundant
        #code in the different CF calculation routines)
        
        #Check that we have the bins 
        if not isinstance(self._theta_bins, binclass.ThetaBins):
            raise ValueError("CF calculations need separation bins.  Use "
                             "catalog.set_theta_bins(min_theta, max_theta,"
                             "nbins, unit='arcsec', logbins=True)")
        
        #Change/store the random oversampling factor if it's given
        if random_oversample is not None:
            self._random_oversample=random_oversample

        #Check the existence of a random sample
        if self._ra_random is None:
            self.generate_random_sample()

        #See if we're properly oversampled.
        nR=len(self._ra_random)
        if nR != len(self._ra)*self._random_oversample:
            self.generate_random_sample()
            
        #Check to make sure we have the trees for the appropriate guys
        if check_trees:
            if self._data_tree is None:
                self.make_data_tree()
            if self._random_tree is None:
                self.make_random_tree()

        #Check to make sure that the subdivisions have happened
        #if need_subregions.  If not, throw an error because it's
        #too specific to fill it in automatically
        if need_subregions:
            if self._subregion_number is None:
                raise ValueError("Jackknife and block bootstrap require "
                                "that you subdivide the field.  Call the "
                                "catalog.subdivide_mask() routine first.")

    #------------------------------------------------------------------------------------------

    #-----------------------------------------------------#
    #- Calculate the correlation function without errors -#
    #-----------------------------------------------------# 
    def cf(self, estimator='landy-szalay', n_iter=1, clobber=False,
          random_oversample=None, save_steps_file=None, name='cf'):
        #This uses the info we have plus the astroML correlation package
        #   to compute the angular correlation function.
        #The idea is that this function will figure out what information
        #   is available and call the appropriate (most efficient) function
        #   with all the relevant information.
        #This function will store the values it calculates for missing info

        if (name in self._cfs.keys()) and not clobber:
            raise ValueError("CorrelationFunction.cf says: There's already"
                             " a CF by that name.  Please choose another or "
                             "overwrite by calling with clobber=True")

        #Make sure that we have everything we need and fix anything missing that's fixable
        self.__check_cf_setup(random_oversample=random_oversample,
                              need_subregions=False, check_trees=True)

        #Make a new CorrelationFunction instance and set the basic info
        #First make a dictionary of the arguments to pass because it's ugly
        info={'name'            : name,
             'cf_type'          : 'no_error',
             'ngals'            : self._n_objects,
             'theta_bin_object' : copy.deepcopy(self._theta_bins),
             'estimator'        : estimator
             }
        self._cfs[name] = cfclass.CorrelationFunction(**info)
        centers, edges = self._cfs[name].get_thetas(unit='degrees')
        nbins=len(centers)

        #Do the calculation
        cf=np.zeros(nbins)
        DD=np.zeros(nbins)
        print "AngularCatalog.cf says: doing a CF calculation without error estimation"
        iterations={}
        for it in np.arange(n_iter):
            this_cf, this_dd = corr.two_point_angular(self._ra[self._use], 
                                                     self._dec[self._use], 
                                                     edges,
                                                     BT_D=self._data_tree, 
                                                     BT_R=self._random_tree,
                                                     method=estimator, 
                                                     ra_R=self._ra_random,
                                                     dec_R=self._dec_random,
                                                     return_DD=True)
            iterations[it]=this_cf
            cf += this_cf
            DD = this_dd/2.
            if save_steps_file is not None:
                self._cfs[name].set_cf(cf/(it+1), np.zeros(nbins), iterations=iterations)
                self._cfs[name].set_DD(DD)
                self.save_cf(save_steps_file, cf_keys=name)
            if n_iter >1:
                self.generate_random_sample()

        #Divide out the number of iterations
        cf/=n_iter

        #Make sure we've stored everything properly even if we're not saving
        self._cfs[name].set_cf(cf, np.zeros(nbins), iterations=iterations)

    #------------------------------------------------------------------------------------------

    #----------------------------------------------------#
    #- Find the CF and error by single-galaxy bootstrap -#
    #----------------------------------------------------#
    def cf_bootstrap(self, n_boots=10, bootstrap_oversample=1,
                     random_oversample=None, estimator='landy-szalay',
                     save_steps_file=None, name='galaxy_bootstrap',
                     clobber=False):
        #Calculate the  correlation function with single-galaxy bootstrapping

        if (name in self._cfs.keys()) and not clobber:
            raise ValueError("CorrelationFunction.cf_bootstrap says: "
                             "There's already a CF by that name.  Please "
                             "choose another or overwrite by calling with "
                             "clobber=True")
        
        #Check that everything is set up
        self.__check_cf_setup(need_subregions=False, check_trees=False,
                              random_oversample=random_oversample)

        #Make a new CorrelationFunction instance and set the basic info
        #First make a dictionary of the arguments to pass because it's ugly
        info={'name'            : name,
             'cf_type'          : 'single_galaxy_bootstrap',
             'ngals'            : self._n_objects,
             'theta_bin_object' : copy.deepcopy(self._theta_bins),
             'estimator'        : estimator
             }
        self._cfs[name] = cfclass.CorrelationFunction(**info)
        centers, edges = self._cfs[name].get_thetas(unit='degrees')
        nbins=len(centers)

        #Make an array so it's easy to average over the boots
        temp = np.zeros((n_boots, nbins))
        #This RR will keep track of the RR counts so you don't have to
        #calculate them every time.
        rr=None
        #A holder for the boots that will be passed to the
        #CorrelationFunction as the iterations
        bootstrap_boots={}
        
        print ("AngularCatalog.cf_bootstrap says: doing a bootstrap "
               "CF calculation")

        #Loop through the boots
        for i in np.arange(n_boots):
            #Give a progress report
            print "calculating boot", i
            
            #Choose the right number of galaxies *with replacement*
            ind=np.random.randint(0, self._n_objects,
                                  bootstrap_oversample*self._n_objects)
            ra_b=self._ra[self._use][ind]
            dc_b=self._dec[self._use][ind]
            
            #Calculate this boot
            bootstrap_boots[i], rr = corr.two_point_angular(ra_b, dec_b, edges, 
                                                            BT_D=self._data_tree, 
                                                            BT_R=self._random_tree,
                                                            method=estimator, 
                                                            ra_R=self._ra_random, 
                                                            dec_R=self._dec_random, 
                                                            RR=rr, return_RR=True)
            #Store what we have
            temp[i]=bootstrap_boots[i]
            if (save_steps_file is not None):
                bootstrap_cf=np.nanmean(temp[0:i+1], axis=0)
                bootstrap_cf_err=np.nanstd(temp[0:i+1], axis=0)
                self.save_cfs(save_steps_file, cf_keys=[name])
                
        #Now we're done- do the final storage.
        bootstrap_cf=np.nanmean(temp, axis=0)
        bootstrap_cf_err=np.nanstd(temp, axis=0)
        self._cfs[name].set_cf(bootstrap_cf, bootstrap_cf_err,
                               iterations=bootstrap_boots)
        self._cfs[name].set_counts(RR=rr)
        
    #------------------------------------------------------------------------------------------

    #----------------------------------------#
    #- Find the CF and error by jackknifing -#
    #----------------------------------------#
    def cf_jackknife(self, ignore_regions=[], estimator='landy-szalay',
                     random_oversample=None, save_steps_file=None,
                     name='jackknife', clobber=False):
        #This takes a divided mask and performs the correlation
        #function calculation on the field with each sub-region
        #removed in turn.

        if (name in self._cfs.keys()) and not clobber:
            raise ValueError("CorrelationFunction.cf_jackknife says: "
                             "There's already a CF by that name.  Please "
                             "choose another or overwrite by calling with "
                             "clobber=True")

        #Check to make sure we have everything we need
        self.__check_cf_setup(need_subregions=True, check_trees=False,
                              random_oversample=random_oversample)

        #Make a new CorrelationFunction instance and set the basic info
        #First make a dictionary of the arguments to pass because it's ugly
        info={'name'            : name,
             'cf_type'          : 'jackknife',
             'ngals'            : self._n_objects,
             'theta_bin_object' : copy.deepcopy(self._theta_bins),
             'estimator'        : estimator
             }
        self._cfs[name] = cfclass.CorrelationFunction(**info)
        centers, edges = self._cfs[name].get_thetas(unit='degrees')
        
        #pull out the unique subregion numbers and figure out which to use
        regions=np.asarray(list(set(self._subregion_number)))
        use_regions=[r for r in regions if (r not in ignore_regions) and (r != -1)]
        use_regions=np.array(use_regions)
        n_jacks=len(use_regions)

        #Figure out where the randoms are
        random_subregions=self._image_mask.return_subregions(self._ra_random,
                                                             self._dec_random)
        
        #Now loop through the regions that you should be using 
        #and calculate the correlation function leaving out each
        jackknife_jacks = {}
        #Make a mask that takes out all the galaxies that aren't in use_regions
        valid_subregion = ma.masked_not_equal(self._subregion_number, -1).mask
        random_valid_subregion=ma.masked_not_equal(random_subregions, -1).mask
        for bad_reg in ignore_regions:
            this_mask = ma.masked_not_equal(self._subregion_number, bad_reg).mask
            valid_subregion = valid_subregion & this_mask
            this_mask = ma.masked_not_equal(random_subregions, bad_reg).mask
            random_valid_subregion = random_valid_subregion & this_mask        

        temp = np.zeros((n_jacks, len(self._cf_thetas)))
        for i, r in enumerate(use_regions):
            #Make the mask for the data
            not_region_r = ma.masked_not_equal(self._subregion_number, r).mask  
            this_jackknife = valid_subregion & not_region_r & self._use  
            
            #Make the mask for the randoms
            random_not_region_r = ma.masked_not_equal(random_subregions, r).mask
            random_this_jackknife = random_not_region_r & random_valid_subregion

            #Do the calculation for this jackknife and store it
            print "calculating jackknife", i
            jackknife_jacks[r] = corr.two_point_angular(self._ra[this_jackknife], 
                                                        self._dec[this_jackknife], 
                                                        edges, method=estimator, 
                                                        ra_R = self._ra_random[random_this_jackknife],
                                                        dec_R = self._dec_random[random_this_jackknife])
            temp[i]=jackknife_jacks[r]
            if (save_steps_file is not None):
                    jackknife_cf=np.nanmean(temp[0:i+1], axis=0)
                    jackknife_cf_err=np.nanstd(temp[0:i+1], axis=0)
                    self._cfs[name].set_cf(jackknife_cf, jackknife_cf_err,
                                           iterations=bootstrap_boots)
                    self.save_cfs(save_steps_file, cf_keys=[name])
            
        #Now that we have all of the jackknifes (jackknives?), calculate the mean
        # and variance.
        jackknife_cf=np.nanmean(temp, axis=0)
        jackknife_cf_err=np.nanstd(temp, axis=0)
        self._cfs[name].set_cf(jackknife_cf, jackknife_cf_err,
                               iterations=bootstrap_boots)

    #------------------------------------------------------------------------------------------

    #--------------------------------------------#
    #- Find the CF and error by block bootstrap -#
    #--------------------------------------------#
    def cf_block_bootstrap(self, n_boots=10, ignore_regions=None,
                           estimator='landy-szalay', random_oversample=None,
                           bootstrap_oversample=1, save_steps_file=None,
                           name='block_bootstrap', clobber=False):
        #Use the subdivided mask to bootstrap on blocks rather than
        #single galaxies.

        if (name in self._cfs.keys()) and not clobber:
            raise ValueError("CorrelationFunction.cf_block_bootstrap says: "
                             "There's already a CF by that name.  Please "
                             "choose another or overwrite by calling with "
                             "clobber=True")

        #Check to make sure I have everything that I need
        self.__check_cf_setup(masked=True, need_subregions=True,
                              random_oversample=random_oversample,
                              check_trees=False)

        #Make a new CorrelationFunction instance and set the basic info
        #First make a dictionary of the arguments to pass because it's ugly
        info={'name'            : name,
             'cf_type'          : 'jackknife',
             'ngals'            : self._n_objects,
             'theta_bin_object' : copy.deepcopy(self._theta_bins),
             'estimator'        : estimator
             }
        self._cfs[name] = cfclass.CorrelationFunction(**info)
        centers, edges = self._cfs[name].get_thetas(unit='degrees')
        nbins = len(centers)
        
        print "block boots done with setup"

        #Figure out which subregions we should be using
        regions=np.asarray(list(set(self._subregion_number)))
        use_regions=[r for r in regions if (r not in ignore_regions) and (r != -1)]
        use_regions=np.array(use_regions)

        #Figure out where the randoms are
        random_subregions=self._image_mask.return_subregions(self._ra_random,
                                                             self._dec_random)

        #Make a dictionary of arrays containing the indices of the members of each sub-region we need
        indices={}
        random_indices={}
        for r in use_regions:
            indices[r]=np.where(self._subregion_number == r)[0]
            random_indices[r]=np.where(random_subregions == r)[0]

        #Loop through the bootstraps
        block_bootstrap_boots={}
        n_choose=len(use_regions)*bootstrap_oversample
        temp = np.zeros((n_boots, nbins))
        print "block boots looping through boots"
        for i in np.arange(n_boots):
            this_boot=rand.choice(use_regions, size=n_choose)
            this_boot_indices=np.array([], dtype=np.int)
            this_boot_random_indices=np.array([], dtype=np.int)
            
            for region in this_boot:
                this_boot_indices=np.concatenate((this_boot_indices, indices[region]))
                this_boot_random_indices=np.concatenate((this_boot_random_indices,
                                                         random_indices[region]))

            # this_boot_indices=np.array(
            print "calculating boot", i
            temp[i] = corr.two_point_angular(self._ra[this_boot_indices], 
                                             self._dec[this_boot_indices], 
                                             edges, method=estimator, 
                                             ra_R=self._ra_random[this_boot_random_indices],
                                             dec_R=self._dec_random[this_boot_random_indices])
            block_bootstrap_boots[i] = temp[i]
            cf=np.nanmean(temp[0:i+1], axis=0)
            cf_err=np.nanstd(temp[0:i+1], axis=0)
            self._cfs[name].set_cf(cf, cf_err, iterations=bootstrap_boots)
            if (save_steps_file is not None):
                self.save_cfs(save_steps_file, cfkeys=[name])

    #------------------------------------------------------------------------------------------

    #----------------------------------------------------------------#
    #- Generate the random-random counts required to compute the IC -#
    #----------------------------------------------------------------#
    def generate_rr(self, set_nbins=None, logbins=True, min_sep=0.01, 
                    force_n_randoms=None, save_to=None, n_chunks=1):
        #Do random-random counts over the entire field.  If set_nbins is declared,
        #generate_rr will not go looking for the correlation functions so that the
        #RR counts for the IC calculation and the CF calculation can be done in parallel.
 
        #Figure out how many randoms we need.  This was calculated by playing with
        #the number of randoms in the GOODS-S field and seeing when the RR counts converged
        #to the "way too many" curve.  27860 per 1.43e-5 steradians was what I settled on.
        #If there's a forced number, it will ignore my estimate.
        #  Amendment 4/15- this minimum number seems to be somewhat too small for fields that 
        #                  aren't as smooth as GOODS-S, so I'm multiplying it by 5.  This looks ok.
        #  Amendment 8/15- added the capability to do this in several chunks.
        
        if force_n_randoms is None:
            surface_density_required = 27860.*5./1.43e-5
            area = self._image_mask.masked_area_solid_angle()
            number_needed = surface_density_required * area
        else:
            number_needed=force_n_randoms

        #If we're doing more than one chunk, divide the number we need into n_chunks chunks
        if n_chunks > 1:
            number_needed = np.ceil(float(number_needed)/n_chunks).astype(int)
        total_number = number_needed * n_chunks
        print "total number: ",  total_number
        print "number per iteration: ", number_needed
        print "number of chunks: ", n_chunks

        #Range of separations to make bins over
        min_ra = self._ra[self._use].min()
        min_dec = self._dec[self._use].min()
        max_ra = self._ra[self._use].max()
        max_dec = self._dec[self._use].max()
        max_sep=misc.ang_sep(min_ra, min_dec, max_ra, max_dec,
                           radians_in=False, radians_out=False)

        #Choose how many bins
        if set_nbins is None:
            #Get our theta bin info from the CF if we can.  Error if we can't
            if self._theta_bins is None:
                raise ValueError("AngularCatalog.generate_rr says: I need"
                                " either a set number of bins (set_nbins=N)"
                                " or thetas from a CF to extrapolate. "
                                " You have given me neither.")
            centers, edges = self._cfs[name].get_thetas(unit='degrees')
            nbins= np.ceil( len(centers) * 2. * max_sep/edges.max())
        else:
            nbins=set_nbins

        #Make the bins
        rr_theta_bins = binclass.ThetaBins(min_sep, max_sep, nbins,
                                           unit='d', logbins=logbins)
        use_centers, use_theta_bins = rr_theta_bins.get_thetas(unit='degrees')

        #Do the loop
        G_p=np.zeros(nbins)
        rr_counts=np.zeros(nbins)
        for n_i in np.arange(n_chunks):
            print "doing chunk #", n_i
            #Remake the random sample so we're sure we have the right oversample factor            
            self.generate_random_sample(masked=True, make_exactly=number_needed)
        
            #Code snippet shamelessly copied from astroML.correlations
            xyz_data = corr.ra_dec_to_xyz(self._ra_random,
                                         self._dec_random)
            data_R = np.asarray(xyz_data, order='F').T
            bins = corr.angular_dist_to_euclidean_dist(use_theta_bins)
            Nbins = len(bins) - 1
            counts_RR = np.zeros(Nbins + 1)
            for i in range(Nbins + 1):
                counts_RR[i] = np.sum(self._random_tree.query_radius(data_R, bins[i],
                                                                            count_only=True))
            rr = np.diff(counts_RR)
            #Landy and Szalay define G_p(theta) as <N_p(theta)>/(n(n-1)/2)
            G_p += rr/(number_needed*(number_needed-1)) 
            rr_counts += rr

        print "Dividing out the theta bin sizes and number of chunks"
        
        #I divide out the bin width because just using the method
        #that L&S detail gives you a {G_p,i} with the property that
        #Sum[G_p,i]=1.  This is not equivalent to Integral[G_p d(theta)]=1,
        #which is what they assume everywhere else.
        #Dividing out the bin width gives you that and lets you pretend
        #G_p is a continuous but chunky-looking function.
        G_p /= np.diff(use_theta_bins)                    
        G_p /= n_chunks                                   
        self._rr_ngals=[total_number, n_chunks]
        self._Gp = gpclass.Gp(min_sep, max_sep, nbins, G_p, total_number,
                              n_chunks, logbins=logbins, unit='d',
                              RR=rr_counts)

        if save_to is not None:
            self.save_gp(save_to)
        
    #------------------------------------------------------------------------------------------

    #-------------------------------------#
    #- Read in previously calculated CFs -#
    #-------------------------------------#
    def load_cf(self, filen, overwrite_existing=False, name_prefix=''):
        #Load in a CF from a file or set of files

        #First, what files start with filen?
        file_list = misc.files_starting_with(filen)
        nfiles = len(file_list)

        #Generate the names
        names = copy.copy(file_list)
        for i, n in names:
            names[i] = name_prefix + n.lstrip(filen)
        
    #------------------------------------------------------------------------------------------

    #--------------------------------------------#
    #- Save the correlation functions to a file -#
    #--------------------------------------------#
    def save_cf(self, file_base, cf_keys=None):
        #Takes all the CF information we have and saves to a file
        #per CF

        #If they didn't say which ones specifically, save all
        if cf_keys is None:
            cf_keys=self._cfs.keys()

        for k in cf_keys:
            filen = file_base + k
            self._cfs[k].save(filen)
        
    #------------------------------------------------------------------------------------------

    #-----------------------------------------------------------------#
    #- Read in previously calculated random-random counts for the IC -#
    #-----------------------------------------------------------------#
    def load_gp(self, filename, overwrite_existing=False):
        #Take the ASCII files with the normed random-random counts calculated and read it in

        if (self._Gp is None) or overwrite_existing:
            self._Gp = gpclass.Gp.from_file(filename)
        else:
            print ("angular_catalog.load_rr says: You've asked me not "
                   "to overwrite the existing RR counts and there's "
                   "already Gp information .")

    #------------------------------------------------------------------------------------------

    #--------------------------------------------#
    #- Save the random-random counts for the IC -#
    #--------------------------------------------#
    def save_gp(self, filename):
        #If we have done the random-random counts for the integral
        #constraint, save to a file
        self._Gp.save(filename)
Exemple #49
0
def tract_smooth(tractography, var, file_output):
    from sklearn.neighbors import BallTree

    var = float(var)
    std = var ** 2

    points = tractography.original_tracts()

    all_points = numpy.vstack(points)
    bt = BallTree(all_points)
    N = len(all_points) / 3
    I = numpy.eye(3)[None, ...]
    for i, tract in enumerate(tractography.original_tracts()):
        # all_points = numpy.vstack(points[:i] + points[i + 1:])
        # bt = BallTree(all_points)

        diff = numpy.diff(tract, axis=0)
        diff = numpy.vstack((diff, diff[-1]))
        lengths = numpy.sqrt((diff ** 2).sum(1))
        # cum_lengths = numpy.cumsum(lengths)

        diff_norm = diff / lengths[:, None]
        tangent_lines = diff_norm[:, None, :] * diff_norm[:, :, None]
        normal_planes = I - tangent_lines
#        weight_matrices = normal_planes + 1e10 * tangent_lines

        N = max(len(d) for d in bt.query_radius(tract, var * 3))

        close_point_distances, close_point_indices = bt.query(
            tract, N
        )

        close_points = all_points[close_point_indices]
        difference_vectors = close_points - tract[:, None, :]
        projected_vectors = (
            normal_planes[:, None, :] *
            difference_vectors[..., None]
        ).sum(-2)
        projected_points = projected_vectors + tract[:, None, :]
        # projected_distances2 = (projected_vectors**2).sum(-1)
        # projected_weights = numpy.exp(- .5 * projected_distances2 / std)
        # projected_weights /= projected_weights.sum(-1)[:, None]

        weights = numpy.exp(
            -.5 * close_point_distances ** 2 / std
        )[..., None]
        weights /= weights.sum(-2)[..., None]

        # tract += (weights * projected_vectors).sum(-2)

#        weighted_distances = (
#            weight_matrices[:, None, :] *
#            difference_vectors[..., None]
#        ).sum(-2)
#        weighted_distances *= difference_vectors
#        weighted_distances = weighted_distances.sum(-1) ** .5
        # weighted_points = (projected_points * weights).sum(1)

        weighted_points = (projected_points * weights).sum(1)

        tract[:] = weighted_points
        # tract /= norm_term

    return Tractography(
        tractography.original_tracts(),
        tractography.original_tracts_data(),
        **tractography.extra_args
    )
Exemple #50
0
def mean_shift(X, bandwidth=None, seeds=None, kernel="flat",
               max_cluster_radius=-1., max_iterations=300):
    """Perform MeanShift Clustering of data using the specified kernel

    Parameters
    ----------

    X : array [n_samples, n_features]
        Input points to be clustered

    bandwidth : float,
        Kernel bandwidth

    seeds: array [n_seeds, n_features], optional
        Points used as initial kernel locations
        If not set, then use every point as a seed (which may
        be very slow---consider using the `get_bin_seeds` function
        to create a reduced set of seeds.

    max_cluster_radius: float, default -1.
        Used only in post-processing.
        If negative, then each point is clustered into its nearest cluster.
        If positive, then those points that are not within `max_cluster_radius`
        of any cluster center are said to be 'orphans' that do not belong to
        any cluster. Orphans are given cluster label -1.

    Returns
    -------

    cluster_centers : array [n_clusters, n_features]
        Coordinates of cluster centers

    labels : array [n_samples]
        cluster labels for each point

    Notes
    -----
    See examples/plot_meanshift.py for an example.

    """

    if seeds is None:
        seeds = X
    elif len(seeds) == 0:
        raise ValueError, "If a list of seeds is provided it cannot be empty."

    if not (kernel in KERNELS):
        valid_kernels = " ".join(KERNELS)
        raise ValueError, "Kernel %s is not valid. Valid kernel choices are: %s " % (kernel, valid_kernels)

    # Set maximum neighbor query distance based on kernel
    if kernel in ["flat"]:
        query_distance = bandwidth
        kernel_update_function = flat_kernel_update
        print "Using flat kernel update"
    elif kernel in ["gaussian"]:
        query_distance = bandwidth * 3 # A bit arbitrary
        kernel_update_function = gaussian_kernel_update
        print "Using gaussian kernel update"
    else:
        raise ValueError, "Kernel %s not implemented correctly" % kernel

    n_points, n_features = X.shape
    stop_thresh = 1e-3 * bandwidth  # when mean has converged
    center_intensity_dict = {}
    ball_tree = BallTree(X)  # to efficiently look up nearby points

    # For each seed, climb gradient until convergence or max_iterations
    for weighted_mean in seeds:
        completed_iterations = 0
        while True:
            # Find mean of points within bandwidth
            points_within = X[ball_tree.query_radius([weighted_mean], query_distance)[0]]
            if len(points_within) == 0:
                break  # Depending on seeding strategy this condition may occur
            old_mean = weighted_mean  # save the old mean
            weighted_mean = kernel_update_function(old_mean, points_within, bandwidth)
            # If converged or at max_iterations, addS the cluster
            if extmath.norm(weighted_mean - old_mean) < stop_thresh or \
                   completed_iterations == max_iterations:
                center_intensity_dict[tuple(weighted_mean)] = len(points_within)
                break
            completed_iterations += 1

    # POST PROCESSING: remove near duplicate points
    # If the distance between two kernels is less than the bandwidth,
    # then we have to remove one because it is a duplicate. Remove the
    # one with fewer points.
    print "%d clusters before removing duplicates " % len(center_intensity_dict)
    sorted_by_intensity = sorted(center_intensity_dict.items(),
                                 key=lambda tup: tup[1], reverse=True)
    sorted_centers = np.array([tup[0] for tup in sorted_by_intensity])
    unique = np.ones(len(sorted_centers), dtype=np.bool)
    cc_tree = BallTree(sorted_centers)
    for i, center in enumerate(sorted_centers):
        if unique[i]:
            neighbor_idxs = cc_tree.query_radius([center], bandwidth)[0]
            unique[neighbor_idxs] = 0
            unique[i] = 1  # leave the current point as unique
    cluster_centers = sorted_centers[unique]
    print "%d clusters after removing duplicates " % len(cluster_centers)

    # ASSIGN LABELS: a point belongs to the cluster that it is closest to
    centers_tree = BallTree(cluster_centers)
    labels = np.zeros(n_points, dtype=np.int)
    distances, idxs = centers_tree.query(X, 1)
    if max_cluster_radius < 0:
        labels = idxs.flatten()
    else:
        labels[:] = -1
        bool_selector = distances.flatten() <= max_cluster_radius
        labels[bool_selector] = idxs.flatten()[bool_selector]
    return cluster_centers, labels
Exemple #51
0
def evaluate_embeddings(embeddings, edges, cda=True, greedy_routing=False, cda_max_vertices=1000, gr_max_pairs=10000):
    "evaluate quality of embeddings compared with real elge set"
    report = []

    # get connected component
    true_graph = nx.Graph()
    true_graph.add_edges_from(edges)
    Gcc=sorted(nx.connected_component_subgraphs(true_graph), key=len, reverse=True)
    true_graph=Gcc[0]

    # use BallTree for efficient graph construction
    print "construct BallTree"
    vertices = list(true_graph.nodes())
    n = len(vertices)
    embeddings_array = np.array([embeddings[v] for v in vertices])
    bt = BallTree(embeddings_array, metric=distance)

    degrees = defaultdict(int)
    print "compute number of correct directed arcs"
    for v1, v2 in edges:
        degrees[v1] += 1
        degrees[v2] += 1

    # compute number of correct DIRECTED arcs assuming that degrees are known
    if cda:
        all_correct_arcs = set()
        cda_vertices = vertices[:]
        if len(cda_vertices) > cda_max_vertices:
            np.random.shuffle(cda_vertices)
            cda_vertices = cda_vertices[:cda_max_vertices]
        for v_i, v in enumerate(cda_vertices):
            start = time.time()
            degree = degrees[v]
            dist, ind = bt.query(embeddings[v], k=degree+1)  # one of neighbors is vertex inself
            neigh = [vertices[i] for i in ind[0].tolist() if vertices[i] != v]
            for ne in neigh:
                if make_edge(v, ne) in edges:
                    all_correct_arcs.add((v, ne))
            finish = time.time()
            #print "DEBUG: {} / {}, time={}s".format(v_i + 1, len(cda_vertices), datetime.timedelta(seconds=finish-start))
        report.append(['ratio of correct arcs for known degrees', float(len(all_correct_arcs)) / (2 * len(edges))])

    if greedy_routing:
        print "compute greedy routing efficiency"
        random_pairs = set()
        if n * (n-1) / 2 <= gr_max_pairs:
            random_pairs = set(combinations(vertices, 2))
        else:
            while(len(random_pairs) < gr_max_pairs):
                v1 = np.random.choice(vertices)
                v2 = np.random.choice(vertices)
                if v1 != v2:
                    random_pairs.add((v1, v2))

        total_distribution = defaultdict(int)
        success_distribution = defaultdict(int)
        for i, pair in enumerate(random_pairs):
            src, dst = pair
            # best path
            best_path_length = nx.shortest_path_length(true_graph, source=src, target=dst)
            total_distribution[0] += 1
            total_distribution[best_path_length] += 1

            # greedy path
            curr_src = src
            path_length = 0
            seen = set()
            while curr_src != dst:
                seen.add(curr_src)
                # find neighbor closest to destination
                unseen_neighbors = filter(lambda x: x not in seen, true_graph.neighbors(curr_src))
                if not len(unseen_neighbors):
                    # greedy algorithm stuck in 'leaf'
                    path_length = np.nan
                    break

                def curr_distance(v):
                    return distance(embeddings[dst], embeddings[v])
                closest_neigh = min(unseen_neighbors, key=curr_distance)
                path_length += 1
                curr_src = closest_neigh

            if path_length == best_path_length:
                success_distribution[0] += 1
                success_distribution[best_path_length] += 1
        all_success = success_distribution[0]
        all_total = total_distribution[0]
        all_ratio = float(all_success) / all_total * 100
        print "Total: {} / {} ({:.2f} %)".format(all_success, all_total, all_ratio)
        for pl in sorted(set(total_distribution.keys()) | set(success_distribution.keys())):
            if pl == 0:
                continue
            total = total_distribution.get(pl, 0)
            success = success_distribution.get(pl, 0)
            ratio = float(success) / total * 100
            print "Path length = {}: {} / {} ({:.2f} %)".format(pl, success, total, ratio)

    if False:
        # depends on R, bad for subgraphs -- not used
        n = len(vertices)
        R = 2 * np.log(n)
        coshR = np.cosh(R)

        predicted_edges = set()
        print "predict edges"
        for v in vertices:
            coords = embeddings[v]
            neigh_idx = bt.query_radius(coords, R)
            neigh = [vertices[i] for i in neigh_idx[0].tolist() if vertices[i] != v]
            predicted_edges.update([make_edge(v, ne) for ne in neigh])
        report.append(['total_predicted_edges', len(predicted_edges)])

        # contingency matrix
        print "compute contingency matrix"
        report.append(['true positive', len(edges & predicted_edges)])
        report.append(['false positive', len(predicted_edges - edges)])
        report.append(['false negative', len(edges - predicted_edges)])
        report.append(['true negative', n*(n-1)/2 - len(edges | predicted_edges)])

    return report
r_vir=0.184*(mass/1e12)**.33 * (omega_z/0.28)**0.33 # r_virial of halos from NFW formulation
candidates=((mass>2e11)).nonzero()[0] # cut for the minimum mass of the halos considered in the CS model, here m_{h,0}=2e11 M_sun
rand_ids=random.sample(candidates,100000) # sample a fraction of the candidiates, for example chosing 100,000 halos  

#making data tree
data=np.array([x,y,z]).T
data_tree=BallTree(data)

#searching the data tree finding halos in CS model with DR_0 =1.
DR0=1.
good_dr=[]
for n in np.arange(0,len(rand_ids)):
        halo=rand_ids[n]
	#searching within 10 Mpc of halos
        halo_neighbors=data_tree.query_radius(data[halo,:], 10., count_only = False, return_distance = False)
        concat_halos=np.concatenate(halo_neighbors)
        pruned_halos=(mass[concat_halos]>.5*mass[halo]).nonzero()[0]
        SB=concat_halos[pruned_halos]
        if (len(SB)<2): # if no halo is found within 10 Mpc radius, analyze the next halo candidate
                continue
        d=np.ones(len(SB))*1000
        i=0
        for sample in SB:
                d[i]=( (x[sample]-x[halo])**2+(y[sample]-y[halo])**2 + (z[sample]-z[halo])**2 )**0.5
                d[i]/=r_vir[sample]
                i+=1
        DR=min(d[d>0])
        exc=((d<1)&(d>0)).nonzero()[0]
        MM=mass[SB[exc]]
        MM_ind=(MM>1e13).nonzero()[0]