def test_remove(): rng = np.random.default_rng(0) points_raw = rng.random((10, 4)) * 100 labels = rng.integers(3, size=10) points = pept.PointData( np.c_[points_raw, labels, labels], columns=["t", "x", "y", "z", "label", "label2"], ) points.samples_indices = [[0, 10], [5, 5], [5, 10]] rm = Remove("label").fit_sample(points) assert "label" not in rm.columns assert rm.points.shape[1] == 5 rm = Remove("label*").fit_sample(points) assert "label" not in rm.columns assert "label2" not in rm.columns assert rm.points.shape[1] == 4 # Testing different settings Remove(0).fit(points, "sequential") Remove(-1).fit(points, "sequential") Remove("label", "label2").fit(points, "sequential") Remove(0, "label").fit(points, "sequential")
def test_swap(): rng = np.random.default_rng(0) points_raw = rng.random((10, 4)) * 100 labels = rng.integers(3, size=10) points = pept.PointData( np.c_[points_raw, labels], columns=["t", "x", "y", "z", "label"], ) points.samples_indices = [[0, 10], [5, 5], [5, 10]] # Simple, single swap p2 = Swap("y, z").fit_sample(points.copy()) assert np.all(p2["y"] == points["z"]), "Swap not done" assert np.all(p2["z"] == points["y"]), "Swap not done" # Single swap with quoted column names p2 = Swap("'y', 'z'").fit_sample(points.copy()) assert np.all(p2["y"] == points["z"]), "Swap not done" assert np.all(p2["z"] == points["y"]), "Swap not done" # Single swap with quoted column indices p2 = Swap("'2', '3'").fit_sample(points.copy()) assert np.all(p2["y"] == points["z"]), "Swap not done" assert np.all(p2["z"] == points["y"]), "Swap not done" # Testing different settings Swap("y, z").fit(points) Swap("label, 'z'").fit(points) Swap("'0', '1'", "'y', 'z'", "x, z").fit(points)
def test_condition(): rng = np.random.default_rng(0) points_raw = rng.random((10, 4)) * 100 labels = rng.integers(3, size=10) points = pept.PointData( np.c_[points_raw, labels], columns=["t", "x", "y", "z", "label"], ) points.samples_indices = [[0, 10], [5, 5], [5, 10]] cp = Condition("x < 50").fit_sample(points) assert np.all( cp.data == points.data[points.points[:, points.columns.index("x")] < 50]) cp2 = Condition("'2' < 50").fit_sample(points) cp3 = Condition("50 > '2'").fit_sample(points) assert np.allclose(cp2.data, points.data[points.data[:, 2] < 50]) assert np.allclose(cp2.data, cp3.data) # Testing different settings Condition("np.isfinite('x')").fit(points) Condition("'x' < 'y'").fit(points) Condition("x < 2, 'x' > 0, 1 > 'x'").fit(points) Condition(lambda arr: arr[:, 0] > 10).fit(points) Condition(lambda x: x[:, -1] < 50, 'x > 10').fit(points)
def test_split_all(): rng = np.random.default_rng(0) points_raw = rng.random((10, 4)) * 100 labels = rng.integers(3, size=10) line_index = rng.integers(10, size=10) points = pept.PointData( np.c_[points_raw, labels, line_index], columns=["t", "x", "y", "z", "label", "line_index"], ) points.samples_indices = [[0, 10], [5, 5], [5, 10]] # Check each split label split = SplitAll("label").fit(points) assert np.all(split[0].points[:, :4] == points_raw[labels == 0]) assert np.all(split[1].points[:, :4] == points_raw[labels == 1]) assert np.all(split[2].points[:, :4] == points_raw[labels == 2]) # Check with empty sample empty_split = SplitLabels().fit_sample(points[1]) assert len(empty_split[0].data) == 0 # Check using numeric index split_str = SplitAll("label").fit(points) split_idx = SplitAll(4).fit(points) assert np.all(split_str[0].points == split_idx[0].points) assert np.all(split_str[1].points == split_idx[1].points) assert np.all(split_str[2].points == split_idx[2].points) # Testing different settings SplitAll("label").fit([points]) SplitAll("label").fit([points, points]) SplitAll(4).fit(points.points)
def test_stack(): rng = np.random.default_rng(0) points_raw = rng.random((10, 4)) * 100 lines_raw = rng.random((10, 7)) * 500 points = pept.PointData(points_raw, sample_size=4) lines = pept.LineData(lines_raw, sample_size=4) # Test it returns points back p = Stack().fit(points) assert p is points, "Stack did not return a single PointData back" # Test it returns lines back ls = Stack().fit(lines) assert ls is lines, "Stack did not return a single LineData back" # Test it concatenates a list of two points points2 = Stack().fit([points, points]) assert np.all(points2.points[:10] == points.points[:10]) # Test it concatenates a list of two lines lines2 = Stack().fit([lines, lines]) assert np.all(lines2.lines[:10] == lines.lines[:10]) # Test list[list] flattening assert Stack().fit([[1, 2, 3]]) == [1, 2, 3], "List flattening wrong"
def test_segregate(): rng = np.random.default_rng(0) points_raw = rng.random((100, 4)) * 100 points = pept.PointData(points_raw, sample_size=4) se = Segregate(20, cut_distance=np.inf).fit(points) assert np.allclose(se.points[:, -1], 0.) # Testing different settings Segregate(5, 10, 15).fit(points) Segregate(1, 1).fit(points)
def fit(self, points: Iterable[pept.PointData]): # Stack the input points into a single PointData if not isinstance(points, pept.PointData): points = pept.PointData(points) if len(points.points) == 0: return points.copy( data=points.points[0:0], columns=points.columns + ["label"], ) pts = points.points # Sort pts based on the time column (col 0) and create a C-ordered copy # to send to Cython. pts = np.asarray(pts[pts[:, 0].argsort()], dtype=float, order="C") # Calculate the sparse distance matrix between reachable points. This # is an optimised Cython function returning a sparse CSR matrix. distance_matrix = distance_matrix_reachable(pts, self.window) # Construct the minimum spanning tree from the sparse distance matrix. # Note that `mst` is also a sparse CSR matrix. mst = minimum_spanning_tree(distance_matrix) # Get the minimum spanning tree edges into the [vertex 1, vertex 2, # edge distance] format, then sort it based on the edge distance. mst = mst.tocoo() mst_edges = np.vstack((mst.row, mst.col, mst.data)).T mst_edges = mst_edges[mst_edges[:, 2].argsort()] # Ignore deprecation warning from HDBSCAN's use of `np.bool` with warnings.catch_warnings(): warnings.simplefilter("ignore", category=DeprecationWarning) # Create the single linkage tree from the minimum spanning tree # edges using internal hdbscan methods (because they're damn fast). # This should be a fairly quick step. linkage_tree = hdbscan._hdbscan_linkage.label(mst_edges) linkage_tree = hdbscan.plots.SingleLinkageTree(linkage_tree) # Cut the single linkage tree at `trajectory_cut_distance` and get # the cluster labels, setting clusters smaller than # `min_trajectory_size` to -1 (i.e. noise). labels = linkage_tree.get_clusters( self.cut_distance, self.min_trajectory_size, ) # Append the labels to `pts`. return points.copy( data=np.c_[pts, labels], columns=points.columns + ["label"], )
def test_velocity(): rng = np.random.default_rng(0) points_raw = rng.random((10, 4)) * 100 points = pept.PointData(points_raw, sample_size=4) vs = Velocity(5).fit_sample(points) assert "vx" in vs.columns assert "vy" in vs.columns assert "vz" in vs.columns assert "v" in Velocity(5, absolute=True).fit_sample(points).columns # Testing different settings Velocity(3).fit(points, "sequential") Velocity(window=9, degree=5).fit(points, "sequential")
def copy(self): '''Create a deep copy of an instance of this class, including a new inner numpy array `points`. Returns ------- pept.PointData A new instance of the `pept.PointData` class with the same attributes as this instance, deep-copied. ''' return pept.PointData(self._points.copy(order="C"), sample_size=self._sample_size, overlap=self._overlap, verbose=False)
def test_dynamic_probability3d(): # Generate tracer locations num_particles = 10 positions = pept.PointData( np.random.uniform(0, 500, (num_particles, 5)), columns = ["t", "x", "y", "z", "v"] ) # Test different uses voxels = DynamicProbability3D(1., "v").fit(positions) assert voxels.voxels.any(), "all voxels are zero!" DynamicProbability3D(0.1, "t", "yzx").fit(positions) DynamicProbability3D(0.1, 4,).fit(positions) DynamicProbability3D(0.1, "v", xlim = [0, 500]).fit(positions) DynamicProbability3D(0.1, "v", resolution = [20, 20, 20]).fit(positions) DynamicProbability3D(0.1, 4, max_workers = 1).fit(positions)
def test_residence_distribution3d(): # Generate tracer locations num_particles = 10 positions = pept.PointData( np.random.uniform(0, 500, (num_particles, 5)), columns = ["t", "x", "y", "z", "v"] ) # Test different uses voxels = ResidenceDistribution3D(1., "v").fit(positions) assert voxels.voxels.any(), "all voxels are zero!" ResidenceDistribution3D(0.1, "t", "yzx").fit(positions) ResidenceDistribution3D(0.1, 0).fit(positions) ResidenceDistribution3D(0.1, xlim = [0, 500]).fit(positions) ResidenceDistribution3D(0.1, resolution = [20, 20, 20]).fit(positions) ResidenceDistribution3D(0.1, 0, max_workers = 1).fit(positions)
def test_interpolate(): points_raw = np.arange(60).reshape(10, 6) points = pept.PointData( points_raw, columns=["t", "x", "y", "z", "label", "line_index"], ) points.samples_indices = [[0, 10], [5, 5], [5, 10]] # Interpolate at double sampling rate half_interpolator = Interpolate((points_raw[1, 0] - points_raw[0, 0]) / 2) interp = half_interpolator.fit_sample(points) assert interp.points[1, 2] == (points_raw[0, 2] + points_raw[1, 2]) / 2 # Testing different settings Interpolate(3., kind="cubic").fit(points, "sequential") Interpolate(10., kind="nearest").fit(points, "sequential")
def fit_sample(self, sample_lines): if not isinstance(sample_lines, pept.LineData): sample_lines = pept.LineData(sample_lines) # If cutoffs were not defined, automatically compute them if self.cutoffs is not None: cutoffs = self.cutoffs else: cutoffs = get_cutoffs(sample_lines.lines) # Only compute minpoints if there are at least num_lines LoRs if len(sample_lines.lines) >= self.num_lines: sample_minpoints = pept.utilities.find_minpoints( sample_lines.lines, self.num_lines, self.max_distance, cutoffs, append_indices=self.append_indices, ) else: ncols = 4 + self.num_lines if self.append_indices else 4 sample_minpoints = np.empty((0, ncols)) # Column names columns = ["t", "x", "y", "z"] if self.append_indices: columns += [f"line_index{i + 1}" for i in range(self.num_lines)] # Encapsulate minpoints in a PointData points = pept.PointData(sample_minpoints, columns=columns) # Add optional metadata to the points; because they have an underscore, # they won't be propagated when new objects are constructed points.attrs["_num_lines"] = self.num_lines points.attrs["_max_distance"] = self.max_distance points.attrs["_cutoffs"] = cutoffs # If LoR indices were appended, also include the constituent LoRs if self.append_indices: points.attrs["_lines"] = sample_lines return points
def test_split_labels(): rng = np.random.default_rng(0) points_raw = rng.random((10, 4)) * 100 labels = rng.integers(3, size=10) line_index = rng.integers(10, size=10) points = pept.PointData( np.c_[points_raw, labels, line_index], columns=["t", "x", "y", "z", "label", "line_index"], ) points.samples_indices = [[0, 10], [5, 5], [5, 10]] # Check each split label split = SplitLabels().fit_sample(points[0]) assert np.all(split[0].points[:, :4] == points_raw[labels == 0]) assert np.all(split[1].points[:, :4] == points_raw[labels == 1]) assert np.all(split[2].points[:, :4] == points_raw[labels == 2]) # Check with empty sample empty_split = SplitLabels().fit_sample(points[1]) assert len(empty_split[0].data) == 0 # Extracting `_lines` lines_raw = rng.random((10, 7)) * 500 lines = pept.LineData(lines_raw, sample_size=4) points.attrs["_lines"] = lines splines = SplitLabels().fit_sample(points[0]) assert "_lines" in splines[0].attrs splines = SplitLabels(extract_lines=True).fit_sample(points[0]) assert isinstance(splines[0], pept.LineData) # Test different settings SplitLabels().fit(points, "sequential") SplitLabels(remove_labels=False).fit(points, "sequential") SplitLabels(noise=True).fit(points, "sequential") SplitLabels(extract_lines=True).fit(points, "sequential")
def connect_trajectories( trajectories_points, max_time_difference, max_signature_difference, points_to_check = 50, signature_col = 4, label_col = -1, as_list = False ): '''Connect segregated trajectories based on tracer signatures. A pair of trajectories in `trajectories_points` will be connected if their ends have a timestamp difference that is smaller than `max_time_difference` and the difference between the signature averages of the closest `points_to_check` points is smaller than `max_signature_difference`. The `trajectories_points` are distinguished based on the trajectory indices in the data column `label_col`. This can be achieved using the `segregate_trajectories` function, which appends the labels to the data points. Because the tracer signature (e.g. cluster size in PEPT-ML) varies with the tracer position in the system, an average of `points_to_check` points is used for connecting pairs of trajectories. Parameters ---------- trajectories_points : (M, N>=6) numpy.ndarray or pept.PointData A numpy array of points that have a timestamp, spatial coordinates, a tracer signature (such as cluster size in PEPT-ML) and a trajectory index (or label). The data columns in `trajectories_points` are then [time, x, y, z, ..., signature, ..., label, ...]. Note that the timestamps and spatial coordinates must be the first 4 columns, while the signature and label columns may be anywhere and are pointed at by `signature_col` and `label_col`. max_time_difference : float Only try to connect trajectories whose ends have a timestamp difference smaller than `max_time_difference`. max_signature_difference : float Connect two trajectories if the difference between the signature averages of the closest `points_to_check` is smaller than this. points_to_check : int, default 50 The number of points used when computing the average tracer signature in one trajectory. signature_col : int, default 4 The column in `trajectories_points` that contains the tracer signatures. The default is 4 (i.e. the signature comes right after the spatial coordinates). label_col : int, default -1 The column in `trajectories_points` that contains the trajectory indices (labels). The default is -1 (i.e. the last column). as_list : bool, default False If True, return a list of arrays, where each array contains the points in a single trajectory. In other words, return separate, single trajectories in a list. If False, return a single array of all points (if `trajectories_points` was a `numpy.ndarray`) or a `pept.PointData` (if `trajectories_points` was a `pept.PointData` instance), but with labels changed to reflect the connected trajectories. Returns ------- numpy.ndarray or pept.PointData or list of numpy.ndarray If `as_list` is True, return separate, single trajectories in a list. If `as_list` is False, return a single array of all points (if `trajectories_points` was a `numpy.ndarray`) or a `pept.PointData` (if `trajectories_points` was a `pept.PointData` instance), but with labels changed to reflect the connected trajectories. Raises ------ ValueError If `point_data` is a numpy array with fewer than 6 columns. Note ---- The labels are changed in-place to reflect the connected trajectories. For example, if there are 3 trajectories with labels 0, 1, 2 and the first two are connected, then all points which previously had the label 1 will be changed to label 0; the last trajectory's label remains unchanged, 2. Examples -------- [TODO] - add full tutorial page on Bham PIC GitHub page for this. See Also -------- segregate_trajectories : Segregate the intertwined points from multiple trajectories into individual paths. PlotlyGrapher : Easy, publication-ready plotting of PEPT-oriented data. ''' # Check `point_data` is a numpy array or pept.PointData if isinstance(trajectories_points, pept.PointData): trajs = trajectories_points.points else: trajs = np.asarray(trajectories_points, dtype = float, order = "C") if trajs.ndim != 2 or trajs.shape[1] < 6: raise ValueError(( "\n[ERROR]: `trajectories_points` should have dimensions " f"(M, N), where N >= 6. Received {trajs.shape}.\n" )) # Type-check the input parameters max_time_difference = float(max_time_difference) max_signature_difference = float(max_signature_difference) points_to_check = int(points_to_check) signature_col = int(signature_col) label_col = int(label_col) as_list = bool(as_list) # Separate the trajs array into a list of individual trajectories based on # the `label_col`. trajectory_list = pept.utilities.group_by_column(trajs.copy(), label_col) trajectory_list = _connect_trajectories( trajectory_list, max_time_difference, max_signature_difference, points_to_check, signature_col, label_col ) if as_list: return trajectory_list elif isinstance(trajectories_points, pept.PointData): trajectories_points_connected = pept.PointData( np.vstack(np.array(trajectory_list)), sample_size = trajectories_points.sample_size, overlap = trajectories_points.overlap, verbose = False ) return trajectories_points_connected else: return np.vstack(np.array(trajectory_list))
def segregate_trajectories( point_data, points_window, trajectory_cut_distance, min_trajectory_size = 5, as_list = False, return_mst = False ): '''Segregate the intertwined points from multiple trajectories into individual paths. The points in `point_data` (a numpy array or `pept.PointData`) are used to construct a minimum spanning tree in which every point can only be connected to `points_window` points around it - this "window" refers to the points in the initial data array, sorted based on the time column; therefore, only points within a certain timeframe can be connected. All edges (or "connections") in the minimum spanning tree that are larger than `trajectory_cut_distance` are removed (or "cut") and the remaining connected "clusters" are deemed individual trajectories if they contain more than `min_trajectory_size` points. The trajectory indices (or labels) are appended to `point_data`. That is, for each data point (i.e. row) in `point_data`, a label will be appended starting from 0 for the corresponding trajectory; a label of -1 represents noise. If `point_data` is a numpy array, a new numpy array is returned; if it is a `pept.PointData` instance, a new instance is returned. This function uses single linkage clustering with a custom metric for spatio-temporal data to segregate trajectory points. The single linkage clustering was optimised for this use-case: points are only connected if they are within a certain `points_window` in the time-sorted input array. Sparse matrices are also used for minimising the memory footprint. Parameters ---------- point_data : (M, N>=4) numpy.ndarray or pept.PointData The points from multiple trajectories. Each row in `point_data` will have a timestamp and the 3 spatial coordinates, such that the data columns are [time, x_coord, y_coord, z_coord]. Note that `point_data` can have more data columns and they will simply be ignored. points_window : int Two points are "reachable" (i.e. they can be connected) if and only if they are within `points_window` in the time-sorted input `point_data`. As the points from different trajectories are intertwined (e.g. for two tracers A and B, the `point_data` array might have two entries for A, followed by three entries for B, then one entry for A, etc.), this should optimally be the largest number of points in the input array between two consecutive points on the same trajectory. If `points_window` is too small, all points in the dataset will be unreachable. Naturally, a larger `time_window` correponds to more pairs needing to be checked (and the function will take a longer to complete). trajectory_cut_distance : float Once all the closest points are connected (i.e. the minimum spanning tree is constructed), separate all trajectories that are further apart than `trajectory_cut_distance`. min_trajectory_size : float, default 5 After the trajectories have been cut, declare all trajectories with fewer points than `min_trajectory_size` as noise. as_list : bool, default False If True, return a list of arrays, where each array contains the points in a single trajectory. In other words, return separate, single trajectories in a list. If False, return a single array of all points (if `point_data` was a `numpy.ndarray`) or a `pept.PointData` (if `point_data` was a `pept.PointData` instance). return_mst : bool, default False If `True`, the function will also return the minimum spanning tree constructed using the input `point_data`. This is a numpy array with columns [vertex1, vertex2, edge_length], where vertex1 and vertex2 are the indices in `point_data` of the connected points, and edge_length is the euclidian distance between them. Returns ------- points_labelled: numpy.ndarray or pept.PointData or list of numpy.ndarray If `as_list` is `False`, this is the `point_data` array or `pept.PointData` instance with an extra column for the trajectory index (i.e. label) - the return type is similar to the input type. If `as_list` is `True`, this is a list of arrays, in which each array contains the points in a single_trajectory; these still include the trajectory label. A label value of `-1` indicates noise; the found trajectories are then labelled starting from 0. mst: numpy.ndarray, optional If `return_mst` is `True`, another numpy array is returned as a second variable containing the columns [vertex1, vertex2, edge_length], where vertex1 and vertex2 are the indices in `point_data` of the connected points, and edge_length is the euclidian distance between them. Raises ------ ValueError If `point_data` is a numpy array with fewer than 4 columns. ValueError If `points_window` is smaller than 1. Examples -------- A typical workflow would involve transforming LoRs into points using some tracking algorithm. These points include all tracers moving through the system, being intertwined (e.g. for two tracers A and B, the `point_data` array might have two entries for A, followed by three entries for B, then one entry for A, etc.). They can be segregated based on position alone using this function; take for example two tracers that go downwards (below, 'x' is the position, and in parens is the array index at which that point is found). `points`, numpy.ndarray, shape (10, 4), columns [time, x, y, z]: x (1) x (2) x (3) x (4) x (5) x (7) x (6) x (9) x (8) x (10) >>> import pept.tracking.trajectory_separation as tsp >>> points_window = 10 >>> trajectory_cut_distance = 15 # mm >>> segregated_trajectories = tsp.segregate_trajectories( >>> points, points_window, trajectory_cut_distance >>> ) `segregated_trajectories`, numpy.ndarray, shape (10, 5), columns [time, x, y, z, trajectory_label]: x (1, label = 0) x (2, label = 1) x (3, label = 0) x (4, label = 1) x (5, label = 0) x (7, label = 1) x (6, label = 0) x (9, label = 1) x (8, label = 0) x (10, label = 1) See Also -------- connect_trajectories : Connect segregated trajectories based on tracer signatures. PlotlyGrapher : Easy, publication-ready plotting of PEPT-oriented data. ''' # Check `point_data` is a numpy array or pept.PointData if isinstance(point_data, pept.PointData): pts = point_data.points else: pts = np.asarray(point_data) if pts.ndim != 2 or pts.shape[1] < 4: raise ValueError(( "\n[ERROR]: `point_data` should have dimensions (M, N), where " f"N >= 4. Received {pts.shape}.\n" )) # Sort pts based on the time column (col 0) and create a C-ordered copy to # send to Cython. pts = np.asarray(pts[pts[:, 0].argsort()], dtype = float, order = "C") # Type-check the input parameters points_window = int(points_window) if points_window < 1: raise ValueError(( "\n[ERROR]: `points_window` should be larger than 1! Received " f"{points_window}.\n" )) trajectory_cut_distance = float(trajectory_cut_distance) min_trajectory_size = int(min_trajectory_size) return_mst = bool(return_mst) # Calculate the sparse distance matrix between reachable points. This is an # optimised Cython function returning a sparse CSR matrix. distance_matrix = distance_matrix_reachable(pts, points_window) # Construct the minimum spanning tree from the sparse distance matrix. Note # that `mst` is also a sparse CSR matrix. mst = minimum_spanning_tree(distance_matrix) # Get the minimum spanning tree edges into the [vertex 1, vertex 2, # edge distance] format, then sort it based on the edge distance. mst = mst.tocoo() mst_edges = np.vstack((mst.row, mst.col, mst.data)).T mst_edges = mst_edges[mst_edges[:, 2].argsort()] # Create the single linkage tree from the minimum spanning tree edges using # internal hdbscan methods (because they're damn fast). This should be a # fairly quick step. single_linkage_tree = hdbscan._hdbscan_linkage.label(mst_edges) single_linkage_tree = hdbscan.plots.SingleLinkageTree(single_linkage_tree) # Cut the single linkage tree at `trajectory_cut_distance` and get the # cluster labels, setting clusters smaller than `min_trajectory_size` to # -1 (i.e. noise). labels = single_linkage_tree.get_clusters( trajectory_cut_distance, min_trajectory_size ) # Append the labels to `pts`. pts = np.append(pts, labels[:, np.newaxis], axis = 1) # Returns based on as_list, return_mst and input data type if as_list: # Get a list of arrays for each trajectory separate_pts = pept.utilities.group_by_column(pts, -1) if return_mst: return separate_pts, mst_edges return separate_pts # If `point_data` was a `pept.PointData` instance, return a new # `pept.PointData` with the new label column. if isinstance(point_data, pept.PointData): point_data_labelled = pept.PointData( pts, sample_size = point_data.sample_size, overlap = point_data.overlap, verbose = False ) if return_mst: return point_data_labelled, mst_edges else: return point_data_labelled elif return_mst: return pts, mst_edges return pts
def timeseries_trace( points, size=6.0, color=None, opacity=0.8, colorbar=True, colorbar_col=-1, colorscale="Magma", colorbar_title=None, ): '''Static method for creating a list of 3 Plotly traces of timeseries. See `PlotlyGrapher2D.add_timeseries` for the full documentation. ''' if not isinstance(points, pept.PointData): points = pept.PointData(points) pts = points.points # No need to type-check the other parameters as Plotly will do that # anyway... # Create the dictionary of marker properties marker = dict(size=size, color=color, opacity=opacity) # Update `marker` if a colorbar is requested AND color is None. if colorbar and color is None: if isinstance(colorbar_col, str): color_data = points[colorbar_col] else: color_data = pts[:, colorbar_col] marker.update(colorscale=colorscale) if colorbar_title is not None: marker["colorbar"] = dict(title=colorbar_title) # Special case: if there are less than 10 values in the colorbar # column, add them as separate traces for better distinction # between colours. labels = np.unique(color_data) if len(labels) <= 10: traces = [[], [], []] for label in labels: selected = pts[color_data == label] for i in range(3): traces[i].append( go.Scatter(x=selected[:, 0], y=selected[:, i + 1], mode="markers", marker=marker)) return traces # Otherwise just use a typical continuous colorbar for all the # values in colorbar_col. else: marker['color'] = color_data traces = [] for i in range(3): traces.append( go.Scatter(x=pts[:, 0], y=pts[:, i + 1], mode="markers", marker=marker)) return traces
def fit_sample(self, sample): '''Use the Birmingham method to track a tracer location from a numpy array (i.e. one sample) of LoRs. For the given `sample` of LoRs (a numpy.ndarray), this function minimises the distance between all of the LoRs, rejecting a fraction of lines that lie furthest away from the calculated distance. The process is repeated iteratively until a specified fraction (`fopt`) of the original subset of LORs remains. Parameters ---------- sample : (N, M>=7) numpy.ndarray The sample of LORs that will be clustered. Each LoR is expressed as a timestamps and a line defined by two points; the data columns are then `[time, x1, y1, z1, x2, y2, z2, extra...]`. get_used : bool, default False If `True`, the function will also return a boolean mask of the LoRs used to compute the tracer location - that is, a vector of the same length as `sample`, containing 1 for the rows that were used, and 0 otherwise. as_array : bool, default True If set to True, the tracked locations are returned as numpy arrays. If set to False, they are returned inside an instance of `pept.PointData` for ease of iteration and plotting. verbose : bool, default False Provide extra information when tracking a location: time the operation and show a progress bar. Returns ------- locations : numpy.ndarray or pept.PointData The tracked locations found. used : numpy.ndarray, optional If `get_used` is true, then also return a boolean mask of the LoRs used to compute the tracer location - that is, a vector of the same length as `sample`, containing 1 for the rows that were used, and 0 otherwise. [ Used for multi-particle tracking, not implemented yet] Raises ------ ValueError If `sample` is not a numpy array of shape (N, M), where M >= 7. ''' if not isinstance(sample, pept.LineData): sample = pept.LineData(sample) locations, used = birmingham_method(sample.lines, self.fopt) # Propagate any LineData attributes besides `columns` attrs = sample.extra_attrs() locations = pept.PointData( [locations], columns=["t", "x", "y", "z", "error"], **attrs, ) # If `get_used`, also attach a `._lines` attribute with the lines used if self.get_used: locations.attrs["_lines"] = sample.copy( data=np.c_[sample.lines, used], columns=sample.columns + ["used"], ) return locations
def fit_cutpoints(self, cutpoints, store_labels = False, noise = False, verbose = True): '''Fit cutpoints (an instance of `PointData`) and return the cluster centres and (optionally) the labelled cutpoints. Parameters ---------- cutpoints : an instance of `pept.PointData` The samples of points that will be clustered. In every sample, every point corresponds to a row and is formatted as `[time, x, y, z, etc]`. Only columns `[1, 2, 3]` are used for clustering. store_labels : bool, optional If set to True, the clustered cutpoints are returned along with the centres of the clusters. Setting it to False speeds up the clustering. The default is False. noise : bool, optional If set to True, the clustered cutpoints also include the points classified as noise. Only has an effect if `store_labels` is set to True. The default is False. verbose : bool, optional Provide extra information when computing the cutpoints: time the operation and show a progress bar. The default is `False`. Returns ------- centres : pept.PointData The centroids of every cluster found. They are computed as the average of every column of `[time, x, y, z, etc]` of the clustered points. Another column is added to the initial data in `sample`, signifying the cluster size - the number of points included in the cluster. clustered_cutpoints : numpy.ndarray or pept.PointData The points in `sample` that fall in every cluster. A new column is added to the points in `sample` that signifies the label of cluster that the point was associated with: all points in cluster number 3 will have the number 3 as the last element in their row. The points classified as noise have the number -1 associated. Raises ------ Exception If `cutpoints` is not an instance (or a subclass) of `pept.PointData`. ''' if verbose: start = time.time() if not isinstance(cutpoints, pept.PointData): raise Exception('[ERROR]: cutpoints should be an instance of pept.PointData (or any class inheriting from it)') # Fit all samples in `cutpoints` in parallel using joblib # Collect all outputs as a list. If verbose, show progress bar with # tqdm if verbose: data_list = Parallel(n_jobs = -1)(delayed(self.fit_sample)(sample, store_labels = store_labels, noise = noise, as_array = True) for sample in tqdm(cutpoints)) else: data_list = Parallel(n_jobs = -1)(delayed(self.fit_sample)(sample, store_labels = store_labels, noise = noise, as_array = True) for sample in cutpoints) # Access joblib.Parallel output as list comprehensions centres = np.array([row[0] for row in data_list if len(row[0]) != 0]) if len(centres) != 0: centres = pept.PointData(np.vstack(centres), sample_size = 0, overlap = 0, verbose = False) if store_labels: clustered_cutpoints = np.array([row[1] for row in data_list if len(row[1]) != 0]) clustered_cutpoints = pept.PointData(np.vstack(np.array(clustered_cutpoints)), sample_size = 0, overlap = 0, verbose = False) if verbose: end = time.time() print("\nFitting cutpoints took {} seconds\n".format(end - start)) if store_labels: return [centres, clustered_cutpoints] else: return [centres, []]
def fit(self, line_data, max_error=10, get_used=False, max_workers=None, verbose=True): '''Fit lines of response (an instance of 'LineData') and return the tracked locations and (optionally) the LoRs that were used. This is a convenience function that asynchronously iterates through the samples in a `LineData`, finding the tracer locations. For more fine-grained control over the tracking, the `fit_sample` method can be used for individual samples. Parameters ---------- line_data : an instance of `pept.LineData` The samples of lines of reponse (LoRs) that will be used for locating the tracer. Be careful to set the appropriate `sample_size` and `overlap` for good results. If the `sample_size` is too low, the tracer might not be found; if it is too high, temporal resolution is decreased. If the `overlap` is too small, the tracked points might be very "sparse". max_error : float, default = 10 The maximum error allowed to return a 'valid' tracked location. All tracer locations with an error larger than `max_error` will be discarded. get_used : bool, default False If `True`, the function will also return a list of boolean masks of the LoRs used to compute the tracer location for each sample - that is, a vector of the same length as `sample`, containing 1 for the rows that were used, and 0 otherwise. max_workers : int, optional The maximum number of threads that will be used for asynchronously clustering the samples in `cutpoints`. If unset (`None`), the number of threads available on the machine (as returned by `os.cpu_count()`) will be used. verbose : bool, default True Provide extra information when tracking: time the operation and show a progress bar. Returns ------- locations : pept.PointData The tracer locations found. used : list of numpy.ndarray A list of boolean masks of the LoRs used to compute the tracer location for each corresponding sample in `line_data` - that is, a vector of the same length as a sample, containing 1 for the rows that were used, and 0 otherwise. Raises ------ TypeError If `line_data` is not an instance of `pept.LineData`. ''' if verbose: start = time.time() if not isinstance(line_data, pept.LineData): raise TypeError( textwrap.fill( "[ERROR]: `line_data` should be an instance of `pept.LineData`" f" (or any subclass thereof). Received {type(line_data)}.") ) # Users might forget to set the sample_size, leaving it to the default # value of 0; in that case, all lines are returned as a single sample - # that might not be the intended behaviour. if line_data.sample_size == 0: warnings.warn( textwrap.fill(( "\n[WARNING]: The `line_data.sample_size` was left to the " "default value of 0, in which case all lines are returned " "as a single sample. For a very large number of lines, " "this might result in a long function execution time.\n"), replace_whitespace=False), RuntimeWarning) get_used = bool(get_used) # Using ThreadPoolExecutor, asynchronously collect the locations from # every sample in a list of arrays. This is more efficient than using # ProcessPoolExecutor (or joblib) because birmingham_method is a Cython # function that releases the GIL for most of its computation. # If verbose, show progress bar using tqdm. if max_workers is None: max_workers = os.cpu_count() with ThreadPoolExecutor(max_workers=max_workers) as executor: futures = [] for sample in line_data: futures.append( executor.submit(birmingham_method, sample, self._fopt)) if verbose: futures = tqdm(futures) data_list = [f.result() for f in futures] # Access the data_list output as list comprehensions # data_list is a list of tuples, in which the first element is an # array of the `location`, and the second element is `used`, a # boolean mask representing the used LoRs. locations = np.array([r[0] for r in data_list if len(r[0]) != 0]) used = np.array([r[1] for r in data_list if len(r[1]) != 0]) # Remove LoRs with error above max_error locations = np.vstack(np.array(locations)) locations = np.delete(locations, np.argwhere(locations[:, 4] > max_error), axis=0) if len(locations) != 0: locations = pept.PointData(locations, sample_size=0, overlap=0, verbose=False) if verbose: end = time.time() print("\nTracking locations took {} seconds\n".format(end - start)) if get_used: # `used is a list of the `used` arrays for the corresponding sample # in `line_data`. return locations, used return locations
def find_cutpoints(sample_lines, max_distance, cutoffs=None, append_indices=False): '''Find the cutpoints from a sample / array of LoRs. A cutpoint is the point in 3D space that minimises the distance between any two lines. For any two non-parallel 3D lines, this point corresponds to the midpoint of the unique segment that is perpendicular to both lines. This function considers every pair of lines in `sample_lines` and returns all the cutpoints that satisfy the following conditions: 1. The distance between the two lines is smaller than `max_distance`. 2. The cutpoint is within the `cutoffs`. Parameters ---------- sample_lines : (N, M >= 7) numpy.ndarray A sample of LoRs, where each row is `[time, x1, y1, z1, x2, y2, z2]`, such that every line is defined by the points `[x1, y1, z1]` and `[x2, y2, z2]`. max_distance : float The maximum distance between any two lines for their cutpoint to be considered. A good starting value would be 0.1 mm for small tracers and/or clean data, or 0.2 mm for larger tracers and/or noisy data. cutoffs : list, optional The cutoffs for each dimension, formatted as `[x_min, x_max, y_min, y_max, z_min, z_max]`. If it is `None`, they are computed automatically by calling `get_cutoffs`. The default is `None`. append_indices : bool, optional If set to `True`, the indices of the individual LoRs that were used to compute each cutpoint are also appended to the returned array. Default is `False`. Returns ------- cutpoints : (M, 4) or (M, 6) numpy.ndarray A numpy array of the calculated cutpoints. If `append_indices` is `False`, then the columns are [time, x, y, z]. If `append_indices` is `True`, then the columns are [time, x, y, z, i, j], where `i` and `j` are the LoR indices from `sample_lines` that were used to compute the weighted cutpoints. The time is the average between the timestamps of the two LoRs that were used to compute the cutpoint. The first column (for time) is sorted. Raises ------ ValueError If `sample_lines` is not a numpy array with shape (N, M >= 7). ValueError If `cutoffs` is not a one-dimensional array with values `[min_x, max_x, min_y, max_y, min_z, max_z]` See Also -------- pept.tracking.peptml.Cutpoints : Compute cutpoints from `pept.LineData`. pept.utilities.read_csv : Fast CSV file reading into numpy arrays. ''' if not isinstance(sample_lines, pept.LineData): sample_lines = pept.LineData(sample_lines) lines = sample_lines.lines lines = np.asarray(lines, order='C', dtype=float) max_distance = float(max_distance) # If cutoffs were not defined, automatically compute them if cutoffs is None: cutoffs = get_cutoffs(lines) else: cutoffs = np.asarray(cutoffs, order='C', dtype=float) if cutoffs.ndim != 1 or len(cutoffs) != 6: raise ValueError( ("\n[ERROR]: cutoffs should be a one-dimensional array with " "values [min_x, max_x, min_y, max_y, min_z, max_z]. Received " f"{cutoffs}.\n")) sample_cutpoints = pept.utilities.find_cutpoints( lines, max_distance, cutoffs, append_indices=append_indices) columns = ["t", "x", "y", "z"] if append_indices: columns += ["line_index1", "line_index2"] points = pept.PointData(sample_cutpoints, columns=columns) # Add optional metadata to the points; because they have an underscore, # they won't be propagated when new objects are constructed points._max_distance = max_distance points._cutoffs = cutoffs if append_indices: points._lines = sample_lines return points
def find_trajectories(self): for i, current_point in enumerate(self.centres): if i == 0: # Add the first point to trajectory 0 self.trajectory_indices[0].add(self.max_index) self.centres_indices[self.max_index].append(0) self.max_index += 1 continue # Search for the closest previous pointsToCheck points # within a given maxDistance start_index = i - self.points_to_check end_index = i if start_index < 0: start_index = 0 # Construct a KDTree from the x, y, z (1:4) of the # selected points. Get the indices for all the points within # maxDistance of the currentPoint tree = cKDTree(self.centres[start_index:end_index, 1:4]) closest_indices = tree.query_ball_point(current_point[1:4], self.max_distance, n_jobs=-1) closest_indices = np.array(closest_indices) + start_index # If no point was found, it is a new trajectory. Continue if len(closest_indices) == 0: self.trajectory_indices[i].add(self.max_index) self.centres_indices.append([i]) self.max_index += 1 continue # For every close point found, search for all the trajectory indices # - If all trajectory indices sets are equal and of a single value # then currentPoint is part of the same trajectory # - If all trajectory indices sets are equal, but of more values, # then currentPoint diverged from an intersection of trajectories # and is part of a single trajectory => separate it # # - If every pair of trajectory indices sets is not disjoint, then # currentPoint is only one of them # - If there exists a pair of trajectory indices sets that is # disjoint, then currentPoint is part of all of them # Select the trajectories of all the points that were found # to be the closest closest_trajectories = self.trajectory_indices[closest_indices] #print("closestTrajectories:") #print(closestTrajectories) # If all the closest points are part of the same trajectory # (just one!), then the currentPoint is part of it too if (np.all(closest_trajectories == closest_trajectories[0]) and len(closest_trajectories[0]) == 1): self.trajectory_indices[i] = closest_trajectories[0] self.centres_indices[next(iter( closest_trajectories[0]))].append(i) continue # Otherwise, check the points based on their cluster size else: # Create a list of all the trajectories that were found to # intersect #print('\nIntersection:') closest_traj_indices = list(set().union(*closest_trajectories)) #print("ClosestTrajIndices:") #print(closestTrajIndices) # For each close trajectory, calculate the mean cluster size # of the last points_cluster_size points # Keep track of the mean cluster size that is the closest to # the currentPoint's clusterSize current_cluster_size = current_point[4] #print("currentClusterSize = {}".format(currentClusterSize)) closest_traj_index = -1 cluster_size_diff = self.max_cluster_diff for traj_index in closest_traj_indices: #print("trajIndex = {}".format(trajIndex)) traj_centres = self.centres[ self.centres_indices[traj_index]] #print("trajCentres:") #print(trajCentres) mean_cluster_size = traj_centres[ -self.points_cluster_size:][:, 4].mean() #print("meanClusterSize = {}".format(meanClusterSize)) #print("clusterSizeDiff = {}".format(clusterSizeDiff)) #print("abs diff = {}".format(np.abs( currentClusterSize - meanClusterSize ))) if np.abs(current_cluster_size - mean_cluster_size) < cluster_size_diff: closest_traj_index = traj_index cluster_size_diff = np.abs(current_cluster_size - mean_cluster_size) if closest_traj_index == -1: #self.trajectoryIndices[i] = set(closestTrajIndices) #for trajIndex in closestTrajIndices: # self.centresIndices[trajIndex].append(i) print("\n**** -1 ****\n") break else: #print("ClosestTrajIndex found = {}".format(closestTrajIndex)) self.trajectory_indices[i] = set([closest_traj_index]) self.centres_indices[closest_traj_index].append(i) individual_trajectories = [] for traj_centres in self.centres_indices: individual_traj = pept.PointData(self.centres[traj_centres], sample_size=0, overlap=0, verbose=False) individual_trajectories.append(individual_traj) return individual_trajectories '''
def fit(self, points): points = pept.tracking.Stack().fit(points) if not isinstance(points, pept.PointData): points = pept.PointData(points) # Columns corresponding to the signatures sig_cols = [points.columns.index(sn) for sn in self.signatures.keys()] trajs = pept.tracking.SplitAll(self.column).fit(points) trajs.sort(key=lambda traj: traj["t"][0]) # List of connections to do, list[tuple[int, int]] connections = [] # Try to forward-connect the end of trajs[i] to the start of trajs[j] start_times = np.array([t["t"][0] for t in trajs]) for i in range(len(trajs)): # Select all future trajectories whose start time is within tmax cur_traj = trajs[i] indices = np.argwhere( (start_times > cur_traj["t"][-1]) & (start_times - cur_traj["t"][-1] < self.tmax), ).flatten() # If no feasible times were found, carry on if not indices.any(): continue # Compute connection costs between trajectory ends costs = [] for j in indices: e2 = trajs[i].points[-self.num_points:].mean(axis=0) e1 = trajs[j].points[:self.num_points].mean(axis=0) # The first cost is the distance between traj ends; the rest # are the signature differences cost = [np.linalg.norm(e2[1:4] - e1[1:4])] for sc in sig_cols: cost.append(np.abs(e2[sc] - e1[sc])) costs.append(cost) # Keep track of trajectory indices and associated costs costs = np.c_[indices, np.array(costs)] # Remove condidate connections that have costs larger than threshs selection = costs[:, 1] < self.dmax for i, sthresh in enumerate(self.signatures.values()): selection = selection & (costs[:, 2 + i] < sthresh) costs = costs[selection] # If no feasible connection was found, carry on if not len(costs): continue # Otherwise, establish connection with minimum overall cost best = costs[:, 1:].mean(axis=1).argmin() connection_index = int(costs[best, 0]) connections.append((i, connection_index)) # Set connected labels if isinstance(self.column, str): label_col = points.columns.index(self.column) else: label_col = self.column for i1, i2 in connections: trajs[i2].points[:, label_col] = trajs[i1].points[0, label_col] # Stack trajectories and map labels from [0, 2, 2, 3, 0] to # [0, 1, 1, 2, 0] trajs = pept.tracking.Stack().fit(trajs) labels = trajs.points[:, label_col] _, ordered = np.unique(labels, return_inverse=True) trajs.points[:, label_col] = ordered return trajs
def test_centroids(): rng = np.random.default_rng(0) points_raw = rng.random((10, 4)) * 100 points = pept.PointData(points_raw, sample_size=4) f1 = pept.tracking.Centroids() print(f1) # Test `fit_sample` s1 = f1.fit_sample(points_raw).points s2 = points_raw.mean(axis=0) assert (s1 == s2).all(), "Single sample geometric centroid" s1 = f1.fit_sample(points[0]).points s2 = points[0].points.mean(axis=0) assert (s1 == s2).all(), "Single sample geometric centroid" # Test `fit` traversed = f1.fit(points) manual = [p.points.mean(axis=0) for p in points] assert all([(t.points == m).all() for t, m in zip(traversed, manual)]), \ "Full `fit` traversal" # Test `fit_sample` s1 = f1.fit_sample(points[0]).points s2 = points_raw[:4].mean(axis=0) assert (s2[:4] == s1[:, :4]).all(), "Single sample geometric centroid" # Test `fit` traversed = f1.fit(points) # Test different settings Centroids(error=True).fit_sample(points[0]) Centroids(error=True, cluster_size=True).fit_sample(points[0]) # Test weighted centroid computation points_raw = np.arange(50).reshape(10, 5) # Last column is "weight" points_raw[:, -1] = 1. # Start with equal weights points = pept.PointData(points_raw, columns=["t", "x", "y", "z", "weight"], sample_size=4) # Test `fit_sample` s1 = f1.fit_sample(points_raw).points s2 = points_raw.mean(axis=0) assert np.allclose(s1[:, :4], s2[:4]), "Single sample weighted centroid" s1 = f1.fit_sample(points[0]).points s2 = points[0].points.mean(axis=0) assert np.allclose(s1[:, :4], s2[:4]), "Single sample weighted centroid" # Ensure "weight" is removed assert "weight" not in f1.fit_sample(points).columns # Test `fit` traversed = f1.fit(points) # Test different settings Centroids(error=True).fit_sample(points[0]) Centroids(error=True, cluster_size=True).fit_sample(points[0])
def fit_sample(self, sample, get_labels=False, as_array=True, verbose=False, _set_labels=True): '''Fit one sample of cutpoints and return the cluster centres and (optionally) the labelled cutpoints. Parameters ---------- sample : (N, M >= 4) numpy.ndarray The sample of points that will be clustered. The expected columns are `[time, x, y, z, etc]`. Only columns `[1, 2, 3]` are used for clustering. get_labels : bool, default False If set to True, the input `sample` is also returned with an extra column representing the label of the cluster that each point is associated with. This label is an `int`, numbering clusters starting from 0; noise is represented with the value -1. as_array : bool, default True If set to True, the centres of the clusters and the labelled cutpoints are returned as numpy arrays. If set to False, they are returned inside instances of `pept.PointData`. verbose : bool, default False Provide extra information when computing the cutpoints: time the operation and show a progress bar. _set_labels : bool, default True This is an internal setting that an end-user should not normally care about. If `True`, the class property `labels` will be set after fitting. Setting this to `False` is helpful for multithreaded contexts - when calling `fit_sample` in parallel, it makes sure no internal attributes are mutated at the same time. Returns ------- centres : numpy.ndarray or pept.PointData The centroids of every cluster found with columns `[time, x, y, z, ..., cluster_size]`. They are computed as the column-wise average of the points included in each cluster (i.e. for each label). Another column is added to the initial data in `sample`, signifying the cluster size - that is, the number of points included in the cluster. If `as_array` is set to True, it is a numpy array, otherwise the centres are stored in a `pept.PointData` instance. sample_labelled : optional, numpy.ndarray or pept.PointData Returned if `get_labels` is `True`. It is the input `sample` with an appended column representing the label of the cluster that the point was associated with. The labels are integers starting from 0. The points classified as noise have the number -1 associated. If `as_array` is set to True, it is a numpy array, otherwise the labelled points are stored in a `pept.PointData` instance. Raises ------ ValueError If `sample` is not a numpy array of shape (N, M), where M >= 4. Note ---- If no clusters were found (i.e. all labels are -1), the returned values are empty numpy arrays. ''' if verbose: start = time.time() # sample columns: [time, x, y, z, ...] sample = np.asarray(sample, dtype=float, order="C") if sample.ndim != 2 or sample.shape[1] < 4: raise ValueError(( "\n[ERROR]: `sample` should have two dimensions (M, N), where " f"N >= 4. Received {sample.shape}.\n")) # Only cluster based on [x, y, z]. Make a C-contiguous copy to improve # cache-locality, then delete it. sample_xyz = np.asarray(sample[:, 1:4], dtype=float, order="C") labels = self.clusterer.fit_predict(sample_xyz) max_label = labels.max() # If `allow_single_cluster` is "auto", check if no clusters were found # and try again using the hdbscan option allow_single_cluster = True. if max_label == -1 and self._allow_single_cluster == "auto": labels = self.clusterer_single.fit_predict(sample_xyz) max_label = labels.max() del sample_xyz if _set_labels: self._labels = labels # the centre of a cluster is the average of the time, x, y, z columns # + the number of points in that cluster (i.e. cluster size) # centres columns: [time, x, y, z, ..etc.., cluster_size] centres = [] for i in range(0, max_label + 1): # Average time, x, y, z of cluster of label i centres_row = np.mean(sample[labels == i], axis=0) # Append the number of points of label i => cluster_size centres_row = np.append(centres_row, (labels == i).sum()) centres.append(centres_row) centres = np.array(centres) if not as_array and len(centres) != 0: centres = pept.PointData(centres, sample_size=0, overlap=0, verbose=False) if verbose: end = time.time() print("Fitting one sample took {} seconds".format(end - start)) # If labels are requested, also return the initial sample with appended # labels. Labels go from 0 to max_label; -1 represents noise. if get_labels: sample_labelled = np.append(sample, labels[:, np.newaxis], axis=1) if not as_array and len(samples_labelled) != 0: sample_labelled = pept.PointData(samples_labelled, sample_size=0, overlap=0, verbose=False) return centres, sample_labelled # Otherwise just return the found centres return centres
def find_minpoints(sample_lines, num_lines, max_distance, cutoffs=None, append_indices=False): '''Compute the minimum distance points (MDPs) from all combinations of `num_lines` lines given in an array of lines `sample_lines`. Given a sample of lines, this functions computes the minimum distance points (MDPs) for every possible combination of `num_lines` lines. The returned numpy array contains all MDPs that satisfy the following: 1. Are within the `cutoffs`. 2. Are closer to all the constituent LoRs than `max_distance`. Parameters ---------- sample_lines: (M, N) numpy.ndarray A 2D array of lines, where each line is defined by two points such that every row is formatted as `[t, x1, y1, z1, x2, y2, z2, etc.]`. It *must* have at least 2 lines and the combination size `num_lines` *must* be smaller or equal to the number of lines. Put differently: 2 <= num_lines <= len(sample_lines). num_lines: int The number of lines in each combination of LoRs used to compute the MDP. This function considers every combination of `numlines` from the input `sample_lines`. It must be smaller or equal to the number of input lines `sample_lines`. max_distance: float The maximum allowed distance between an MDP and its constituent lines. If any distance from the MDP to one of its lines is larger than `max_distance`, the MDP is thrown away. cutoffs: (6,) numpy.ndarray, optional An array of spatial cutoff coordinates with *exactly 6 elements* as [x_min, x_max, y_min, y_max, z_min, z_max]. If any MDP lies outside this region, it is thrown away. If it is `None`, they are computed automatically by calling `get_cutoffs`. The default is `None`. append_indices: bool, default False A boolean specifying whether to include the indices of the lines used to compute each MDP. If `False`, the output array will only contain the [time, x, y, z] of the MDPs. If `True`, the output array will have extra columns [time, x, y, z, line_idx(1), ..., line_idx(n)] where n = `num_lines`. Returns ------- minpoints: (M, N) numpy.ndarray A 2D array of `float`s containing the time and coordinates of the MDPs [time, x, y, z]. The time is computed as the average of the constituent lines. If `append_indices` is `True`, then `num_lines` indices of the constituent lines are appended as extra columns: [time, x, y, z, line_idx1, line_idx2, ..]. The first column (for time) is sorted. Raises ------ ValueError If `sample_lines` is not a numpy array with shape (N, M >= 7). ValueError If 2 <= num_lines <= len(sample_lines) is not satisfied. ValueError If `cutoffs` is not a one-dimensional array with values `[min_x, max_x, min_y, max_y, min_z, max_z]` See Also -------- pept.tracking.peptml.Minpoints : Compute minpoints from `pept.LineData`. pept.utilities.read_csv : Fast CSV file reading into numpy arrays. ''' if not isinstance(sample_lines, pept.LineData): sample_lines = pept.LineData(sample_lines) lines = sample_lines.lines lines = np.asarray(lines, order='C', dtype=float) num_lines = int(num_lines) max_distance = float(max_distance) if cutoffs is None: cutoffs = get_cutoffs(sample_lines) else: cutoffs = np.asarray(cutoffs, order='C', dtype=float) if cutoffs.ndim != 1 or len(cutoffs) != 6: raise ValueError( ("\n[ERROR]: cutoffs should be a one-dimensional array with " "values [min_x, max_x, min_y, max_y, min_z, max_z]. Received " f"{cutoffs}.\n")) sample_minpoints = pept.utilities.find_minpoints( lines, num_lines, max_distance, cutoffs, append_indices=append_indices) columns = ["t", "x", "y", "z"] if append_indices: columns += [f"line_index{i + 1}" for i in range(num_lines)] points = pept.PointData(sample_minpoints, columns=columns) # Add optional metadata to the points; because they have an underscore, # they won't be propagated when new objects are constructed points._max_distance = max_distance points._cutoffs = cutoffs points._num_lines = num_lines if append_indices: points._lines = sample_lines return points
def fit(self, cutpoints, get_labels=False, max_workers=None, verbose=True): '''Fit cutpoints (an instance of `PointData`) and return the cluster centres and (optionally) the labelled cutpoints. This is a convenience function that clusters each sample in an instance of `pept.PointData` *in parallel*, using joblib. For more fine-grained control over the clustering, the `fit_sample` method can be used for each individual sample. Parameters ---------- cutpoints : an instance of `pept.PointData` The samples of points that will be clustered. Be careful to set the appropriate `sample_size` and `overlap` for good results. If the `sample_size` is too low, the less radioactive tracers might not be found; if it is too high, temporal resolution is decreased. If the `overlap` is too small, the tracked points might be very "sparse". Note: when transforming LoRs into cutpoints using the `Cutpoints` class, the `sample_size` is automatically set based on the average number of cutpoints found per sample of LoRs. get_labels : bool, default False If set to True, the labelled cutpoints are returned along with the centres of the clusters. The labelled cutpoints are a list of `pept.PointData` for each sample of cutpoints, with an appended column representing the cluster labels (starting from 0; noise is encoded as -1). max_workers : int, optional The maximum number of threads that will be used for asynchronously clustering the samples in `cutpoints`. If unset (`None`), the number of threads available on the machine (as returned by `os.cpu_count()`) will be used. verbose : bool, default True Provide extra information when computing the cutpoints: time the operation and show a progress bar. Returns ------- centres : pept.PointData The centroids of every cluster found with columns `[time, x, y, z, ..., cluster_size]`. They are computed as the column-wise average of the points included in each cluster (i.e. for each label). Another column is added to the initial data in `sample`, signifying the cluster size - that is, the number of points included in the cluster. labelled_cutpoints : optional, pept.PointData Returned if `get_labels` is `True`. It is a `pept.PointData` instance in which every sample is the corresponding sample in `cutpoints`, but with an appended column representing the label of the cluster that the point was associated with. The labels are integers starting from 0. The points classified as noise have the number -1 associated. Note that the labels are only consistent within the same sample; that is, for tracers A and B, if in one sample A gets the label 0 and B the label 1, in another sample their order might be inversed. The trajectory separation module might be used to separate them out. Raises ------ TypeError If `cutpoints` is not an instance (or a subclass) of `pept.PointData`. Note ---- If no clusters were found (i.e. all labels are -1), the returned values are empty numpy arrays. ''' if verbose: start = time.time() if not isinstance(cutpoints, pept.PointData): raise TypeError(( "\n[ERROR]: cutpoints should be an instance of " "`pept.PointData` (or any class inheriting from it). Received " f"{type(cutpoints)}.\n")) # Users might forget to set the sample_size, leaving it to the default # value of 0; in that case, all points are returned as a single sample; # that might not be the intended behaviour. if cutpoints.sample_size == 0: warnings.warn( textwrap.fill(( "\n[WARNING]: The `cutpoints.sample_size` was left to the " "default value of 0, in which case all points are returned" " as a single sample. For a very large number of points, " "this might result in a long function execution time.\n"), replace_whitespace=False), RuntimeWarning) get_labels = bool(get_labels) # Fit all samples in `cutpoints` in parallel using joblib # Collect all outputs as a list. If verbose, show progress bar with # tqdm if verbose: cutpoints = tqdm(cutpoints) if max_workers is None: max_workers = os.cpu_count() data_list = Parallel(n_jobs=max_workers)( delayed(self.fit_sample)(sample, get_labels=get_labels, as_array=True, verbose=False, _set_labels=False) for sample in cutpoints) if not get_labels: # data_list is a list of arrays. Only choose the arrays with at # least one row. centres = np.array([r for r in data_list if len(r) != 0]) else: # data_list is a list of tuples, in which the first element is an # array of the centres, and the second element is an array of the # labelled cutpoints. centres = np.array([r[0] for r in data_list if len(r[0]) != 0]) if len(centres) != 0: centres = pept.PointData(np.vstack(centres), sample_size=0, overlap=0, verbose=False) if verbose: end = time.time() print("\nFitting cutpoints took {} seconds.\n".format(end - start)) if get_labels: # data_list is a list of tuples, in which the first element is an # array of the centres, and the second element is an array of the # labelled cutpoints. labelled_cutpoints = [r[1] for r in data_list if len(r[1]) != 0] if len(labelled_cutpoints) != 0: # Encapsulate `labelled_cutpoints` in a `pept.PointData` # instance in which every sample is the corresponding sample in # `cutpoints`, but with an appended column representing the # labels. Therefore, the `sample_size` is the same as for # `cutpoints`, which is equal to the length of every array in # `labelled_cutpoints` labelled_cutpoints = pept.PointData( np.vstack(np.array(labelled_cutpoints)), sample_size=len(labelled_cutpoints[0]), overlap=0, verbose=False) # Set the attribute `labels` to the stacked labels of all the # labelled cutpoints; that is, the last column in the # labelled_cutpoints internal data: self._labels = labelled_cutpoints.points[:, -1] return centres, labelled_cutpoints return centres
def fit_sample(self, sample, store_labels = False, noise = False, as_array = True, verbose = False): '''Fit one sample of cutpoints and return the cluster centres and (optionally) the labelled cutpoints. Parameters ---------- sample : (N, M >= 4) numpy.ndarray The sample of points that will be clustered. Every point corresponds to a row and is formatted as `[time, x, y, z, etc]`. Only columns `[1, 2, 3]` are used for clustering. store_labels : bool, optional If set to True, the clustered cutpoints are returned along with the centres of the clusters. Setting it to False speeds up the clustering. The default is False. noise : bool, optional If set to True, the clustered cutpoints also include the points classified as noise. Only has an effect if `store_labels` is set to True. The default is False. as_array : bool, optional If set to True, the centres of the clusters and the clustered cutpoints are returned as numpy arrays. If set to False, they are returned inside instances of `pept.PointData`. verbose : bool, optional Provide extra information when computing the cutpoints: time the operation and show a progress bar. The default is `False`. Returns ------- centres : numpy.ndarray or pept.PointData The centroids of every cluster found. They are computed as the average of every column of `[time, x, y, z, etc]` of the clustered points. Another column is added to the initial data in `sample`, signifying the cluster size - the number of points included in the cluster. If `as_array` is set to True, it is a numpy array, otherwise the centres are stored in a pept.PointData instance. clustered_cutpoints : numpy.ndarray or pept.PointData The points in `sample` that fall in every cluster. A new column is added to the points in `sample` that signifies the label of cluster that the point was associated with: all points in cluster number 3 will have the number 3 as the last element in their row. The points classified as noise have the number -1 associated. If `as_array` is set to True, it is a numpy array, otherwise the clustered cutpoints are stored in a pept.PointData instance. Raises ------ TypeError If `sample` is not a numpy array of shape (N, M), where M >= 4. ''' if verbose: start = time.time() # sample row: [time, x, y, z] if sample.ndim != 2 or sample.shape[1] < 4: raise TypeError('\n[ERROR]: sample should have two dimensions (M, N), where N >= 4. Received {}\n'.format(sample.shape)) # Only cluster based on [x, y, z] labels = self.clusterer.fit_predict(sample[:, 1:4]) max_label = labels.max() centres = [] clustered_cutpoints = [] # the centre of a cluster is the average of the time, x, y, z columns # and the number of points of that cluster # centres row: [time, x, y, z, ..etc.., cluster_size] for i in range(0, max_label + 1): # Average time, x, y, z of cluster of label i centres_row = np.mean(sample[labels == i], axis = 0) # Append the number of points of label i => cluster_size centres_row = np.append(centres_row, (labels == i).sum()) centres.append(centres_row) centres = np.array(centres) if not as_array: centres = pept.PointData(centres, sample_size = 0, overlap = 0, verbose = False) # Return all cutpoints as a list of numpy arrays for every label # where the last column of an array is the label if store_labels: # Create a list of numpy arrays with rows: [t, x, y, z, ..etc.., label] if noise: cutpoints = sample[labels == -1] cutpoints = np.insert(cutpoints, cutpoints.shape[1], -1, axis = 1) clustered_cutpoints.append(cutpoints) for i in range(0, max_label + 1): cutpoints = sample[labels == i] cutpoints = np.insert(cutpoints, cutpoints.shape[1], i, axis = 1) clustered_cutpoints.append(cutpoints) clustered_cutpoints = np.vstack(np.array(clustered_cutpoints)) if not as_array: clustered_cutpoints = pept.PointData(clustered_cutpoints, sample_size = 0, overlap = 0, verbose = False) if verbose: end = time.time() print("Fitting one sample took {} seconds".format(end - start)) return [centres, clustered_cutpoints]
def fit_sample(self, voxels: pept.Voxels): '''Use the FPI algorithm to locate a tracer from a single voxellised space (i.e. from one sample of LoRs). A sample of LoRs can be voxellised using the `pept.Voxels.from_lines` method before calling this function. Parameters ---------- voxels: pept.Voxels A single voxellised space (i.e. from a single sample of LoRs) for which the tracers' locations will be found using the FPI method. timestamp: float, default 0. The timestamp to associate with the tracer positions found in this voxel space. as_array: bool, default False If `True`, return the found tracers' locations as a NumPy array. Otherwise, return them in a `pept.PointData` instance. verbose: bool, default False Show extra information on the sample processing step. Returns ------- locations: numpy.ndarray or pept.PointData The tracked locations found; if `as_array` is True, they are returned as a NumPy array with columns [time, x, y, z, error_x, error_y, error_z]. If `as_array` is False, the points are returned in a `pept.PointData` for ease of visualisation. Raises ------ TypeError If `voxels` is not an instance of `pept.Voxels` (or subclass thereof). ''' positions = fpi_ext( np.asarray(voxels.voxels, dtype=float, order="C"), self.w, self.r, self.lld_counts, ) # Translate the coordinates from the voxel space to the physical space positions[:, :3] *= voxels.voxel_size positions[:, :3] += [voxels.xlim[0], voxels.ylim[0], voxels.zlim[0]] # Convert errors to physical space too positions[:, 3:] *= voxels.voxel_size # Create points array to store [t, x, y, z, xerr, yerr, zerr, err] points = np.full((len(positions), 8), np.nan) points[:, 1:7] = positions points[:, 7] = np.linalg.norm(positions[:, 3:6], axis=1) # Set the timestamp if `_lines` exists if "_lines" in voxels.attrs: points[:, 0] = voxels.attrs["_lines"].lines[:, 0].mean() else: warnings.warn( ("The input `Voxels` did not have a '_lines' attribute, so no " "timestamp can be inferred. The time was set to NaN."), RuntimeWarning) return pept.PointData( points, columns=[ "t", "x", "y", "z", "error_x", "error_y", "error_z", "error" ], )
def fit_sample(self, sample, get_used=False, as_array=True, verbose=False): '''Use the Birmingham method to track a tracer location from a numpy array (i.e. one sample) of LoRs. For the given `sample` of LoRs (a numpy.ndarray), this function minimises the distance between all of the LoRs, rejecting a fraction of lines that lie furthest away from the calculated distance. The process is repeated iteratively until a specified fraction (`fopt`) of the original subset of LORs remains. Parameters ---------- sample : (N, M>=7) numpy.ndarray The sample of LORs that will be clustered. Each LoR is expressed as a timestamps and a line defined by two points; the data columns are then `[time, x1, y1, z1, x2, y2, z2, extra...]`. get_used : bool, default False If `True`, the function will also return a boolean mask of the LoRs used to compute the tracer location - that is, a vector of the same length as `sample`, containing 1 for the rows that were used, and 0 otherwise. as_array : bool, default True If set to True, the tracked locations are returned as numpy arrays. If set to False, they are returned inside an instance of `pept.PointData` for ease of iteration and plotting. verbose : bool, default False Provide extra information when tracking a location: time the operation and show a progress bar. Returns ------- locations : numpy.ndarray or pept.PointData The tracked locations found. used : numpy.ndarray, optional If `get_used` is true, then also return a boolean mask of the LoRs used to compute the tracer location - that is, a vector of the same length as `sample`, containing 1 for the rows that were used, and 0 otherwise. [ Used for multi-particle tracking, not implemented yet] Raises ------ ValueError If `sample` is not a numpy array of shape (N, M), where M >= 7. ''' if verbose: start = time.time() # Type-check input parameters. # sample cols: [time, x1, y1, z1, x2, y2, z2, etc.] sample = np.asarray(sample, dtype=float, order="C") if sample.ndim != 2 or sample.shape[1] < 7: raise ValueError( textwrap.fill( "[ERROR]: `sample` should have two dimensions (M, N), where " f"N >= 7. Received {sample.shape}.")) locations, used = birmingham_method(sample, self._fopt) if not as_array: locations = pept.PointData(locations, sample_size=0, overlap=0, verbose=False) if verbose: end = time.time() print(("Tracking one location with %i LORs took %.3f seconds" % (sample.shape[0], end - start))) if get_used: return locations, used return locations