def nearest_neighbor_hist(self, n_rand_pts=5000, n_bins=200, figsize=(8, 6), metric='euclidean'): """ Plots a histogram of distance to nearest neighbor for select number of random points :param n_rand_pts: Number of random pts to use to generate histogram :patam n_bins: Number of bins used to generate histogram :param figsize: size of plot to return :return: Histograom of distances to nearest neighbors """ if (n_rand_pts > self._n_pts): n_rand_pts = self._n_pts r_inds = np.random.choice(range(self._n_pts), size=n_rand_pts) dists = _pairwise_distances(self._data[r_inds, :], self._data, metric=self._metric) dists_sort = np.sort(dists, axis=1) # plotting configurations fig = _plt.figure(figsize=figsize) ax = fig.add_subplot(111) ax.set_xlabel('Distance to Nearest Neighbor') ax.set_ylabel('Number of Datapoints') ax.hist(dists_sort[:, 1], bins=n_bins) # plot line for 3rd STD, can be used a starting radius in downsampling best_guess = np.mean(dists_sort[:, 1]) + 3 * np.std(dists_sort[:, 1]) ax.axvline(best_guess, color='r') print("3rd STD (best guess starting radius) = {}".format(best_guess)) return
def get_density(self, radius, chunk_size=5000): """ Calculates the density of each datapoint :param radius: Radius around each datapoints used for density calculations :patam chunk_size: Number of cells to consider during each iteration due to memory restrictions :return: Calculated densities for all datapoints """ # Due to memory restrictions density assignments have to be preformed in chunks all_chunks = get_chunks(range(self._n_pts), chunk_size) # create array to hold all densities density = np.empty((self._n_pts), dtype=int) # create a nested array of indices for each cell within rad neighbors = np.empty((self._n_pts), dtype=object) for chunk in all_chunks: chunk_dist = _pairwise_distances(self._data[chunk, :], self._data, n_jobs=1, metric=self._metric) print("calculating densities for datapoints: {0} -> {1}".format( chunk[0], chunk[-1])) for chunk_ind, ind in enumerate(chunk): neighbors[ind] = np.setdiff1d( np.ravel( np.argwhere(chunk_dist[chunk_ind] <= radius).ravel()), ind) density[ind] = len(neighbors[ind]) print("****Always check density overlay for radius fit****") self.density = density self.neighbors = neighbors return (density)
def radius_best_guess(self, n_rand_pts=5000, metric='euclidean'): """ Returns a best guess for the radius based on a select number of random points :param n_rand_pts: Number of random pts to use to generate histogram :return: float numeric for best guess of radius """ if (n_rand_pts > self._n_pts): n_rand_pts = self._n_pts r_inds = np.random.choice(range(self._n_pts), size=n_rand_pts) dists = _pairwise_distances(self._data[r_inds, :], self._data, metric=self._metric) dists_sort = np.sort(dists, axis=1) # plotting configurations best_guess = np.median(np.sort(dists_sort[:, 1])[-20:]) return (best_guess)
def get_density(self, radius, chunk_size=5000, mute=False): """ Calculates the density of each datapoint :param radius: Radius around each datapoints used for density calculations :param chunk_size: Number of cells to consider during each iteration due to memory restrictions :param mute: boolean operator to suppress print statements :return: Calculated densities for all datapoints """ # Save sys.stdout to return print output if muted old_stdout = sys.stdout # Mute print statements if True if (mute == True): sys.stdout = open(os.devnull, 'w') # Due to memory restrictions density assignments have to be preformed in chunks all_chunks = get_chunks(range(self._n_pts), chunk_size) # create array to hold all densities density = np.empty((self._n_pts), dtype=int) # create a nested array of indices for each cell within rad neighbors = np.empty((self._n_pts), dtype=object) for chunk in all_chunks: chunk_dist = _pairwise_distances(self._data[chunk, :], self._data, n_jobs=1, metric=self._metric) print("calculating densities for datapoints: {0} -> {1}".format( chunk[0], chunk[-1])) for chunk_ind, ind in enumerate(chunk): neighbors[ind] = np.setdiff1d( np.ravel( np.argwhere(chunk_dist[chunk_ind] <= radius).ravel()), ind) density[ind] = len(neighbors[ind]) print("****Always check density overlay for radius fit****") self.density = density self.neighbors = neighbors # return to normal treatment of print statements sys.stdout = old_stdout return (density)
def pairwise_distance_matrix_from_embeddings_and_annotations( query_embeddings_path: str, reference_embeddings_path: str, metric: str = "euclidean", n_jobs: int = 1) -> PairwiseDistanceMatrixResult: """ :param n_jobs: int, see scikit-learn documentation :param metric: Metric to use (string!), see scikit-learn documentation :param query_embeddings_path: A string defining a path to an h5 file :param reference_embeddings_path: A string defining a path to an h5 file :return: A tuple containing: - pairwise_matrix: the pairwise distances between queries and references - queries: A list of strings defining the queries - references: A list of strings defining the references """ references: List[str] queries: List[str] reference_embeddings = list() query_embeddings = list() with h5py.File(reference_embeddings_path, 'r') as reference_embeddings_file,\ h5py.File(query_embeddings_path, 'r') as query_embeddings_file: references = list(reference_embeddings_file.keys()) queries = list(query_embeddings_file.keys()) for refereince_identifier in references: reference_embeddings.append( np.array(reference_embeddings_file[refereince_identifier])) for query_identifier in queries: query_embeddings.append( np.array(query_embeddings_file[query_identifier])) pairwise_distances = _pairwise_distances(query_embeddings, reference_embeddings, metric=metric, n_jobs=n_jobs) return PairwiseDistanceMatrixResult(pairwise_matrix=pairwise_distances, queries=queries, references=references)
def nearest_neighbor_hist(self, n_rand_pts=5000, n_bins=200, figsize=(8, 6), metric='euclidean', mute=False): """ Plots a histogram of distance to nearest neighbor for select number of random points and returns a best guess for the radius used for density calculations :param n_rand_pts: Number of random pts to use to generate histogram :patam n_bins: Number of bins used to generate histogram :param figsize: size of plot to return :param mute: boolean operator to suppress print statements :return: Histograom of distances to nearest neighbors """ # Save sys.stdout to return print output if muted old_stdout = sys.stdout # Mute print statements if True if (mute == True): sys.stdout = open(os.devnull, 'w') if (n_rand_pts > self._n_pts): n_rand_pts = self._n_pts r_inds = np.random.choice(range(self._n_pts), size=n_rand_pts) dists = _pairwise_distances(self._data[r_inds, :], self._data, metric=self._metric) dists_sort = np.sort(dists, axis=1) # plotting configurations fig = _plt.figure(figsize=figsize) ax = fig.add_subplot(111) ax.set_xlabel('Distance to Nearest Neighbor') ax.set_ylabel('Number of Datapoints') ax.hist(dists_sort[:, 1], bins=n_bins) # plot line for best guess starting radius in downsampling best_guess = np.median(np.sort(dists_sort[:, 1])[-20:]) ax.axvline(best_guess, color='r') print("best guess starting radius = {}".format(best_guess)) # return to normal treatment of print statements sys.stdout = old_stdout return (best_guess)
def unsupervised(**kwargs) -> Dict[str, Any]: check_required(kwargs, [ 'reference_embeddings_file', 'reference_annotations_file', 'reduced_embeddings_file' ]) result_kwargs = deepcopy(kwargs) file_manager = get_file_manager(**kwargs) # Try to create final files (if this fails, now is better than later transferred_annotations_file_path = file_manager.create_file( result_kwargs.get('prefix'), result_kwargs.get('stage_name'), 'transferred_annotations_file', extension='.csv') # Read the reference annotations and reference embeddings # The reference annotations file must be CSV containing two columns & headers like: # identifier,label # ** identifier doesn't need to be unique ** reference_annotations_file = read_csv( result_kwargs['reference_annotations_file']) # If reference annotations contain nans (either in label or identifier) throw an error! # https://github.com/sacdallago/bio_embeddings/issues/58 # https://datatofish.com/check-nan-pandas-dataframe/ if reference_annotations_file[['identifier', 'label']].isnull().values.any(): raise InvalidAnnotationFileError( "Your annotation file contains NaN values in either identifier or label columns.\n" "Please remove these and run the pipeline again.") # Save a copy of the annotation file with only necessary cols cols input_reference_annotations_file_path = file_manager.create_file( result_kwargs.get('prefix'), result_kwargs.get('stage_name'), 'input_reference_annotations_file', extension='.csv') reference_annotations_file.to_csv(input_reference_annotations_file_path, index=False) result_kwargs[ 'input_reference_annotations_file'] = input_reference_annotations_file_path # Starting from here order is super important! reference_identifiers = reference_annotations_file['identifier'].unique() reference_identifiers.sort() reference_embeddings = list() # Save a copy of the reference embeddings file with only necessary embeddings input_reference_embeddings_file_path = file_manager.create_file( result_kwargs.get('prefix'), result_kwargs.get('stage_name'), 'input_reference_embeddings_file', extension='.h5') result_kwargs[ 'input_reference_embeddings_file'] = input_reference_embeddings_file_path # Only read in embeddings for annotated sequences! This will save RAM/GPU_RAM. with h5py.File(result_kwargs['reference_embeddings_file'], 'r') as reference_embeddings_file: # Sanity check: check that all identifiers in reference_annotation_file are present as embeddings unembedded_identifiers = set(reference_identifiers) - set( reference_embeddings_file.keys()) if len(unembedded_identifiers) > 0: raise UnrecognizedEmbeddingError( "Your reference_annotations_file includes identifiers for which " "no embedding can be found in your reference_embeddings_file.\n" "We require the set of identifiers in the reference_annotations_file " "to be a equal or a subset of the embeddings present in the " "reference_embeddings_file.\n" "To fix this issue, you can use the " "bio_embeddings.utilities.remove_identifiers_from_annotations_file " "function (see notebooks). " "The faulty identifiers are:\n['" + "','".join(unembedded_identifiers) + "']") with h5py.File(result_kwargs['input_reference_embeddings_file'], 'w') as input_reference_embeddings_file: for identifier in reference_identifiers: current_embedding = np.array( reference_embeddings_file[identifier]) reference_embeddings.append(current_embedding) input_reference_embeddings_file.create_dataset( identifier, data=current_embedding) # mapping file will be needed to transfer annotations mapping_file = read_csv(result_kwargs['mapping_file'], index_col=0) mapping_file.index = mapping_file.index.map(str) # Important to have consistent ordering! target_identifiers = mapping_file.index.values target_identifiers.sort() target_embeddings = list() with h5py.File(result_kwargs['reduced_embeddings_file'], 'r') as reduced_embeddings_file: for identifier in target_identifiers: target_embeddings.append( np.array(reduced_embeddings_file[identifier])) result_kwargs['n_jobs'] = result_kwargs.get('n_jobs', 1) result_kwargs['metric'] = result_kwargs.get('metric', 'euclidean') pairwise_distances = _pairwise_distances(target_embeddings, reference_embeddings, metric=result_kwargs['metric'], n_jobs=result_kwargs['n_jobs']) result_kwargs['keep_pairwise_distances_matrix_file'] = result_kwargs.get( 'keep_pairwise_distances_matrix_file', False) if result_kwargs['keep_pairwise_distances_matrix_file']: pairwise_distances_matrix_file_path = file_manager.create_file( result_kwargs.get('prefix'), result_kwargs.get('stage_name'), 'pairwise_distances_matrix_file', extension='.csv') pairwise_distances_matrix_file = DataFrame( pairwise_distances, index=target_identifiers, columns=reference_identifiers) pairwise_distances_matrix_file.to_csv( pairwise_distances_matrix_file_path, index=True) result_kwargs[ 'pairwise_distances_matrix_file'] = pairwise_distances_matrix_file_path # transfer & store annotations result_kwargs['k_nearest_neighbours'] = result_kwargs.get( 'k_nearest_neighbours', 1) k_nn_indices, k_nn_distances = get_k_nearest_neighbours( pairwise_distances, result_kwargs['k_nearest_neighbours']) k_nn_identifiers = list( map(reference_identifiers.__getitem__, k_nn_indices)) k_nn_annotations = list() for row in k_nn_identifiers: k_nn_annotations.append([ ";".join(reference_annotations_file[ reference_annotations_file['identifier'] == identifier] ['label'].values) for identifier in row ]) # At this stage I have: nxk list of identifiers (strings), nxk indices (ints), nxk distances (floats), # nxk annotations # Now I need to expand the lists into a table and store the table into a CSV k_nn_identifiers_df = DataFrame( k_nn_identifiers, columns=[ f"k_nn_{i+1}_identifier" for i in range(len(k_nn_identifiers[0])) ]) k_nn_distances_df = DataFrame(k_nn_distances, columns=[ f"k_nn_{i+1}_distance" for i in range(len(k_nn_distances[0])) ]) k_nn_annotations_df = DataFrame( k_nn_annotations, columns=[ f"k_nn_{i+1}_annotations" for i in range(len(k_nn_annotations[0])) ]) transferred_annotations_dataframe = concatenate_dataframe( [k_nn_identifiers_df, k_nn_distances_df, k_nn_annotations_df], axis=1) transferred_annotations_dataframe.index = target_identifiers # At this stage we would like to aggregate all k_nn_XX_annotations into one column # - A row in the k_nn_annotations matrix is string with X annotations (e.g. ["A;B", "A;C", "D"]) # - Each annotation in the string is separated by a ";" # Thus: # 1. Join all strings in a row separating them with ";" (aka ["A;B", "C"] --> "A;B;A;C;D") # 2. Split joined string into separate annotations using split(";") (aka "A;B;A;C;D" --> ["A","B","A","C","D"]) # 3. Take a unique set of annotations by using set(*) (aka ["A","B","A","C","D"] --> set{"A","B","C","D"}) # 4. Join the new unique set of annotations using ";" (aka set{"A","B","C","D"}) --> "A;B;C;D") transferred_annotations_dataframe['transferred_annotations'] = [ ";".join(set(";".join(k_nn_row).split(";"))) for k_nn_row in k_nn_annotations ] # Merge with mapping file! Get also original ids! transferred_annotations_dataframe = mapping_file.join( transferred_annotations_dataframe) transferred_annotations_dataframe.to_csv(transferred_annotations_file_path, index=True) result_kwargs[ 'transferred_annotations_file'] = transferred_annotations_file_path return result_kwargs