Exemple #1
0
    def nearest_neighbor_hist(self,
                              n_rand_pts=5000,
                              n_bins=200,
                              figsize=(8, 6),
                              metric='euclidean'):
        """
        Plots a histogram of distance to nearest neighbor for
        select number of random points
        :param n_rand_pts: Number of random pts to use to generate histogram
        :patam n_bins: Number of bins used to generate histogram
        :param figsize: size of plot to return
        :return: Histograom of distances to nearest neighbors
        """
        if (n_rand_pts > self._n_pts):
            n_rand_pts = self._n_pts

        r_inds = np.random.choice(range(self._n_pts), size=n_rand_pts)
        dists = _pairwise_distances(self._data[r_inds, :],
                                    self._data,
                                    metric=self._metric)
        dists_sort = np.sort(dists, axis=1)
        # plotting configurations
        fig = _plt.figure(figsize=figsize)
        ax = fig.add_subplot(111)
        ax.set_xlabel('Distance to Nearest Neighbor')
        ax.set_ylabel('Number of Datapoints')
        ax.hist(dists_sort[:, 1], bins=n_bins)
        # plot line for 3rd STD, can be used a starting radius in downsampling
        best_guess = np.mean(dists_sort[:, 1]) + 3 * np.std(dists_sort[:, 1])
        ax.axvline(best_guess, color='r')
        print("3rd STD (best guess starting radius) = {}".format(best_guess))
        return
Exemple #2
0
    def get_density(self, radius, chunk_size=5000):
        """
        Calculates the density of each datapoint
        :param radius: Radius around each datapoints used for density calculations
        :patam chunk_size: Number of cells to consider during each iteration due to memory restrictions
        :return: Calculated densities for all datapoints
        """
        # Due to memory restrictions density assignments have to be preformed in chunks
        all_chunks = get_chunks(range(self._n_pts), chunk_size)
        # create array to hold all densities
        density = np.empty((self._n_pts), dtype=int)
        # create a nested array of indices for each cell within rad
        neighbors = np.empty((self._n_pts), dtype=object)

        for chunk in all_chunks:

            chunk_dist = _pairwise_distances(self._data[chunk, :],
                                             self._data,
                                             n_jobs=1,
                                             metric=self._metric)
            print("calculating densities for datapoints: {0} -> {1}".format(
                chunk[0], chunk[-1]))

            for chunk_ind, ind in enumerate(chunk):
                neighbors[ind] = np.setdiff1d(
                    np.ravel(
                        np.argwhere(chunk_dist[chunk_ind] <= radius).ravel()),
                    ind)
                density[ind] = len(neighbors[ind])
        print("****Always check density overlay for radius fit****")
        self.density = density
        self.neighbors = neighbors
        return (density)
Exemple #3
0
    def radius_best_guess(self, n_rand_pts=5000, metric='euclidean'):
        """
        Returns a best guess for the radius based on a select number of random points
        :param n_rand_pts: Number of random pts to use to generate histogram
        :return: float numeric for best guess of radius
        """
        if (n_rand_pts > self._n_pts):
            n_rand_pts = self._n_pts

        r_inds = np.random.choice(range(self._n_pts), size=n_rand_pts)
        dists = _pairwise_distances(self._data[r_inds, :],
                                    self._data,
                                    metric=self._metric)
        dists_sort = np.sort(dists, axis=1)
        # plotting configurations
        best_guess = np.median(np.sort(dists_sort[:, 1])[-20:])
        return (best_guess)
Exemple #4
0
    def get_density(self, radius, chunk_size=5000, mute=False):
        """
        Calculates the density of each datapoint
        :param radius: Radius around each datapoints used for density calculations
        :param chunk_size: Number of cells to consider during each iteration due to memory restrictions
        :param  mute: boolean operator to suppress print statements
        :return: Calculated densities for all datapoints
        """
        # Save sys.stdout to return print output if muted
        old_stdout = sys.stdout
        # Mute print statements if True
        if (mute == True):
            sys.stdout = open(os.devnull, 'w')

        # Due to memory restrictions density assignments have to be preformed in chunks
        all_chunks = get_chunks(range(self._n_pts), chunk_size)
        # create array to hold all densities
        density = np.empty((self._n_pts), dtype=int)
        # create a nested array of indices for each cell within rad
        neighbors = np.empty((self._n_pts), dtype=object)

        for chunk in all_chunks:

            chunk_dist = _pairwise_distances(self._data[chunk, :],
                                             self._data,
                                             n_jobs=1,
                                             metric=self._metric)
            print("calculating densities for datapoints: {0} -> {1}".format(
                chunk[0], chunk[-1]))

            for chunk_ind, ind in enumerate(chunk):
                neighbors[ind] = np.setdiff1d(
                    np.ravel(
                        np.argwhere(chunk_dist[chunk_ind] <= radius).ravel()),
                    ind)
                density[ind] = len(neighbors[ind])
        print("****Always check density overlay for radius fit****")
        self.density = density
        self.neighbors = neighbors

        # return to normal treatment of print statements
        sys.stdout = old_stdout

        return (density)
Exemple #5
0
def pairwise_distance_matrix_from_embeddings_and_annotations(
        query_embeddings_path: str,
        reference_embeddings_path: str,
        metric: str = "euclidean",
        n_jobs: int = 1) -> PairwiseDistanceMatrixResult:
    """

    :param n_jobs: int, see scikit-learn documentation
    :param metric: Metric to use (string!), see scikit-learn documentation
    :param query_embeddings_path: A string defining a path to an h5 file
    :param reference_embeddings_path: A string defining a path to an h5 file
    :return: A tuple containing:
        - pairwise_matrix: the pairwise distances between queries and references
        - queries: A list of strings defining the queries
        - references: A list of strings defining the references
    """
    references: List[str]
    queries: List[str]
    reference_embeddings = list()
    query_embeddings = list()

    with h5py.File(reference_embeddings_path, 'r') as reference_embeddings_file,\
          h5py.File(query_embeddings_path, 'r') as query_embeddings_file:

        references = list(reference_embeddings_file.keys())
        queries = list(query_embeddings_file.keys())

        for refereince_identifier in references:
            reference_embeddings.append(
                np.array(reference_embeddings_file[refereince_identifier]))

        for query_identifier in queries:
            query_embeddings.append(
                np.array(query_embeddings_file[query_identifier]))

    pairwise_distances = _pairwise_distances(query_embeddings,
                                             reference_embeddings,
                                             metric=metric,
                                             n_jobs=n_jobs)

    return PairwiseDistanceMatrixResult(pairwise_matrix=pairwise_distances,
                                        queries=queries,
                                        references=references)
Exemple #6
0
    def nearest_neighbor_hist(self,
                              n_rand_pts=5000,
                              n_bins=200,
                              figsize=(8, 6),
                              metric='euclidean',
                              mute=False):
        """
        Plots a histogram of distance to nearest neighbor for select number of random points 
        and returns a best guess for the radius used for density calculations
        :param n_rand_pts: Number of random pts to use to generate histogram
        :patam n_bins: Number of bins used to generate histogram
        :param figsize: size of plot to return
        :param mute: boolean operator to suppress print statements
        :return: Histograom of distances to nearest neighbors
        """
        # Save sys.stdout to return print output if muted
        old_stdout = sys.stdout
        # Mute print statements if True
        if (mute == True):
            sys.stdout = open(os.devnull, 'w')

        if (n_rand_pts > self._n_pts):
            n_rand_pts = self._n_pts

        r_inds = np.random.choice(range(self._n_pts), size=n_rand_pts)
        dists = _pairwise_distances(self._data[r_inds, :],
                                    self._data,
                                    metric=self._metric)
        dists_sort = np.sort(dists, axis=1)
        # plotting configurations
        fig = _plt.figure(figsize=figsize)
        ax = fig.add_subplot(111)
        ax.set_xlabel('Distance to Nearest Neighbor')
        ax.set_ylabel('Number of Datapoints')
        ax.hist(dists_sort[:, 1], bins=n_bins)
        # plot line for best guess starting radius in downsampling
        best_guess = np.median(np.sort(dists_sort[:, 1])[-20:])
        ax.axvline(best_guess, color='r')
        print("best guess starting radius = {}".format(best_guess))
        # return to normal treatment of print statements
        sys.stdout = old_stdout
        return (best_guess)
Exemple #7
0
def unsupervised(**kwargs) -> Dict[str, Any]:
    check_required(kwargs, [
        'reference_embeddings_file', 'reference_annotations_file',
        'reduced_embeddings_file'
    ])

    result_kwargs = deepcopy(kwargs)
    file_manager = get_file_manager(**kwargs)

    # Try to create final files (if this fails, now is better than later
    transferred_annotations_file_path = file_manager.create_file(
        result_kwargs.get('prefix'),
        result_kwargs.get('stage_name'),
        'transferred_annotations_file',
        extension='.csv')

    # Read the reference annotations and reference embeddings

    # The reference annotations file must be CSV containing two columns & headers like:
    # identifier,label
    # ** identifier doesn't need to be unique **
    reference_annotations_file = read_csv(
        result_kwargs['reference_annotations_file'])

    # If reference annotations contain nans (either in label or identifier) throw an error!
    # https://github.com/sacdallago/bio_embeddings/issues/58
    # https://datatofish.com/check-nan-pandas-dataframe/
    if reference_annotations_file[['identifier',
                                   'label']].isnull().values.any():
        raise InvalidAnnotationFileError(
            "Your annotation file contains NaN values in either identifier or label columns.\n"
            "Please remove these and run the pipeline again.")

    # Save a copy of the annotation file with only necessary cols cols
    input_reference_annotations_file_path = file_manager.create_file(
        result_kwargs.get('prefix'),
        result_kwargs.get('stage_name'),
        'input_reference_annotations_file',
        extension='.csv')

    reference_annotations_file.to_csv(input_reference_annotations_file_path,
                                      index=False)

    result_kwargs[
        'input_reference_annotations_file'] = input_reference_annotations_file_path

    # Starting from here order is super important!
    reference_identifiers = reference_annotations_file['identifier'].unique()
    reference_identifiers.sort()
    reference_embeddings = list()

    # Save a copy of the reference embeddings file with only necessary embeddings
    input_reference_embeddings_file_path = file_manager.create_file(
        result_kwargs.get('prefix'),
        result_kwargs.get('stage_name'),
        'input_reference_embeddings_file',
        extension='.h5')

    result_kwargs[
        'input_reference_embeddings_file'] = input_reference_embeddings_file_path

    # Only read in embeddings for annotated sequences! This will save RAM/GPU_RAM.
    with h5py.File(result_kwargs['reference_embeddings_file'],
                   'r') as reference_embeddings_file:
        # Sanity check: check that all identifiers in reference_annotation_file are present as embeddings

        unembedded_identifiers = set(reference_identifiers) - set(
            reference_embeddings_file.keys())

        if len(unembedded_identifiers) > 0:
            raise UnrecognizedEmbeddingError(
                "Your reference_annotations_file includes identifiers for which "
                "no embedding can be found in your reference_embeddings_file.\n"
                "We require the set of identifiers in the reference_annotations_file "
                "to be a equal or a subset of the embeddings present in the "
                "reference_embeddings_file.\n"
                "To fix this issue, you can use the "
                "bio_embeddings.utilities.remove_identifiers_from_annotations_file "
                "function (see notebooks). "
                "The faulty identifiers are:\n['" +
                "','".join(unembedded_identifiers) + "']")

        with h5py.File(result_kwargs['input_reference_embeddings_file'],
                       'w') as input_reference_embeddings_file:
            for identifier in reference_identifiers:
                current_embedding = np.array(
                    reference_embeddings_file[identifier])
                reference_embeddings.append(current_embedding)
                input_reference_embeddings_file.create_dataset(
                    identifier, data=current_embedding)

    # mapping file will be needed to transfer annotations
    mapping_file = read_csv(result_kwargs['mapping_file'], index_col=0)
    mapping_file.index = mapping_file.index.map(str)

    # Important to have consistent ordering!
    target_identifiers = mapping_file.index.values
    target_identifiers.sort()
    target_embeddings = list()

    with h5py.File(result_kwargs['reduced_embeddings_file'],
                   'r') as reduced_embeddings_file:
        for identifier in target_identifiers:
            target_embeddings.append(
                np.array(reduced_embeddings_file[identifier]))

    result_kwargs['n_jobs'] = result_kwargs.get('n_jobs', 1)
    result_kwargs['metric'] = result_kwargs.get('metric', 'euclidean')

    pairwise_distances = _pairwise_distances(target_embeddings,
                                             reference_embeddings,
                                             metric=result_kwargs['metric'],
                                             n_jobs=result_kwargs['n_jobs'])

    result_kwargs['keep_pairwise_distances_matrix_file'] = result_kwargs.get(
        'keep_pairwise_distances_matrix_file', False)

    if result_kwargs['keep_pairwise_distances_matrix_file']:
        pairwise_distances_matrix_file_path = file_manager.create_file(
            result_kwargs.get('prefix'),
            result_kwargs.get('stage_name'),
            'pairwise_distances_matrix_file',
            extension='.csv')
        pairwise_distances_matrix_file = DataFrame(
            pairwise_distances,
            index=target_identifiers,
            columns=reference_identifiers)
        pairwise_distances_matrix_file.to_csv(
            pairwise_distances_matrix_file_path, index=True)
        result_kwargs[
            'pairwise_distances_matrix_file'] = pairwise_distances_matrix_file_path

    # transfer & store annotations
    result_kwargs['k_nearest_neighbours'] = result_kwargs.get(
        'k_nearest_neighbours', 1)

    k_nn_indices, k_nn_distances = get_k_nearest_neighbours(
        pairwise_distances, result_kwargs['k_nearest_neighbours'])

    k_nn_identifiers = list(
        map(reference_identifiers.__getitem__, k_nn_indices))
    k_nn_annotations = list()

    for row in k_nn_identifiers:
        k_nn_annotations.append([
            ";".join(reference_annotations_file[
                reference_annotations_file['identifier'] == identifier]
                     ['label'].values) for identifier in row
        ])

    # At this stage I have: nxk list of identifiers (strings), nxk indices (ints), nxk distances (floats),
    # nxk annotations
    # Now I need to expand the lists into a table and store the table into a CSV

    k_nn_identifiers_df = DataFrame(
        k_nn_identifiers,
        columns=[
            f"k_nn_{i+1}_identifier" for i in range(len(k_nn_identifiers[0]))
        ])
    k_nn_distances_df = DataFrame(k_nn_distances,
                                  columns=[
                                      f"k_nn_{i+1}_distance"
                                      for i in range(len(k_nn_distances[0]))
                                  ])
    k_nn_annotations_df = DataFrame(
        k_nn_annotations,
        columns=[
            f"k_nn_{i+1}_annotations" for i in range(len(k_nn_annotations[0]))
        ])

    transferred_annotations_dataframe = concatenate_dataframe(
        [k_nn_identifiers_df, k_nn_distances_df, k_nn_annotations_df], axis=1)
    transferred_annotations_dataframe.index = target_identifiers

    # At this stage we would like to aggregate all k_nn_XX_annotations into one column
    # -  A row in the k_nn_annotations matrix is string with X annotations (e.g. ["A;B", "A;C", "D"])
    # -  Each annotation in the string is separated by a ";"
    # Thus:
    # 1. Join all strings in a row separating them with ";" (aka ["A;B", "C"] --> "A;B;A;C;D")
    # 2. Split joined string into separate annotations using split(";") (aka "A;B;A;C;D" --> ["A","B","A","C","D"])
    # 3. Take a unique set of annotations by using set(*) (aka ["A","B","A","C","D"] --> set{"A","B","C","D"})
    # 4. Join the new unique set of annotations using ";" (aka set{"A","B","C","D"}) --> "A;B;C;D")
    transferred_annotations_dataframe['transferred_annotations'] = [
        ";".join(set(";".join(k_nn_row).split(";")))
        for k_nn_row in k_nn_annotations
    ]

    # Merge with mapping file! Get also original ids!
    transferred_annotations_dataframe = mapping_file.join(
        transferred_annotations_dataframe)
    transferred_annotations_dataframe.to_csv(transferred_annotations_file_path,
                                             index=True)

    result_kwargs[
        'transferred_annotations_file'] = transferred_annotations_file_path

    return result_kwargs