def get_involved_complex_from_counts(multidatas_counts: pd.DataFrame, clusters_names: list,
                                     complex_expanded: pd.DataFrame, complex_composition: pd.DataFrame) -> (
        pd.DataFrame, pd.DataFrame):
    """
    Finds the complexes defined in counts and calculates the counts values
    """
    proteins_in_complexes = complex_composition['protein_multidata_id'].tolist()

    # Remove counts that can't be part of a complex
    multidatas_counts_filtered = multidatas_counts[
        multidatas_counts['id_multidata'].apply(lambda multidata: multidata in proteins_in_complexes)]

    # Find complexes with all components defined in counts
    complex_composition_counts = complex_helper.get_involved_complex_from_protein(multidatas_counts_filtered,
                                                                                  complex_expanded,
                                                                                  complex_composition,
                                                                                  drop_duplicates=False)

    if complex_composition_counts.empty:
        return pd.DataFrame(), pd.DataFrame()

    # Remove counts that are not defined in selected complexes
    multidatas_counts_filtered = filter_counts_by_genes(multidatas_counts_filtered,
                                                        complex_composition_counts['gene'].tolist())

    # Set the counts value a complex count. This is the minimum value of the cell component
    complex_counts = cluster_counts_helper.merge_complex_counts(clusters_names, complex_composition_counts,
                                                                list(complex_expanded.columns.values))

    # Removes empty counts
    complex_counts = cluster_counts_filter.filter_empty_cluster_counts(complex_counts, clusters_names)

    complex_counts.drop(clusters_names, axis=1, inplace=True)

    return complex_counts, multidatas_counts_filtered
Ejemplo n.º 2
0
    def test_filter_empty_cluster_counts(self):
        cluster_counts = pd.read_csv(
            '{}/cluster_counts_generic_cluster_counts.csv'.format(
                self.FIXTURES_SUBPATH))
        expected_result = pd.read_csv(
            '{}/cluster_counts_filter_empty_cluster_results.csv'.format(
                self.FIXTURES_SUBPATH))

        gene_column_name = 'gene'

        cluster_names = list(cluster_counts.columns.values)
        cluster_names.remove(gene_column_name)

        result = cluster_counts_filter.filter_empty_cluster_counts(
            cluster_counts, cluster_names)

        self.assertTrue(
            dataframe_functions.dataframes_has_same_data(
                result, expected_result))
def get_complex_involved_in_counts(
        multidatas_counts: pd.DataFrame, clusters_names: list,
        complex_composition: pd.DataFrame,
        complex_expanded: pd.DataFrame) -> pd.DataFrame:
    """
    Gets complexes involved in counts
    """
    core_logger.debug('Finding Complexes')
    complex_counts_composition = complex_helper.get_involved_complex_from_protein(
        multidatas_counts,
        complex_expanded,
        complex_composition,
        drop_duplicates=False)

    complex_counts = merge_complex_counts(
        clusters_names, complex_counts_composition,
        list(complex_expanded.columns.values))
    complex_counts = filter_empty_cluster_counts(complex_counts,
                                                 clusters_names)

    complex_counts.reset_index(drop=True, inplace=True)

    return complex_counts