def get_involved_complex_from_counts(multidatas_counts: pd.DataFrame, clusters_names: list, complex_expanded: pd.DataFrame, complex_composition: pd.DataFrame) -> ( pd.DataFrame, pd.DataFrame): """ Finds the complexes defined in counts and calculates the counts values """ proteins_in_complexes = complex_composition['protein_multidata_id'].tolist() # Remove counts that can't be part of a complex multidatas_counts_filtered = multidatas_counts[ multidatas_counts['id_multidata'].apply(lambda multidata: multidata in proteins_in_complexes)] # Find complexes with all components defined in counts complex_composition_counts = complex_helper.get_involved_complex_from_protein(multidatas_counts_filtered, complex_expanded, complex_composition, drop_duplicates=False) if complex_composition_counts.empty: return pd.DataFrame(), pd.DataFrame() # Remove counts that are not defined in selected complexes multidatas_counts_filtered = filter_counts_by_genes(multidatas_counts_filtered, complex_composition_counts['gene'].tolist()) # Set the counts value a complex count. This is the minimum value of the cell component complex_counts = cluster_counts_helper.merge_complex_counts(clusters_names, complex_composition_counts, list(complex_expanded.columns.values)) # Removes empty counts complex_counts = cluster_counts_filter.filter_empty_cluster_counts(complex_counts, clusters_names) complex_counts.drop(clusters_names, axis=1, inplace=True) return complex_counts, multidatas_counts_filtered
def test_filter_empty_cluster_counts(self): cluster_counts = pd.read_csv( '{}/cluster_counts_generic_cluster_counts.csv'.format( self.FIXTURES_SUBPATH)) expected_result = pd.read_csv( '{}/cluster_counts_filter_empty_cluster_results.csv'.format( self.FIXTURES_SUBPATH)) gene_column_name = 'gene' cluster_names = list(cluster_counts.columns.values) cluster_names.remove(gene_column_name) result = cluster_counts_filter.filter_empty_cluster_counts( cluster_counts, cluster_names) self.assertTrue( dataframe_functions.dataframes_has_same_data( result, expected_result))
def get_complex_involved_in_counts( multidatas_counts: pd.DataFrame, clusters_names: list, complex_composition: pd.DataFrame, complex_expanded: pd.DataFrame) -> pd.DataFrame: """ Gets complexes involved in counts """ core_logger.debug('Finding Complexes') complex_counts_composition = complex_helper.get_involved_complex_from_protein( multidatas_counts, complex_expanded, complex_composition, drop_duplicates=False) complex_counts = merge_complex_counts( clusters_names, complex_counts_composition, list(complex_expanded.columns.values)) complex_counts = filter_empty_cluster_counts(complex_counts, clusters_names) complex_counts.reset_index(drop=True, inplace=True) return complex_counts