def get_involved_complex_from_counts(multidatas_counts: pd.DataFrame, clusters_names: list, complex_expanded: pd.DataFrame, complex_composition: pd.DataFrame) -> ( pd.DataFrame, pd.DataFrame): """ Finds the complexes defined in counts and calculates the counts values """ proteins_in_complexes = complex_composition['protein_multidata_id'].tolist() # Remove counts that can't be part of a complex multidatas_counts_filtered = multidatas_counts[ multidatas_counts['id_multidata'].apply(lambda multidata: multidata in proteins_in_complexes)] # Find complexes with all components defined in counts complex_composition_counts = complex_helper.get_involved_complex_from_protein(multidatas_counts_filtered, complex_expanded, complex_composition, drop_duplicates=False) if complex_composition_counts.empty: return pd.DataFrame(), pd.DataFrame() # Remove counts that are not defined in selected complexes multidatas_counts_filtered = filter_counts_by_genes(multidatas_counts_filtered, complex_composition_counts['gene'].tolist()) # Set the counts value a complex count. This is the minimum value of the cell component complex_counts = cluster_counts_helper.merge_complex_counts(clusters_names, complex_composition_counts, list(complex_expanded.columns.values)) # Removes empty counts complex_counts = cluster_counts_filter.filter_empty_cluster_counts(complex_counts, clusters_names) complex_counts.drop(clusters_names, axis=1, inplace=True) return complex_counts, multidatas_counts_filtered
def test_get_involved_complex_from_protein_empty_result(self): proteins = pd.read_csv('{}/helper_complex_protein.csv'.format( self.FIXTURES_SUBPATH)) proteins.drop(proteins.index, inplace=True) complexes = pd.read_csv('{}/helper_complex_complex.csv'.format( self.FIXTURES_SUBPATH)) complex_composition = pd.read_csv( '{}/helper_complex_complex_composition.csv'.format( self.FIXTURES_SUBPATH)) result = complex_helper.get_involved_complex_from_protein( proteins, complexes, complex_composition, drop_duplicates=False) self.assertTrue( dataframe_functions.dataframes_has_same_data( result, pd.DataFrame()))
def test_get_involved_complex_from_protein(self): proteins = pd.read_csv('{}/helper_complex_protein.csv'.format( self.FIXTURES_SUBPATH)) complexes = pd.read_csv('{}/helper_complex_complex.csv'.format( self.FIXTURES_SUBPATH)) complex_composition = pd.read_csv( '{}/helper_complex_complex_composition.csv'.format( self.FIXTURES_SUBPATH)) result_expected = pd.read_csv('{}/helper_complex_result.csv'.format( self.FIXTURES_SUBPATH)) result = complex_helper.get_involved_complex_from_protein( proteins, complexes, complex_composition, drop_duplicates=False) self.assertTrue( dataframe_functions.dataframes_has_same_data( result, result_expected))
def get_complex_involved_in_counts( multidatas_counts: pd.DataFrame, clusters_names: list, complex_composition: pd.DataFrame, complex_expanded: pd.DataFrame) -> pd.DataFrame: """ Gets complexes involved in counts """ core_logger.debug('Finding Complexes') complex_counts_composition = complex_helper.get_involved_complex_from_protein( multidatas_counts, complex_expanded, complex_composition, drop_duplicates=False) complex_counts = merge_complex_counts( clusters_names, complex_counts_composition, list(complex_expanded.columns.values)) complex_counts = filter_empty_cluster_counts(complex_counts, clusters_names) complex_counts.reset_index(drop=True, inplace=True) return complex_counts