def shuffled_analysis(iterations: int, meta: pd.DataFrame, counts: pd.DataFrame, interactions: pd.DataFrame, cluster_interactions: list, base_result: pd.DataFrame, threads: int, suffixes: tuple = ('_1', '_2')) -> list: """ Shuffles meta and calculates the means for each and saves it in a list. Runs it in a multiple threads to run it fasters """ core_logger.info('Running Statistical Analysis') with Pool(processes=threads) as pool: asd_mult = partial(_statistical_analysis, base_result, cluster_interactions, counts, interactions, meta, suffixes) results = pool.map(asd_mult, range(iterations)) return results
def prefilters( interactions: pd.DataFrame, counts: pd.DataFrame, genes: pd.DataFrame, complexes: pd.DataFrame, complex_compositions: pd.DataFrame, counts_data: str, ) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame): """ - Finds the complex defined in counts and calculates their counts values - Remove interactions if the simple component ensembl is not in the counts list - Remove interactions if the complex component is not in the calculated complex list - Remove undefined simple counts - Merge simple filtered counts and calculated complex counts - Remove duplicated counts """ core_logger.info('Running Complex Prefilters') clusters_names = sorted(counts.columns.values) counts['gene'] = counts.index counts_multidata = cluster_counts_filter.filter_by_gene( counts, genes, counts_data) complex_in_counts, counts_multidata_complex = get_involved_complex_from_counts( counts_multidata, clusters_names, complexes, complex_compositions) if complex_in_counts.empty: return pd.DataFrame(), pd.DataFrame(), pd.DataFrame() interactions_filtered = filter_interactions_by_genes( interactions, counts['gene'].tolist(), counts_data=counts_data) interactions_filtered = filter_interactions_by_complexes( interactions_filtered, complex_in_counts) counts_simple = filter_counts_by_interactions(counts_multidata, interactions_filtered, counts_data=counts_data) counts_filtered = counts_simple.append(counts_multidata_complex, sort=False) # TODO: we need to add it to method log counts_filtered.drop_duplicates(['gene'], inplace=True) counts_filtered.set_index(counts_filtered['gene'], inplace=True) return interactions_filtered, counts_filtered, complex_in_counts
def call( meta: pd.DataFrame, counts: pd.DataFrame, interactions: pd.DataFrame, threshold: float = 0.1, result_precision: int = 3 ) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame): core_logger.info( '[Non Statistical Method] Threshold:{} Precission:{}'.format( threshold, result_precision)) interactions_filtered, counts_filtered = prefilters(counts, interactions) if interactions_filtered.empty or counts_filtered.empty: return pd.DataFrame(), pd.DataFrame(), pd.DataFrame() clusters = cpdb_statistical_analysis_helper.build_clusters( meta, counts_filtered) core_logger.info('Running Simple Analysis') cluster_interactions = cpdb_statistical_analysis_helper.get_cluster_combinations( clusters['names']) base_result = cpdb_statistical_analysis_helper.build_result_matrix( interactions_filtered, cluster_interactions) mean_analysis = cpdb_statistical_analysis_helper.mean_analysis( interactions_filtered, clusters, cluster_interactions, base_result, suffixes=('_1', '_2')) percent_analysis = cpdb_analysis_helper.percent_analysis( clusters, threshold, interactions_filtered, cluster_interactions, base_result, suffixes=('_1', '_2')) means_result, significant_means, deconvoluted_result = build_results( interactions_filtered, mean_analysis, percent_analysis, clusters['means'], result_precision) return means_result, significant_means, deconvoluted_result
def __init__(self, config: dict): core_logger.setLevel(config['logger']['level']) core_logger.info('Initializing SqlAlchemy CellPhoneDB Core') uri = self._build_uri(config) core_logger.debug('Database Uri: {}'.format(uri)) engine = create_engine(uri) database = Database(engine) database.base_model = Base database_manager = DatabaseManager(None, database) # TODO: Auto-load repositories database_manager.add_repository(ComplexRepository) database_manager.add_repository(GeneRepository) database_manager.add_repository(InteractionRepository) database_manager.add_repository(MultidataRepository) database_manager.add_repository(ProteinRepository) Cellphonedb.__init__(self, database_manager, config)
def call( meta: pd.DataFrame, counts: pd.DataFrame, interactions: pd.DataFrame, genes: pd.DataFrame, complexes: pd.DataFrame, complex_compositions: pd.DataFrame, threshold: float = 0.1, round_decimals: int = 1 ) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame): core_logger.info('[Non Statistical Method] Threshold:{}'.format(threshold)) cells_names = sorted(counts.columns) interactions_filtered, counts_filtered, complex_in_counts = prefilters( interactions, counts, genes, complexes, complex_compositions) if interactions_filtered.empty: return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame( ), pd.DataFrame() complex_significative_protein = get_complex_significative( complex_in_counts, counts_filtered, complex_compositions, cells_names) clusters = cpdb_statistical_analysis_helper.build_clusters( meta, counts_filtered) core_logger.info('Running Real Complex Analysis') cluster_interactions = cpdb_statistical_analysis_helper.get_cluster_combinations( clusters['names']) interactions_processed = get_interactions_processed( interactions_filtered, complex_significative_protein) base_result = cpdb_statistical_analysis_helper.build_result_matrix( interactions_processed, cluster_interactions) real_mean_analysis = cpdb_statistical_analysis_helper.mean_analysis( interactions_processed, clusters, cluster_interactions, base_result) means_result, deconvoluted_result = build_results( interactions_filtered, real_mean_analysis, clusters['means'], complex_compositions, counts, genes, round_decimals) return means_result, deconvoluted_result
def cpdb_statistical_analysis_launcher(self, raw_meta: pd.DataFrame, counts: pd.DataFrame, iterations: int, threshold: float, threads: int, debug_seed: int, result_precision: int ) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame): if threads < 1: core_logger.info('Using Default thread number: %s' % self.default_threads) threads = self.default_threads if threshold < 0 or threshold > 1: raise ThresholdValueException(threshold) meta = method_preprocessors.meta_preprocessor(raw_meta) counts = self._counts_validations(counts, meta) interactions = self.database_manager.get_repository('interaction').get_all_expanded( only_cellphonedb_interactor=True) genes = self.database_manager.get_repository('gene').get_all_expanded() complex_composition = self.database_manager.get_repository('complex').get_all_compositions() complex_expanded = self.database_manager.get_repository('complex').get_all_expanded() deconvoluted, mean_pvalue, means, pvalues, significant_means = \ cpdb_statistical_analysis_method.call(meta, counts, interactions, genes, complex_expanded, complex_composition, iterations, threshold, threads, debug_seed, result_precision) return pvalues, means, significant_means, mean_pvalue, deconvoluted
def cpdb_statistical_analysis_launcher( self, raw_meta: pd.DataFrame, counts: pd.DataFrame, threads: int, debug_seed: int, result_precision: int, log2_transform: bool ) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame ): if threads < 1: core_logger.info('Using Default thread number: %s' % self.default_threads) threads = self.default_threads meta = method_preprocessors.meta_preprocessor(raw_meta) counts = self._counts_validations(counts, meta) winsorized = \ cpdb_statistical_analysis_method.call(meta, counts, threads, debug_seed, result_precision, log2_transform) return winsorized
def build_results(interactions: pd.DataFrame, interactions_original: pd.DataFrame, counts_relations: pd.DataFrame, real_mean_analysis: pd.DataFrame, result_percent: pd.DataFrame, clusters_means: pd.DataFrame, complex_compositions: pd.DataFrame, counts: pd.DataFrame, genes: pd.DataFrame, result_precision: int, pvalue: float, counts_data: str ) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame): """ Sets the results data structure from method generated data. Results documents are defined by specs. """ core_logger.info('Building results') interactions: pd.DataFrame = interactions_original.loc[interactions.index] interactions['interaction_index'] = interactions.index interactions = interactions.merge(counts_relations, how='left', left_on='multidata_1_id', right_on='id_multidata', ) interactions = interactions.merge(counts_relations, how='left', left_on='multidata_2_id', right_on='id_multidata', suffixes=('_1', '_2')) interactions.set_index('interaction_index', inplace=True, drop=True) interacting_pair = cpdb_statistical_analysis_helper.interacting_pair_build(interactions) def simple_complex_indicator(interaction: pd.Series, suffix: str) -> str: """ Add simple/complex prefixes to interaction components """ if interaction['is_complex{}'.format(suffix)]: return 'complex:{}'.format(interaction['name{}'.format(suffix)]) return 'simple:{}'.format(interaction['name{}'.format(suffix)]) interactions['partner_a'] = interactions.apply(lambda interaction: simple_complex_indicator(interaction, '_1'), axis=1) interactions['partner_b'] = interactions.apply(lambda interaction: simple_complex_indicator(interaction, '_2'), axis=1) significant_mean_rank, significant_means = cpdb_statistical_analysis_helper.build_significant_means( real_mean_analysis, result_percent, pvalue) significant_means = significant_means.round(result_precision) gene_columns = ['{}_{}'.format(counts_data, suffix) for suffix in ('1', '2')] gene_renames = {column: 'gene_{}'.format(suffix) for column, suffix in zip(gene_columns, ['a', 'b'])} # Remove useless columns interactions_data_result = pd.DataFrame( interactions[['id_cp_interaction', 'partner_a', 'partner_b', 'receptor_1', 'receptor_2', *gene_columns, 'annotation_strategy']].copy()) interactions_data_result = pd.concat([interacting_pair, interactions_data_result], axis=1, sort=False) interactions_data_result['secreted'] = (interactions['secreted_1'] | interactions['secreted_2']) interactions_data_result['is_integrin'] = (interactions['integrin_1'] | interactions['integrin_2']) interactions_data_result.rename( columns={**gene_renames, 'receptor_1': 'receptor_a', 'receptor_2': 'receptor_b'}, inplace=True) # Dedupe rows and filter only desired columns interactions_data_result.drop_duplicates(inplace=True) means_columns = ['id_cp_interaction', 'interacting_pair', 'partner_a', 'partner_b', 'gene_a', 'gene_b', 'secreted', 'receptor_a', 'receptor_b', 'annotation_strategy', 'is_integrin'] interactions_data_result = interactions_data_result[means_columns] real_mean_analysis = real_mean_analysis.round(result_precision) significant_means = significant_means.round(result_precision) # Round result decimals for key, cluster_means in clusters_means.items(): clusters_means[key] = cluster_means.round(result_precision) # Document 1 pvalues_result = pd.concat([interactions_data_result, result_percent], axis=1, join='inner', sort=False) # Document 2 means_result = pd.concat([interactions_data_result, real_mean_analysis], axis=1, join='inner', sort=False) # Document 3 significant_means_result = pd.concat([interactions_data_result, significant_mean_rank, significant_means], axis=1, join='inner', sort=False) # Document 5 deconvoluted_result = deconvoluted_complex_result_build(clusters_means, interactions, complex_compositions, counts, genes, counts_data) return pvalues_result, means_result, significant_means_result, deconvoluted_result
def call(meta: pd.DataFrame, counts: pd.DataFrame, counts_data: str, interactions: pd.DataFrame, genes: pd.DataFrame, complexes: pd.DataFrame, complex_compositions: pd.DataFrame, pvalue: float, separator: str, iterations: int = 1000, threshold: float = 0.1, threads: int = 4, debug_seed: int = -1, result_precision: int = 3, ) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame): core_logger.info( '[Cluster Statistical Analysis] ' 'Threshold:{} Iterations:{} Debug-seed:{} Threads:{} Precision:{}'.format(threshold, iterations, debug_seed, threads, result_precision)) if debug_seed >= 0: np.random.seed(debug_seed) core_logger.warning('Debug random seed enabled. Setted to {}'.format(debug_seed)) cells_names = sorted(counts.columns) interactions.set_index('id_interaction', drop=True, inplace=True) interactions_reduced = interactions[['multidata_1_id', 'multidata_2_id']].drop_duplicates() complex_compositions.set_index('id_complex_composition', inplace=True, drop=True) # Add id multidata to counts input counts: pd.DataFrame = counts.merge(genes[['id_multidata', 'ensembl', 'gene_name', 'hgnc_symbol']], left_index=True, right_on=counts_data) counts_relations = counts[['id_multidata', 'ensembl', 'gene_name', 'hgnc_symbol']].copy() counts.set_index('id_multidata', inplace=True, drop=True) counts = counts[cells_names] counts = counts.astype('float32') counts = counts.groupby(counts.index).mean() if counts.empty: raise AllCountsFilteredException(hint='Are you using human data?') # End add id multidata interactions_filtered, counts_filtered, complex_composition_filtered = \ cpdb_statistical_analysis_helper.prefilters(interactions_reduced, counts, complexes, complex_compositions) if interactions_filtered.empty: raise NoInteractionsFound() clusters = cpdb_statistical_analysis_helper.build_clusters(meta, counts_filtered, complex_composition_filtered) core_logger.info('Running Real Analysis') cluster_interactions = cpdb_statistical_analysis_helper.get_cluster_combinations(clusters['names']) base_result = cpdb_statistical_analysis_helper.build_result_matrix(interactions_filtered, cluster_interactions, separator) real_mean_analysis = cpdb_statistical_analysis_helper.mean_analysis(interactions_filtered, clusters, cluster_interactions, base_result, separator) real_percents_analysis = cpdb_statistical_analysis_helper.percent_analysis(clusters, threshold, interactions_filtered, cluster_interactions, base_result, separator) core_logger.info('Running Statistical Analysis') statistical_mean_analysis = cpdb_statistical_analysis_helper.shuffled_analysis(iterations, meta, counts_filtered, interactions_filtered, cluster_interactions, complex_composition_filtered, base_result, threads, separator) result_percent = cpdb_statistical_analysis_helper.build_percent_result(real_mean_analysis, real_percents_analysis, statistical_mean_analysis, interactions_filtered, cluster_interactions, base_result, separator) pvalues_result, means_result, significant_means, deconvoluted_result = build_results( interactions_filtered, interactions, counts_relations, real_mean_analysis, result_percent, clusters['means'], complex_composition_filtered, counts, genes, result_precision, pvalue, counts_data ) return pvalues_result, means_result, significant_means, deconvoluted_result
def build_percent_result(real_mean_analysis: pd.DataFrame, real_perecents_analysis: pd.DataFrame, statistical_mean_analysis: list, interactions: pd.DataFrame, cluster_interactions: list, base_result: pd.DataFrame, separator: str) -> pd.DataFrame: """ Calculates the pvalues after statistical analysis. If real_percent or real_mean are zero, result_percent is 1 If not: Calculates how many shuffled means are bigger than real mean and divides it for the number of the total iterations EXAMPLE: INPUT: real_mean_analysis: cluster1_cluster1 cluster1_cluster2 ... interaction1 0.5 0.4 interaction2 0.0 0.2 real_percents_analysis: cluster1_cluster1 cluster1_cluster2 ... interaction1 1 0 interaction2 0 1 statistical means: [ cluster1_cluster1 cluster1_cluster2 ... interaction1 0.6 0.1 interaction2 0.0 0.2 , cluster1_cluster1 cluster1_cluster2 ... interaction1 0.5 0.4 interaction2 0.0 0.6 ] iterations = 2 RESULT: cluster1_cluster1 cluster1_cluster2 ... interaction1 1 1 interaction2 1 0.5 """ core_logger.info('Building Pvalues result') percent_result = base_result.copy() for interaction_index, interaction in interactions.iterrows(): for cluster_interaction in cluster_interactions: cluster_interaction_string = '{}{}{}'.format(cluster_interaction[0], separator, cluster_interaction[1]) real_mean = real_mean_analysis.at[interaction_index, cluster_interaction_string] real_percent = real_perecents_analysis.at[interaction_index, cluster_interaction_string] if int(real_percent) == 0 or real_mean == 0: result_percent = 1.0 else: shuffled_bigger = 0 for statistical_mean in statistical_mean_analysis: mean = statistical_mean.at[interaction_index, cluster_interaction_string] if mean > real_mean: shuffled_bigger += 1 result_percent = shuffled_bigger / len(statistical_mean_analysis) percent_result.at[interaction_index, cluster_interaction_string] = result_percent return percent_result
def __getattribute__(self, name: str): method = object.__getattribute__(self, name) if hasattr(method, '__call__'): core_logger.info('Collecting {}'.format(name)) return method
def call(meta: pd.DataFrame, counts: pd.DataFrame, counts_data: str, interactions: pd.DataFrame, genes: pd.DataFrame, complexes: pd.DataFrame, complex_compositions: pd.DataFrame, separator: str, threshold: float = 0.1, result_precision: int = 3 ) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame): core_logger.info( '[Non Statistical Method] Threshold:{} Precision:{}'.format(threshold, result_precision)) cells_names = sorted(counts.columns) interactions.set_index('id_interaction', drop=True, inplace=True) interactions_reduced = interactions[['multidata_1_id', 'multidata_2_id']].drop_duplicates() complex_compositions.set_index('id_complex_composition', inplace=True, drop=True) # Add id multidata to counts input counts: pd.DataFrame = counts.merge(genes[['id_multidata', 'ensembl', 'gene_name', 'hgnc_symbol']], left_index=True, right_on=counts_data) counts_relations = counts[['id_multidata', 'ensembl', 'gene_name', 'hgnc_symbol']].copy() counts.set_index('id_multidata', inplace=True, drop=True) counts = counts[cells_names] counts = counts.astype('float32') counts = counts.groupby(counts.index).mean() if counts.empty: raise AllCountsFilteredException(hint='Are you using human data?') # End add id multidata interactions_filtered, counts_filtered, complex_composition_filtered = \ cpdb_statistical_analysis_helper.prefilters(interactions_reduced, counts, complexes, complex_compositions) if interactions_filtered.empty: raise NoInteractionsFound() clusters = cpdb_statistical_analysis_helper.build_clusters(meta, counts_filtered, complex_composition_filtered) core_logger.info('Running Real Analysis') cluster_interactions = cpdb_statistical_analysis_helper.get_cluster_combinations(clusters['names']) base_result = cpdb_statistical_analysis_helper.build_result_matrix(interactions_filtered, cluster_interactions, separator) mean_analysis = cpdb_statistical_analysis_helper.mean_analysis(interactions_filtered, clusters, cluster_interactions, base_result, separator) percent_analysis = cpdb_analysis_helper.percent_analysis(clusters, threshold, interactions_filtered, cluster_interactions, base_result, separator) means_result, significant_means, deconvoluted_result = build_results( interactions_filtered, interactions, counts_relations, mean_analysis, percent_analysis, clusters['means'], complex_composition_filtered, counts, genes, result_precision, counts_data ) return means_result, significant_means, deconvoluted_result
def _build_sqlite_uri(database_file): path = os.path.realpath(os.path.expanduser(database_file)) core_logger.info('Using custom database at {}'.format(path)) return 'sqlite:///{}'.format(path)
def call( meta: pd.DataFrame, counts: pd.DataFrame, interactions: pd.DataFrame, genes: pd.DataFrame, complexes: pd.DataFrame, complex_compositions: pd.DataFrame, iterations: int = 1000, threshold: float = 0.1, threads: int = 4, debug_seed: int = -1, result_precision: int = 3 ) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame): core_logger.info( '[Cluster Statistical Analysis Complex] ' 'Threshold:{} Iterations:{} Debug-seed:{} Threads:{} Precision:{}'. format(threshold, iterations, debug_seed, threads, result_precision)) if debug_seed >= 0: pd.np.random.seed(debug_seed) core_logger.warning( 'Debug random seed enabled. Setted to {}'.format(debug_seed)) cells_names = sorted(counts.columns) interactions_filtered, counts_filtered, complex_in_counts = prefilters( interactions, counts, genes, complexes, complex_compositions) if interactions_filtered.empty: return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame( ), pd.DataFrame() complex_significative_protein = get_complex_significative( complex_in_counts, counts_filtered, complex_compositions, cells_names) clusters = cpdb_statistical_analysis_helper.build_clusters( meta, counts_filtered) core_logger.info('Running Real Complex Analysis') cluster_interactions = cpdb_statistical_analysis_helper.get_cluster_combinations( clusters['names']) interactions_processed = get_interactions_processed( interactions_filtered, complex_significative_protein) base_result = cpdb_statistical_analysis_helper.build_result_matrix( interactions_processed, cluster_interactions) real_mean_analysis = cpdb_statistical_analysis_helper.mean_analysis( interactions_processed, clusters, cluster_interactions, base_result) real_percents_analysis = cpdb_statistical_analysis_helper.percent_analysis( clusters, threshold, interactions_processed, cluster_interactions, base_result) statistical_mean_analysis = cpdb_statistical_analysis_helper.shuffled_analysis( iterations, meta, counts_filtered, interactions_processed, cluster_interactions, base_result, threads) result_percent = cpdb_statistical_analysis_helper.build_percent_result( real_mean_analysis, real_percents_analysis, statistical_mean_analysis, interactions_processed, cluster_interactions, base_result) pvalues_result, means_result, significant_means, mean_pvalue_result, deconvoluted_result = build_results( interactions_filtered, real_mean_analysis, result_percent, clusters['means'], complex_compositions, counts, genes, result_precision) return pvalues_result, means_result, significant_means, mean_pvalue_result, deconvoluted_result
def build_results( interactions: pd.DataFrame, real_mean_analysis: pd.DataFrame, result_percent: pd.DataFrame, clusters_means: dict, complex_compositions: pd.DataFrame, counts: pd.DataFrame, genes: pd.DataFrame, result_precision: int ) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame): """ Sets the results data structure from method generated data. Results documents are defined by specs. """ core_logger.info('Building Complex results') interacting_pair = cpdb_statistical_analysis_helper.interacting_pair_build( interactions) interactions = interactions.copy() def simple_complex_indicator(interaction: pd.Series, suffix: str) -> str: """ Add simple/complex prefixes to interaction components """ if interaction['is_complex{}'.format(suffix)]: return 'complex:{}'.format(interaction['name{}'.format(suffix)]) return 'simple:{}'.format(interaction['name{}'.format(suffix)]) interactions['partner_a'] = interactions.apply( lambda interaction: simple_complex_indicator(interaction, '_1'), axis=1) interactions['partner_b'] = interactions.apply( lambda interaction: simple_complex_indicator(interaction, '_2'), axis=1) # Remove useless columns interactions_data_result = pd.DataFrame(interactions[[ 'id_cp_interaction', 'partner_a', 'partner_b', 'ensembl_1', 'ensembl_2', 'source' ]].copy()) interactions_data_result = pd.concat( [interacting_pair, interactions_data_result], axis=1, sort=False) interactions_data_result['secreted'] = (interactions['secretion_1'] | interactions['secretion_2']) interactions_data_result['is_integrin'] = ( interactions['integrin_interaction_1'] | interactions['integrin_interaction_2']) interactions_data_result.rename(columns={ 'ensembl_1': 'ensembl_a', 'ensembl_2': 'ensembl_b' }, inplace=True) significant_mean_rank, significant_means = cpdb_statistical_analysis_helper.build_significant_means( real_mean_analysis, result_percent) result_percent = result_percent.round(result_precision) real_mean_analysis = real_mean_analysis.round(result_precision) significant_means = significant_means.round(result_precision) # Round result decimals for key, cluster_means in clusters_means.items(): clusters_means[key] = cluster_means.round(result_precision) # Document 1 pvalues_result = pd.concat([interactions_data_result, result_percent], axis=1, join='inner', sort=False) # Document 2 means_result = pd.concat([interactions_data_result, real_mean_analysis], axis=1, join='inner', sort=False) # Document 3 significant_mean_result = pd.concat( [interactions_data_result, significant_mean_rank, significant_means], axis=1, join='inner', sort=False) # Document 4 mean_pvalue_result = cpdb_statistical_analysis_helper.mean_pvalue_result_build( real_mean_analysis, result_percent, interactions_data_result) # Document 5 deconvoluted_result = deconvoluted_complex_result_build( clusters_means, interactions, complex_compositions, counts, genes) return pvalues_result, means_result, significant_mean_result, mean_pvalue_result, deconvoluted_result
def call( meta: pd.DataFrame, counts: pd.DataFrame, counts_data: str, interactions: pd.DataFrame, pvalue: float, separator: str, iterations: int = 1000, threshold: float = 0.1, threads: int = 4, debug_seed: int = -1, result_precision: int = 3, ) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame): core_logger.info( '[Cluster Statistical Analysis Simple] ' 'Threshold:{} Iterations:{} Debug-seed:{} Threads:{} Precision:{}'. format(threshold, iterations, debug_seed, threads, result_precision)) if debug_seed >= 0: pd.np.random.seed(debug_seed) core_logger.warning( 'Debug random seed enabled. Setted to {}'.format(debug_seed)) interactions_filtered, counts_filtered = prefilters( counts, interactions, counts_data) if interactions_filtered.empty or counts_filtered.empty: return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame() clusters = cpdb_statistical_analysis_helper.build_clusters( meta, counts_filtered) core_logger.info('Running Real Simple Analysis') cluster_interactions = cpdb_statistical_analysis_helper.get_cluster_combinations( clusters['names']) base_result = cpdb_statistical_analysis_helper.build_result_matrix( interactions_filtered, cluster_interactions, separator) real_mean_analysis = cpdb_statistical_analysis_helper.mean_analysis( interactions_filtered, clusters, cluster_interactions, base_result, separator, suffixes=('_1', '_2'), counts_data=counts_data) real_percent_analysis = cpdb_statistical_analysis_helper.percent_analysis( clusters, threshold, interactions_filtered, cluster_interactions, base_result, separator, suffixes=('_1', '_2'), counts_data=counts_data) statistical_mean_analysis = cpdb_statistical_analysis_helper.shuffled_analysis( iterations, meta, counts_filtered, interactions_filtered, cluster_interactions, base_result, threads, separator, suffixes=('_1', '_2'), counts_data=counts_data) result_percent = cpdb_statistical_analysis_helper.build_percent_result( real_mean_analysis, real_percent_analysis, statistical_mean_analysis, interactions_filtered, cluster_interactions, base_result, separator) pvalues_result, means_result, significant_means, deconvoluted_result = build_results( interactions_filtered, real_mean_analysis, result_percent, clusters['means'], result_precision, pvalue, counts_data) return pvalues_result, means_result, significant_means, deconvoluted_result
def build_results( interactions: pd.DataFrame, real_mean_analysis: pd.DataFrame, result_percent: pd.DataFrame, clusters_means: dict, result_precision: int, pvalue: float, counts_data: str, ) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame): core_logger.info('Building Simple results') interacting_pair = cpdb_statistical_analysis_helper.interacting_pair_build( interactions) gene_columns = [ '{}_{}'.format(counts_data, suffix) for suffix in ('1', '2') ] gene_renames = { column: 'gene_{}'.format(suffix) for column, suffix in zip(gene_columns, ['a', 'b']) } interactions_data_result = pd.DataFrame(interactions[[ 'id_cp_interaction', 'name_1', 'name_2', 'receptor_1', 'receptor_2', *gene_columns, 'annotation_strategy' ]].copy()) interactions_data_result = pd.concat( [interacting_pair, interactions_data_result], axis=1, sort=False) interactions_data_result['secreted'] = (interactions['secreted_1'] | interactions['secreted_2']) interactions_data_result['is_integrin'] = (interactions['integrin_1'] | interactions['integrin_2']) interactions_data_result.rename(columns={ 'name_1': 'partner_a', 'name_2': 'partner_b', 'receptor_1': 'receptor_a', 'receptor_2': 'receptor_b', **gene_renames }, inplace=True) interactions_data_result['partner_a'] = interactions_data_result[ 'partner_a'].apply(lambda name: 'simple:{}'.format(name)) interactions_data_result['partner_b'] = interactions_data_result[ 'partner_b'].apply(lambda name: 'simple:{}'.format(name)) # Dedupe rows and filter only desired columns interactions_data_result.drop_duplicates(inplace=True) means_columns = [ 'id_cp_interaction', 'interacting_pair', 'partner_a', 'partner_b', 'gene_a', 'gene_b', 'secreted', 'receptor_a', 'receptor_b', 'annotation_strategy', 'is_integrin' ] interactions_data_result = interactions_data_result[means_columns] significant_mean_rank, significant_means = cpdb_statistical_analysis_helper.build_significant_means( real_mean_analysis, result_percent, pvalue) result_percent = result_percent.round(result_precision) real_mean_analysis = real_mean_analysis.round(result_precision) significant_means = significant_means.round(result_precision) for key, cluster_means in clusters_means.items(): clusters_means[key] = cluster_means.round(result_precision) # Document 1 pvalues_result = pd.concat([interactions_data_result, result_percent], axis=1, join='inner', sort=False) # Document 2 means_result = pd.concat([interactions_data_result, real_mean_analysis], axis=1, join='inner', sort=False) # Document 3 significant_mean_result = pd.concat( [interactions_data_result, significant_mean_rank, significant_means], axis=1, join='inner', sort=False) # Document 5 deconvoluted_result = deconvoluted_result_build(clusters_means, interactions, counts_data=counts_data) return pvalues_result, means_result, significant_mean_result, deconvoluted_result
def call( meta: pd.DataFrame, counts: pd.DataFrame, counts_data: str, interactions: pd.DataFrame, genes: pd.DataFrame, complexes: pd.DataFrame, complex_compositions: pd.DataFrame, separator: str, threshold: float = 0.1, result_precision: int = 3 ) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame): core_logger.info( '[Non Statistical Method] Threshold:{} Precision:{}'.format( threshold, result_precision)) cells_names = sorted(counts.columns) interactions_filtered, counts_filtered, complex_in_counts = prefilters( interactions, counts, genes, complexes, complex_compositions, counts_data) if interactions_filtered.empty: return pd.DataFrame(), pd.DataFrame(), pd.DataFrame() complex_significative_protein = get_complex_significative( complex_in_counts, counts_filtered, complex_compositions, cells_names) clusters = cpdb_statistical_analysis_helper.build_clusters( meta, counts_filtered) core_logger.info('Running Complex Analysis') cluster_interactions = cpdb_statistical_analysis_helper.get_cluster_combinations( clusters['names']) interactions_processed = get_interactions_processed( interactions_filtered, complex_significative_protein, counts_data=counts_data) base_result = cpdb_statistical_analysis_helper.build_result_matrix( interactions_processed, cluster_interactions, separator) mean_analysis = cpdb_statistical_analysis_helper.mean_analysis( interactions_processed, clusters, cluster_interactions, base_result, separator, suffixes=('_1', '_2'), counts_data=counts_data) percent_analysis = cpdb_analysis_helper.percent_analysis( clusters, threshold, interactions_processed, cluster_interactions, base_result.copy(), separator, suffixes=('_1', '_2'), counts_data=counts_data) means_result, significant_means, deconvoluted_result = build_results( interactions_filtered, mean_analysis, percent_analysis, clusters['means'], complex_compositions, counts, genes, result_precision, counts_data) return means_result, significant_means, deconvoluted_result
def build_results(winsorized_counts: pd.DataFrame, result_precision: int ) -> (pd.DataFrame): core_logger.info('Building winsorized results') return winsorized_counts.round(result_precision)
def build_results( interactions: pd.DataFrame, mean_analysis: pd.DataFrame, percent_analysis: pd.DataFrame, clusters_means: dict, result_precision: int) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame): core_logger.info('Building Simple results') interacting_pair = cpdb_statistical_analysis_helper.interacting_pair_build( interactions) interactions_data_result = pd.DataFrame(interactions[[ 'id_cp_interaction', 'name_1', 'name_2', 'ensembl_1', 'ensembl_2', 'source' ]].copy()) interactions_data_result = pd.concat( [interacting_pair, interactions_data_result], axis=1, sort=False) interactions_data_result['secreted'] = (interactions['secretion_1'] | interactions['secretion_2']) interactions_data_result['is_integrin'] = ( interactions['integrin_interaction_1'] | interactions['integrin_interaction_2']) interactions_data_result.rename(columns={ 'name_1': 'partner_a', 'name_2': 'partner_b', 'ensembl_1': 'ensembl_a', 'ensembl_2': 'ensembl_b' }, inplace=True) interactions_data_result['partner_a'] = interactions_data_result[ 'partner_a'].apply(lambda name: 'simple:{}'.format(name)) interactions_data_result['partner_b'] = interactions_data_result[ 'partner_b'].apply(lambda name: 'simple:{}'.format(name)) significant_mean_rank, significant_means = cpdb_analysis_helper.build_significant_means( mean_analysis, percent_analysis) significant_means = significant_means.round(result_precision) mean_analysis = mean_analysis.round(result_precision) for key, cluster_means in clusters_means.items(): clusters_means[key] = cluster_means.round(result_precision) # Document 2 means_result = pd.concat([interactions_data_result, mean_analysis], axis=1, join='inner', sort=False) # Document 3 significant_means_result = pd.concat( [interactions_data_result, significant_mean_rank, significant_means], axis=1, join='inner', sort=False) # Document 5 deconvoluted_result = deconvoluted_result_build(clusters_means, interactions) return means_result, significant_means_result, deconvoluted_result
def __getattribute__(self, name): method = object.__getattribute__(self, name) if hasattr(method, '__call__'): core_logger.info('Launching Method {}'.format(name)) return method
def build_percent_result(real_mean_analysis: pd.DataFrame, real_perecents_analysis: pd.DataFrame, statistical_mean_analysis: list, interactions: pd.DataFrame, cluster_interactions: list, base_result: pd.DataFrame) -> pd.DataFrame: """ Calculates the pvalues after statistical analysis. If real_percent or real_mean are zero, result_percent is 1 If not: Calculates how many shuffled means are bigger than real mean and divides it for the number of the total iterations EXAMPLE: INPUT: real_mean_analysis: cluster1_cluster1 cluster1_cluster2 ... interaction1 0.5 0.4 interaction2 0.0 0.2 real_percents_analysis: cluster1_cluster1 cluster1_cluster2 ... interaction1 1 0 interaction2 0 1 statistical means: [ cluster1_cluster1 cluster1_cluster2 ... interaction1 0.6 0.1 interaction2 0.0 0.2 , cluster1_cluster1 cluster1_cluster2 ... interaction1 0.5 0.4 interaction2 0.0 0.6 ] iterations = 2 RESULT: cluster1_cluster1 cluster1_cluster2 ... interaction1 1 1 interaction2 1 0.5 """ core_logger.info('Building Pvalues result') percent_result = base_result.copy() for interaction_index, interaction in interactions.iterrows(): for cluster_interaction in cluster_interactions: cluster_interaction_string = '{}_{}'.format( cluster_interaction[0], cluster_interaction[1]) real_mean = real_mean_analysis.at[interaction_index, cluster_interaction_string] real_percent = real_perecents_analysis.at[ interaction_index, cluster_interaction_string] # print(real_mean) # print(real_percent) if int(real_percent) == 0 or real_mean == 0: result_percent = 1.0 else: # cellnet mean_per_pair = [] for statistical_mean in statistical_mean_analysis: mean = statistical_mean.at[interaction_index, cluster_interaction_string] mean_per_pair.append(mean) mean_per_pair = [x for x in mean_per_pair if ~np.isnan(x)] mean_per_pair = np.array(mean_per_pair) # mean_per_pair.sort() # mean_at_975 = np.percentile(mean_per_pair, 97.5) # mean_at_25 = np.percentile(mean_per_pair, 2.5) # print(np.percentile(mean_per_pair, 2.5), np.percentile(mean_per_pair, 50), np.percentile(mean_per_pair, 97.5)) if real_mean > 0: shuffled_bigger_smaller = len( mean_per_pair[mean_per_pair > real_mean]) result_percent = shuffled_bigger_smaller / len( mean_per_pair) result_percent = result_percent * 2 # two-tails elif real_mean < 0: shuffled_bigger_smaller = len( mean_per_pair[mean_per_pair < real_mean]) result_percent = shuffled_bigger_smaller / len( mean_per_pair) result_percent = result_percent * 2 # two-tails percent_result.at[interaction_index, cluster_interaction_string] = result_percent return percent_result