コード例 #1
0
def shuffled_analysis(iterations: int, meta: pd.DataFrame, counts: pd.DataFrame, interactions: pd.DataFrame,
                      cluster_interactions: list, base_result: pd.DataFrame, threads: int,
                      suffixes: tuple = ('_1', '_2')) -> list:
    """
    Shuffles meta and calculates the means for each and saves it in a list.

    Runs it in a multiple threads to run it fasters
    """
    core_logger.info('Running Statistical Analysis')
    with Pool(processes=threads) as pool:
        asd_mult = partial(_statistical_analysis, base_result, cluster_interactions, counts, interactions, meta,
                           suffixes)
        results = pool.map(asd_mult, range(iterations))

    return results
コード例 #2
0
def prefilters(
    interactions: pd.DataFrame,
    counts: pd.DataFrame,
    genes: pd.DataFrame,
    complexes: pd.DataFrame,
    complex_compositions: pd.DataFrame,
    counts_data: str,
) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    """
    - Finds the complex defined in counts and calculates their counts values
    - Remove interactions if the simple component ensembl is not in the counts list
    - Remove interactions if the complex component is not in the calculated complex list
    - Remove undefined simple counts
    - Merge simple filtered counts and calculated complex counts
    - Remove duplicated counts
    """
    core_logger.info('Running Complex Prefilters')
    clusters_names = sorted(counts.columns.values)
    counts['gene'] = counts.index

    counts_multidata = cluster_counts_filter.filter_by_gene(
        counts, genes, counts_data)

    complex_in_counts, counts_multidata_complex = get_involved_complex_from_counts(
        counts_multidata, clusters_names, complexes, complex_compositions)

    if complex_in_counts.empty:
        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

    interactions_filtered = filter_interactions_by_genes(
        interactions, counts['gene'].tolist(), counts_data=counts_data)

    interactions_filtered = filter_interactions_by_complexes(
        interactions_filtered, complex_in_counts)

    counts_simple = filter_counts_by_interactions(counts_multidata,
                                                  interactions_filtered,
                                                  counts_data=counts_data)

    counts_filtered = counts_simple.append(counts_multidata_complex,
                                           sort=False)

    # TODO: we need to add it to method log
    counts_filtered.drop_duplicates(['gene'], inplace=True)

    counts_filtered.set_index(counts_filtered['gene'], inplace=True)

    return interactions_filtered, counts_filtered, complex_in_counts
コード例 #3
0
def call(
        meta: pd.DataFrame,
        counts: pd.DataFrame,
        interactions: pd.DataFrame,
        threshold: float = 0.1,
        result_precision: int = 3
) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    core_logger.info(
        '[Non Statistical Method] Threshold:{} Precission:{}'.format(
            threshold, result_precision))

    interactions_filtered, counts_filtered = prefilters(counts, interactions)

    if interactions_filtered.empty or counts_filtered.empty:
        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

    clusters = cpdb_statistical_analysis_helper.build_clusters(
        meta, counts_filtered)
    core_logger.info('Running Simple Analysis')
    cluster_interactions = cpdb_statistical_analysis_helper.get_cluster_combinations(
        clusters['names'])

    base_result = cpdb_statistical_analysis_helper.build_result_matrix(
        interactions_filtered, cluster_interactions)

    mean_analysis = cpdb_statistical_analysis_helper.mean_analysis(
        interactions_filtered,
        clusters,
        cluster_interactions,
        base_result,
        suffixes=('_1', '_2'))

    percent_analysis = cpdb_analysis_helper.percent_analysis(
        clusters,
        threshold,
        interactions_filtered,
        cluster_interactions,
        base_result,
        suffixes=('_1', '_2'))

    means_result, significant_means, deconvoluted_result = build_results(
        interactions_filtered, mean_analysis, percent_analysis,
        clusters['means'], result_precision)

    return means_result, significant_means, deconvoluted_result
コード例 #4
0
    def __init__(self, config: dict):
        core_logger.setLevel(config['logger']['level'])
        core_logger.info('Initializing SqlAlchemy CellPhoneDB Core')

        uri = self._build_uri(config)

        core_logger.debug('Database Uri: {}'.format(uri))

        engine = create_engine(uri)
        database = Database(engine)
        database.base_model = Base
        database_manager = DatabaseManager(None, database)
        # TODO: Auto-load repositories
        database_manager.add_repository(ComplexRepository)
        database_manager.add_repository(GeneRepository)
        database_manager.add_repository(InteractionRepository)
        database_manager.add_repository(MultidataRepository)
        database_manager.add_repository(ProteinRepository)
        Cellphonedb.__init__(self, database_manager, config)
コード例 #5
0
def call(
    meta: pd.DataFrame,
    counts: pd.DataFrame,
    interactions: pd.DataFrame,
    genes: pd.DataFrame,
    complexes: pd.DataFrame,
    complex_compositions: pd.DataFrame,
    threshold: float = 0.1,
    round_decimals: int = 1
) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame):
    core_logger.info('[Non Statistical Method] Threshold:{}'.format(threshold))

    cells_names = sorted(counts.columns)

    interactions_filtered, counts_filtered, complex_in_counts = prefilters(
        interactions, counts, genes, complexes, complex_compositions)
    if interactions_filtered.empty:
        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(
        ), pd.DataFrame()

    complex_significative_protein = get_complex_significative(
        complex_in_counts, counts_filtered, complex_compositions, cells_names)

    clusters = cpdb_statistical_analysis_helper.build_clusters(
        meta, counts_filtered)
    core_logger.info('Running Real Complex Analysis')

    cluster_interactions = cpdb_statistical_analysis_helper.get_cluster_combinations(
        clusters['names'])
    interactions_processed = get_interactions_processed(
        interactions_filtered, complex_significative_protein)

    base_result = cpdb_statistical_analysis_helper.build_result_matrix(
        interactions_processed, cluster_interactions)

    real_mean_analysis = cpdb_statistical_analysis_helper.mean_analysis(
        interactions_processed, clusters, cluster_interactions, base_result)

    means_result, deconvoluted_result = build_results(
        interactions_filtered, real_mean_analysis, clusters['means'],
        complex_compositions, counts, genes, round_decimals)
    return means_result, deconvoluted_result
コード例 #6
0
    def cpdb_statistical_analysis_launcher(self,
                                           raw_meta: pd.DataFrame,
                                           counts: pd.DataFrame,
                                           iterations: int,
                                           threshold: float,
                                           threads: int,
                                           debug_seed: int,
                                           result_precision: int
                                           ) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame):

        if threads < 1:
            core_logger.info('Using Default thread number: %s' % self.default_threads)
            threads = self.default_threads

        if threshold < 0 or threshold > 1:
            raise ThresholdValueException(threshold)

        meta = method_preprocessors.meta_preprocessor(raw_meta)
        counts = self._counts_validations(counts, meta)

        interactions = self.database_manager.get_repository('interaction').get_all_expanded(
            only_cellphonedb_interactor=True)
        genes = self.database_manager.get_repository('gene').get_all_expanded()
        complex_composition = self.database_manager.get_repository('complex').get_all_compositions()
        complex_expanded = self.database_manager.get_repository('complex').get_all_expanded()

        deconvoluted, mean_pvalue, means, pvalues, significant_means = \
            cpdb_statistical_analysis_method.call(meta,
                                                  counts,
                                                  interactions,
                                                  genes,
                                                  complex_expanded,
                                                  complex_composition,
                                                  iterations,
                                                  threshold,
                                                  threads,
                                                  debug_seed,
                                                  result_precision)

        return pvalues, means, significant_means, mean_pvalue, deconvoluted
コード例 #7
0
    def cpdb_statistical_analysis_launcher(
        self, raw_meta: pd.DataFrame, counts: pd.DataFrame, threads: int,
        debug_seed: int, result_precision: int, log2_transform: bool
    ) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame
          ):

        if threads < 1:
            core_logger.info('Using Default thread number: %s' %
                             self.default_threads)
            threads = self.default_threads

        meta = method_preprocessors.meta_preprocessor(raw_meta)
        counts = self._counts_validations(counts, meta)

        winsorized = \
            cpdb_statistical_analysis_method.call(meta,
                                                  counts,
                                                  threads,
                                                  debug_seed,
                                                  result_precision,
                                                  log2_transform)

        return winsorized
コード例 #8
0
def build_results(interactions: pd.DataFrame,
                  interactions_original: pd.DataFrame,
                  counts_relations: pd.DataFrame,
                  real_mean_analysis: pd.DataFrame,
                  result_percent: pd.DataFrame,
                  clusters_means: pd.DataFrame,
                  complex_compositions: pd.DataFrame,
                  counts: pd.DataFrame,
                  genes: pd.DataFrame,
                  result_precision: int,
                  pvalue: float,
                  counts_data: str
                  ) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame):
    """
    Sets the results data structure from method generated data. Results documents are defined by specs.
    """
    core_logger.info('Building results')
    interactions: pd.DataFrame = interactions_original.loc[interactions.index]
    interactions['interaction_index'] = interactions.index
    interactions = interactions.merge(counts_relations, how='left', left_on='multidata_1_id', right_on='id_multidata', )
    interactions = interactions.merge(counts_relations, how='left', left_on='multidata_2_id', right_on='id_multidata',
                                      suffixes=('_1', '_2'))
    interactions.set_index('interaction_index', inplace=True, drop=True)

    interacting_pair = cpdb_statistical_analysis_helper.interacting_pair_build(interactions)

    def simple_complex_indicator(interaction: pd.Series, suffix: str) -> str:
        """
        Add simple/complex prefixes to interaction components
        """
        if interaction['is_complex{}'.format(suffix)]:
            return 'complex:{}'.format(interaction['name{}'.format(suffix)])

        return 'simple:{}'.format(interaction['name{}'.format(suffix)])

    interactions['partner_a'] = interactions.apply(lambda interaction: simple_complex_indicator(interaction, '_1'),
                                                   axis=1)
    interactions['partner_b'] = interactions.apply(lambda interaction: simple_complex_indicator(interaction, '_2'),
                                                   axis=1)

    significant_mean_rank, significant_means = cpdb_statistical_analysis_helper.build_significant_means(
        real_mean_analysis, result_percent, pvalue)
    significant_means = significant_means.round(result_precision)

    gene_columns = ['{}_{}'.format(counts_data, suffix) for suffix in ('1', '2')]
    gene_renames = {column: 'gene_{}'.format(suffix) for column, suffix in zip(gene_columns, ['a', 'b'])}

    # Remove useless columns
    interactions_data_result = pd.DataFrame(
        interactions[['id_cp_interaction', 'partner_a', 'partner_b', 'receptor_1', 'receptor_2', *gene_columns,
                      'annotation_strategy']].copy())

    interactions_data_result = pd.concat([interacting_pair, interactions_data_result], axis=1, sort=False)

    interactions_data_result['secreted'] = (interactions['secreted_1'] | interactions['secreted_2'])
    interactions_data_result['is_integrin'] = (interactions['integrin_1'] | interactions['integrin_2'])

    interactions_data_result.rename(
        columns={**gene_renames, 'receptor_1': 'receptor_a', 'receptor_2': 'receptor_b'},
        inplace=True)

    # Dedupe rows and filter only desired columns
    interactions_data_result.drop_duplicates(inplace=True)

    means_columns = ['id_cp_interaction', 'interacting_pair', 'partner_a', 'partner_b', 'gene_a', 'gene_b', 'secreted',
                     'receptor_a', 'receptor_b', 'annotation_strategy', 'is_integrin']

    interactions_data_result = interactions_data_result[means_columns]

    real_mean_analysis = real_mean_analysis.round(result_precision)
    significant_means = significant_means.round(result_precision)

    # Round result decimals
    for key, cluster_means in clusters_means.items():
        clusters_means[key] = cluster_means.round(result_precision)

    # Document 1
    pvalues_result = pd.concat([interactions_data_result, result_percent], axis=1, join='inner', sort=False)

    # Document 2
    means_result = pd.concat([interactions_data_result, real_mean_analysis], axis=1, join='inner', sort=False)

    # Document 3
    significant_means_result = pd.concat([interactions_data_result, significant_mean_rank, significant_means], axis=1,
                                         join='inner', sort=False)

    # Document 5
    deconvoluted_result = deconvoluted_complex_result_build(clusters_means,
                                                            interactions,
                                                            complex_compositions,
                                                            counts,
                                                            genes,
                                                            counts_data)

    return pvalues_result, means_result, significant_means_result, deconvoluted_result
コード例 #9
0
def call(meta: pd.DataFrame,
         counts: pd.DataFrame,
         counts_data: str,
         interactions: pd.DataFrame,
         genes: pd.DataFrame,
         complexes: pd.DataFrame,
         complex_compositions: pd.DataFrame,
         pvalue: float,
         separator: str,
         iterations: int = 1000,
         threshold: float = 0.1,
         threads: int = 4,
         debug_seed: int = -1,
         result_precision: int = 3,
         ) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame):
    core_logger.info(
        '[Cluster Statistical Analysis] '
        'Threshold:{} Iterations:{} Debug-seed:{} Threads:{} Precision:{}'.format(threshold,
                                                                                  iterations,
                                                                                  debug_seed,
                                                                                  threads,
                                                                                  result_precision))
    if debug_seed >= 0:
        np.random.seed(debug_seed)
        core_logger.warning('Debug random seed enabled. Setted to {}'.format(debug_seed))
    cells_names = sorted(counts.columns)

    interactions.set_index('id_interaction', drop=True, inplace=True)
    interactions_reduced = interactions[['multidata_1_id', 'multidata_2_id']].drop_duplicates()

    complex_compositions.set_index('id_complex_composition', inplace=True, drop=True)
    # Add id multidata to counts input
    counts: pd.DataFrame = counts.merge(genes[['id_multidata', 'ensembl', 'gene_name', 'hgnc_symbol']],
                                        left_index=True, right_on=counts_data)
    counts_relations = counts[['id_multidata', 'ensembl', 'gene_name', 'hgnc_symbol']].copy()

    counts.set_index('id_multidata', inplace=True, drop=True)
    counts = counts[cells_names]
    counts = counts.astype('float32')
    counts = counts.groupby(counts.index).mean()

    if counts.empty:
        raise AllCountsFilteredException(hint='Are you using human data?')
    # End add id multidata

    interactions_filtered, counts_filtered, complex_composition_filtered = \
        cpdb_statistical_analysis_helper.prefilters(interactions_reduced,
                                                    counts,
                                                    complexes,
                                                    complex_compositions)

    if interactions_filtered.empty:
        raise NoInteractionsFound()

    clusters = cpdb_statistical_analysis_helper.build_clusters(meta, counts_filtered, complex_composition_filtered)
    core_logger.info('Running Real Analysis')

    cluster_interactions = cpdb_statistical_analysis_helper.get_cluster_combinations(clusters['names'])

    base_result = cpdb_statistical_analysis_helper.build_result_matrix(interactions_filtered,
                                                                       cluster_interactions,
                                                                       separator)

    real_mean_analysis = cpdb_statistical_analysis_helper.mean_analysis(interactions_filtered,
                                                                        clusters,
                                                                        cluster_interactions,
                                                                        base_result,
                                                                        separator)

    real_percents_analysis = cpdb_statistical_analysis_helper.percent_analysis(clusters,
                                                                               threshold,
                                                                               interactions_filtered,
                                                                               cluster_interactions,
                                                                               base_result,
                                                                               separator)

    core_logger.info('Running Statistical Analysis')
    statistical_mean_analysis = cpdb_statistical_analysis_helper.shuffled_analysis(iterations,
                                                                                   meta,
                                                                                   counts_filtered,
                                                                                   interactions_filtered,
                                                                                   cluster_interactions,
                                                                                   complex_composition_filtered,
                                                                                   base_result,
                                                                                   threads,
                                                                                   separator)

    result_percent = cpdb_statistical_analysis_helper.build_percent_result(real_mean_analysis,
                                                                           real_percents_analysis,
                                                                           statistical_mean_analysis,
                                                                           interactions_filtered,
                                                                           cluster_interactions,
                                                                           base_result,
                                                                           separator)

    pvalues_result, means_result, significant_means, deconvoluted_result = build_results(
        interactions_filtered,
        interactions,
        counts_relations,
        real_mean_analysis,
        result_percent,
        clusters['means'],
        complex_composition_filtered,
        counts,
        genes,
        result_precision,
        pvalue,
        counts_data
    )
    return pvalues_result, means_result, significant_means, deconvoluted_result
コード例 #10
0
def build_percent_result(real_mean_analysis: pd.DataFrame, real_perecents_analysis: pd.DataFrame,
                         statistical_mean_analysis: list, interactions: pd.DataFrame, cluster_interactions: list,
                         base_result: pd.DataFrame, separator: str) -> pd.DataFrame:
    """
    Calculates the pvalues after statistical analysis.

    If real_percent or real_mean are zero, result_percent is 1

    If not:
    Calculates how many shuffled means are bigger than real mean and divides it for the number of
    the total iterations

    EXAMPLE:
        INPUT:

        real_mean_analysis:
                      cluster1_cluster1   cluster1_cluster2 ...
        interaction1  0.5                 0.4
        interaction2  0.0                 0.2


        real_percents_analysis:
                      cluster1_cluster1   cluster1_cluster2 ...
        interaction1  1                   0
        interaction2  0                   1

        statistical means:
        [
                        cluster1_cluster1   cluster1_cluster2 ...
        interaction1    0.6                 0.1
        interaction2    0.0                 0.2

        ,
                      cluster1_cluster1   cluster1_cluster2 ...
        interaction1  0.5                 0.4
        interaction2  0.0                 0.6
        ]

        iterations = 2


        RESULT:

                        cluster1_cluster1   cluster1_cluster2 ...
        interaction1    1                   1
        interaction2    1                   0.5


    """
    core_logger.info('Building Pvalues result')
    percent_result = base_result.copy()

    for interaction_index, interaction in interactions.iterrows():
        for cluster_interaction in cluster_interactions:
            cluster_interaction_string = '{}{}{}'.format(cluster_interaction[0], separator, cluster_interaction[1])
            real_mean = real_mean_analysis.at[interaction_index, cluster_interaction_string]
            real_percent = real_perecents_analysis.at[interaction_index, cluster_interaction_string]

            if int(real_percent) == 0 or real_mean == 0:
                result_percent = 1.0

            else:
                shuffled_bigger = 0

                for statistical_mean in statistical_mean_analysis:
                    mean = statistical_mean.at[interaction_index, cluster_interaction_string]
                    if mean > real_mean:
                        shuffled_bigger += 1

                result_percent = shuffled_bigger / len(statistical_mean_analysis)

            percent_result.at[interaction_index, cluster_interaction_string] = result_percent

    return percent_result
コード例 #11
0
    def __getattribute__(self, name: str):
        method = object.__getattribute__(self, name)
        if hasattr(method, '__call__'):
            core_logger.info('Collecting {}'.format(name))

        return method
コード例 #12
0
def call(meta: pd.DataFrame,
         counts: pd.DataFrame,
         counts_data: str,
         interactions: pd.DataFrame,
         genes: pd.DataFrame,
         complexes: pd.DataFrame,
         complex_compositions: pd.DataFrame,
         separator: str,
         threshold: float = 0.1,
         result_precision: int = 3
         ) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    core_logger.info(
        '[Non Statistical Method] Threshold:{} Precision:{}'.format(threshold,
                                                                    result_precision))

    cells_names = sorted(counts.columns)

    interactions.set_index('id_interaction', drop=True, inplace=True)
    interactions_reduced = interactions[['multidata_1_id', 'multidata_2_id']].drop_duplicates()

    complex_compositions.set_index('id_complex_composition', inplace=True, drop=True)
    # Add id multidata to counts input
    counts: pd.DataFrame = counts.merge(genes[['id_multidata', 'ensembl', 'gene_name', 'hgnc_symbol']],
                                        left_index=True, right_on=counts_data)
    counts_relations = counts[['id_multidata', 'ensembl', 'gene_name', 'hgnc_symbol']].copy()

    counts.set_index('id_multidata', inplace=True, drop=True)
    counts = counts[cells_names]
    counts = counts.astype('float32')
    counts = counts.groupby(counts.index).mean()

    if counts.empty:
        raise AllCountsFilteredException(hint='Are you using human data?')
    # End add id multidata

    interactions_filtered, counts_filtered, complex_composition_filtered = \
        cpdb_statistical_analysis_helper.prefilters(interactions_reduced,
                                                    counts,
                                                    complexes,
                                                    complex_compositions)
    if interactions_filtered.empty:
        raise NoInteractionsFound()

    clusters = cpdb_statistical_analysis_helper.build_clusters(meta, counts_filtered, complex_composition_filtered)
    core_logger.info('Running Real Analysis')

    cluster_interactions = cpdb_statistical_analysis_helper.get_cluster_combinations(clusters['names'])

    base_result = cpdb_statistical_analysis_helper.build_result_matrix(interactions_filtered,
                                                                       cluster_interactions,
                                                                       separator)

    mean_analysis = cpdb_statistical_analysis_helper.mean_analysis(interactions_filtered,
                                                                   clusters,
                                                                   cluster_interactions,
                                                                   base_result,
                                                                   separator)

    percent_analysis = cpdb_analysis_helper.percent_analysis(clusters,
                                                             threshold,
                                                             interactions_filtered,
                                                             cluster_interactions,
                                                             base_result,
                                                             separator)

    means_result, significant_means, deconvoluted_result = build_results(
        interactions_filtered,
        interactions,
        counts_relations,
        mean_analysis,
        percent_analysis,
        clusters['means'],
        complex_composition_filtered,
        counts,
        genes,
        result_precision,
        counts_data
    )
    return means_result, significant_means, deconvoluted_result
コード例 #13
0
    def _build_sqlite_uri(database_file):
        path = os.path.realpath(os.path.expanduser(database_file))
        core_logger.info('Using custom database at {}'.format(path))

        return 'sqlite:///{}'.format(path)
コード例 #14
0
def call(
    meta: pd.DataFrame,
    counts: pd.DataFrame,
    interactions: pd.DataFrame,
    genes: pd.DataFrame,
    complexes: pd.DataFrame,
    complex_compositions: pd.DataFrame,
    iterations: int = 1000,
    threshold: float = 0.1,
    threads: int = 4,
    debug_seed: int = -1,
    result_precision: int = 3
) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame):
    core_logger.info(
        '[Cluster Statistical Analysis Complex] '
        'Threshold:{} Iterations:{} Debug-seed:{} Threads:{} Precision:{}'.
        format(threshold, iterations, debug_seed, threads, result_precision))
    if debug_seed >= 0:
        pd.np.random.seed(debug_seed)
        core_logger.warning(
            'Debug random seed enabled. Setted to {}'.format(debug_seed))

    cells_names = sorted(counts.columns)

    interactions_filtered, counts_filtered, complex_in_counts = prefilters(
        interactions, counts, genes, complexes, complex_compositions)
    if interactions_filtered.empty:
        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(
        ), pd.DataFrame()

    complex_significative_protein = get_complex_significative(
        complex_in_counts, counts_filtered, complex_compositions, cells_names)

    clusters = cpdb_statistical_analysis_helper.build_clusters(
        meta, counts_filtered)
    core_logger.info('Running Real Complex Analysis')

    cluster_interactions = cpdb_statistical_analysis_helper.get_cluster_combinations(
        clusters['names'])
    interactions_processed = get_interactions_processed(
        interactions_filtered, complex_significative_protein)

    base_result = cpdb_statistical_analysis_helper.build_result_matrix(
        interactions_processed, cluster_interactions)

    real_mean_analysis = cpdb_statistical_analysis_helper.mean_analysis(
        interactions_processed, clusters, cluster_interactions, base_result)

    real_percents_analysis = cpdb_statistical_analysis_helper.percent_analysis(
        clusters, threshold, interactions_processed, cluster_interactions,
        base_result)

    statistical_mean_analysis = cpdb_statistical_analysis_helper.shuffled_analysis(
        iterations, meta, counts_filtered, interactions_processed,
        cluster_interactions, base_result, threads)

    result_percent = cpdb_statistical_analysis_helper.build_percent_result(
        real_mean_analysis, real_percents_analysis, statistical_mean_analysis,
        interactions_processed, cluster_interactions, base_result)
    pvalues_result, means_result, significant_means, mean_pvalue_result, deconvoluted_result = build_results(
        interactions_filtered, real_mean_analysis, result_percent,
        clusters['means'], complex_compositions, counts, genes,
        result_precision)
    return pvalues_result, means_result, significant_means, mean_pvalue_result, deconvoluted_result
コード例 #15
0
def build_results(
    interactions: pd.DataFrame, real_mean_analysis: pd.DataFrame,
    result_percent: pd.DataFrame, clusters_means: dict,
    complex_compositions: pd.DataFrame, counts: pd.DataFrame,
    genes: pd.DataFrame, result_precision: int
) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame):
    """
    Sets the results data structure from method generated data. Results documents are defined by specs.
    """
    core_logger.info('Building Complex results')
    interacting_pair = cpdb_statistical_analysis_helper.interacting_pair_build(
        interactions)

    interactions = interactions.copy()

    def simple_complex_indicator(interaction: pd.Series, suffix: str) -> str:
        """
        Add simple/complex prefixes to interaction components
        """
        if interaction['is_complex{}'.format(suffix)]:
            return 'complex:{}'.format(interaction['name{}'.format(suffix)])

        return 'simple:{}'.format(interaction['name{}'.format(suffix)])

    interactions['partner_a'] = interactions.apply(
        lambda interaction: simple_complex_indicator(interaction, '_1'),
        axis=1)
    interactions['partner_b'] = interactions.apply(
        lambda interaction: simple_complex_indicator(interaction, '_2'),
        axis=1)
    # Remove useless columns
    interactions_data_result = pd.DataFrame(interactions[[
        'id_cp_interaction', 'partner_a', 'partner_b', 'ensembl_1',
        'ensembl_2', 'source'
    ]].copy())

    interactions_data_result = pd.concat(
        [interacting_pair, interactions_data_result], axis=1, sort=False)

    interactions_data_result['secreted'] = (interactions['secretion_1']
                                            | interactions['secretion_2'])
    interactions_data_result['is_integrin'] = (
        interactions['integrin_interaction_1']
        | interactions['integrin_interaction_2'])

    interactions_data_result.rename(columns={
        'ensembl_1': 'ensembl_a',
        'ensembl_2': 'ensembl_b'
    },
                                    inplace=True)

    significant_mean_rank, significant_means = cpdb_statistical_analysis_helper.build_significant_means(
        real_mean_analysis, result_percent)

    result_percent = result_percent.round(result_precision)
    real_mean_analysis = real_mean_analysis.round(result_precision)
    significant_means = significant_means.round(result_precision)

    # Round result decimals
    for key, cluster_means in clusters_means.items():
        clusters_means[key] = cluster_means.round(result_precision)

    # Document 1
    pvalues_result = pd.concat([interactions_data_result, result_percent],
                               axis=1,
                               join='inner',
                               sort=False)

    # Document 2
    means_result = pd.concat([interactions_data_result, real_mean_analysis],
                             axis=1,
                             join='inner',
                             sort=False)

    # Document 3
    significant_mean_result = pd.concat(
        [interactions_data_result, significant_mean_rank, significant_means],
        axis=1,
        join='inner',
        sort=False)

    # Document 4
    mean_pvalue_result = cpdb_statistical_analysis_helper.mean_pvalue_result_build(
        real_mean_analysis, result_percent, interactions_data_result)

    # Document 5
    deconvoluted_result = deconvoluted_complex_result_build(
        clusters_means, interactions, complex_compositions, counts, genes)

    return pvalues_result, means_result, significant_mean_result, mean_pvalue_result, deconvoluted_result
コード例 #16
0
def call(
    meta: pd.DataFrame,
    counts: pd.DataFrame,
    counts_data: str,
    interactions: pd.DataFrame,
    pvalue: float,
    separator: str,
    iterations: int = 1000,
    threshold: float = 0.1,
    threads: int = 4,
    debug_seed: int = -1,
    result_precision: int = 3,
) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame):
    core_logger.info(
        '[Cluster Statistical Analysis Simple] '
        'Threshold:{} Iterations:{} Debug-seed:{} Threads:{} Precision:{}'.
        format(threshold, iterations, debug_seed, threads, result_precision))

    if debug_seed >= 0:
        pd.np.random.seed(debug_seed)
        core_logger.warning(
            'Debug random seed enabled. Setted to {}'.format(debug_seed))

    interactions_filtered, counts_filtered = prefilters(
        counts, interactions, counts_data)

    if interactions_filtered.empty or counts_filtered.empty:
        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

    clusters = cpdb_statistical_analysis_helper.build_clusters(
        meta, counts_filtered)
    core_logger.info('Running Real Simple Analysis')
    cluster_interactions = cpdb_statistical_analysis_helper.get_cluster_combinations(
        clusters['names'])

    base_result = cpdb_statistical_analysis_helper.build_result_matrix(
        interactions_filtered, cluster_interactions, separator)

    real_mean_analysis = cpdb_statistical_analysis_helper.mean_analysis(
        interactions_filtered,
        clusters,
        cluster_interactions,
        base_result,
        separator,
        suffixes=('_1', '_2'),
        counts_data=counts_data)

    real_percent_analysis = cpdb_statistical_analysis_helper.percent_analysis(
        clusters,
        threshold,
        interactions_filtered,
        cluster_interactions,
        base_result,
        separator,
        suffixes=('_1', '_2'),
        counts_data=counts_data)

    statistical_mean_analysis = cpdb_statistical_analysis_helper.shuffled_analysis(
        iterations,
        meta,
        counts_filtered,
        interactions_filtered,
        cluster_interactions,
        base_result,
        threads,
        separator,
        suffixes=('_1', '_2'),
        counts_data=counts_data)

    result_percent = cpdb_statistical_analysis_helper.build_percent_result(
        real_mean_analysis, real_percent_analysis, statistical_mean_analysis,
        interactions_filtered, cluster_interactions, base_result, separator)

    pvalues_result, means_result, significant_means, deconvoluted_result = build_results(
        interactions_filtered, real_mean_analysis, result_percent,
        clusters['means'], result_precision, pvalue, counts_data)

    return pvalues_result, means_result, significant_means, deconvoluted_result
コード例 #17
0
def build_results(
    interactions: pd.DataFrame,
    real_mean_analysis: pd.DataFrame,
    result_percent: pd.DataFrame,
    clusters_means: dict,
    result_precision: int,
    pvalue: float,
    counts_data: str,
) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame):
    core_logger.info('Building Simple results')
    interacting_pair = cpdb_statistical_analysis_helper.interacting_pair_build(
        interactions)

    gene_columns = [
        '{}_{}'.format(counts_data, suffix) for suffix in ('1', '2')
    ]
    gene_renames = {
        column: 'gene_{}'.format(suffix)
        for column, suffix in zip(gene_columns, ['a', 'b'])
    }

    interactions_data_result = pd.DataFrame(interactions[[
        'id_cp_interaction', 'name_1', 'name_2', 'receptor_1', 'receptor_2',
        *gene_columns, 'annotation_strategy'
    ]].copy())

    interactions_data_result = pd.concat(
        [interacting_pair, interactions_data_result], axis=1, sort=False)

    interactions_data_result['secreted'] = (interactions['secreted_1']
                                            | interactions['secreted_2'])
    interactions_data_result['is_integrin'] = (interactions['integrin_1']
                                               | interactions['integrin_2'])

    interactions_data_result.rename(columns={
        'name_1': 'partner_a',
        'name_2': 'partner_b',
        'receptor_1': 'receptor_a',
        'receptor_2': 'receptor_b',
        **gene_renames
    },
                                    inplace=True)

    interactions_data_result['partner_a'] = interactions_data_result[
        'partner_a'].apply(lambda name: 'simple:{}'.format(name))
    interactions_data_result['partner_b'] = interactions_data_result[
        'partner_b'].apply(lambda name: 'simple:{}'.format(name))

    # Dedupe rows and filter only desired columns
    interactions_data_result.drop_duplicates(inplace=True)

    means_columns = [
        'id_cp_interaction', 'interacting_pair', 'partner_a', 'partner_b',
        'gene_a', 'gene_b', 'secreted', 'receptor_a', 'receptor_b',
        'annotation_strategy', 'is_integrin'
    ]

    interactions_data_result = interactions_data_result[means_columns]

    significant_mean_rank, significant_means = cpdb_statistical_analysis_helper.build_significant_means(
        real_mean_analysis, result_percent, pvalue)

    result_percent = result_percent.round(result_precision)
    real_mean_analysis = real_mean_analysis.round(result_precision)
    significant_means = significant_means.round(result_precision)
    for key, cluster_means in clusters_means.items():
        clusters_means[key] = cluster_means.round(result_precision)

    # Document 1
    pvalues_result = pd.concat([interactions_data_result, result_percent],
                               axis=1,
                               join='inner',
                               sort=False)

    # Document 2
    means_result = pd.concat([interactions_data_result, real_mean_analysis],
                             axis=1,
                             join='inner',
                             sort=False)

    # Document 3
    significant_mean_result = pd.concat(
        [interactions_data_result, significant_mean_rank, significant_means],
        axis=1,
        join='inner',
        sort=False)

    # Document 5
    deconvoluted_result = deconvoluted_result_build(clusters_means,
                                                    interactions,
                                                    counts_data=counts_data)

    return pvalues_result, means_result, significant_mean_result, deconvoluted_result
コード例 #18
0
def call(
        meta: pd.DataFrame,
        counts: pd.DataFrame,
        counts_data: str,
        interactions: pd.DataFrame,
        genes: pd.DataFrame,
        complexes: pd.DataFrame,
        complex_compositions: pd.DataFrame,
        separator: str,
        threshold: float = 0.1,
        result_precision: int = 3
) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    core_logger.info(
        '[Non Statistical Method] Threshold:{} Precision:{}'.format(
            threshold, result_precision))

    cells_names = sorted(counts.columns)

    interactions_filtered, counts_filtered, complex_in_counts = prefilters(
        interactions, counts, genes, complexes, complex_compositions,
        counts_data)
    if interactions_filtered.empty:
        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

    complex_significative_protein = get_complex_significative(
        complex_in_counts, counts_filtered, complex_compositions, cells_names)

    clusters = cpdb_statistical_analysis_helper.build_clusters(
        meta, counts_filtered)
    core_logger.info('Running Complex Analysis')

    cluster_interactions = cpdb_statistical_analysis_helper.get_cluster_combinations(
        clusters['names'])
    interactions_processed = get_interactions_processed(
        interactions_filtered,
        complex_significative_protein,
        counts_data=counts_data)

    base_result = cpdb_statistical_analysis_helper.build_result_matrix(
        interactions_processed, cluster_interactions, separator)

    mean_analysis = cpdb_statistical_analysis_helper.mean_analysis(
        interactions_processed,
        clusters,
        cluster_interactions,
        base_result,
        separator,
        suffixes=('_1', '_2'),
        counts_data=counts_data)

    percent_analysis = cpdb_analysis_helper.percent_analysis(
        clusters,
        threshold,
        interactions_processed,
        cluster_interactions,
        base_result.copy(),
        separator,
        suffixes=('_1', '_2'),
        counts_data=counts_data)

    means_result, significant_means, deconvoluted_result = build_results(
        interactions_filtered, mean_analysis, percent_analysis,
        clusters['means'], complex_compositions, counts, genes,
        result_precision, counts_data)
    return means_result, significant_means, deconvoluted_result
コード例 #19
0
def build_results(winsorized_counts: pd.DataFrame,
                  result_precision: int
                  ) -> (pd.DataFrame):
    core_logger.info('Building winsorized results')

    return winsorized_counts.round(result_precision)
コード例 #20
0
def build_results(
        interactions: pd.DataFrame, mean_analysis: pd.DataFrame,
        percent_analysis: pd.DataFrame, clusters_means: dict,
        result_precision: int) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    core_logger.info('Building Simple results')
    interacting_pair = cpdb_statistical_analysis_helper.interacting_pair_build(
        interactions)

    interactions_data_result = pd.DataFrame(interactions[[
        'id_cp_interaction', 'name_1', 'name_2', 'ensembl_1', 'ensembl_2',
        'source'
    ]].copy())

    interactions_data_result = pd.concat(
        [interacting_pair, interactions_data_result], axis=1, sort=False)

    interactions_data_result['secreted'] = (interactions['secretion_1']
                                            | interactions['secretion_2'])
    interactions_data_result['is_integrin'] = (
        interactions['integrin_interaction_1']
        | interactions['integrin_interaction_2'])

    interactions_data_result.rename(columns={
        'name_1': 'partner_a',
        'name_2': 'partner_b',
        'ensembl_1': 'ensembl_a',
        'ensembl_2': 'ensembl_b'
    },
                                    inplace=True)

    interactions_data_result['partner_a'] = interactions_data_result[
        'partner_a'].apply(lambda name: 'simple:{}'.format(name))
    interactions_data_result['partner_b'] = interactions_data_result[
        'partner_b'].apply(lambda name: 'simple:{}'.format(name))

    significant_mean_rank, significant_means = cpdb_analysis_helper.build_significant_means(
        mean_analysis, percent_analysis)
    significant_means = significant_means.round(result_precision)

    mean_analysis = mean_analysis.round(result_precision)
    for key, cluster_means in clusters_means.items():
        clusters_means[key] = cluster_means.round(result_precision)

    # Document 2
    means_result = pd.concat([interactions_data_result, mean_analysis],
                             axis=1,
                             join='inner',
                             sort=False)

    # Document 3

    significant_means_result = pd.concat(
        [interactions_data_result, significant_mean_rank, significant_means],
        axis=1,
        join='inner',
        sort=False)

    # Document 5
    deconvoluted_result = deconvoluted_result_build(clusters_means,
                                                    interactions)

    return means_result, significant_means_result, deconvoluted_result
コード例 #21
0
    def __getattribute__(self, name):
        method = object.__getattribute__(self, name)
        if hasattr(method, '__call__'):
            core_logger.info('Launching Method {}'.format(name))

        return method
コード例 #22
0
def build_percent_result(real_mean_analysis: pd.DataFrame,
                         real_perecents_analysis: pd.DataFrame,
                         statistical_mean_analysis: list,
                         interactions: pd.DataFrame,
                         cluster_interactions: list,
                         base_result: pd.DataFrame) -> pd.DataFrame:
    """
    Calculates the pvalues after statistical analysis.

    If real_percent or real_mean are zero, result_percent is 1

    If not:
    Calculates how many shuffled means are bigger than real mean and divides it for the number of
    the total iterations

    EXAMPLE:
        INPUT:

        real_mean_analysis:
                      cluster1_cluster1   cluster1_cluster2 ...
        interaction1  0.5                 0.4
        interaction2  0.0                 0.2


        real_percents_analysis:
                      cluster1_cluster1   cluster1_cluster2 ...
        interaction1  1                   0
        interaction2  0                   1

        statistical means:
        [
                        cluster1_cluster1   cluster1_cluster2 ...
        interaction1    0.6                 0.1
        interaction2    0.0                 0.2

        ,
                      cluster1_cluster1   cluster1_cluster2 ...
        interaction1  0.5                 0.4
        interaction2  0.0                 0.6
        ]

        iterations = 2


        RESULT:

                        cluster1_cluster1   cluster1_cluster2 ...
        interaction1    1                   1
        interaction2    1                   0.5


    """
    core_logger.info('Building Pvalues result')
    percent_result = base_result.copy()

    for interaction_index, interaction in interactions.iterrows():
        for cluster_interaction in cluster_interactions:
            cluster_interaction_string = '{}_{}'.format(
                cluster_interaction[0], cluster_interaction[1])
            real_mean = real_mean_analysis.at[interaction_index,
                                              cluster_interaction_string]
            real_percent = real_perecents_analysis.at[
                interaction_index, cluster_interaction_string]
            # print(real_mean)
            # print(real_percent)
            if int(real_percent) == 0 or real_mean == 0:
                result_percent = 1.0

            else:
                # cellnet
                mean_per_pair = []

                for statistical_mean in statistical_mean_analysis:
                    mean = statistical_mean.at[interaction_index,
                                               cluster_interaction_string]
                    mean_per_pair.append(mean)

                mean_per_pair = [x for x in mean_per_pair if ~np.isnan(x)]
                mean_per_pair = np.array(mean_per_pair)
                # mean_per_pair.sort()

                # mean_at_975 = np.percentile(mean_per_pair, 97.5)
                # mean_at_25 = np.percentile(mean_per_pair, 2.5)
                # print(np.percentile(mean_per_pair, 2.5), np.percentile(mean_per_pair, 50), np.percentile(mean_per_pair, 97.5))
                if real_mean > 0:
                    shuffled_bigger_smaller = len(
                        mean_per_pair[mean_per_pair > real_mean])
                    result_percent = shuffled_bigger_smaller / len(
                        mean_per_pair)
                    result_percent = result_percent * 2  # two-tails
                elif real_mean < 0:
                    shuffled_bigger_smaller = len(
                        mean_per_pair[mean_per_pair < real_mean])
                    result_percent = shuffled_bigger_smaller / len(
                        mean_per_pair)
                    result_percent = result_percent * 2  # two-tails

            percent_result.at[interaction_index,
                              cluster_interaction_string] = result_percent

    return percent_result