Ejemplo n.º 1
0
    def subsample(self, counts: pd.DataFrame) -> pd.DataFrame:
        input_genes = counts.shape[1]

        if self.num_cells is None:
            self.num_cells = int(input_genes / 3)

        core_logger.info('Subsampling {} to {}'.format(input_genes, self.num_cells))

        counts_t = counts.T

        if self.log:
            pca_input = np.log1p(counts_t)
        else:
            pca_input = counts_t

        try:
            u, s, vt = pca(pca_input.values, k=self.num_pc)
            x_dimred = u[:, :self.num_pc] * s[:self.num_pc]
            sketch_index = gs(x_dimred, self.num_cells, replace=False)
            x_matrix = counts_t.iloc[sketch_index]
        except Exception as e:
            core_logger.warning('Subsampling failed: ignored.')
            if self.verbose:
                core_logger.warning(str(e))
            return counts

        core_logger.info('Done subsampling {} to {}'.format(input_genes, self.num_cells))

        return x_matrix.T
def call(meta: pd.DataFrame,
         counts: pd.DataFrame,
         threads: int = 4,
         debug_seed: int = -1,
         result_precision: int = 3,
         log2_transform: bool = True
         ) -> (pd.DataFrame):
    core_logger.info(
        '[Cluster Statistical Analysis Simple] '
        'Debug-seed:{} Threads:{} Precision:{} Log2-Transformed: {}'.format(debug_seed,
                                                                            threads, 
                                                                            result_precision,
                                                                            log2_transform))

    if debug_seed >= 0:
        pd.np.random.seed(debug_seed)
        core_logger.warning('Debug random seed enabled. Setted to {}'.format(debug_seed))

    core_logger.info('Running Winsorization')
    
    winsorized_counts = cpdb_statistical_analysis_helper.log2tf_winsorizer(meta,
                                                                           counts,
                                                                           log2_transform, 
                                                                           threads)

    return build_results(winsorized_counts, result_precision)
def call(meta: pd.DataFrame, counts: pd.DataFrame, interactions: pd.DataFrame, genes: pd.DataFrame,
         complexes: pd.DataFrame, complex_compositions: pd.DataFrame, iterations: int = 1000, threshold: float = 0.1,
         threads: int = 4, debug_seed=False, round_decimals: int = 1) -> (
        pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame):
    core_logger.info(
        '[Cluster Statistical Analysis Complex] Threshold:{} Iterations:{} Debug-seed:{} Threads:{}'.format(
            threshold, iterations, debug_seed, threads))
    if debug_seed >= 0:
        pd.np.random.seed(debug_seed)
        core_logger.warning('Debug random seed enabled. Setted to {}'.format(debug_seed))

    cells_names = sorted(counts.columns)

    interactions_filtered, counts_filtered, complex_in_counts = prefilters(interactions, counts, genes, complexes,
                                                                           complex_compositions)
    if interactions_filtered.empty:
        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

    complex_significative_protein = get_complex_significative(complex_in_counts, counts_filtered, complex_compositions,
                                                              cells_names)

    clusters = cpdb_statistical_analysis_helper.build_clusters(meta, counts_filtered)
    core_logger.info('Running Real Complex Analysis')

    cluster_interactions = cpdb_statistical_analysis_helper.get_cluster_combinations(clusters['names'])
    interactions_processed = get_interactions_processed(interactions_filtered, complex_significative_protein)

    base_result = cpdb_statistical_analysis_helper.build_result_matrix(interactions_processed, cluster_interactions)

    real_mean_analysis = cpdb_statistical_analysis_helper.mean_analysis(interactions_processed, clusters,
                                                                        cluster_interactions, base_result)

    real_percents_analysis = cpdb_statistical_analysis_helper.percent_analysis(clusters, threshold,
                                                                               interactions_processed,
                                                                               cluster_interactions,
                                                                               base_result)

    statistical_mean_analysis = cpdb_statistical_analysis_helper.shuffled_analysis(iterations, meta, counts_filtered,
                                                                                   interactions_processed,
                                                                                   cluster_interactions, base_result,
                                                                                   threads)

    result_percent = cpdb_statistical_analysis_helper.build_percent_result(real_mean_analysis,
                                                                           real_percents_analysis,
                                                                           statistical_mean_analysis,
                                                                           interactions_processed,
                                                                           cluster_interactions, base_result)
    pvalues_result, means_result, significant_means, mean_pvalue_result, deconvoluted_result = build_results(
        interactions_filtered,
        real_mean_analysis,
        result_percent,
        clusters['means'],
        complex_compositions,
        counts,
        genes,
        round_decimals
    )
    return pvalues_result, means_result, significant_means, mean_pvalue_result, deconvoluted_result
Ejemplo n.º 4
0
    def add(self, complexes):
        """
        Uploads complex data from csv.

        - Creates new complexes in Multidata table
        - Creates reference in Complex table
        - Creates complex composition to define complexes.
        """

        if complexes.empty:
            return

        existing_complexes = self.database_manager.database.session.query(Multidata.name).all()
        existing_complexes = [c[0] for c in existing_complexes]
        proteins = self.database_manager.database.session.query(Multidata.name, Multidata.id_multidata).join(
            Protein).all()
        proteins = {p[0]: p[1] for p in proteins}

        # Get complex composition info
        complete_indices = []
        incomplete_indices = []
        missing_proteins = []
        complex_map = {}
        for index, row in complexes.iterrows():
            missing = False
            protein_id_list = []
            for protein in ['protein_1', 'protein_2',
                            'protein_3', 'protein_4']:
                if not pd.isnull(row[protein]):
                    protein_id = proteins.get(row[protein])
                    if protein_id is None:
                        missing = True
                        missing_proteins.append(row[protein])
                    else:
                        protein_id_list.append(protein_id)
            if not missing:
                complex_map[row['name']] = protein_id_list
                complete_indices.append(int(index))
            else:
                incomplete_indices.append(index)

        if len(incomplete_indices) > 0:
            core_logger.warning('MISSING PROTEINS:')
            for protein in missing_proteins:
                core_logger.warning('MISSING PROTEINS:')(protein)

            core_logger.warning('COMEPLEXES WITH MISSING PROTEINS:')
            core_logger.warning(complexes.iloc[incomplete_indices, :]['name'])

        # Insert complexes
        if not complexes.empty:
            # Remove unwanted columns
            removal_columns = list(
                [x for x in complexes.columns if 'protein_' in x or 'Name_' in x or 'Unnamed' in x])
            # removal_columns += ['comments']
            complexes.drop(removal_columns, axis=1, inplace=True)

            # Remove rows with missing complexes
            complexes = complexes.iloc[complete_indices, :]

            # Convert ints to bool
            bools = ['receptor', 'other', 'secreted_highlight', 'transmembrane', 'secreted',
                     'peripheral']
            complexes[bools] = complexes[bools].astype(bool)

            # Drop existing complexes
            complexes = complexes[complexes['name'].apply(
                lambda x: x not in existing_complexes)]

            multidata_df = filters.remove_not_defined_columns(complexes.copy(),
                                                              self.database_manager.get_column_table_names(
                                                                  'multidata_table'))

            multidata_df = self._add_complex_optimitzations(multidata_df)
            multidata_df.to_sql(name='multidata_table', if_exists='append', con=self.database_manager.database.engine,
                                index=False, chunksize=50)

        # Now find id's of new complex rows
        new_complexes = self.database_manager.database.session.query(Multidata.name, Multidata.id_multidata).all()
        new_complexes = {c[0]: c[1] for c in new_complexes}

        # Build set of complexes
        complex_set = []
        complex_table = []
        for complex_name in complex_map:
            complex_id = new_complexes[complex_name]
            for protein_id in complex_map[complex_name]:
                complex_set.append((complex_id, protein_id, len(complex_map[complex_name])))
            complex_table.append({'complex_multidata_id': complex_id, 'name': complex_name})

        # Insert complex composition
        complex_set_df = pd.DataFrame(complex_set,
                                      columns=['complex_multidata_id', 'protein_multidata_id', 'total_protein'])

        complex_table_df = pd.DataFrame(complex_table)
        complex_table_df = pd.merge(complex_table_df, complexes, on='name')

        filters.remove_not_defined_columns(complex_table_df,
                                           self.database_manager.get_column_table_names('complex_table'))

        complex_table_df.to_sql(
            name='complex_table', if_exists='append',
            con=self.database_manager.database.engine, index=False, chunksize=50)

        complex_set_df.to_sql(
            name='complex_composition_table', if_exists='append',
            con=self.database_manager.database.engine, index=False, chunksize=50)
def call(
    meta: pd.DataFrame,
    counts: pd.DataFrame,
    counts_data: str,
    interactions: pd.DataFrame,
    pvalue: float,
    separator: str,
    iterations: int = 1000,
    threshold: float = 0.1,
    threads: int = 4,
    debug_seed: int = -1,
    result_precision: int = 3,
) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame):
    core_logger.info(
        '[Cluster Statistical Analysis Simple] '
        'Threshold:{} Iterations:{} Debug-seed:{} Threads:{} Precision:{}'.
        format(threshold, iterations, debug_seed, threads, result_precision))

    if debug_seed >= 0:
        pd.np.random.seed(debug_seed)
        core_logger.warning(
            'Debug random seed enabled. Setted to {}'.format(debug_seed))

    interactions_filtered, counts_filtered = prefilters(
        counts, interactions, counts_data)

    if interactions_filtered.empty or counts_filtered.empty:
        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

    clusters = cpdb_statistical_analysis_helper.build_clusters(
        meta, counts_filtered)
    core_logger.info('Running Real Simple Analysis')
    cluster_interactions = cpdb_statistical_analysis_helper.get_cluster_combinations(
        clusters['names'])

    base_result = cpdb_statistical_analysis_helper.build_result_matrix(
        interactions_filtered, cluster_interactions, separator)

    real_mean_analysis = cpdb_statistical_analysis_helper.mean_analysis(
        interactions_filtered,
        clusters,
        cluster_interactions,
        base_result,
        separator,
        suffixes=('_1', '_2'),
        counts_data=counts_data)

    real_percent_analysis = cpdb_statistical_analysis_helper.percent_analysis(
        clusters,
        threshold,
        interactions_filtered,
        cluster_interactions,
        base_result,
        separator,
        suffixes=('_1', '_2'),
        counts_data=counts_data)

    statistical_mean_analysis = cpdb_statistical_analysis_helper.shuffled_analysis(
        iterations,
        meta,
        counts_filtered,
        interactions_filtered,
        cluster_interactions,
        base_result,
        threads,
        separator,
        suffixes=('_1', '_2'),
        counts_data=counts_data)

    result_percent = cpdb_statistical_analysis_helper.build_percent_result(
        real_mean_analysis, real_percent_analysis, statistical_mean_analysis,
        interactions_filtered, cluster_interactions, base_result, separator)

    pvalues_result, means_result, significant_means, deconvoluted_result = build_results(
        interactions_filtered, real_mean_analysis, result_percent,
        clusters['means'], result_precision, pvalue, counts_data)

    return pvalues_result, means_result, significant_means, deconvoluted_result
Ejemplo n.º 6
0
def call(meta: pd.DataFrame,
         counts: pd.DataFrame,
         counts_data: str,
         interactions: pd.DataFrame,
         genes: pd.DataFrame,
         complexes: pd.DataFrame,
         complex_compositions: pd.DataFrame,
         pvalue: float,
         separator: str,
         iterations: int = 1000,
         threshold: float = 0.1,
         threads: int = 4,
         debug_seed: int = -1,
         result_precision: int = 3,
         ) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame):
    core_logger.info(
        '[Cluster Statistical Analysis] '
        'Threshold:{} Iterations:{} Debug-seed:{} Threads:{} Precision:{}'.format(threshold,
                                                                                  iterations,
                                                                                  debug_seed,
                                                                                  threads,
                                                                                  result_precision))
    if debug_seed >= 0:
        np.random.seed(debug_seed)
        core_logger.warning('Debug random seed enabled. Setted to {}'.format(debug_seed))
    cells_names = sorted(counts.columns)

    interactions.set_index('id_interaction', drop=True, inplace=True)
    interactions_reduced = interactions[['multidata_1_id', 'multidata_2_id']].drop_duplicates()

    complex_compositions.set_index('id_complex_composition', inplace=True, drop=True)
    # Add id multidata to counts input
    counts: pd.DataFrame = counts.merge(genes[['id_multidata', 'ensembl', 'gene_name', 'hgnc_symbol']],
                                        left_index=True, right_on=counts_data)
    counts_relations = counts[['id_multidata', 'ensembl', 'gene_name', 'hgnc_symbol']].copy()

    counts.set_index('id_multidata', inplace=True, drop=True)
    counts = counts[cells_names]
    counts = counts.astype('float32')
    counts = counts.groupby(counts.index).mean()

    if counts.empty:
        raise AllCountsFilteredException(hint='Are you using human data?')
    # End add id multidata

    interactions_filtered, counts_filtered, complex_composition_filtered = \
        cpdb_statistical_analysis_helper.prefilters(interactions_reduced,
                                                    counts,
                                                    complexes,
                                                    complex_compositions)

    if interactions_filtered.empty:
        raise NoInteractionsFound()

    clusters = cpdb_statistical_analysis_helper.build_clusters(meta, counts_filtered, complex_composition_filtered)
    core_logger.info('Running Real Analysis')

    cluster_interactions = cpdb_statistical_analysis_helper.get_cluster_combinations(clusters['names'])

    base_result = cpdb_statistical_analysis_helper.build_result_matrix(interactions_filtered,
                                                                       cluster_interactions,
                                                                       separator)

    real_mean_analysis = cpdb_statistical_analysis_helper.mean_analysis(interactions_filtered,
                                                                        clusters,
                                                                        cluster_interactions,
                                                                        base_result,
                                                                        separator)

    real_percents_analysis = cpdb_statistical_analysis_helper.percent_analysis(clusters,
                                                                               threshold,
                                                                               interactions_filtered,
                                                                               cluster_interactions,
                                                                               base_result,
                                                                               separator)

    core_logger.info('Running Statistical Analysis')
    statistical_mean_analysis = cpdb_statistical_analysis_helper.shuffled_analysis(iterations,
                                                                                   meta,
                                                                                   counts_filtered,
                                                                                   interactions_filtered,
                                                                                   cluster_interactions,
                                                                                   complex_composition_filtered,
                                                                                   base_result,
                                                                                   threads,
                                                                                   separator)

    result_percent = cpdb_statistical_analysis_helper.build_percent_result(real_mean_analysis,
                                                                           real_percents_analysis,
                                                                           statistical_mean_analysis,
                                                                           interactions_filtered,
                                                                           cluster_interactions,
                                                                           base_result,
                                                                           separator)

    pvalues_result, means_result, significant_means, deconvoluted_result = build_results(
        interactions_filtered,
        interactions,
        counts_relations,
        real_mean_analysis,
        result_percent,
        clusters['means'],
        complex_composition_filtered,
        counts,
        genes,
        result_precision,
        pvalue,
        counts_data
    )
    return pvalues_result, means_result, significant_means, deconvoluted_result
Ejemplo n.º 7
0
    def blend_dataframes(left_df,
                         left_column_names,
                         right_df,
                         db_column_name,
                         db_table_name,
                         quiet=False):
        result_df = left_df.copy()

        if not quiet and db_column_name in left_df.columns:
            core_logger.debug(
                'WARNING | BLENDING: column "%s" already exists in orginal df'
                % (db_column_name))

        unique_slug = '_EDITNAME'
        unique_original_column_names = [("%s%s" % (column_name, unique_slug))
                                        for column_name in left_column_names]

        result_df.rename(index=str,
                         columns=dict(
                             zip(left_column_names,
                                 unique_original_column_names)),
                         inplace=True)

        not_existent_proteins = []

        for i in range(0, len(unique_original_column_names)):
            result_df = Repository._blend_column(
                result_df, right_df, unique_original_column_names[i],
                db_column_name, db_table_name, i + 1)

            not_existent_proteins = not_existent_proteins + \
                                    result_df[result_df['_merge_%s' % (i + 1)] == 'left_only'][
                                        unique_original_column_names[i]].drop_duplicates().tolist()
        not_existent_proteins = list(set(not_existent_proteins))

        for i in range(1, len(unique_original_column_names) + 1):
            result_df = result_df[(result_df['_merge_%s' % i] == 'both')]

        result_df.drop([
            '_merge_%s' % merge_column
            for merge_column in range(1,
                                      len(unique_original_column_names) + 1)
        ] + unique_original_column_names,
                       axis=1,
                       inplace=True)

        if len(left_column_names) == 1:
            result_df.rename(index=str,
                             columns={
                                 '%s_1' % db_column_name: db_column_name,
                                 '%s_1_id' % db_table_name:
                                 '%s_id' % db_table_name
                             },
                             inplace=True)

        if not quiet and not_existent_proteins:
            core_logger.warning(
                'WARNING | BLENDING: THIS %s DIDNT EXIST IN %s' %
                (db_column_name, db_table_name))
            core_logger.warning(not_existent_proteins)

        return result_df