def filter_all(input_path, result_path): interactions = pd.read_csv( os.path.join(input_path, 'interaction_input.csv')) complexes = pd.read_csv(os.path.join(input_path, 'complex_generated.csv')) proteins = pd.read_csv(os.path.join(input_path, 'protein_generated.csv')) genes = pd.read_csv(os.path.join(input_path, 'gene_generated.csv')) output_path = _set_paths(output_dir, result_path) interacting_partners = pd.concat( [interactions['partner_a'], interactions['partner_b']]).drop_duplicates() filtered_complexes = _filter_complexes(complexes, interacting_partners) write_to_file(filtered_complexes, 'complex_input.csv', output_path=output_path) filtered_proteins, interacting_proteins = _filter_proteins( proteins, filtered_complexes, interacting_partners) write_to_file(filtered_proteins, 'protein_input.csv', output_path=output_path) filtered_genes = _filter_genes(genes, filtered_proteins['uniprot']) write_to_file(filtered_genes, 'gene_input.csv', output_path=output_path) rejected_members = interacting_partners[~( interacting_partners.isin(filtered_complexes['complex_name']) | interacting_partners.isin(filtered_proteins['uniprot']))] if len(rejected_members): app_logger.warning( 'There are some proteins or complexes not interacting properly: `{}`' .format(', '.join(rejected_members)))
def test_duplicated_gene_ensembl_is_not_in_interaction(self): all_genes = cellphonedb_app.cellphonedb.database_manager.get_repository( 'gene').get_all_expanded() all_interactions = cellphonedb_app.cellphonedb.database_manager.get_repository( 'interaction').get_all() genes_duplicated_ensembl = all_genes[all_genes.duplicated('ensembl', keep=False)] all_interactions_multidata_ids = all_interactions[ 'multidata_1_id'].tolist( ) + all_interactions['multidata_2_id'].tolist() duplicated_gene_ensembls_in_interactions = genes_duplicated_ensembl[ genes_duplicated_ensembl['id_multidata'].apply( lambda id: id in all_interactions_multidata_ids)] if not duplicated_gene_ensembls_in_interactions.empty: app_logger.warning( 'Some duplicated ensembls apears in interactions') app_logger.warning( duplicated_gene_ensembls_in_interactions.to_csv(index=False)) self.assertTrue(duplicated_gene_ensembls_in_interactions.empty, 'Some duplicated ensembl gene apears in interactions')
def test_interaction(self): interaction_df = cellphonedb_app.cellphonedb.database_manager.get_repository( 'interaction').get_all_expanded() data_not_match = False for interaction in interaction_entries: db_interaction = interaction_df non_match_properties = [] for column_name in interaction: if interaction[column_name] == None: db_interaction = db_interaction[pd.isnull( db_interaction[column_name])] else: db_interaction = db_interaction[db_interaction[column_name] == interaction[column_name]] if len(db_interaction) < 1: non_match_properties.append(column_name) if (len(db_interaction) < 1): app_logger.warning('Failed cheking Interaction:') app_logger.warning('Expected data:') app_logger.warning(interaction) app_logger.warning('Non Match properties') app_logger.warning(non_match_properties) data_not_match = True self.assertFalse(data_not_match, 'Some Interactions doesnt match')
def write_to_file(df: pd.DataFrame, filename: str, output_path: str, output_format: Optional[str] = None): _, file_extension = os.path.splitext(filename) if output_format is None: if not file_extension: default_format = 'txt' default_extension = '.{}'.format(default_format) separator = _get_separator(default_extension) filename = '{}{}'.format(filename, default_extension) else: separator = _get_separator(file_extension) else: selected_extension = '.{}'.format(output_format) if file_extension != selected_extension: separator = _get_separator(selected_extension) filename = '{}{}'.format(filename, selected_extension) if file_extension: app_logger.warning( 'Selected extension missmatches output filename ({}, {}): It will be added => {}' .format(selected_extension, file_extension, filename)) else: separator = _get_separator(selected_extension) df.to_csv('{}/{}'.format(output_path, filename), sep=separator, index=False)
def analysis(meta_filename: str, counts_filename: str, counts_data: str, project_name: str, threshold: float, result_precision: int, output_path: str, output_format: str, means_result_name: str, significant_means_result_name: str, deconvoluted_result_name: str, verbose: bool, database: Optional[str], subsampling: bool, subsampling_log: bool, subsampling_num_pc: int, subsampling_num_cells: Optional[int] ): try: subsampler = Subsampler(subsampling_log, subsampling_num_pc, subsampling_num_cells, verbose) if subsampling else None LocalMethodLauncher(cpdb_app.create_app(verbose, database)).cpdb_analysis_local_method_launcher(meta_filename, counts_filename, counts_data, project_name, threshold, output_path, output_format, means_result_name, significant_means_result_name, deconvoluted_result_name, result_precision, subsampler, ) except (ReadFileException, ParseMetaException, ParseCountsException, ThresholdValueException, AllCountsFilteredException) as e: app_logger.error(str(e) + (':' if (hasattr(e, 'description') and e.description) or ( hasattr(e, 'hint') and e.hint) else '') + (' {}.'.format(e.description) if hasattr(e, 'description') and e.description else '') + (' {}.'.format(e.hint) if hasattr(e, 'hint') and e.hint else '') ) except EmptyResultException as e: app_logger.warning(str(e) + (':' if (hasattr(e, 'description') and e.description) or ( hasattr(e, 'hint') and e.hint) else '') + (' {}.'.format(e.description) if hasattr(e, 'description') and e.description else '') + (' {}.'.format(e.hint) if hasattr(e, 'hint') and e.hint else '') ) except: app_logger.error('Unexpected error') if verbose: traceback.print_exc(file=sys.stdout)
def analysis_scanpy(adata, var_names, obs_names, obs_key, var_key=None, gene_id_format=None, project_name='', threshold=0.1, result_precision='3', output_path='', output_format='csv', means_result_name='means', significant_means_result_name='significant_means', deconvoluted_result_name='deconvoluted', verbose=True, database='latest', subsampling=False, subsampling_log=True, subsampling_num_pc=100, subsampling_num_cells=None, write=False, add_to_uns=True): try: subsampler = Subsampler(subsampling_log, subsampling_num_pc, subsampling_num_cells, verbose) if subsampling else None out = LocalMethodLauncher(cpdb_app.create_app( verbose, database)).cpdb_analysis_local_method_launcher_scanpy( adata, var_names, obs_names, obs_key, var_key, gene_id_format, project_name, threshold, output_path, output_format, means_result_name, significant_means_result_name, deconvoluted_result_name, result_precision, subsampler, write, add_to_uns) return out except (ReadFileException, ParseMetaException, ParseCountsException, ThresholdValueException, AllCountsFilteredException) as e: app_logger.error( str(e) + (':' if (hasattr(e, 'description') and e.description) or (hasattr(e, 'hint') and e.hint) else '') + (' {}.'.format(e.description) if hasattr(e, 'description') and e.description else '') + (' {}.'.format(e.hint) if hasattr(e, 'hint') and e.hint else '')) except EmptyResultException as e: app_logger.warning( str(e) + (':' if (hasattr(e, 'description') and e.description) or (hasattr(e, 'hint') and e.hint) else '') + (' {}.'.format(e.description) if hasattr(e, 'description') and e.description else '') + (' {}.'.format(e.hint) if hasattr(e, 'hint') and e.hint else '')) except: app_logger.error('Unexpected error') if verbose: traceback.print_exc(file=sys.stdout)
def _set_paths(output_path, project_name): if not output_path: output_path = output_dir if project_name: output_path = os.path.realpath(os.path.expanduser('{}/{}'.format(output_path, project_name))) os.makedirs(output_path, exist_ok=True) if LocalMethodLauncher._path_is_empty(output_path): app_logger.warning( 'Output directory ({}) exist and is not empty. Result can overwrite old results'.format(output_path)) return output_path
def statistical_analysis(meta_filename: str, counts_filename: str, project_name: str, iterations: int, threshold: float, result_precision: int, output_path: str, means_result_name: str, pvalues_result_name: str, significant_mean_result_name: str, means_pvalues_result_name: str, deconvoluted_result_name: str, debug_seed: int, threads: int, verbose: bool, ) -> None: try: LocalMethodLauncher(cpdb_app.create_app(verbose)). \ cpdb_statistical_analysis_local_method_launcher(meta_filename, counts_filename, project_name, iterations, threshold, output_path, means_result_name, pvalues_result_name, significant_mean_result_name, means_pvalues_result_name, deconvoluted_result_name, debug_seed, threads, result_precision ) except (ReadFileException, ParseMetaException, ParseCountsException, ThresholdValueException, AllCountsFilteredException) as e: app_logger.error(str(e) + (':' if (hasattr(e, 'description') and e.description) or ( hasattr(e, 'hint') and e.hint) else '') + (' {}.'.format(e.description) if hasattr(e, 'description') and e.description else '') + (' {}.'.format(e.hint) if hasattr(e, 'hint') and e.hint else '') ) except EmptyResultException as e: app_logger.warning(str(e) + (':' if (hasattr(e, 'description') and e.description) or ( hasattr(e, 'hint') and e.hint) else '') + (' {}.'.format(e.description) if hasattr(e, 'description') and e.description else '') + (' {}.'.format(e.hint) if hasattr(e, 'hint') and e.hint else '') ) except: app_logger.error('Unexpected error') if (verbose): traceback.print_exc(file=sys.stdout)
def _set_paths(output_path, subfolder): if not output_path: output_path = output_dir if subfolder: output_path = os.path.realpath(os.path.expanduser('{}/{}'.format(output_path, subfolder))) os.makedirs(output_path, exist_ok=True) if _path_is_not_empty(output_path): app_logger.warning( 'Output directory ({}) exist and is not empty. Result can overwrite old results'.format(output_path)) return output_path
def test_complex(self): dataframe = cellphonedb_app.cellphonedb.database_manager.get_repository( 'complex').get_all_expanded() data_not_match = False for complex in complex_entries: db_complex = dataframe[dataframe['name'] == complex['data'] ['name']] for complex_data in complex['data']: if db_complex[complex_data].iloc[0] != complex['data'][ complex_data]: app_logger.warning( 'Failed checking column \'%s\' of multidata/complex with name \'%s\'' % (complex_data, complex['data']['name'])) app_logger.warning('Expected value: %s' % complex['data'][complex_data]) app_logger.warning('Database value: %s' % db_complex[complex_data].iloc[0]) app_logger.warning('---') data_not_match = True self.assertFalse(data_not_match, 'Some complex doesnt match')
def test_protein(self): dataframe = cellphonedb_app.cellphonedb.database_manager.get_repository( 'protein').get_all_expanded() data_not_match = False for protein in protein_entries: db_protein = dataframe[dataframe['name'] == protein['name']] if db_protein.empty: print('Protein {} dindt exist'.format(protein['name'])) data_not_match = True continue for column_name in protein: if db_protein[column_name].iloc[0] != protein[column_name]: app_logger.warning('Failed checking column \'%s\' of multidata/protein with name \'%s\'' % ( column_name, protein['name'])) app_logger.warning('Expected value: %s' % protein[column_name]) app_logger.warning('Database value: %s' % db_protein[column_name].iloc[0]) app_logger.warning('---') data_not_match = True self.assertFalse(data_not_match, 'Some proteins doesnt match or doesnt exist')
def call(downloads_path: str, fetch: bool, save_backup: bool = True) -> pd.DataFrame: url = 'http://www.guidetopharmacology.org/DATA/interactions.csv' compression = 'xz' file_name = 'iuphar_interaction_raw.csv.{}'.format(compression) download_file_path = os.path.join(downloads_path, file_name) def best_path(): saved_file_path = os.path.join(data_dir, 'sources', file_name) if os.path.exists(download_file_path): return download_file_path if os.path.exists(saved_file_path): return saved_file_path app_logger.error('Could not find local source for iuphar') exit(1) try: if fetch: response = requests.get(url) if response.text: s = StringIO(response.text) df = pd.read_csv(s, dtype=str) df.drop_duplicates(inplace=True) if save_backup: df.to_csv(download_file_path, index=False, compression=compression) add_to_meta(file_name, os.path.join(downloads_path, 'meta.json')) return df else: if response.status_code != 200: raise CouldNotFetchFromApiException() else: app_logger.warning('Using local version for iuphar') df = pd.read_csv(best_path(), compression=compression, dtype=str) return df except (requests.exceptions.ConnectionError, requests.exceptions.SSLError): app_logger.warning( 'Could not fetch remote source for iuphar, using local backup') df = pd.read_csv(best_path(), compression=compression) return df
def test_gene_are_not_duplicated(self): query = cellphonedb_app.cellphonedb.database_manager.database.session.query( Gene) dataframe = pd.read_sql( query.statement, cellphonedb_app.cellphonedb.database_manager.database.engine) duplicated_genes = dataframe[dataframe.duplicated(keep=False)] if len(duplicated_genes): app_logger.warning( duplicated_genes.sort_values('gene_name').to_csv(index=False)) self.assertEqual( len(duplicated_genes), 0, 'There are %s duplicated genes in database. Please check WARNING_duplicated_genes.csv file' % len(duplicated_genes))
def _merge_proteins(base_protein: pd.DataFrame, additional: pd.DataFrame, default_values: dict, default_types: dict, result_columns: list, log_file: str, quiet: bool = False) -> pd.DataFrame: additional = additional.copy() # Here we set defaults for uniprot & curated data base_protein = generator_helper.set_defaults(base_protein, default_values, quiet) additional = generator_helper.set_defaults(additional, default_values, quiet) # we will only use these columns additional = additional[result_columns] base_protein = base_protein[result_columns] # Type casting to ensure they are equal additional = additional.astype(default_types) base_protein = base_protein.astype(default_types) join_key = 'uniprot' merged_protein = base_protein.append(additional, ignore_index=True, sort=False).drop_duplicates() if not quiet and merged_protein.duplicated(join_key).any(): app_logger.warning( 'There are differences between merged files: logged to {}'.format( log_file)) log = merged_protein[merged_protein.duplicated( join_key, keep=False)].sort_values(join_key) log.to_csv(log_file, index=False, sep='\t') merged_protein.drop_duplicates(join_key, keep='last', inplace=True) return merged_protein
def test_duplicated_gene_ensembl_is_not_in_interaction(self): all_genes = cellphonedb_app.cellphonedb.database_manager.get_repository( 'gene').get_all_expanded() all_interactions = cellphonedb_app.cellphonedb.database_manager.get_repository( 'interaction').get_all() genes_duplicated_ensembl = all_genes[all_genes.duplicated('ensembl', keep=False)] all_interactions_multidata_ids = all_interactions[ 'multidata_1_id'].tolist( ) + all_interactions['multidata_2_id'].tolist() duplicated_gene_ensembls_in_interactions = genes_duplicated_ensembl[ genes_duplicated_ensembl['id_multidata'].apply( lambda id: id in all_interactions_multidata_ids)] nunknowed_duplicated_ensembl = False if not duplicated_gene_ensembls_in_interactions[ duplicated_gene_ensembls_in_interactions['ensembl'].apply( lambda ensembl: ensembl not in KNOWED_DUPLICATED_ENSEMBL. tolist())].empty: app_logger.warning( 'Some duplicated ensembls apears in interactions') app_logger.warning( duplicated_gene_ensembls_in_interactions.to_csv(index=False)) nunknowed_duplicated_ensembl = True self.assertFalse( nunknowed_duplicated_ensembl, 'Some duplicated ensembl gene apears in interactions') self.assertFalse( len(KNOWED_DUPLICATED_ENSEMBL.drop_duplicates()) != len( duplicated_gene_ensembls_in_interactions.drop_duplicates( 'ensembl')), 'Some duplicated ensembl gene apears in interactions')
def test_all_protein_have_gen(self): expected_protein_without_gene = 235 protein_query = cellphonedb_app.cellphonedb.database_manager.database.session.query( Protein, Multidata.name).join(Multidata) protein_df = pd.read_sql( protein_query.statement, cellphonedb_app.cellphonedb.database_manager.database.engine) protein_ids = protein_df['id_protein'].tolist() gene_query = cellphonedb_app.cellphonedb.database_manager.database.session.query( Gene.protein_id) gene_protein_ids = \ pd.read_sql(gene_query.statement, cellphonedb_app.cellphonedb.database_manager.database.engine)[ 'protein_id'].tolist() protein_without_gene = [] for protein_id in protein_ids: if not protein_id in gene_protein_ids: protein_without_gene.append(protein_df[ protein_df['id_protein'] == protein_id]['name'].iloc[0]) if len(protein_without_gene) != expected_protein_without_gene: app_logger.warning('There are {} Proteins without gene'.format( len(protein_without_gene))) app_logger.warning(protein_without_gene) unknowed_proteins_without_gene = [] for protein in protein_without_gene: if not protein in KNOWED_PROTEINS_WITHOUT_GENE: unknowed_proteins_without_gene.append(protein) if unknowed_proteins_without_gene: app_logger.warning( 'There are {} unknowed proteins without gene'.format( len(unknowed_proteins_without_gene))) app_logger.warning( pd.Series(unknowed_proteins_without_gene).drop_duplicates( ).tolist()) self.assertEqual(expected_protein_without_gene, len(protein_without_gene), 'There are Proteins without Gene.')
def find_database_for(value: str) -> str: file_candidate = os.path.expanduser(value) if os.path.exists(file_candidate): # todo: warning is perhaps not appropriate, logger doesn't allow info at this point app_logger.warning( 'User selected database `{}` is available, using it'.format( file_candidate)) return file_candidate _ensure_core_version_in_user_dbs() user_databases_prefix = os.path.expanduser(cpdb_releases) if not os.path.isdir(user_databases_prefix): app_logger.error( 'No downloaded databases found, run the `database download` command from the cli first' ) exit(1) if value == 'latest' or not value: available = list_local_versions() latest_available = available[0] app_logger.warning( 'Latest local available version is `{}`, using it'.format( latest_available)) value = latest_available downloaded_candidate = os.path.join(user_databases_prefix, value, database_file) valid_database = os.path.exists(downloaded_candidate) if valid_database: # todo: warning is perhaps not appropriate, logger doesn't allow info at this point app_logger.warning( 'User selected downloaded database `{}` is available, using it'. format(value)) else: app_logger.warning( 'User selected database `{}` not available, trying to download it'. format(value)) download_database(value) return find_database_for(value) return downloaded_candidate
def test_gene(self): dataframe = cellphonedb_app.cellphonedb.database_manager.get_repository( 'gene').get_all_expanded() data_not_match = False for gene in gene_entries: db_gene = dataframe for column_name in gene: if gene[column_name] == None: db_gene = db_gene[pd.isnull(db_gene[column_name])] else: db_gene = db_gene[db_gene[column_name] == gene[column_name]] if (len(db_gene) < 1): app_logger.warning('Failed cheking Gene:') app_logger.warning('Expected data:') app_logger.warning(gene) data_not_match = True self.assertFalse(data_not_match, 'Some Gene doesnt match')
def _merge_complex(base_complex: pd.DataFrame, additional: pd.DataFrame, log_file: str) -> pd.DataFrame: additional = additional.copy() defaults = { 'uniprot_3': np.nan, 'uniprot_4': np.nan, 'receptor': False, 'integrin': False, 'other': False, 'other_desc': np.nan, 'peripheral': False, 'receptor_desc': np.nan, 'secreted_desc': np.nan, 'secreted_highlight': False, 'secreted': False, 'transmembrane': False, 'pdb_structure': False, 'pdb_id': np.nan, 'stoichiometry': np.nan, 'comments_complex': np.nan } default_types = { 'complex_name': str, 'uniprot_1': str, 'uniprot_2': str, 'uniprot_3': str, 'uniprot_4': str, 'transmembrane': bool, 'peripheral': bool, 'secreted': bool, 'secreted_desc': str, 'secreted_highlight': bool, 'receptor': bool, 'receptor_desc': str, 'integrin': bool, 'other': bool, 'other_desc': str, 'pdb_id': str, 'pdb_structure': str, 'stoichiometry': str, 'comments_complex': str, } result_columns = list(default_types.keys()) required_columns = ['complex_name', 'uniprot_1', 'uniprot_2'] if not set(required_columns).issubset(additional): raise MissingRequiredColumns( list(set(required_columns).difference(additional))) # TODO: Fill NA # Here we set defaults for additional data additional = set_defaults(additional, defaults) # we will only use these columns additional = additional[result_columns] base_complex = base_complex[result_columns] # Type casting to ensure they are equal base_complex = base_complex.astype(default_types) additional = additional.astype(default_types) join_key = 'complex_name' merged_complex = base_complex.append(additional, ignore_index=True, sort=False).drop_duplicates() if merged_complex.duplicated(join_key).any(): app_logger.warning( 'There are differences between merged files: logged to {}'.format( log_file)) log = merged_complex[merged_complex.duplicated( join_key, keep=False)].sort_values(join_key) log.to_csv(log_file, index=False, sep='\t') merged_complex.drop_duplicates(join_key, keep='last', inplace=True) return merged_complex
def test_complex_composition_table(self): df_multidata = cellphonedb_app.cellphonedb.database_manager.get_repository( 'multidata').get_all() df_complex_composition = cellphonedb_app.cellphonedb.database_manager.get_repository( 'complex').get_all_compositions() number_compositions_not_match = False some_protein_didnt_exists = False some_protein_not_part_of_complex = False for complex in complex_entries: db_complex_id = df_multidata[df_multidata['name'] == complex['data'] ['name']]['id_multidata'].iloc[0] if len(df_complex_composition[ df_complex_composition['complex_multidata_id'] != db_complex_id]) == len(complex['composition']): app_logger.warning( 'Failed checking number of complex_composition with name \'%s\'' % (complex['data']['name'])) app_logger.warning( 'Expected value: %s' % len(df_complex_composition[ df_complex_composition['complex_multidata_id'] == db_complex_id])) app_logger.warning('Database value: %s' % len(complex['composition'])) app_logger.warning('---') number_compositions_not_match = True for protein_name in complex['composition']: db_complex_composition_ids = \ df_complex_composition[df_complex_composition['complex_multidata_id'] == db_complex_id][ 'protein_multidata_id'].tolist() composition_multidata_id = df_multidata[ df_multidata['name'] == protein_name]['id_multidata'] if not len(composition_multidata_id): app_logger.warning( 'Failed finding protein \'%s\' in multidata from complex name \'%s\'' % (protein_name, complex['data']['name'])) some_protein_didnt_exists = True continue if composition_multidata_id.iloc[ 0] not in db_complex_composition_ids: app_logger.warning( 'Failed finding protein \'%s\' in composition from complex name \'%s\'' % (protein_name, complex['data']['name'])) some_protein_not_part_of_complex = True self.assertFalse(number_compositions_not_match, 'Number of complex composition doesnt match') self.assertFalse(some_protein_didnt_exists, 'Some complex_composition proteins doesnt match') self.assertFalse(some_protein_not_part_of_complex, 'Complex_composition proteins doesnt match')
def statistical_analysis(meta_filename: str, counts_filename: str, counts_data='ensembl', project_name='', threshold=0.1, result_precision='3', output_path='', output_format='csv', means_result_name='means', significant_means_result_name='significant_means', deconvoluted_result_name='deconvoluted', verbose=True, database='latest', subsampling=False, subsampling_log=True, subsampling_num_pc=100, subsampling_num_cells=None, debug_seed='-1', pvalue=0.05, pvalues_result_name='pvalues', iterations=1000, threads=4) -> None: database = choose_database(None, None, value=database) try: subsampler = Subsampler(subsampling_log, subsampling_num_pc, subsampling_num_cells, verbose) if subsampling else None LocalMethodLauncher(cpdb_app.create_app(verbose, database)). \ cpdb_statistical_analysis_local_method_launcher(meta_filename, counts_filename, counts_data, project_name, iterations, threshold, output_path, output_format, means_result_name, pvalues_result_name, significant_means_result_name, deconvoluted_result_name, debug_seed, threads, result_precision, pvalue, subsampler, ) except (ReadFileException, ParseMetaException, ParseCountsException, ThresholdValueException, AllCountsFilteredException) as e: app_logger.error( str(e) + (':' if (hasattr(e, 'description') and e.description) or (hasattr(e, 'hint') and e.hint) else '') + (' {}.'.format(e.description) if hasattr(e, 'description') and e.description else '') + (' {}.'.format(e.hint) if hasattr(e, 'hint') and e.hint else '')) except EmptyResultException as e: app_logger.warning( str(e) + (':' if (hasattr(e, 'description') and e.description) or (hasattr(e, 'hint') and e.hint) else '') + (' {}.'.format(e.description) if hasattr(e, 'description') and e.description else '') + (' {}.'.format(e.hint) if hasattr(e, 'hint') and e.hint else '')) except: app_logger.error('Unexpected error') if verbose: traceback.print_exc(file=sys.stdout)