def _load_meta_counts(counts_filename: str, meta_filename: str) -> (pd.DataFrame, pd.DataFrame): """ :raise ParseMetaException """ meta = utils.read_data_table_from_file(os.path.realpath(meta_filename)) counts = utils.read_data_table_from_file(os.path.realpath(counts_filename), index_column_first=True) return counts, meta
def add_hla_genes( gene_base_filename: str, hla_genes_filename: str, result_filename: str = 'gene_hla.csv', ) -> None: gene_base = utils.read_data_table_from_file(gene_base_filename) hla_genes = utils.read_data_table_from_file(hla_genes_filename) genes_merged = gene_base.append(hla_genes, ignore_index=True) genes_merged.to_csv(result_filename, index=False)
def remove_genes_in_file( gene_base_filename: str, remove_genes_filename: str, result_filename: str = 'gene_filtered.csv', ) -> None: gene_base_data = utils.read_data_table_from_file(gene_base_filename) remove_genes_data = utils.read_data_table_from_file(remove_genes_filename) genes_filtered = remove_genes.remove_genes_in_file(gene_base_data, remove_genes_data) genes_filtered.to_csv(result_filename, index=False)
def _load_meta_counts(counts_filename, meta_filename): try: meta_raw = utils.read_data_table_from_file( os.path.realpath(meta_filename), index_column_first=True) counts = utils.read_data_table_from_file( os.path.realpath(counts_filename), index_column_first=True) except ReadFileException as e: app_logger.error(e) exit(1) meta = pd.DataFrame(index=meta_raw.index) meta['cell_type'] = meta_raw.iloc[:, 0] return counts, meta_raw
def remove_genes_in_file(gene_base_filename: str, remove_genes_filename: str, result_filename: str = 'gene_filtered.csv') -> None: gene_base_filename = '{}/{}'.format(app.data_dir, gene_base_filename) remove_genes_filename = '{}/{}'.format(app.data_dir, remove_genes_filename) gene_base_data = utils.read_data_table_from_file(gene_base_filename) remove_genes_data = utils.read_data_table_from_file(remove_genes_filename) genes_filtered = remove_genes.remove_genes_in_file(gene_base_data, remove_genes_data) genes_filtered.to_csv('{}/{}'.format(app.output_dir, result_filename), index=False)
def generate_genes_from_uniprot_ensembl_db( uniprot_db_filename: str, ensembl_db_filename: str, proteins_filename: str, result_filename: str = 'gene_uniprot_ensembl_merged.csv', ) -> None: uniprots = utils.read_data_table_from_file(uniprot_db_filename) ensembls = utils.read_data_table_from_file(ensembl_db_filename) proteins = utils.read_data_table_from_file(proteins_filename) result = mergers_genes.merge_genes_from_uniprot_ensembl_db( ensembls, proteins, uniprots) result.to_csv(result_filename, index=False)
def validate_gene_list(gene_filename: str) -> None: genes = utils.read_data_table_from_file(gene_filename) if gene_validators.validate_genes(genes): print('GENE LIST IS VALID') else: print('GENE LIST IS NOT VALID')
def _load_meta_counts(counts_filename: str, meta_filename: str) -> (pd.DataFrame, pd.DataFrame): """ :raise ParseMetaException """ meta_raw = utils.read_data_table_from_file( os.path.realpath(meta_filename), index_column_first=True) counts = utils.read_data_table_from_file( os.path.realpath(counts_filename), index_column_first=True) try: meta = pd.DataFrame(index=meta_raw.index) meta['cell_type'] = meta_raw.iloc[:, 0] except: raise ParseMetaException return counts, meta
def add_hla_genes(gene_base_filename: str, hla_genes_filename: str, data_path: str = '', result_filename: str = 'gene_hla.csv', result_path: str = '') -> None: if not data_path: data_path = app.data_dir gene_base_filename = '{}/{}'.format(data_path, gene_base_filename) hla_genes_filename = '{}/{}'.format(data_path, hla_genes_filename) gene_base = utils.read_data_table_from_file(gene_base_filename) hla_genes = utils.read_data_table_from_file(hla_genes_filename) genes_merged = gene_base.append(hla_genes, ignore_index=True) genes_merged.to_csv('{}/{}'.format(result_path, result_filename), index=False)
def assert_open_file(self, base_name, extension, index_column_first, separator): fixtures_dir = '{}/fixtures'.format(self.current_dir) result = utils.read_data_table_from_file( '{}/{}.{}'.format(fixtures_dir, base_name, extension), index_column_first, separator) expected_result = pd.read_csv( '{}/example_data.csv'.format(fixtures_dir)) self.assertTrue(result.equals(expected_result))
def generate_genes_from_uniprot_ensembl_db( uniprot_db_filename: str, ensembl_db_filename: str, proteins_filename: str, result_filename: str = 'gene_uniprot_ensembl_merged.csv', result_path: str = ''): uniprot_db_filename = '{}/{}'.format(app.data_dir, uniprot_db_filename) ensembl_db_filename = '{}/{}'.format(app.data_dir, ensembl_db_filename) proteins_filename = '{}/{}'.format(app.data_dir, proteins_filename) if not result_path: result_path = app.output_dir uniprots = utils.read_data_table_from_file(uniprot_db_filename) ensembls = utils.read_data_table_from_file(ensembl_db_filename) proteins = utils.read_data_table_from_file(proteins_filename) result = mergers_genes.merge_genes_from_uniprot_ensembl_db( ensembls, proteins, uniprots) result.to_csv('{}/{}'.format(result_path, result_filename), index=False)
def validate_gene_list(gene_filename: str, data_path: str) -> None: if not data_path: data_path = app.output_dir gene_filename = '{}/{}'.format(data_path, gene_filename) genes = utils.read_data_table_from_file(gene_filename) if gene_validators.validate_genes(genes): print('GENE LIST IS VALID') else: print('GENE LIST IS NOT VALID')
def generate_interactions(imex_raw_filename: str, iuphar_raw_filename: str, database_proteins_filename: str, database_gene_filename: str, database_complex_filename: str, interaction_to_remove_filename: str, interaction_curated_filename: str) -> None: interactions_base = utils.read_data_table_from_file( '%s/%s' % (data_dir, imex_raw_filename), na_values='-') proteins = pd.read_csv('%s/%s' % (data_dir, database_proteins_filename)) genes = pd.read_csv('%s/%s' % (data_dir, database_gene_filename)) complexes = pd.read_csv('%s/%s' % (data_dir, database_complex_filename)) interactions_to_remove = pd.read_csv( '%s/%s' % (data_dir, interaction_to_remove_filename)) interaction_curated = pd.read_csv('%s/%s' % (data_dir, interaction_curated_filename)) print('generating imex file') imex_interactions = parse_interactions_imex(interactions_base, proteins, genes) print('Getting Iuphar interactions') iuphar_original = get_iuphar_guidetopharmacology.call( iuphar_raw_filename, data_dir, '{}/downloads'.format(data_dir), default_download_response='yes') print('generating iuphar file') iuphar_interactions = parse_iuphar_guidetopharmacology.call( iuphar_original, genes, proteins) print('merging iuphar/imex') merged_interactions = merge_iuphar_imex_interactions( iuphar_interactions, imex_interactions) print('removing complex interactions') no_complex_interactions = only_noncomplex_interactions( merged_interactions, complexes) print('removing selected interactions') clean_interactions = remove_interactions_in_file(no_complex_interactions, interactions_to_remove) print('adding curated interaction') interactions_with_curated = add_curated(clean_interactions, interaction_curated) interactions_with_curated.to_csv('%s/interaction.csv' % output_dir, index=False)
def wrapper(namefile='', data_path=''): app_logger.info('Collecting {}'.format(method_name)) if not namefile: namefile = '{}_input.csv'.format(method_name) if not data_path: data_path = data_dir data = utils.read_data_table_from_file('{}/{}'.format( data_path, namefile)) if self.database_file: getattr( create_app(True, self.database_file, True).collect, method_name)(data) else: getattr(cellphonedb_app.cellphonedb.collect, method_name)(data)
def assert_file_not_empty(self, file, message=''): if not message: message = 'File {} is empty'.format(file) read_data = utils.read_data_table_from_file(file) self.assertFalse(read_data.empty, message)
def generate_genes( user_gene: Optional[str], fetch_uniprot: bool, fetch_ensembl: bool, result_path: str, project_name: str, ) -> None: output_path = _set_paths(result_path, project_name) # TODO: Add logger if fetch_ensembl: print('fetching remote ensembl data ... ', end='') source_url = 'http://www.ensembl.org/biomart/martservice?query={}' query = '<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default" ' \ 'formatter = "CSV" header = "1" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >' \ '<Dataset name = "hsapiens_gene_ensembl" interface = "default" >' \ '<Attribute name = "ensembl_gene_id" />' \ '<Attribute name = "ensembl_transcript_id" />' \ '<Attribute name = "external_gene_name" />' \ '<Attribute name = "hgnc_symbol" />' \ '<Attribute name = "uniprotswissprot" />' \ '</Dataset>' \ '</Query>' url = source_url.format(urllib.parse.quote(query)) ensembl_db = pd.read_csv(url) print('done') else: ensembl_db = utils.read_data_table_from_file( os.path.join(data_dir, 'sources/ensembl.txt')) print('read local ensembl file') # additional data comes from given file or uniprot remote url if fetch_uniprot: try: print('fetching remote uniprot file ... ', end='') source_url = 'https://www.uniprot.org/uniprot/?query=*&format=tab&force=true' \ '&columns=id,entry%20name,reviewed,protein%20names,genes,organism,length' \ '&fil=organism:%22Homo%20sapiens%20(Human)%20[9606]%22%20AND%20reviewed:yes' \ '&compress=yes' uniprot_db = pd.read_csv(source_url, sep='\t', compression='gzip') print('done') except Exception as e: print('Error fetching remote UniProt data, fetching local data') uniprot_db = pd.read_csv(os.path.join(data_dir, 'sources/uniprot.tab'), sep='\t') print('read local uniprot file') else: uniprot_db = utils.read_data_table_from_file( os.path.join(data_dir, 'sources/uniprot.tab')) print('read local uniprot file') ensembl_columns = { 'Gene name': 'gene_name', 'Gene stable ID': 'ensembl', 'HGNC symbol': 'hgnc_symbol', 'UniProtKB/Swiss-Prot ID': 'uniprot' } uniprot_columns = {'Entry': 'uniprot', 'Gene names': 'gene_names'} result_columns = ['gene_name', 'uniprot', 'hgnc_symbol', 'ensembl'] ensembl_db = ensembl_db[list( ensembl_columns.keys())].rename(columns=ensembl_columns) uniprot_db = uniprot_db[list( uniprot_columns.keys())].rename(columns=uniprot_columns) hla_genes = utils.read_data_table_from_file( os.path.join(data_dir, 'sources/hla_curated.csv')) if user_gene: separator = _get_separator(os.path.splitext(user_gene)[-1]) user_gene = pd.read_csv(user_gene, sep=separator) cpdb_genes = gene_generator(ensembl_db, uniprot_db, hla_genes, user_gene, result_columns) cpdb_genes[result_columns].to_csv('{}/{}'.format(output_path, 'gene_generated.csv'), index=False)
def generate_interactions( proteins: str, genes: str, complex: str, user_interactions: Optional[str], user_interactions_only: bool, result_path: str, fetch_imex: bool, fetch_iuphar: bool, project_name: str, ) -> None: if user_interactions_only and not user_interactions: raise Exception('You need to set --user-interactions parameter') output_path = utils.set_paths(result_path, project_name) downloads_path = utils.set_paths( utils.set_paths(result_path, project_name), 'downloads') proteins = utils.read_data_table_from_file(proteins) genes = utils.read_data_table_from_file(genes) complexes = utils.read_data_table_from_file(complex) if not user_interactions_only: raw_imex = get_imex.call(genes, downloads_path, fetch_imex) interactions_to_remove = utils.read_data_table_from_file( os.path.join(data_dir, 'sources/excluded_interaction.csv')) interaction_curated = utils.read_data_table_from_file( os.path.join(data_dir, 'sources/interaction_curated.csv')) if user_interactions: separator = _get_separator(os.path.splitext(user_interactions)[-1]) user_interactions = pd.read_csv(user_interactions, sep=separator) user_interactions['partner_a'] = user_interactions['partner_a'].apply( lambda x: str(x).strip()) user_interactions['partner_b'] = user_interactions['partner_b'].apply( lambda x: str(x).strip()) user_interactions['annotation_strategy'] = 'user_curated' if not 'protein_name_a' in user_interactions.columns: user_interactions['protein_name_a'] = '' if not 'protein_name_b' in user_interactions.columns: user_interactions['protein_name_b'] = '' result_columns = [ 'partner_a', 'partner_b', 'protein_name_a', 'protein_name_b', 'annotation_strategy', 'source' ] if not user_interactions_only: print('Parsing IMEX file') imex_interactions = parse_interactions_imex(raw_imex, proteins, genes) print('Getting iuphar data') raw_iuphar = get_iuphar.call(downloads_path, fetch_iuphar) print('Generating iuphar interactions') iuphar_interactions = parse_iuphar_guidetopharmacology.call( raw_iuphar, genes, proteins) print('Merging iuphar/imex') merged_interactions = merge_iuphar_imex_interactions( iuphar_interactions, imex_interactions) print('Removing complex interactions') no_complex_interactions = only_noncomplex_interactions( merged_interactions, complexes) print('Removing selected interactions') clean_interactions = remove_interactions_in_file( no_complex_interactions, interactions_to_remove) print('Adding curated interaction') interactions_with_curated = add_curated(clean_interactions, interaction_curated) result = tools_helper.normalize_interactions( interactions_with_curated.append(user_interactions, ignore_index=True, sort=False), 'partner_a', 'partner_b').drop_duplicates(['partner_a', 'partner_b'], keep='last') else: result = tools_helper.normalize_interactions(user_interactions, 'partner_a', 'partner_b') \ .drop_duplicates(['partner_a', 'partner_b'], keep='last') result[result_columns].sort_values(['partner_a', 'partner_b']).to_csv( '{}/interaction_input.csv'.format(output_path), index=False)
def read_meta_file(path, filename): meta_raw = utils.read_data_table_from_file('{}/{}'.format(path, filename), index_column_first=True) meta = pd.DataFrame(index=meta_raw.index) meta['cell_type'] = meta_raw.iloc[:, 0] return meta