def get_organisms_df(url: Optional[str] = None) -> pd.DataFrame: """Convert tab separated txt files to pandas Dataframe. :param url: url from KEGG tab separated file :return: dataframe of the file :rtype: pandas.DataFrame """ df = pd.read_csv( url or ensure_path(MODULE_NAME, KEGG_ORGANISM_URL, path='organisms.tsv'), sep='\t', header=None, names=[ 'kegg_id', 'kegg_code', 'name', # fourth column is the taxonomy hierarchy ], usecols=[0, 1, 2], ) df['common_name'] = df['name'].map( lambda name: name.replace(')', '').split(' (')[1].capitalize() if len(name.replace(')', '').split(' (')) > 1 else '') df['name'] = df['name'].map( lambda name: name.replace(')', '').split(' (')[0].capitalize()) return df
def _load_file(module_name: str = MODULE_NAME, url: str = URL) -> str: """Load the file from the URL and place it into the bio2bel_sophia directory. :param module_name: name of module (database) :param url: URL to file from database :return: path of saved database file """ return ensure_path(prefix=module_name, url=url)
def get_entity_pathway_df(url: Optional[str] = None) -> pd.DataFrame: """Convert tab separated text files in to DataFrame. :param url: An optional url from a KEGG TSV file """ df = pd.read_csv( url or ensure_path(MODULE_NAME, PROTEIN_PATHWAY_HUMAN_URL, path='protein_pathway.tsv'), sep='\t', header=None, names=['kegg_protein_id', 'kegg_pathway_id'], ) # df['kegg_pathway_id'] = df['kegg_pathway_id'].map(_remove_path_prefix) return df
def get_pathway_df(url: Optional[str] = None) -> pd.DataFrame: """Convert tab separated txt files to pandpathway = parse_pathway_lines(pathway_lines)as Dataframe. :param url: url from KEGG tab separated file :return: dataframe of the file """ df = pd.read_csv( url or ensure_path( MODULE_NAME, KEGG_HUMAN_PATHWAYS_URL, path='pathways.tsv'), sep='\t', header=None, names=['kegg_pathway_id', 'name'], ) # df['kegg_pathway_id'] = df['kegg_pathway_id'].map(_remove_path_prefix) return df
def path(self) -> str: # noqa: D401 """The (ensured) path to the data.""" return ensure_path(MODULE_NAME, self.url)
'phosphotransfer reaction': 'proteinModification', 'disulfide bond': 'complexAbundance', 'self interaction': '', 'deacetylation reaction': '', 'lipoprotein cleavage reaction': 'proteinAbundance', 'gtpase reaction': 'reaction', 'glycosylation reaction': 'proteinModification(Glyco)', 'palmitoylation reaction': 'proteinModification', 'putative self interaction': '', 'dna cleavage': 'geneAbundance', 'rna cleavage': 'rnaAbundace', } MODULE_NAME = 'intact' URL = 'ftp://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.zip' path = ensure_path(MODULE_NAME, URL) sample_path = '/Users/sophiakrix/Downloads/intact_sample.txt' ID_INTA = '#ID(s) interactor A' ID_INTB = 'ID(s) interactor B' DATABASE_INT_A = 'database_intA' DATABASE_INT_B = 'database_intB' ONLY_ID_INT_A = 'id_intA' ONLY_ID_INT_B = 'id_intB' UNIPROTKB = 'uniprotkb' ORIG_ALT_ID_COLUMN_NAMES = [ 'Alt. ID(s) interactor A', 'Alt. ID(s) interactor B' ] NEW_ALT_ID_COLUMN_NAMES = ['alternative_intA', 'alternative_intB']