def test_extract_protein_interactions_kgml(self, kgml_file, expected_no_rel): # Arrange sut = KeggProteinInteractionsExtractor() with open( os.path.join(os.path.dirname(os.path.realpath(__file__)), kgml_file), 'r') as myfile: kgml_string = myfile.read() # Mock Kegg ops mock_kegg = KEGG() sut.kegg = mock_kegg # No matter what the input is, return the ko numbers that map to hsa numbers mock_kegg.link = MagicMock(return_value="ko:K00922 hsa:5293\n" + "ko:K00922 hsa:5291\n" + "ko:K02649 hsa:5295") # No matter what the input is, return the hsa numbers that map to uniprot numbers mock_kegg.conv = MagicMock(return_value={"hsa:5293": "up:B0LPE5"}) # Mock Uni Prot mock_uniprot = UniProt() sut.uniprot = mock_uniprot mock_uniprot.mapping = MagicMock( return_value={"B0LPE5": ["gene1", "gene2"]}) # Act actual = sut.extract_protein_interactions_kgml(kgml_string) # Assert self.assertEqual(expected_no_rel, len(actual))
def downloadPathway(kegg_color, final_com, downloaded, pair): i = 1 s = KEGG() for path in final_com: if path not in downloaded: print('loading...%d' % i) i += 1 keggid = dict([(key, kegg_color[key]) for key in final_com[path]]) image_url = s.show_pathway("path:%s%s" % (org, path), dcolor="white", keggid=keggid) req = urllib.request.urlopen(image_url).read() req = req.decode() listurl = re.findall(r'tmp.*png', req) if path == '01100' or path == '01110': listurl = re.findall(r'tmp.*%s.{5}' % org, listurl[0]) urll = listurl[0] + '.png' url = 'http://www.kegg.jp/' + urll else: url = 'http://www.kegg.jp/' + listurl[0] f = open('./%s/%s/%s%s.png' % (client, pair, org, path), "wb") # 打开文件 req = urllib.request.urlopen(url) buf = req.read() # 读出文件 f.write(buf) # 写入文件 f.close()
def getPathway(org, compare): temp = open('%s_pathway.txt' % org, "r").read() temp = re.findall(r'\d{5}', temp) text = read_doc('%s.txt' % compare) kegg_color = {} map = {} map_list = [] s = KEGG() final_com = {} for i in range(0, len(text) - 1, 2): newid = text[i] #'cpd:'+ text[i] kegg_color[newid] = text[i + 1] + ',' + text[i + 1] for id in kegg_color: a = s.get(id) dic = s.parse(a) try: if 'PATHWAY' in dic: map[id] = list(dic['PATHWAY'].keys()) map_list.extend(map[id]) except TypeError: print('Error:' + a) final_map = dict(Counter(map_list)) final_map = [x for x in final_map.items() if x[1] > 1] final_map = [x for x in final_map if x[0][3:] in temp] for pathway in final_map: newpath = pathway[0][3:] final_com[newpath] = [] for compound in map: if pathway[0] in map[compound]: final_com[newpath].append(compound) return kegg_color, final_com
def get_pdb_id_by_name_gene(gene_name): k = KEGG() gene_ids = [] # по названию гена получаем id gen = kegg_find("hsa", gene_name) if gen in [400, 404]: return [],[] for line in gen.split("\n"): if len(line)>0: gene_ids.append(line.split("\t")[0]) # по каждому полученному id гена получаем PDB_ID pdb_ids = [] if len(gene_ids)>100: return [],[] for gene_id in gene_ids: e = kegg_get(gene_id) if e in [400, 404]: continue d = k.parse(e) if "STRUCTURE" in d: pdb_ids += d["STRUCTURE"]["PDB"].split() return gene_ids, pdb_ids
def kegg_to_uniprot(fr='hsa', cache=False): """Downloads a mapping from a `KEGG` database to `UniProt`, including both `TrEMBL` and `SwissProt`. Parameters: ---------- fr : str, optional, default: 'hsa' KEGG database identifier to convert. Defaults to 'hsa'. cache : bool, optional, default: False If True, results are cached by `bioservices`. This can save time but you will eventually miss out on new database releases if your cache is old. Returns ------- `dict` Mapping from `KEGG` identifiers to a list of `UniProt` accessions. """ kegg = KEGG(cache=cache) mapping = kegg.conv(fr, 'uniprot') parsed_mapping = {} for upid, org in mapping.items(): upid = upid.split(':')[1] # remove the 'up:' prefix if org in parsed_mapping: parsed_mapping[org] += [upid] else: parsed_mapping[org] = [upid] return parsed_mapping
def get_pathway(pathway): s = KEGG() data = s.get(pathway) if type(data) == int: return data dict_data = s.parse(data) path_info = (dict_data['NAME'], dict_data['GENE']) return path_info
def Get_Drug_IDs(Brite_ID): k = KEGG(verbose="False") k_id = k.get(Brite_ID) e = easyXML(k_id, 'utf=8') results = e.soup.findChildren("a") all_drug_ids = re.findall(r"(D\d{5})", str(results)) array = np.array(all_drug_ids) unique_drug_ids = np.unique(array) return unique_drug_ids
def teste4(): s = KEGG() s.organism = "hsa" #H**o sapiens (human) modules=s.moduleIds #pathway modules dic=s.parse(s.get(modules[0])) compounds=dic["COMPOUND"]#dictionary with the names of the compounds {'C00074': 'Phosphoenolpyruvate',..... pathway=dic["PATHWAY"] # {'map00010': 'Glycolysis / Gluconeogenesis',...... module_name=dic["NAME"] #['Glycolysis (Embden-Meyerhof pathway), glucose => pyruvate'] return pathway
def checkOrg(org): if not os.path.exists('./%s_pathway.txt' % org): a = KEGG() b = a.list('pathway', org) with open('%s_pathway.txt' % org, 'a', encoding='latin-1') as f: f.write(b) print('Finish writing org file') else: print('File already existed')
def __init__(self): self.k = KEGG() self.org = "lac" self.genelist = [] self.genedict = {} self.a = AnnotationTable() self.targetdict = self.a.analyze_sequences( ) # listed as sequence: list of genes for gene in self.a.genes: self.genelist.append(gene)
def __init__(self, gene_lists, taxon, dataframe, kegg_organism=None, enrichment_params={ "padj": 0.05, "log2_fc": 3, "max_entries": 3000, "kegg_background": None, "mapper": None, "preload_directory": None, 'plot_compute_levels': False, 'plot_logx': True }, go_only=False, kegg_only=False, command="" ): """.. rubric:: constructor """ super().__init__() self.title = "Enrichment" self.command = command #self.rnadiff_folder = rnadiff_folder self.gene_lists = gene_lists self.enrichment_params = enrichment_params self.data = dataframe self.taxon = taxon if taxon == 10090: self.organism = "mmu" elif taxon == 9606: self.organism = "hsa" else: if kegg_organism is None: logger.error("You must specify the kegg organism name if not human or mouse: eg., eco for ecoli") # figure out the organism from taxon raise NotImplementedError else: from bioservices import KEGG k = KEGG() k.organism = kegg_organism # validates the organism name self.organism = kegg_organism if self.enrichment_params['preload_directory']: pathname = self.enrichment_params['preload_directory'] if os.path.exists(pathname) is False: logger.error(f"{pathname} does not exist") sys.exit(1) #from sequana.rnadiff import RNADiffResults #self.rnadiff = RNADiffResults(self.rnadiff_folder) self.rnadiff = {} self.create_report_content(go_only=go_only, kegg_only=kegg_only) self.create_html("enrichment.html")
def teste5(): s = KEGG() s.organism = "hsa" #H**o sapiens (human) modules=s.moduleIds #pathway modules dic=s.parse(s.get("M00627")) module_name=dic["NAME"][0] reactions=dic["REACTION"] if "Pentose phosphate cycle" in module_name: print(module_name) else: print("haha")
def target_paths(target_dict): # Create KEGG Object k = KEGG(verbose=False) # Create empty dictionary to output information gene_path = {} # start iterator i = 0 # create list of targets target_names = list(target_dict.keys()) # Loop through genes for HSA in target_dict.values(): # Only use data where available if len(HSA) > 1: # get gene KEGG page page = k.get(HSA.lower()) # isolate pathway information d = k.parse(page) # write pathway information to output dictionary if "PATHWAY" in d.keys(): # create variable for pathways paths = d["PATHWAY"] # add pathway ids as list to gene name key gene_path[target_names[i]] = list(paths.keys()) # increase iterator i += 1 # add null value for no pathways else: gene_path[target_names[i]] = " " # increase iterator i += 1 # Skip null values else: gene_path[target_names[i]] = " " # increase iterator i += 1 return gene_path
def teste6(): s = KEGG() s.organism = "hsa" modules=["M00001", "M00002", "M00013", "M00034"] dic_reac={} for mod in modules: dic=s.parse(s.get(mod)) reactions=dic["REACTION"] for reac in reactions: teste=reactions[reac] string=teste.split(" ") dic_reac[reac]=string return dic_reac
def extract_all(self): from bioservices import KEGG kegg = KEGG() pathway_list = filter(None, kegg.list("pathway/hsa").split("\n")) pathway_dict = {} for p in pathway_list: id = p.split("\t")[0] name = p.split("\t")[1] pathway_dict[id] = name return pathway_dict
def teste2(): s = KEGG() s.organism = "hsa" modules=s.moduleIds print(modules[3]) dic=s.parse(s.get(modules[3])) reactions=dic["REACTION"] dic_reac={} for reac in reactions: teste=reactions[reac] string=teste.split(" ") dic_reac[reac]=string return dic_reac #it gives a dictionary with reactionsID as keys and a list of compounds
def __init__(self, folder, organism, alpha=0.05, log2_fc=0, progress=True, mapper=None, background=None): print("DRAFT in progress") from bioservices import KEGG self.kegg = KEGG(cache=True) self.kegg.organism = organism self.rnadiff = RNADiffResults(folder, alpha=alpha, log2_fc=log2_fc) # some clean up if "ID" in self.rnadiff.df.columns: self.rnadiff.df['ID'] = [ x.replace("gene:", "") for x in self.rnadiff.df['ID'] ] self.rnadiff.df.index = [ x.replace("gene:", "") for x in self.rnadiff.df.index ] for key, values in self.rnadiff.gene_lists.items(): self.rnadiff.gene_lists[key] = [ x.replace("gene:", "") for x in values ] self.rnadiff.df.index = [ x.replace("gene:", "") for x in self.rnadiff.df.index ] choices = list(self.rnadiff.gene_lists.keys()) if background: self.background = background else: self.background = len( self.kegg.list(self.kegg.organism).split("\n")) logger.info("Set number of genes to {}".format(self.background)) self._load_pathways(progress=progress) self.mapper = mapper try: self.compute_enrichment() except Exception: logger.critical("An error occured while computing enrichments") pass
def drug_dict(disease): # Create KEGG Object k = KEGG(verbose=False) # Create object for disease file dis = k.get(disease) # create dictionary of k.get() output with k.parse() # this is an extension of the KEGG class d = k.parse(dis) # Pull out Therapeutic drug information treatment_drugs = d["DRUG"] # Return dictionary of drugs return treatment_drugs
def kegg_find(*args): if not hasattr(kegg_find,"cache"): if os.path.isfile("kegg_find.cache"): kegg_find.cache = pickle.load(open("kegg_find.cache","rb")) else: kegg_find.cache = {} if args not in kegg_find.cache or kegg_find.cache[args] is None: k = KEGG() result = k.find(*args) kegg_find.cache[args] = result with open("kegg_find.cache~","wb") as f: pickle.dump(kegg_find.cache, f) os.rename("kegg_find.cache~", "kegg_find.cache") return result else: return kegg_find.cache[args]
def tcell_read_metabolomics_data(): """This function is quite convoluted as it downloads an excelfile from a publication and extracts a dataframe, idexed by chebi. The function also caches intermediate files""" tcell_metabol_xls = cache.UrlFileCache(os.path.join(cache.get_cache_path(), metabolite_expression_name + ".xlsx"), metabolomics_data_url) metabolomics_df = pd.read_excel(tcell_metabol_xls.get_file_name(), sheet_name = "normalized by sample mean", index_col=0, usecols="A,C:HN", skiprows = [0]) #metabolomics_df = pd.read_excel(tcell_metabol_xls.get_file_name(), sheet_name = "normalized by sample mean", index_col=0, usecols="A,C:HN", skiprows = [0]) for col in metabolomics_df.columns: # Average all technical replicates (Named by trailing ".1") if len(col.split('.'))>1 and col.split('.')[1] == "1": remcol = col.split('.')[0] metabolomics_df[remcol] = scipy.stats.gmean(metabolomics_df[[remcol,col]],axis=1) metabolomics_df.drop(col, axis=1, inplace=True) metabolomics_df.index.name = "KEGG_ID" metabolomics_df = metabolomics_df.apply(np.exp2) # The excel data is in log2 space, return it to normal k = KEGG(verbose=False) map_kegg_chebi = k.conv("chebi", "compound") metabolomics_df = metabolomics_df.groupby("KEGG_ID", group_keys=False).apply(lambda x: one_row_per_compound_convert(x, map_kegg_chebi)).reset_index(drop=True) metabolomics_df.set_index("MetaboliteID", inplace=True) return metabolomics_df
class KeggInfo: def __init__(self): self.k = KEGG() self.org = "lac" self.genelist = [] self.genedict = {} self.a = AnnotationTable() self.targetdict = self.a.analyze_sequences( ) # listed as sequence: list of genes for gene in self.a.genes: self.genelist.append(gene) #for gene in self.genelist: #self.get_info(gene) def get_info(self, gene): id = self.org + ":" + gene res = self.k.get(id) d = self.k.parse(res) ortho = "unknown" motif = "unknown" pfam = "unknown" definition = str(d['DEFINITION']) definition = definition[9:] if d.has_key('ORTHOLOGY'): ortho = str(d['ORTHOLOGY']) if d.has_key('MOTIF'): motif = d['MOTIF'] if motif.has_key('Pfam'): pfam = str(motif['Pfam']) else: pfam = "unknown" # print gene + ";" + definition + ";" + pfam self.genedict[gene] = definition print gene + " info obtained" def make_file(self): f = open("/Users/brianmendoza/Desktop/CRISPRs/lac_multi_data.txt", 'w') for sequence in self.targetdict: sequenceLine = sequence + ";" + str(len(self.targetdict[sequence])) for gene in self.targetdict[sequence]: sequenceLine += ";" + gene[0:-2] # self.genedict[gene] f.write(sequenceLine + "\n") f.close()
def __init__(self, info_path): super(NewGenome, self).__init__() uic.loadUi('NewGenome.ui', self) self.setWindowTitle('New Genome') self.k = KEGG() self.info_path = info_path #---Button Modifications---# self.setWindowIcon(Qt.QIcon("cas9image.png")) self.whatsthisButton.clicked.connect(self.whatsthisclicked) self.KeggSearchButton.clicked.connect(self.updatekegglist) self.resetButton.clicked.connect(self.reset) self.submitButton.clicked.connect(self.submit) self.browseForFile.clicked.connect(self.selectFasta) self.NCBI_File_Search.clicked.connect(self.prep_ncbi_search) self.JobsQueueBox.setReadOnly(True) self.output_browser.setText("Waiting for program initiation...") self.CompletedJobs.setText(" ") self.contButton.clicked.connect(self.continue_to_main) self.comboBoxEndo.currentIndexChanged.connect(self.endo_settings) self.runButton.clicked.connect(self.run_jobs) self.clearButton.clicked.connect(self.clear_job_queue) self.viewStatButton.setEnabled(False) self.JobsQueue = [] # holds Job classes. self.Endos = dict() self.file = "" self.process = QtCore.QProcess() self.process.setProcessChannelMode(QtCore.QProcess.MergedChannels) self.process.finished.connect(self.upon_process_finishing) self.seqTrans = SeqTranslate() self.first = False #show functionalities on window self.fillEndo() #self.show() self.num_chromo_next = False
def download_pathway_ids(organism, cache=False): """ Query KEGG for a recent list of pathways for an organism. Parameters ---------- organism: str A KEGG organism code. For example 'hsa'. cache : bool, optional, default: False If True, results are cached by `bioservices`. This can save time but you will eventually miss out on new database releases if your cache is old. Returns ------- `list` List of str pathway identifiers. """ kegg = KEGG(cache=cache) kegg.organism = organism pathways = kegg.pathwayIds return pathways
def search_kegg(accessions): start_time = datetime.datetime.now() with yaspin(text="Retrieving KEGG annotations...", color="cyan") as sp: raw_data = "" for accession in accessions.dropna(): path = KEGG() res = accession.split(":") try: for k, val in path.get_pathway_by_gene(res[1], res[0]).items(): _id = re.search("\d+", k).group(0) raw_data = f"{raw_data}map{_id}\t\"{val}\"\n" except AttributeError: pass try: kegg = pandas.read_csv(pandas.compat.StringIO(raw_data), sep="\t", header=None) kegg.columns = ["accession", "description"] # Add column of counts. kegg["count"] = kegg.groupby("accession")["accession"].transform( "count") kegg = (kegg.drop_duplicates(subset="accession").sort_values( by="count", ascending=False).reset_index(drop=True)) mssg = f"* Found {sum(kegg['count'])} KEGG pathways from which {len(kegg)} were unique." except pandas.errors.EmptyDataError: kegg = pandas.DataFrame() mssg = f"* Found 0 KEGG Pathways." time_diff = (datetime.datetime.now() - start_time).total_seconds() sp.text = f"Retrieving KEGG annotations => Task done in {time_diff} seconds." sp.ok("✔") print(mssg) return kegg
def build_csv(self, filename=None, Nmax=None): """rebuild the entire dataframe (1hour) and stores as attribute :param Nmax: for testing """ logger.info("Retrieving the kegg organisms and their definitions") from bioservices import KEGG import pandas as pd k = KEGG() results = [] definition = [] for i, item in enumerate(k.organismIds): results.append(k.parse(k.get(f"gn:{item}"))['NAME']) definition.append(k.parse(k.get(f"gn:{item}"))['DEFINITION']) print(i, Nmax) if Nmax and i + 1 >= Nmax: break results = [x[0] for x in results] IDs = [x.split(",")[0] for x in results] taxon = [x.split(",")[-1] for x in results] names = [ x.split(",")[1].strip() if len(x.split(",")) == 3 else None for x in results ] df = pd.DataFrame({ 'ID': IDs, 'taxon': taxon, 'name': names, 'def': definition }) df = df.fillna("") df.columns = ['ID', 'taxon', 'shortname', 'definition'] df['definition'] = [x.lower() for x in df.definition] df['shortname'] = [x.lower() for x in df.shortname] self.df = df if filename: df.to_csv(filename)
from bioservices import ChEMBL, QuickGO, Reactome, KEGG from py2neo import Graph from model.core import * from ncbi import fetch_publication_list from quickgo import fetch_quick_go_data from uniprot import * graph = Graph(host=os.environ.get("DB", "localhost"), bolt=True, password=os.environ.get("NEO4J_PASSWORD", "")) chembl = ChEMBL(verbose=False) quick_go = QuickGO(verbose=False) reactome = Reactome(verbose=False) kegg = KEGG(verbose=False) # watch("neo4j.bolt") gene_dict = dict() transcript_dict = dict() pseudogene_dict = dict() cds_dict = dict() exon_dict = dict() rrna_dict = dict() trna_dict = dict() ncrna_dict = dict() location_dict = dict() go_term_set = set() target_protein_ids_csv = "data/drugbank/all_target_polypeptide_ids.csv"
def search_organism(organism): k = KEGG() return k.lookfor_organism(organism)
def test_KEGGParser(): s = KEGG() d = s.parse(s.get("cpd:C00001")) d = s.parse(s.get("ds:H00001")) d = s.parse(s.get("dr:D00001")) d = s.parse(s.get("ev:E00001")) d = s.parse(s.get("ec:1.1.1.1")) d = s.parse(s.get("hsa:1525")) d = s.parse(s.get("genome:T00001")) d = s.parse(s.get("gl:G00001")) d = s.parse(s.get("md:hsa_M00554")) d = s.parse(s.get("ko:K00001")) d = s.parse(s.get("path:hsa04914")) d = s.parse(s.get("rc:RC00001")) d = s.parse(s.get("rn:R00001")) d = s.parse(s.get("rp:RP00001")) d = s.parse(s.get('C15682')) assert d['SEQUENCE'][0]['TYPE'] == 'PK' assert d['SEQUENCE'][0]['GENE'] =="0-2 mycAI [UP:Q83WF0]; 3 mycAII [UP:Q83WE9]; 4-5 mycAIII[UP:Q83WE8]; 6 mycAIV [UP:Q83WE7]; 7 mycAV [UP:Q83WE6]" assert d['SEQUENCE'][0]['ORGANISM'] == "Micromonospora griseorubida"
def test_KEGGParser(): s = KEGG() d = s.parse(s.get("cpd:C00001")) d = s.parse(s.get("ds:H00001")) d = s.parse(s.get("dr:D00001")) d = s.parse(s.get("ev:E00001")) d = s.parse(s.get("ec:1.1.1.1")) d = s.parse(s.get("hsa:1525")) d = s.parse(s.get("genome:T00001")) d = s.parse(s.get("gl:G00001")) d = s.parse(s.get("md:hsa_M00554")) d = s.parse(s.get("ko:K00001")) d = s.parse(s.get("path:hsa04914")) d = s.parse(s.get("rc:RC00001")) d = s.parse(s.get("rn:R00001")) d = s.parse(s.get("rp:RP00001"))
def kegg(): k = KEGG() k.organismIds k.organism = "hsa" return k
""" import logging import collections import matplotlib.pyplot as plt from matplotlib import gridspec import numpy as np from scipy.stats import linregress from scipy.sparse import issparse from bioservices import KEGG #Setting logging preferences logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) k = KEGG() k.settings.TIMEOUT = 1000 #Changing timeout COMPLETE_BACT_MEDIUM = [ #letort c et al, 2001 : https://mediadb.systemsbiology.net/defined_media/media/322/ "C00568", #4-Aminobenzoate "C00147", #Adenine "C00041", #Alanine "C01342", "C00158", #Ammonium citrate "C00062", #Arginine "C00072", #Ascorbate "C00152", #Asparagine "C00049", #Aspartate "C00120", #Biotin "C08130", "C00076",
at any time and I'll get back to you with instructions on how to use it. ''' import os import click import json import requests import time import xmltodict import bioservices from bioservices import KEGG, ChEBI from zeep import Client from tqdm import tqdm k = KEGG(verbose=False) map_kegg_chebi = k.conv("chebi", "compound") c = ChEBI(verbose=False) chebi_client = Client( "https://www.ebi.ac.uk/webservices/chebi/2.0/webservice?wsdl") chemspider_client = Client("https://www.chemspider.com/InChI.asmx?WSDL") # For compounds that cant be found at all. not_founds = [] # Need to create a global dictonary for these annotations, as I don't # want to take the piss with the web services these wonderful people # provide to us free of charge. global CONVERTED_COMPOUNDS
import io import logging import os.path as op from collections import defaultdict from bioservices import KEGG from slugify import Slugify import ssbio.utils from ssbio.protein.sequence.seqprop import SeqProp log = logging.getLogger(__name__) custom_slugify = Slugify(safe_chars='-_') bs_kegg = KEGG() class KEGGProp(SeqProp): def __init__(self, seq, id, fasta_path=None, txt_path=None, gff_path=None): SeqProp.__init__(self, seq=seq, id=id, sequence_path=fasta_path, metadata_path=txt_path, feature_path=gff_path) self.kegg = id @SeqProp.metadata_path.setter def metadata_path(self, m_path): """Provide pointers to the paths of the metadata file Args:
def drug_targets(drug_dic): # Create KEGG Object k = KEGG(verbose=False) # create empty list for drug IDs id_list = [] # Create empty dictionary do add gene information to target_gene_dic = {} # create dictionary to link gene(key) and theraputic drug(value) gene_drug = {} # locate each drug id and add to list for value in drug_dic.values(): id = re.findall(r"(D\d{5})", str(value)) id_list.append(id[0]) # Loop through drug IDs to gather information for drug_ID in id_list: # create object for drug information page = k.get(drug_ID) # create dictionary of drug information to isolate target information d = k.parse(page) # check for presence of target information if "TARGET" in d.keys(): # isolate target information targ = d["TARGET"] # Remove pathways no_paths_pre = targ.split(" PATHWAY") # count spaces to identify presence of info spaces = targ.count(" ") # create list of genes gene_list = no_paths_pre[0].split("\n ") # follow this if pathway section is present if spaces > 0: # loop through gene list for x in gene_list: # separate gene names and HSA ID's gene_split = x.split(" [") # remove extras from gene name y_split = gene_split[0].split(" ") # add gene information to output dictionary target_gene_dic[y_split[0]] = gene_split[1].strip("]") # add gene and drug to output dictionary gene_drug.setdefault(y_split[0], []).append(drug_ID) # if Gene doesn't have HSA# enter no value # also add gene to drug output dictionary else: target_gene_dic[no_paths_pre[0]] = "" for x in gene_list: # separate gene names and HSA ID's gene_split = x.split(" [") # remove extras from gene name y_split = gene_split[0].split(" ") # add gene and drug to output dictionary gene_drug.setdefault(y_split[0], []).append(drug_ID) else: pass return target_gene_dic, gene_drug
def __init__(self, modules, organism="hsa"): MyGraph.__init__(self,{}) self.gr=MyGraph() self.modules=modules self.s = KEGG() self.s.organism = organism # H**o sapiens as default
from pandas import DataFrame, read_csv sns.set(style='ticks', palette='pastel', color_codes=True) # ---- Import network network = read_csv('%s/files/string_mouse_network_filtered_800.txt' % wd, sep='\t') network_proteins = set(network['protein1']).intersection(network['protein2']) # ---- Set-up UniProt uniprot = UniProt(cache=True) # ---- Set-up QuickGO bioservice quickgo = QuickGO(cache=True) # ---- Set-up KEGG bioservice kegg, kegg_parser = KEGG(cache=True), KEGGParser() kegg.organism = 'mmu' print '[INFO] KEGG service configured' kegg_pathways = {p: kegg.parse_kgml_pathway(p) for p in kegg.pathwayIds} print '[INFO] KEGG pathways extracted: ', len(kegg_pathways) # Convert KEGG pathways Gene Name to UniProt k2u = kegg.conv('uniprot', 'mmu') kegg_pathways_proteins = {p: {k2u[x].split(':')[1] for i in kegg_pathways[p]['entries'] if i['type'] == 'gene' for x in i['name'].split(' ') if x in k2u} for p in kegg_pathways} kegg_uniprot_acc_map = {x for p in kegg_pathways_proteins for x in kegg_pathways_proteins[p]} kegg_uniprot_acc_map = {p: uniprot.get_fasta(str(p)).split(' ')[0].split('|')[2] for p in kegg_uniprot_acc_map}
class Kegg: k = KEGG() location = "http://www.genome.jp/dbget-bin/www_bget?" def gene_locator(self, gene_id): res = self.k.get(gene_id) d = self.k.parse(res) newstr = d['POSITION'] cstop = newstr.find(':') if cstop == -1: chromosome = 1 else: chrom = newstr[0:cstop] chromosome = self.translate_chromosome(chrom) sense = True if newstr.find('complement') != -1: # it is on the opposite strand of DNA sense = False cstop = newstr.find('(') if newstr.find('join') != -1: cstop = newstr.find('(') srt = cstop + 1 bothpos = newstr[srt:len(newstr)-1].split(",") totpos = [] for i in range(0, len(bothpos)): spc = bothpos[i].find('..') spos = bothpos[i][0:spc] epos = bothpos[i][spc+2:len(bothpos[i])] totpos.append((spos, epos)) startloc = totpos[0][0] endloc = totpos[len(bothpos)-1][1] else: srt = cstop + 1 spc = newstr.find('..') startloc = newstr[srt:spc] if not sense: endloc = newstr[spc+2:len(newstr)-1] else: endloc = newstr[spc+2:len(newstr)] totloc = (chromosome, int(startloc), int(endloc)) return totloc def translate_chromosome(self, chr): numbers = ('1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16') letters = ('A','B','C','D','E','F','G','H') roman = ('I','II','III','IV','V','VI','VII','VIII','IX','X','XI','XII','XIII', 'XIV', 'XV', 'XVI') types =[numbers,letters,roman] for list in types: if chr in list: ind = list.index(chr) return numbers[ind] return 1 def revcom(self, sequence): revseq = "" change = {'A':'T', 'T':'A', 'G':'C', 'C':'G'} for nt in sequence: rnt = change[nt] revseq = rnt + revseq return revseq def added_nts(self, seqstart, seqend, vector, orgcode, chromosome): url = self.location + "FROM=" + seqstart + "&TO=" + seqend + "&VECTOR="\ + vector + "&ORG=" + orgcode + "&CHR=" + chromosome source_code = requests.get(url) plain_text = source_code.text soup = BeautifulSoup(plain_text, "html.parser") exons = [] x = soup.find('pre') for region in soup.findAll('font'): seq = str(region) st = seq.find('>') + 1 z = seq.find('/font') - 2 exons.append(seq[st:z]) # get the sequence: sx = str(x) start = sx.find("\n", 10) + 1 end = sx.find("/pre") - 2 unfontseq = sx[start:end] trueseq = "" ingene = True i = 0 for nt in unfontseq: if nt == "<": ingene = False if nt == ">": ingene = True elif ingene: trueseq += nt i += 1 exon_position_tuples = [] for exon in exons: spos = trueseq.find(exon) epos = spos + len(exon) pos = (spos, epos) exon_position_tuples.append(pos) return exon_position_tuples
def search_pathway(gene, organism): k = KEGG() return k.get_pathway_by_gene(gene, organism)
class MetabolicNetwork(MyGraph): def __init__(self, modules, organism="hsa"): MyGraph.__init__(self,{}) self.gr=MyGraph() self.modules=modules self.s = KEGG() self.s.organism = organism # H**o sapiens as default def __kegg_dic(self): if type(self.modules)!=list: self.modules=self.s.moduleIds dic_reac={} for mod in self.modules: try: dic=self.s.parse(self.s.get(mod)) reactions=dic['REACTION'] for reac in reactions: teste=reactions[reac] string=teste.split(" ") dic_reac[reac]=string except KeyError: pass return dic_reac #it gives a dictionary with reactionsID as keys and a list of compoundsID # 'R01015': ['C00111', '->', 'C00118'] # 'R01070': ['C05378', '->', 'C00111', '+', 'C00118'] def c_c_graph(self):### comp-comp dic_reac=self.__kegg_dic() gr=self.gr for reac in dic_reac: comp=dic_reac[reac] c=0 if comp[c+1]=="+": try: comp[c+5]=="+" s2="+".join([str(comp[c+4]), str(comp[c+6])]) s3="+".join([str(comp[c]), str(comp[c+2])]) gr.addEdge(s3,s2) except IndexError: s="+".join([str(comp[c]), str(comp[c+2])]) gr.addEdge(s,comp[c+4]) elif comp[c+1]=="->": try: comp[c+3]=="+" s="+".join([str(comp[c+2]), str(comp[c+4])]) gr.addEdge(comp[c],s) except IndexError: gr.addEdge(comp[c],comp[c+2]) return gr.printGraph() def r_r_graph(self):### reac-reac dic_reac=self.__kegg_dic() gr=self.gr for k, v in dic_reac.items(): for r, m in dic_reac.items(): if v[len(v)-2] == "->": if v[len(v)-1]==m[0]: gr.addEdge(k, r) else: s="+".join([str(v[len(v)-3]), str(v[len(v)-1])]) try: s2="+".join([str(m[0]), str(m[2])]) if s == s2: gr.addEdge(k, r) except IndexError: pass return gr.printGraph() def r_c_graph(self):### reac-comp dic_reac=self.__kegg_dic() gr=self.gr for k, v in dic_reac.items(): for r, m in dic_reac.items(): if v[len(v)-2] == "->": if v[len(v)-1]==m[0]: sv="".join(v) sm="".join(m) gr.addEdge(k, sv) gr.addEdge(sv, r) gr.addEdge(r, sm) else: s="+".join([str(v[len(v)-3]), str(v[len(v)-1])]) try: s2="+".join([str(m[0]), str(m[2])]) if s == s2: sv="".join(v) sm="".join(m) gr.addEdge(k, sv) gr.addEdge(sv, r) gr.addEdge(r, sm) except IndexError: pass return gr.printGraph() def modules_name(self): if type(self.modules)!=list: self.modules=self.s.moduleIds for i in self.modules: dic=self.s.parse(self.s.get(i)) name=dic["NAME"][0]#['Glycolysis (Embden-Meyerhof pathway), glucose => pyruvate'] s="-".join([i,name]) print("\n".join([s])) def compounds_name(self): if type(self.modules)!=list: self.modules=self.s.moduleIds for i in self.modules: print(i) dic=self.s.parse(self.s.get(i)) comps=dic["COMPOUND"]#dictionary with the names of the compounds {'C00074': 'Phosphoenolpyruvate',..... for key in comps.keys(): s="-".join([key,comps[key]]) print("\n".join([s])) def pathway_name(self): if type(self.modules)!=list: self.modules=self.s.moduleIds for i in self.modules: dic=self.s.parse(self.s.get(i)) pathway=dic["PATHWAY"]#{'map00010': 'Glycolysis / Gluconeogenesis',...... for key in pathway.keys(): s="-".join([key, pathway[key]]) print(s) def nodes_degree(self): gr=self.gr return gr.allDegrees() def clustering(self): gr=self.gr return gr.allClusteringCoefs() def connections(self, n1, n2): gr=self.gr return gr.distance(n1, n2)