def export_flat_obo(self, filepath, termPrefix): go = GeneOntology() newterms = [] for synid in self.mSyns: syn = self.mSyns[synid] got = self.syn2goterm(syn, synid)#termPrefix + str(len(go.dTerms) + len(newterms))) if got != None: newterms.append(got) go.addterms(newterms) go.saveFile(filepath)
from synonymes.GeneOntology import GeneOntology from textdb.NcitTerm2Symbols import NcitTermSymbolDB from collections import defaultdict from synonymes.Synonym import Synonym from synonymes.SynonymUtils import handleCommonExcludeWords from utils.idutils import dataDir, loadExludeWords, printToFile, speciesName2TaxID ncitObo = GeneOntology(dataDir + "miRExplore/obodir/ncit.obo") ncitTerm2Sym = NcitTermSymbolDB.loadFromFolder() vAllSyns = [] for termID in ncitObo.dTerms: oboNode = ncitObo.dTerms[termID] oboID = oboNode.id oboName = oboNode.name oboSyns = oboNode.synonym oboRels = oboNode.is_a newSyn = Synonym(oboID) newSyn.addSyn(oboName) if oboSyns != None: for x in oboSyns: newSyn.addSyn(x.syn) allOrgs = [x for x in ncitTerm2Sym.org_term2symbol]
rtype="gene", normGeneSymbols=normGeneSymbols, switchLR=True, stopAfter=-1) relDBs = [mirelPMIDhsa, mirelPMIDmmu] requiredPMIDs = set() for rdb in relDBs: assert (isinstance(rdb, DataBaseDescriptor)) for rpmid in rdb.get_evidence_docids(): requiredPMIDs.add(rpmid) diseaseObo = GeneOntology(args.obodir + "/doid.obo") #{'group': 'disease', 'termid': 'DOID:1936', 'name': 'atherosclerosis'} #{'group': 'disease', 'termid': 'DOID:2349', 'name': 'arteriosclerosis'} #{'group': 'disease', 'termid': 'DOID:1287', 'name': 'cardiovascular system disease'}, elemTerm = diseaseObo['DOID:1936'] elemTerms = [x.term.id for x in elemTerm.getAllChildren()] + [elemTerm.id] cvTerm = diseaseObo['DOID:1287'] cvTerms = [x.term.id for x in cvTerm.getAllChildren()] + [cvTerm.id] + elemTerms pmid2disease = PMID2XDB.loadFromFile(args.pmidBase + "/disease.pmid", diseaseObo, requiredPMIDs) # number of genes with interaction
def start_app_from_args(args): global mirFeedback global mirandaDB_mm10 global relDBs global diseaseObo global goObo global cellObo global pmid2go global pmid2disease global pmid2fma global pmid2cell global testRels global mirelPMID global sentDB global featureViewer global symbol2ensemblDB global pmid2ncit global ncitObo global humanGeneNeighbourDB global mouseGeneNeighbourDB global geneNeighbourHoods global mi2mirna global dateDB pmidBase = args.textmine + '/aggregated_pmid/' pmcBase = args.textmine + '/aggregated_pmc/' normGeneSymbols = normalize_gene_names(path=fileurl + "/hgnc_no_withdrawn.syn") #mouseGeneNeighbourDB = GeneNeighbourDB.loadFromFile("mmu", inputgff=args.obodir + "/mm10_primary_assembly_and_lncRNA.gtf") #geneNeighbourHoods[mouseGeneNeighbourDB.orgid] = mouseGeneNeighbourDB print(datetime.datetime.now(), "Loading PMID2PMC") # allInteractions = defaultdict(list) print(datetime.datetime.now(), "Loading Sym2Ens") symbol2ensemblDB = SymbolEnsemblDB.loadFromFile(fileurl + "/sym2ens/") print(datetime.datetime.now(), "Loading MI2Mirna") mi2mirna = MI2Mirna.loadFromFile(fileurl + "/dbs/mirnas_mirbase.csv") print(datetime.datetime.now(), "Loading miranda interactions mm10") # mirandaDB_mm10 = MirandaRelDB.loadFromFile(filepath=args.obodir + "/mm10_interactionsAllGenes.txt", symbol2ens=symbol2ensemblDB, org="mmu") # mirandaDB_hg38 = MirandaRelDB.loadFromFile(filepath=args.obodir + "/hg38_interactionsAllGenes.txt", org="hsa") mirandaDB_mm10 = None mirandaDB_hg38 = None recordsDB = None mirtarbaseDB = None dianaDB, celllInfos = None, None if args.load_mirecords: print(datetime.datetime.now(), "Loading miRecords") recordsDB = miRecordDB.loadFromFile(filelocation=fileurl + "/dbs/mirecords_v4.xlsx", normGeneSymbols=normGeneSymbols) if args.load_mirtarbase: print(datetime.datetime.now(), "Loading miRTarBase") mirtarbaseDB = MirTarBaseDB.loadFromFile( filepath=fileurl + "/dbs/miRTarBase.csv", normGeneSymbols=normGeneSymbols) if args.load_diana: print(datetime.datetime.now(), "Loading hsa_mmu.diana") dianaDB, celllInfos = DIANATarbaseDB.loadFromFile( fileurl + "/dbs/hsa_mmu.diana.csv", normGeneSymbols=normGeneSymbols) allDBS = None print(datetime.datetime.now(), "Loading PMID2PMC") pmid2pmcDB = None excludePMIDs = None if args.load_pmc: pmid2pmcDB = PMID2PMCDB.loadFromFile(pmcBase + '/pmc2pmid', PMC2PMID=True) excludePMIDs = pmid2pmcDB.getAllPMIDs() print("Got", len(excludePMIDs), "exclude PMIDs") if len(excludePMIDs) > 5: print(list(excludePMIDs)[:5]) print(datetime.datetime.now(), "Finished PMID2PMC") print(datetime.datetime.now(), "Loading mirel") testRels = None # TestRelLoader.loadFromFile(pmidBase + "/test_rels_4") print(datetime.datetime.now(), "Loading mirel PMID") mirelPMIDhsa = MiGenRelDB.loadFromFile(pmidBase + "/mirna_gene.hsa.pmid", ltype="mirna", rtype="gene", normGeneSymbols=normGeneSymbols, switchLR=True, excludeIDs=excludePMIDs) mirelPMIDmmu = MiGenRelDB.loadFromFile(pmidBase + "/mirna_gene.mmu.pmid", ltype="mirna", rtype="gene", normGeneSymbols=normGeneSymbols, switchLR=True, excludeIDs=excludePMIDs) print(datetime.datetime.now(), "Loading mirel PMC") mirelPMChsa = None mirelPMCmmu = None if args.load_pmc: mirelPMChsa = MiGenRelDB.loadFromFile(pmcBase + "/mirna_gene.hsa.pmid", ltype="mirna", rtype="gene", normGeneSymbols=normGeneSymbols, switchLR=True) mirelPMCmmu = MiGenRelDB.loadFromFile(pmcBase + "/mirna_gene.mmu.pmid", ltype="mirna", rtype="gene", normGeneSymbols=normGeneSymbols, switchLR=True) lncMirPMID = None #MiGenRelDB.loadFromFile(pmidBase + "/lncrna_mirna.pmid", ltype="lncrna", rtype="mirna") geneLncPMID = None #MiGenRelDB.loadFromFile(pmidBase + "/gene_lncrna.pmid", ltype="gene", rtype="lncrna") print(datetime.datetime.now(), "Finished mirel") print(datetime.datetime.now(), "Loading Dates") dateDB = PubmedDateDB.loadFromFile(pmidBase + "/allpmids.date") if args.load_pmc: pmc_dateDB = PubmedDateDB.loadFromFile(pmcBase + "/allpmc.date") dateDB.add_database(pmc_dateDB) print(datetime.datetime.now(), "Finished Dates") print(datetime.datetime.now(), "Loading mirWalk") mirWalkMMU3UTRDB = None #MirWalkDB.loadFromFile('/mnt/c/ownCloud/data/miRExplore/mirwalk/mmu_miRWalk_3UTR.txt', org="mmu", bindSite="3UTR", normGeneSymbols=normGeneSymbols) print(datetime.datetime.now(), "Loading mirWalk finished") relDBs = [ recordsDB, mirtarbaseDB, dianaDB, mirelPMIDhsa, mirelPMIDmmu, mirelPMChsa, mirelPMCmmu, lncMirPMID, geneLncPMID, mirandaDB_mm10, mirWalkMMU3UTRDB ] relDBs = [x for x in relDBs if x != None] mirFeedback = feedbackDB(args.feedback) requiredDocuments = set() for relDB in relDBs: requiredDocuments = requiredDocuments.union( relDB.get_evidence_docids()) print("Requiring", len(requiredDocuments), "documents") print(datetime.datetime.now(), "Loading sents") print(datetime.datetime.now(), "Loading sents PMID") sentDB = SentenceDB.loadFromFile(args.sentdir, pmidBase + "/pmid2sent", requiredIDs=requiredDocuments) if args.load_pmc: print(datetime.datetime.now(), "Loading sents PMC") sentDBPMC = SentenceDB.loadFromFile(args.sentdir_pmc, pmcBase + "/pmc2sent", requiredIDs=requiredDocuments) print(datetime.datetime.now(), "Merging sentence DBs") sentDB.add_database(sentDBPMC) print(datetime.datetime.now(), "Finished sents") allDBsPMID = None if os.path.isfile(pmidBase + "/dbs.pickle"): print(datetime.datetime.now(), "Loading pickle PMID") with open(pmidBase + "/dbs.pickle", 'rb') as fin: allDBsPMID = pickle.load(fin) pmid2go = allDBsPMID[0] pmid2disease = allDBsPMID[1] pmid2fma = allDBsPMID[2] pmid2cell = allDBsPMID[3] pmid2ncit = allDBsPMID[4] print(datetime.datetime.now(), "Loading pickle PMID ended") allDBsPMC = None if os.path.isfile(pmcBase + "/dbs.pickle") and args.load_pmc: print(datetime.datetime.now(), "Loading pickle PMC") with open(pmidBase + "/dbs.pickle", 'rb') as fin: allDBsPMC = pickle.load(fin) pmc2go = allDBsPMC[0] pmc2disease = allDBsPMC[1] pmc2fma = allDBsPMC[2] pmc2cell = allDBsPMC[3] pmc2ncit = allDBsPMC[4] print(datetime.datetime.now(), "Loading pickle PMC ended") print(datetime.datetime.now(), "Loading ontologies") diseaseObo = GeneOntology(args.obodir + "/doid.obo") goObo = GeneOntology(args.obodir + "/go.obo") cellObo = GeneOntology(args.obodir + "/meta_cells.obo") ncitObo = GeneOntology(args.obodir + "/ncit.obo") fmaObo = GeneOntology(args.obodir + "/fma_obo.obo") print(datetime.datetime.now(), "Loading ontologies finished") if allDBsPMID is None: pmid2go = None pmid2disease = None pmid2fma = None pmid2cell = None pmid2ncit = None print(datetime.datetime.now(), "Loading GO") pmid2go = PMID2XDB.loadFromFile(pmidBase + "/go.pmid", goObo, requiredDocuments) print(datetime.datetime.now(), "Loading Disease") pmid2disease = PMID2XDB.loadFromFile(pmidBase + "/disease.pmid", diseaseObo, requiredDocuments) print(datetime.datetime.now(), "Loading FMA") pmid2fma = PMID2XDB.loadFromFile(pmidBase + "/model_anatomy.pmid", fmaObo, requiredDocuments) print(datetime.datetime.now(), "Loading cellline") pmid2cell = PMID2XDB.loadFromFile(pmidBase + "/celllines.pmid", cellObo, requiredDocuments) print(datetime.datetime.now(), "Loading ncit") pmid2ncit = PMID2XDB.loadFromFile(pmidBase + "/ncit.pmid", ncitObo, requiredDocuments) allDBsPMID = (pmid2go, pmid2disease, pmid2fma, pmid2cell, pmid2ncit) print(datetime.datetime.now(), "Writing Pickle") with open(pmidBase + "/dbs.pickle", 'wb') as fout: pickle.dump(allDBsPMID, fout) print(datetime.datetime.now(), "Finished Writing Pickle") if allDBsPMC is None and args.load_pmc: pmc2go = None pmc2disease = None pmc2fma = None pmc2cell = None pmc2ncit = None print(datetime.datetime.now(), "Loading GO") pmc2go = PMID2XDB.loadFromFile(pmcBase + "/go.pmid", goObo, requiredDocuments) print(datetime.datetime.now(), "Loading Disease") pmc2disease = PMID2XDB.loadFromFile(pmcBase + "/disease.pmid", diseaseObo, requiredDocuments) print(datetime.datetime.now(), "Loading FMA") pmc2fma = PMID2XDB.loadFromFile(pmcBase + "/model_anatomy.pmid", fmaObo, requiredDocuments) print(datetime.datetime.now(), "Loading cellline") pmc2cell = PMID2XDB.loadFromFile(pmcBase + "/celllines.pmid", cellObo, requiredDocuments) print(datetime.datetime.now(), "Loading ncit") pmc2ncit = PMID2XDB.loadFromFile(pmcBase + "/ncit.pmid", ncitObo, requiredDocuments) allDBsPMID = (pmc2go, pmc2disease, pmc2fma, pmc2cell, pmc2ncit) print(datetime.datetime.now(), "Writing Pickle") with open(pmcBase + "/dbs.pickle", 'wb') as fout: pickle.dump(allDBsPMID, fout) print(datetime.datetime.now(), "Finished Writing Pickle") if args.load_pmc: print(datetime.datetime.now(), "Merging Context DBs") print(datetime.datetime.now(), "Merging Context GO") pmid2go.add_database(pmc2go) print(datetime.datetime.now(), "Merging Context DISEASE") pmid2disease.add_database(pmc2disease) print(datetime.datetime.now(), "Merging Context FMA") pmid2fma.add_database(pmc2fma) print(datetime.datetime.now(), "Merging Context CELL") pmid2cell.add_database(pmc2cell) print(datetime.datetime.now(), "Merging Context NCIT") pmid2ncit.add_database(pmc2ncit) print(datetime.datetime.now(), "Finished Merging Context DBs") if celllInfos != None: print(datetime.datetime.now(), "Adding CelllInfo Features") for celllInfo in celllInfos: pmid2cell.docid2info[celllInfo['docid']].append(celllInfo) print(datetime.datetime.now(), "Finished Adding CelllInfo Features") print(datetime.datetime.now(), "Loading Features") #rfDB = RFamDB.loadFromFile(fileurl + "/dbs/rfam.regions.mirexplore") #featureViewerMMU = FeatureViewer('mmu', args.obodir, rfamDB=rfDB) #featureViewerHSA = FeatureViewer('hsa', args.obodir, rfamDB=rfDB) print(datetime.datetime.now(), "Loading Features finished") print(datetime.datetime.now(), "Loading finished")
import os, sys sys.path.insert(0, str(os.path.dirname(os.path.realpath(__file__))) + "/../") sys.path.insert(0, "/mnt/f/dev/git/NERtoolkit/") from collections import defaultdict from synonymes.GeneOntology import GeneOntology from synonymes.Synonym import Synonym from synonymes.SynonymUtils import handleCommonExcludeWords from utils.idutils import dataDir, loadExludeWords, printToFile, speciesName2TaxID infile = sys.argv[1] # dataDir + "miRExplore/doid.obo" outfile = sys.argv[2] #"/mnt/d/dev/data/pmid_jun2020/synonyms/disease.syn" celloObo = GeneOntology(infile) ignoreTerms = set() ignoreTerms.add("DOID:4") print("Total terms:", len(celloObo.dTerms), "Ignore terms", len(ignoreTerms)) vAllSyns = [] for cellID in celloObo.dTerms: oboNode = celloObo.dTerms[cellID] oboID = oboNode.id oboName = oboNode.name if oboID in ignoreTerms:
import os, sys sys.path.insert(0, str(os.path.dirname(os.path.realpath(__file__))) + "/../") from collections import defaultdict from synonymes.Synonym import Synonym from synonymes.SynonymUtils import handleCommonExcludeWords from utils.idutils import dataDir, loadExludeWords, printToFile, speciesName2TaxID from synonymes.GeneOntology import GeneOntology bodypartsObo = GeneOntology( dataDir + "miRExplore/foundational_model_anatomy/fma_obo.obo") vAllSyns = [] for cellID in bodypartsObo.dTerms: oboNode = bodypartsObo.dTerms[cellID] oboID = oboNode.id oboName = oboNode.name oboSyns = oboNode.synonym oboRels = oboNode.is_a newSyn = Synonym(oboID) newSyn.addSyn(oboName) aName = oboName.split(' ') if len(aName) > 1 and len(aName) < 5: acro = ""
from synonymes.GeneOntology import GeneOntology from utils.tmutils import normalize_gene_names sys.path.insert(0, str(os.path.dirname("/mnt/d/dev/git/poreSTAT/"))) from porestat.utils.DataFrame import DataFrame, DataRow, ExportTYPE from synonymes.mirnaID import miRNA, miRNAPART, miRNACOMPARISONLEVEL from textdb.makeNetworkView import DataBasePlotter from utils.cytoscape_grapher import CytoscapeGrapher import matplotlib.pyplot as plt from natsort import natsorted if __name__ == '__main__': cellObo = GeneOntology( "/mnt/d/owncloud/data/miRExplore/obodir/meta_cells.obo") cellTypeName2Terms = { "EC": ["META:52"], "MC": ["META:148", "META:99"], "FC": ["CL:0000891"], "SMC": ["META:83"], } cellType2AccTerms = {} for cellT in cellTypeName2Terms: cellType2AccTerms[cellT] = set() for et in cellTypeName2Terms[cellT]:
if len(line) > 0: accept_pmids.add(line) #resultBase = dataDir + "/miRExplore/textmine/results_pmc/" resultBase = args.resultdir oboSyns = SynfileMap(resultBase + "/synfile.map") oboSyns.loadSynFiles((args.mine_path, args.datadir)) allfiles = glob.glob(resultBase + "/*.index") allfileIDs = [os.path.basename(x).replace(".index", "") for x in allfiles] allfileIDs = sorted(allfileIDs, reverse=True) #allfileIDs = [894] celloObo = GeneOntology(args.obo.name) def getTerm(synid, obo): if synid in obo.dTerms: return obo.getID(synid) synid = synid.replace('_', ':', 1) return obo.getID(synid) def analyseFile(splitFileIDs, env): fileCoocs = [] for splitFileID in splitFileIDs:
from collections import defaultdict from synonymes.GeneOntology import GeneOntology from synonymes.Synonym import Synonym from synonymes.SynonymUtils import handleCommonExcludeWords from utils.idutils import dataDir, loadExludeWords, printToFile, speciesName2TaxID import math celloObo = GeneOntology(dataDir + "miRExplore/cellosaurus/cellosaurus.obo") tax2cells = defaultdict(set) allowedTaxIDs = set([str(speciesName2TaxID[x]) for x in speciesName2TaxID]) xref2cello = defaultdict(set) considerXRefs = ['CL', 'CLO', 'DOID', 'UBERON'] for cellID in celloObo.dTerms: oboNode = celloObo.dTerms[cellID] oboXRefs = oboNode.xref taxID = {'all'} if oboXRefs != None: for xref in oboXRefs: if xref.startswith('NCBI_TaxID'): newTaxID = xref.split(' ')[0].split(':')[1] if newTaxID in allowedTaxIDs: taxID.add(newTaxID)
from collections import defaultdict import sys, os sys.path.insert(0, str(os.path.dirname("/mnt/d/dev/git/miRExplore/python/"))) from synonymes.GeneOntology import GeneOntology from synonymes.Synonym import Synonym from synonymes.SynonymUtils import handleCommonExcludeWords from utils.idutils import dataDir, loadExludeWords, printToFile, speciesName2TaxID filepath = sys.argv[1] fileObo = GeneOntology(filepath) namespace2syn = defaultdict(set) allowedTaxIDs = set([str(speciesName2TaxID[x]) for x in speciesName2TaxID]) allNodes = [] for cellID in fileObo.dTerms: oboNode = fileObo.dTerms[cellID] allNodes.append(oboNode) globalKeywordExcludes = loadExludeWords(common=False, cell_co=False, disease=False, generic=False) for x in globalKeywordExcludes: if 'membrane' in globalKeywordExcludes[x]: print("Membrane: " + x)