def _get_relationships(uri, system): """Parses a list of `Relationship` objects Args: uri: a URI for the ontology file on the local filesystem system: the target system Returns: yields relationships """ ontology = Ontology(uri.path) # child to parent relationships for term in ontology: for child in term.children: yield _to_relationship(child, "subsumes", term, system) # parent to child relationships for term in ontology: for parent in term.parents: yield _to_relationship(parent, "specializes", term, system) for term in ontology: for scope, references in term.other.items(): if scope in _SCOPE_TO_EQUIVALENCE: for reference in references: relationship = _to_equivalence_from_scope(scope) yield _to_relationship(term, relationship, ontology[reference], system) if scope in _SCOPE_TO_INVERSE_SCOPE: inverse = _to_inverse_scope(scope) for reference in references: relationship = _to_equivalence_from_scope(inverse) yield _to_relationship(ontology[reference], relationship, term, system)
def main(data_directory): input_endpoint_path = data_directory / INPUT_ENDPOINT_FILE input_ontology_path = data_directory / INPUT_ONTOLOGY_FILE output_path = data_directory / OUTPUT_FILE prechecks(input_endpoint_path, input_ontology_path, output_path) endpoint_doids, endpoint_mesh = map_endpoint_doids_mesh( input_endpoint_path) all_endpoints = set(endpoint_doids) all_endpoints = all_endpoints.union(endpoint_mesh) logger.info("Parsing ontology file to map EFO->[]DOIDS") path = str(input_ontology_path) # 'Ontology' takes only str as input ontology = Ontology(path) efo_doids = map_efo_doids(ontology) endpoint_efos = map_endpoint_efos(endpoint_doids, efo_doids) endpoint_refs = get_endpoint_refs(all_endpoints, ontology, endpoint_efos, endpoint_mesh) # Merge all the references into one coherent data structure out = merge(endpoint_doids, endpoint_mesh, endpoint_refs) logger.info(f"Writing endpoint refs to file {output_path}") with open(output_path, "x") as f: json.dump(out, f) logger.info("Done.")
def validate(self, mzqc: MzQcFile): # Semantic validation of the JSON file. # Load the mzqc file specific ontologies cvs: Dict[str, TermList] = dict() for cv in mzqc.controlled_vocabularies: try: cvs[cv.ref] = Ontology(cv.uri, False) except: SemanticError(f'Failed to load cv {cv.name} from {cv.uri}. Does {cv.ref} exist?') # For all cv terms involved: for cv_parameter in self._get_cv_parameters(mzqc): # Verify that cvRefs are valid. if cv_parameter.cvRef not in cvs.keys(): raise SemanticError(f'Unknown CV reference <{cv_parameter.cv_ref}> in ' f'element `{str(type(cv_parameter))}`') # Verify that the term exists in the CV. cv_term = cvs[cv_parameter.cvRef].get(cv_parameter.accession) if cv_term is None: raise SemanticError(f'Term {cv_parameter.name} not found in CV <{cv_parameter.cvRef}>') # Verify that the term name is correct. elif cv_parameter.name != cv_term.name: raise SemanticError( f'Incorrect name for CV term {cv_parameter.accession}: ' f'"{cv_parameter.name}" != "{cv_term.name}"') # Regarding metadata, verify that input files are consistent and unique. self._inputFileConsistency(mzqc.run_qualities) self._inputFileConsistency(mzqc.set_qualities) # For all metrics (which are basing on cv type) #run_and_set_quality_collection: List[BaseQuality] = list() #for run_or_set_quality in run_and_set_quality_collection: if "Proteomics Standards Initiative Quality Control Ontology" not in [cv.name for cv in cvs.values()]: raise SemanticError(f'Quality Control Ontology missing!') else: keys = [filter( lambda x: cvs[x].name == "Proteomics Standards Initiative Quality Control Ontology", cvs )] if len(keys) != 1: SemanticError('More than one QC CV.') else: qc_ref = keys[0] metric_cvs: List[Term] = cvs[qc_ref]["QC:4000001"].rchildren() for run_or_set_quality in chain(mzqc.run_qualities,mzqc.set_qualities): # Verify that quality metrics are unique within a run/setQuality. accessions: Set[str] = set() for quality_metric in run_or_set_quality.quality_metrics: if quality_metric.accession not in accessions: accessions.add(quality_metric.accession) else: raise ValidationError(f'Duplicate quality metric: ' f'accession = {quality_metric.accession}') # Verify that quality_metric actually is of metric type/relationship? cv_term = cvs[quality_metric.cvRef].get(quality_metric.accession) if cv_term is None or cv_term not in metric_cvs: raise SemanticError(f'Non-metric CV used in metric context.')
def extract_concepts_from_ontologies(ontologies): # Extracts concepts from ontologies, given a list of ontologies all_concepts = [] for ontology in ontologies: concepts = [ split_by_camel_case(term) for term in Ontology(ontology).terms.keys() ] all_concepts.extend(concepts) return all_concepts
def load_mondo_graph(lan='en', use_cache=True): fn_cache = os.path.join(PATH_CACHE, F'mondo-{lan}.gfx') if use_cache: if os.path.isfile(fn_cache): return MondoGraph.load(fn_cache) fn_obo = os.path.join(PATH_FILES, F'mondo-{lan}.obo') mondo = Ontology(fn_obo) gfx = MondoGraph(mondo) gfx.save(fn_cache) return gfx
def load_hpo_graph(lan='en', use_cache=True): fn_cache = os.path.join(PATH_CACHE, F'hp-{lan}.gfx') if use_cache: if os.path.isfile(fn_cache): return HPOGraph.load(fn_cache) fn_obo = os.path.join(PATH_FILES, F'hp-{lan}.obo') hpo = Ontology(fn_obo) gfx = HPOGraph(hpo) gfx.save(fn_cache) return gfx
def _multiparse(multi_in, return_list): print('Parsing file: {}'.format(multi_in[0])) parser = multi_in[1][multi_in[0].split(os.path.extsep)[-1]] ont = multi_in[2][multi_in[0].split(os.path.extsep)[-1]] ont = Ontology("C:/Users/tnl495/appveyor-py-test/mzml2isa/psi-ms.obo", False) p = parser(multi_in[0], ont).meta_isa #p = parser(multi_in[0]).meta_isa #print p return_list.append(p) return p
def _multiparse(filepath, metalist, win): dirname = os.path.dirname(os.path.realpath(__file__)) if not any(x in sys.argv for x in ('-h', '--help', '--version')): _ms = Ontology(os.path.join(dirname, "psi-ms.obo"), False) _ims = Ontology(os.path.join(dirname, "imagingMS.obo"), False) _ims.terms.update(_ms.terms) else: _ms, _ims = None, None _ims.merge(_ms) PARSERS = {'mzML': mzml.mzMLmeta, 'imzML': mzml.imzMLmeta} ONTOLOGIES = {'mzML': _ms, 'imzML': _ims} print('Parsing file: {}'.format(filepath)) parser = PARSERS[filepath.split(os.path.extsep)[-1]] ont = ONTOLOGIES[filepath.split(os.path.extsep)[-1]] meta = parser(filepath, ont).meta metalist.append(meta)
def expand_ontologies(self, to_strings=False): self = copy(self) for name, (ontology_path, id_mapper) in self.ontologies_data.items(): ontology = Ontology(ontology_path) converter = (lambda entry: entry.name) if to_strings else identity self[name] = Series( tuple({ converter(ontology[id_mapper(ontology_id)]) for ontology_id in row.mapped_ontology_terms if id_mapper(ontology_id) in ontology }) for row in self.itertuples()).values return MetaSRA(self)
def _multiparse(multi_in): #from mzml2isa import mzml #from pronto import Ontology print('Parsing file: {}'.format(multi_in[0])) parser = multi_in[1][multi_in[0].split(os.path.extsep)[-1]] #ont = multi_in[2][multi_in[0].split(os.path.extsep)[-1]] ont = Ontology("C:/Users/tnl495/appveyor-py-test/mzml2isa/psi-ms.obo", False) p = parser(multi_in[0], ont).meta_isa #p = parser(multi_in[0]).meta_isa #print p return p
def get_ont(self): """ Return an Ontology object that is ready to use. """ with warnings.catch_warnings(): warnings.filterwarnings("ignore") try: return Ontology(self.ontology_link) except ValueError: # Fail to parse raise exceptions.FailToParseOntologyException() except Exception: # from None suppresses all previous exceptions. raise exceptions.NoInternetConnectionException() from None
def convert_to_json(input="GFOP.owl", output="GFOP.json"): # create a root to bundle everything root: Node = Node("GFOP") nodes = {} # read owl file and cache all nodes in a dict{name, node} obo = Ontology(input) for term in obo.terms(): id = term.id name = term.name # find parents in distance 1 (exclude self) parent_terms = term.superclasses(with_self=False, distance=1).to_set() if parent_terms is None or len(parent_terms) == 0: # create root node nodes[name] = Node(name, id=id) else: # currently only uses one parent parent = parent_terms.pop() nodes[name] = Node(name, id=id, parent_id=parent.id, parent_name=parent.name) # link all nodes to their parents for key, node in nodes.items(): if key is not root.name: try: # find parent in cached nodes and set to node node.parent = nodes[node.parent_name] except AttributeError: # no parent - add to root node.parent = root # generate json string exporter = JsonExporter(indent=2, sort_keys=True) json = exporter.export(root) # print json and tree for debugging print(json) for pre, _, node in RenderTree(root): print("%s%s" % (pre, node.name)) # export to json file print("Writing to {}".format(output)) with open(output, "w") as file: print(json, file=file)
def extract_therapeutic_areas_from_owl() -> pd.DataFrame: """ A dataframe with all the EFO IDs and their therapeutic areas are parsed from the EFO OTAR SLIM OWL file. """ owl_url = fetch_otar_owl_from_github('latest') efo_terms = Ontology(owl_url, timeout=10).terms() owl_parsed = [] for term in efo_terms: # The TAs are extracted by iterating through the ancestors of a term and looking up if it's in THERAPEUTIC_AREAS therapeutic_areas = [] for ancestor in term.superclasses(): ancestor_id = normalise_ontology_identifier(ancestor.id) if ancestor_id in THERAPEUTIC_AREAS['id']: therapeutic_areas.append(ancestor_id) efo_id = normalise_ontology_identifier(term.id) owl_parsed.append((efo_id, therapeutic_areas)) return pd.DataFrame(owl_parsed, columns=['efo_id', 'therapeutic_areas'])
def extractHypernymsFromOntology(ontology): ont = Ontology(ontology) allConcepts = [] listid = [] dictelem = {} for term in ont: allConcepts.append(term) if term.children: a = str(term).split(":") b = a[0] listid.append(b[1:]) for x in range(0,len(listid)): key = listid[x] if key in dictelem: child = ont[listid[x].children].split(":") ch = child[0] dictelem.get(key).append(ch[1:]) else: childs = ont[listid[x]].children all_childs = "" for y in childs: z = str(y).split(":") f = z[0] all_childs += f[1:]+"," dictelem[key] = all_childs finalDict = {} for elem in dictelem: newelem = camel_case_split(elem) ls = dictelem[elem].split(",")[:-1] newval = ",".join([camel_case_split(el) for el in ls]) finalDict[newelem] = newval hypernymsList = [] for elem in finalDict: hypernymsList.extend([(elem, val) for val in finalDict[elem].split(",")]) return (hypernymsList, allConcepts)
def get_relationships(self): """Parses a list of `Relationship` objects Returns: yields `Relationship`s in JSON form """ ontology = Ontology(self.uri.path) # child to parent relationships for term in ontology: for child in term.children: yield _to_relationship(child, "subsumes", term) # parent to child relationships for term in ontology: for parent in term.parents: yield _to_relationship(parent, "specializes", term) # alt_id relationships: if not self.skip_alt_ids: for term in ontology: for other, values in term.other.items(): if other == 'alt_id': for value in values: target = Term(id=value, name=term.name, desc=term.desc) yield _to_relationship(term, "equal", target) # synonym relationships if not self.skip_synonyms: for term in ontology: for synonym in term.synonyms: target = Term(id=term.id, name=synonym.desc, desc=term.desc) equivalence = _to_equivalence_from_scope(synonym.scope) yield _to_relationship(term, equivalence, target)
def createSubsetFor(self, classes_in): self.classes_in = set(classes_in) print(len(classes_in)) with open("classes_in.txt", 'w') as outfile: for c in self.classes_in: outfile.writelines(c + "\n") rw = robot_wrapper.RobotWrapper( robotcmd='/Users/hastingj/Work/Onto/robot/robot') #get_ontology_cmd = 'curl -L http://purl.obolibrary.org/obo/chebi.obo > chebi.obo' #rw.__executeCommand__(get_ontology_cmd) extract_cmd = [ rw.robotcmd, "extract --method MIREOT ", "--input chebi.obo", "--lower-terms classes_in.txt", "--intermediates minimal", "--output chebi-slim.obo" ] rw.__executeCommand__(" ".join(extract_cmd)) self.chebislim = Ontology("chebi-slim.obo")
def ontologyExtractor(ontName): ''' Extracts raw (property, entity) RDF triples from ontology given entity name ''' ont = Ontology(ontName) ontology = [] for term in list(ont.terms()): subclasses = list(term.subclasses()) ontology.extend([(subclass.name, "subclass_of", term.name) for subclass in subclasses]) ontology = [ tuple(el.split("\t")) for el in list( set([ "\t".join(relation) for relation in ontology if relation[0] != relation[-1] ])) ] ontology = [ ont for ont in ontology if ont[0] == "Pizza" or ont[-1] == "Pizza" ] ontology = [("hypernym", el[-1]) if el[0] == "Pizza" else ("hyponym", el[0] + " Pizza") for el in ontology] return ontology
import mzml2isa import mzml2isa.isa as isa import mzml2isa.mzml as mzml from mzml2isa.versionutils import longest_substring _PARSERS = {'mzML': mzml.mzMLmeta, 'imzML': mzml.imzMLmeta} # change the ontology and start extracting imaging specific metadata warnings.simplefilter('ignore') dirname = os.path.dirname(os.path.realpath(__file__)) if not any(x in sys.argv for x in ('-h', '--help', '--version')): _ms = Ontology(os.path.join(dirname, "psi-ms.obo"), False) _ims = Ontology(os.path.join(dirname, "imagingMS.obo"), False) _ims.terms.update(_ms.terms) else: _ms, _ims = None, None #_ims.merge(_ms) _ONTOLOGIES = {'mzML': _ms, 'imzML': _ims } del dirname def _multiparse(filepath): print('Parsing file: {}'.format(filepath)) parser = _PARSERS[filepath.split(os.path.extsep)[-1]]
import numpy as np import os from pronto import Ontology from collections import Counter from transformers import AutoTokenizer, AutoModel import torch from scipy import spatial import json # File paths need to be set here. # They are not passed via argument to the program. _dev_path = "BioNLP-OST-2019_BB-norm_dev/" _train_path = "BioNLP-OST-2019_BB-norm_train/" _test_path = "BioNLP-OST-2019_BB-norm_test/" _obo_base = Ontology("OntoBiotope_BioNLP-OST-2019.obo") _train_set_output_folder = "train_results/" _dev_set_output_folder = "dev_results/" _test_set_output_folder = "test_results/" # -2 at the end excludes LICENCE and README files. _dev_files = sorted(list(os.walk(_dev_path))[0][2])[:-2] _train_files = sorted(list(os.walk(_train_path))[0][2])[:-2] _test_files = sorted(list(os.walk(_test_path))[0][2])[:-2] ########################################################################### # DATA STRUCTURES # ###########################################################################
efo_term2id = {} with open('ontologies/efo.obo') as efo_file: efo_file = efo_file.read() line_iterator = iter(efo_file.splitlines()) line = next(line_iterator, None) while line is not None: if line == '[Term]': curr_id = next(line_iterator)[4:] if curr_id.startswith('EFO'): curr_name = next(line_iterator)[6:] efo_terms.append(curr_name) efo[curr_id] = curr_name efo_term2id[curr_name] = curr_id line = next(line_iterator, None) cl = Ontology('ontologies/cl-basic.obo') cl_term2id, cl_terms = get_term2id(cl) cl_dict = { 'ontology': cl, 'terms': cl_terms, 'terms_lower': [t.lower() for t in cl_terms], 'term2id': cl_term2id, 'column': 'cell_type' } uberon = Ontology('ontologies/uberon-basic.obo') uberon_term2id, uberon_terms = get_term2id(uberon) biomaterial2onto = { 'primary cell': cl_dict, 'primary cell culture': cl_dict, 'primary tissue': { 'ontology': uberon,
import spacy import string import csv from pronto import Ontology onto_path = '/content/drive/My Drive/bio_files/OntoBiotope_BioNLP-OST-2019 (1).obo' onto = Ontology(onto_path) def remove_stopwords(sentence): non_stop_words = [word.text for word in sentence if not word.is_stop] return nlp(' '.join(non_stop_words)) def remove_non_ascii(text): return nlp(' '.join([ token.text for token in text if all([letter in string.ascii_letters for letter in token.text]) ])) def load_entities(entities_loc): names = dict() descriptions = dict() test = [] entities = [] with open(entities_loc, 'r') as csvfile: csvreader = csv.reader(csvfile, delimiter="\t") for row in csvreader: qid = row[0] name = row[1]
# # if lst_id[j] == k: # d = v['desc'] # n = v['name'] # desc = d.split(); # name = n.split(); # if (keywords[i] in desc or keywords[i] in name): # f = 1; # # print('found in desc or name') # # print(lst[j] , k ) # break; # else: # f = 0 # # print('not found sorry ') # # if f == 0: # # print("removed") # # lst.remove(lst[j]) # lst_id[j] = 1 # # def remove_values_from_list(the_list, val): # return [value for value in the_list if value != val] # # lst = remove_values_from_list(lst_id, 1) # print(lst) from pronto import Ontology cl = Ontology("CSO.owl") print(cl)
def to_atomese(owlfile): onto = Ontology(owlfile) with open(owlfile.split('.')[0] + '.scm', 'w') as output: import_terms(onto, output) output.write(import_meta(onto, owlfile.split('.')[0]))
from pronto import Ontology cl = Ontology("http://purl.obolibrary.org/obo/cl.obo") #for ARGs (ARO for antibiotic resistance ontology--aro.owl from https://card.mcmaster.ca/download) aro = Ontology.from_obo_library("aro.owl") #exploring ontology cf = aro['confers_resistance_to_antibiotic'] t = aro['ARO:1000001'] list(t.objects(cf)) list(t.superclasses()) list(t.subclasses()) #change format to obo with open('aro.obo', 'wb') as f: aro.dump(f, format='obo') #find terms aro = Ontology("aro.obo") for term in aro.terms(): if term.is_leaf(): print(term.id) #load resfinder sequences for matching import pandas as pd resfinder_seq = pd.read_csv("resfinder.csv", sep=" ", header=None) aro2seq = {} for a in set(resfinder_seq['#Aminoglycoside']): if a not in aro: continue t = aro[a]
def full_parse(in_dir, out_dir, study_identifier, usermeta=None, split=True, merge=False, verbose=False, multip=False): """ Parses every study from *in_dir* and then creates ISA files. A new folder is created in the out directory bearing the name of the study identifier. :param str in_dir: path to directory containing studies :param str out_dir: path to out directory :param str study_identifier: name of the study (directory to create) """ dirname = os.path.dirname(os.path.realpath(__file__)) if not any(x in sys.argv for x in ('-h', '--help', '--version')): ms = Ontology(os.path.join(dirname, "psi-ms.obo"), False) ims = Ontology(os.path.join(dirname, "imagingMS.obo"), False) ims.terms.update(ms.terms) else: ms, ims = None, None ims.merge(ms) PARSERS = {'mzML': mzml.mzMLmeta, 'imzML': mzml.imzMLmeta} ONTOLOGIES = {'mzML': ms, 'imzML': ims} # get mzML file in the example_files folder if os.path.isfile(in_dir) and tarfile.is_tarfile(in_dir): compr = True mzml_files = compr_extract(in_dir, "tar") elif os.path.isfile(in_dir) and zipfile.is_zipfile(in_dir): compr = True mzml_files = compr_extract(in_dir, "zip") else: compr = False mzml_path = os.path.join(in_dir, "*mzML") if verbose: print(mzml_path) mzml_files = [mzML for mzML in glob.glob(mzml_path)] #mzml_files.sort() # if multip: # pool = Pool(multip) manager = Manager() metalist = manager.list() if mzml_files: # store the first mzml_files extension if compr: ext1 = mzml_files[0].name.split(os.path.extsep)[-1] else: ext1 = mzml_files[0].split(os.path.extsep)[-1] if multip: jobs = [] for i in mzml_files: p = Process(target=_multiparse, args=(i, metalist)) jobs.append(p) p.start() for proc in jobs: proc.join() # get meta information for all files elif not verbose: pbar = pb.ProgressBar(widgets=[ 'Parsing {:8}: '.format(study_identifier), pb.FormatLabel('%(value)4d'), '/', '%4d' % len(mzml_files), pb.Bar(marker=MARKER, left=" |", right="| "), pb.ETA() ]) for i in pbar(mzml_files): if compr: ext = i.name.split(os.path.extsep)[-1] else: ext = i.split(os.path.extsep)[-1] parser = PARSERS[ext] ont = ONTOLOGIES[ext] metalist.append(parser(i, ont).meta) else: for i in mzml_files: print("Parsing file: {}".format(i)) if compr: ext = i.name.split(os.path.extsep)[-1] else: ext = i.split(os.path.extsep)[-1] parser = PARSERS[ext] ont = ONTOLOGIES[ext] metalist.append(parser(i, ont).meta) # update isa-tab file if merge and ext1 == 'imzML': if verbose: print('Attempting to merge profile and centroid scans') metalist = merge_spectra(metalist) if metalist: if verbose: print("Parsing mzML meta information into ISA-Tab structure") isa_tab_create = isa.ISA_Tab(out_dir, study_identifier, usermeta or {}).write(metalist, ext1, split) else: warnings.warn("No files were found in {}.".format(in_dir), UserWarning)
from pronto import Ontology from restapi.config import IMPORT_PATH from restapi.connectors.neo4j.parser import NodeDump, RelationDump nodes = NodeDump( "HPO", fields=["hpo_id:string", "label:string", "description:string"]) relations = RelationDump("HPO", "IS_CHILD_OF", "HPO", fields=["hpo_id", "hpo_id"], ignore_indexes=True) cl = Ontology(f"{IMPORT_PATH}/hp.obo") for hpo in cl: nodes.dump(cl[hpo].id, cl[hpo].name, cl[hpo].definition or "N/A") for s in list(cl[hpo].subclasses(distance=1)): # type: ignore if cl[hpo].id != s.id: relations.dump(cl[hpo].id, s.id) nodes.store() relations.store()
def build_vocabularies(dirname='datapackage'): edam_onto = Ontology('inputdata/edam.obo') # Hacked up Uberon so it will load uberon_onto = Ontology('inputdata/human-view.obo') obi_onto = Ontology('inputdata/obi.obo') fieldnames = ['id', 'name', 'description', 'synonyms'] with open(dirname + '/data_type.tsv', 'w') as data_type_file: data_type_writer = csv.DictWriter(data_type_file, fieldnames=fieldnames, delimiter='\t') data_type_writer.writeheader() for v in set(i for i in edam_types['data_types'].values() if i is not None): dt_id = 'http://edamontology.org/' + v.replace(':', '_') dt_term = edam_onto.get(dt_id) dt_name = dt_term.name dt_def = dt_term.definition syns = '' for s in dt_term.synonyms: syns += s.description + '' if not syns: syns = None else: syns = syns[:-1] data_type = { 'id': v, 'name': dt_name, 'description': dt_def, # Synonyms in EDAM don't provide references # 'synonyms': syns} 'synonyms': None } data_type_writer.writerow(data_type) with open(dirname + '/file_format.tsv', 'w') as file_format_file: file_format_writer = csv.DictWriter(file_format_file, fieldnames=fieldnames, delimiter='\t') file_format_writer.writeheader() for v in set(i for i in edam_types['file_formats'].values() if i is not None): ff_id = 'http://edamontology.org/' + v.replace(':', '_') ff_term = edam_onto.get(ff_id) ff_name = ff_term.name ff_def = ff_term.definition syns = '' for s in ff_term.synonyms: syns += s.description + '|' if not syns: syns = None else: syns = syns[:-1] file_format = { 'id': v, 'name': ff_name, 'description': ff_def, # Synonyms in EDAM don't provide references # 'synonyms': syns} 'synonyms': None } file_format_writer.writerow(file_format) with open(dirname + '/assay_type.tsv', 'w') as assay_type_file: assay_type_writer = csv.DictWriter(assay_type_file, fieldnames=fieldnames, delimiter='\t') assay_type_writer.writeheader() for at_id in set(i for i in assay_types.values() if i is not None): at_term = obi_onto.get(at_id) at_name = at_term.name at_def = at_term.definition syns = '' for s in at_term.synonyms: syns += s.description + '|' if not syns: syns = None else: syns = syns[:-1] assay_type = { 'id': at_id, 'name': at_name, 'description': at_def, # Synonyms semi-broken # 'synonyms': syns} 'synonyms': None } assay_type_writer.writerow(assay_type) with open(dirname + '/anatomy.tsv', 'w') as anatomy_file: anatomy_writer = csv.DictWriter(anatomy_file, fieldnames=fieldnames, delimiter='\t') anatomy_writer.writeheader() for an_id in set(i for i in anatomy_dict.values() if i is not None): an_term = uberon_onto.get(an_id) an_name = an_term.name an_def = an_term.definition syns = '' for s in an_term.synonyms: syns += s.description + '|' if not syns: syns = None else: syns = syns[:-1] anatomy = { 'id': an_id, 'name': an_name, 'description': an_def, # Synonyms semi-broken # 'synonyms': syns} 'synonyms': None } anatomy_writer.writerow(anatomy)
def handle(self, *args, **options): if options['ontology'] == 'HP': purl = app_settings.HPO_PURL elif options['ontology'] == 'MONDO': purl = app_settings.MONDO_PURL elif options['ontology'] == 'ONCOTREE': purl = app_settings.NCIT_PURL logger.info('Downloading {0}...'.format(purl)) data = Ontology(purl, timeout=10) version = data.meta['data-version'][0] version_obj, version_created = models.Ontology.objects.get_or_create( type=getattr(choices.ONTOLOGY, options['ontology']), label=version, ) if not version_created: logger.info('Version: {0} already available'.format(version)) else: logger.info('Adding Version: {0}'.format(version)) logger.info('Adding Terms...') relations = [] for term in data: created_by = term.other.get('created_by', None) created = term.other.get('creation_date', None) alternate_ids = [ str(self.extract_id(alt_term)) for alt_term in term.other.get('alt_id', []) ] try: term_obj = models.Term.objects.create( ontology=version_obj, identifier=self.extract_id(term.id), label=term.name if term.name else "", description=term.desc if term.desc else "", created_by=created_by[0] if created_by else "", created=created[0] if created else "", alternate_ids=",".join(alternate_ids), ) except Exception as error: msg = '{0} could not be added! Error: {1}'.format( term.id, error) logger.error(msg) raise Exception(msg) # Create Synonym objects for synonym in term.synonyms: if synonym: models.Synonym.objects.get_or_create( term=term_obj, description=synonym.desc if synonym.desc else "", # NOTE: pronto always returns array scope=getattr(choices.SYNONYM_SCOPES, synonym.scope), ) # Create Relationships objects for key, value in term.relations.items(): key = str(key).replace("Relationship('", "").replace("')", "") type_obj, type_created = models.RelationshipType.objects.get_or_create( label=key) for related_term in value: # NOTE: We want source and id to store. There are a few records that don't conform if related_term.id.startswith( 'https://rarediseases.info.nih.gov/diseases/'): logger.warning( 'Relationship {0} format does not conform!'. format(related_term.id)) else: relations.append({ "type": type_obj, "term": term_obj, # NOTE: This is to handle MONDO terms w/ source "related_term": self.extract_id( related_term.id.strip().split()[0]) }) # Create CrossReference objects for xref in term.other.get('xref', []): if xref: xref_data = xref.split(':', 1) # NOTE: We want source and id to store. There are 3 records that don't conform if len(xref_data) != 2 or xref_data[0].upper( ) == 'HTTP': logger.warning( 'CrossReference: {0} format not supported!'. format(xref)) else: models.CrossReference.objects.get_or_create( term=term_obj, source=xref_data[0].split()[0].upper(), source_value=xref_data[1].split()[0], ) # Create is_a/can_be relationships logger.info('Building Relationships...') for relation in relations: try: models.Relationship.objects.create( type=relation['type'], term=relation['term'], related_term=models.Term.objects.get( identifier=relation['related_term'], ontology=version_obj, ), ) except Exception as error: logger.warning( 'Relationship: {0} was not added. Error: {1}'.format( relation, error))
import sqlite3 from pronto import Ontology from Bio.SeqUtils import CheckSum from Bio import SeqIO faa_path = snakemake.input["faa_file"] goa_path = snakemake.input["goa"] go = Ontology(snakemake.input["go_obo"]) go_annotations = open(snakemake.output["go_annotations"], 'w') uniparcdb = snakemake.input["uniparcdb"] conn = sqlite3.connect(goa_path) cursor = conn.cursor() sqlatt = f'attach database "{uniparcdb}" as uniparc;' cursor.execute(sqlatt,) # 1. retrieve uniprot accession from exact match (hash) # 2. retrieve GO annotations for record in SeqIO.parse(faa_path, "fasta"): checksum = CheckSum.seguid(record.seq) sqlq = 'select * from uniparc.uniparc_accession where sequence_hash="%s"' % checksum uniparc_id = cursor.execute(sqlq,).fetchall()[0][0] print("uid", uniparc_id)
def main(): mondo = Ontology('mondo.obo') dic = extract_omim(mondo) with open('omim_index.json', 'w') as fp: json.dump(dic, fp, indent=2)