def load_ontologies(): """Loads all of the ontologies into a nice dictionary data structure""" # a massive dictionary containing key : dictionary mappings between HBP ontology id's and .obo ontology terms big_onto = {} mcc = MouseConnectivityCache() aba_onto = mcc.get_ontology() file_name_list = [f for f in glob.glob(onto_root + "*.robo")] file_name_list.extend([f for f in glob.glob(onto_root + "*.obo")]) for fn in file_name_list: for o in oboparser.parse(fn): if 'synonym' in o: for s in o['synonym']: if "BROAD ACRONYM" in s: acro = re.search("\w+", s).group() o['acronym'] = acro if 'id' in o: big_onto[o['id']] = o for k in big_onto.keys(): if 'ABA' in k: new_o = big_onto[k] aba_id = int(k[11:]) new_o['acronym'] = aba_onto[aba_id]['acronym'].item() big_onto[k] = new_o return big_onto
def __init__(self): self.onto_id2layer_numbers = {} # 'HBP_LAYER:0000123' -> 123 def onto_id2layer_number(o): return int(o[10:]) for o in oboparser.parse(onto_root + 'hbp_layer_ontology.robo'): if 'id' in o: onto_id = o['id'] # HBP_LAYER:0000001 assert len(onto_id) == 17, 'invalid onto_id: {} in {}'.format( onto_id, o) if onto_id2layer_number(onto_id) < 8: # HBP_LAYER:0000001 to 7 # just reslove to layer id self.onto_id2layer_numbers[onto_id] = [ onto_id2layer_number(onto_id) ] elif 'union_of' in o: # put the layer numbers of all 'union_of' ids self.onto_id2layer_numbers[onto_id] = [ onto_id2layer_number(u) for u in o['union_of'] ] elif 'is_a' in o: # e.g. L5a is_a L5 # to simplify, resolve to L5 # not happy with this - SJT self.onto_id2layer_numbers[onto_id] = [ onto_id2layer_number(onto_id), onto_id2layer_number(o['is_a'][0]) ] else: raise Exception('invalid layer entry: {}'.format(o))
def test_parse(self): obo_file = cfg['onto_root'] + 'hbp_layer_ontology.robo' obo = list(oboparser.parse(obo_file)) self.assertTrue(len(obo) > 10) #for o in obo: # print o print(obo[0])
def get_cell_names(obo_onto_file): cell_names = {} for stanza in oboparser.parse(obo_onto_file): if stanza['@type'] != "Term": continue id = stanza["id"] if id == '': continue name = stanza["name"] synonyms = [] for s in stanza["synonym"]: synonyms.append(SYNONOYM_NAME(s).group(1)) cell_names[id] = (name, synonyms) return cell_names
def main(parser): # Initialize logging twiggy_setup() # Parse our ontology file using our oboparser disease_ontology = oboparser.parse(parser.input_obo_file, parser.typedefs) # Load the OBO file into neo4j. # This will be performed in two steps, first the base nodes will # be loaded into the neo4j database followed by the relationships. gdb = GraphDatabase(parser.neo4j_server_address) nodeMapping = load_ontology_to_neo4jdb(gdb, disease_ontology) create_term_relationships(gdb, nodeMapping) create_root_node_index(parser.root_node, gdb)
def _indexOntologies(elasticsearch_host,ontology_file,ontology_type): failed_ontologies = [] documents = [] updated_ontologies= {} try: for term in oboparser.parse(ontology_file, ['is_a','part_of']): try: if term.obsolete: continue document = _getDocumentFromOntology(term) document['type'] = ontology_type retval = _indexDocument(elasticsearch_host,document,index,'term',document['term_id']) if retval is None: raise Exception(str(retval)) except Exception, err: failed_ontologies.append({'name':term.id,'error':str(err)}) raise err except Exception,err: raise err
def __init__(self): self.onto_id2layer_numbers = {} # 'HBP_LAYER:0000123' -> 123 def onto_id2layer_number(o): return int(o[10:]) for o in oboparser.parse(onto_root + 'hbp_layer_ontology.robo'): if 'id' in o: onto_id = o['id'] # HBP_LAYER:0000001 assert len(onto_id) == 17, 'invalid onto_id: {} in {}'.format(onto_id, o) if onto_id2layer_number(onto_id) < 8: # HBP_LAYER:0000001 to 7 # just reslove to layer id self.onto_id2layer_numbers[onto_id] = [onto_id2layer_number(onto_id)] elif 'union_of' in o: # put the layer numbers of all 'union_of' ids self.onto_id2layer_numbers[onto_id] = [onto_id2layer_number(u) for u in o['union_of']] elif 'is_a' in o: # e.g. L5a is_a L5 # to simplify, resolve to L5 # not happy with this - SJT self.onto_id2layer_numbers[onto_id] = [onto_id2layer_number(onto_id), onto_id2layer_number(o['is_a'][0])] else: raise Exception('invalid layer entry: {}'.format(o))
def main(argv=None): if argv is None: argv = sys.argv try: try: opts, args = getopt.getopt(argv[1:], "h", ["help"]) except getopt.error, msg: raise Usage(msg) do_filename = 'HumanDO.obo' do_file = open(do_filename, 'w') do_csv = csv.writer(open('HumanDO.csv', 'w')) print "Downloading HumanDO file..." do_svn = urllib.urlopen('http://diseaseontology.svn.sourceforge.net/svnroot/diseaseontology/trunk/HumanDO.obo') do_text = do_svn.read() print "Writing to file..." do_file.write(do_text) do_file.close() print "Parsing data..." do_data = oboparser.parse(do_filename, ['is_a']) for term in do_data.get_terms(): do_csv.writerow([term.id, term.name, term.definition, str([synonym[0] for synonym in term.synonyms if synonym[1] == 'EXACT']).strip("[]"), str([relationship[2] for relationship in term.relationships]).strip("[]")])
def index_ontology_files(oboFile, outDir, xref_map): """ Iterates over our list of ontology files and creates an index for each file. """ lucene.initVM() analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT) # Handle a little bit of lucene setup filename, _ext = os.path.splitext(os.path.basename(oboFile)) indexDir = os.path.join(outDir, filename) if os.path.exists(indexDir): raise ExistingIndexDirectoryException( 'Error, attempted to index same file twice or index two files named the same' ) dir = lucene.SimpleFSDirectory(lucene.File(indexDir)) writer = lucene.IndexWriter(dir, analyzer, True, lucene.IndexWriter.MaxFieldLength(512)) for term in oboparser.parse(oboFile, ['is_a']): if term.obsolete: continue doc = lucene.Document() add_field_to_document(doc, "term id", term.id, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED) add_field_to_document(doc, "name", term.name, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED, 4.0) # Frequently in the definition text we will run into URLs or some sort of hyperlinks that could # query hits that we would not want to occur thus errantly increasing the score of the field. # We will strip out these hyperlinks and index just the text. add_field_to_document(doc, "definition", strip_urls_from_text(term.definition), lucene.Field.Store.YES, lucene.Field.Index.ANALYZED, 0.4) # Synonyms, relationships, xrefs, subsets, and alternate ID's are all represented as lists # in our Ontology object and need to be entered in one at a time add_fields_to_document(doc, "synonym", [x[0] for x in term.synonyms if x], lucene.Field.Store.NO, lucene.Field.Index.ANALYZED, 0.7) add_fields_to_document(doc, "alt_id", term.alternateIds, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED) add_fields_to_document( doc, "xref", [replace_xref_identifier(x, xref_map) for x in term.xrefs], lucene.Field.Store.NO, lucene.Field.Index.ANALYZED) add_fields_to_document( doc, "relationship", [" ".join(list(x)) for x in list(term.relationships)], lucene.Field.Store.NO, lucene.Field.Index.NOT_ANALYZED) add_fields_to_document(doc, "subset", term.subsets, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED) writer.addDocument(doc) writer.optimize() writer.close()
import csv import sys import oboparser filename = sys.argv[1] reader = csv.reader(open(filename, 'r')) writer = csv.writer(open(filename.replace('.csv', '-tagged.csv'), 'w')) cases = {} do_filename = 'HumanDO.obo' do_data = oboparser.parse(do_filename, ['is_a']) print do_data.get_terms() for row in reader: case_title = row[0].strip('" ') print case_title.lower(), do_data.get_term(case_title.lower())