Example #1
0
def load_ontologies():
    """Loads all of the ontologies into a nice dictionary data structure"""

    # a massive dictionary containing key : dictionary mappings between HBP ontology id's and .obo ontology terms
    big_onto = {}
    mcc = MouseConnectivityCache()
    aba_onto = mcc.get_ontology()

    file_name_list = [f for f in glob.glob(onto_root + "*.robo")]
    file_name_list.extend([f for f in glob.glob(onto_root + "*.obo")])
    for fn in file_name_list:
        for o in oboparser.parse(fn):
            if 'synonym' in o:
                for s in o['synonym']:
                    if "BROAD ACRONYM" in s:
                        acro = re.search("\w+", s).group()
                        o['acronym'] = acro
            if 'id' in o:
                big_onto[o['id']] = o


    for k in big_onto.keys():
        if 'ABA' in k:
            new_o = big_onto[k]
            aba_id = int(k[11:])
            new_o['acronym'] = aba_onto[aba_id]['acronym'].item()
            big_onto[k] = new_o
    return big_onto
Example #2
0
    def __init__(self):
        self.onto_id2layer_numbers = {}

        # 'HBP_LAYER:0000123' -> 123
        def onto_id2layer_number(o):
            return int(o[10:])

        for o in oboparser.parse(onto_root + 'hbp_layer_ontology.robo'):
            if 'id' in o:
                onto_id = o['id']  # HBP_LAYER:0000001
                assert len(onto_id) == 17, 'invalid onto_id: {} in {}'.format(
                    onto_id, o)
                if onto_id2layer_number(onto_id) < 8:  # HBP_LAYER:0000001 to 7
                    # just reslove to layer id
                    self.onto_id2layer_numbers[onto_id] = [
                        onto_id2layer_number(onto_id)
                    ]
                elif 'union_of' in o:
                    # put the layer numbers of all 'union_of' ids
                    self.onto_id2layer_numbers[onto_id] = [
                        onto_id2layer_number(u) for u in o['union_of']
                    ]
                elif 'is_a' in o:  # e.g. L5a is_a L5
                    # to simplify, resolve to L5
                    # not happy with this - SJT
                    self.onto_id2layer_numbers[onto_id] = [
                        onto_id2layer_number(onto_id),
                        onto_id2layer_number(o['is_a'][0])
                    ]
                else:
                    raise Exception('invalid layer entry: {}'.format(o))
Example #3
0
 def test_parse(self):
     obo_file = cfg['onto_root'] + 'hbp_layer_ontology.robo'
     obo = list(oboparser.parse(obo_file))
     self.assertTrue(len(obo) > 10)
     #for o in obo:
     #    print o
     print(obo[0])
Example #4
0
def load_ontologies():
    """Loads all of the ontologies into a nice dictionary data structure"""

    # a massive dictionary containing key : dictionary mappings between HBP ontology id's and .obo ontology terms
    big_onto = {}
    mcc = MouseConnectivityCache()
    aba_onto = mcc.get_ontology()

    file_name_list = [f for f in glob.glob(onto_root + "*.robo")]
    file_name_list.extend([f for f in glob.glob(onto_root + "*.obo")])
    for fn in file_name_list:
        for o in oboparser.parse(fn):
            if 'synonym' in o:
                for s in o['synonym']:
                    if "BROAD ACRONYM" in s:
                        acro = re.search("\w+", s).group()
                        o['acronym'] = acro
            if 'id' in o:
                big_onto[o['id']] = o

    for k in big_onto.keys():
        if 'ABA' in k:
            new_o = big_onto[k]
            aba_id = int(k[11:])
            new_o['acronym'] = aba_onto[aba_id]['acronym'].item()
            big_onto[k] = new_o
    return big_onto
Example #5
0
def get_cell_names(obo_onto_file):
    cell_names = {}    
    for stanza in oboparser.parse(obo_onto_file):
        if stanza['@type'] != "Term": continue
        id = stanza["id"]
        if id == '': continue
        name = stanza["name"]
        synonyms = []
        for s in stanza["synonym"]:
            synonyms.append(SYNONOYM_NAME(s).group(1))
        cell_names[id] = (name, synonyms)
    return cell_names
def get_cell_names(obo_onto_file):
    cell_names = {}    
    for stanza in oboparser.parse(obo_onto_file):
        if stanza['@type'] != "Term": continue
        id = stanza["id"]
        if id == '': continue
        name = stanza["name"]
        synonyms = []
        for s in stanza["synonym"]:
            synonyms.append(SYNONOYM_NAME(s).group(1))
        cell_names[id] = (name, synonyms)
    return cell_names
def main(parser):
    # Initialize logging
    twiggy_setup()

    # Parse our ontology file using our oboparser
    disease_ontology = oboparser.parse(parser.input_obo_file, parser.typedefs)

    # Load the OBO file into neo4j.
    # This will be performed in two steps, first the base nodes will 
    # be loaded into the neo4j database followed by the relationships.
    gdb = GraphDatabase(parser.neo4j_server_address)
    nodeMapping = load_ontology_to_neo4jdb(gdb, disease_ontology)
    create_term_relationships(gdb, nodeMapping)
    
    create_root_node_index(parser.root_node, gdb)
Example #8
0
def main(parser):
    # Initialize logging
    twiggy_setup()

    # Parse our ontology file using our oboparser
    disease_ontology = oboparser.parse(parser.input_obo_file, parser.typedefs)

    # Load the OBO file into neo4j.
    # This will be performed in two steps, first the base nodes will
    # be loaded into the neo4j database followed by the relationships.
    gdb = GraphDatabase(parser.neo4j_server_address)
    nodeMapping = load_ontology_to_neo4jdb(gdb, disease_ontology)
    create_term_relationships(gdb, nodeMapping)

    create_root_node_index(parser.root_node, gdb)
Example #9
0
def _indexOntologies(elasticsearch_host,ontology_file,ontology_type):
    failed_ontologies = []
    documents = []
    updated_ontologies= {}
    try:
        for term in oboparser.parse(ontology_file, ['is_a','part_of']):
            try: 
                if term.obsolete:
                    continue
                document = _getDocumentFromOntology(term)
                document['type'] = ontology_type
                retval = _indexDocument(elasticsearch_host,document,index,'term',document['term_id'])
                if retval is None:
                    raise Exception(str(retval))    
            except Exception, err:
                failed_ontologies.append({'name':term.id,'error':str(err)})
                raise err
    except Exception,err:
        raise err
Example #10
0
 def __init__(self):
     self.onto_id2layer_numbers = {}
     # 'HBP_LAYER:0000123' -> 123
     def onto_id2layer_number(o):
         return int(o[10:])
     for o in oboparser.parse(onto_root + 'hbp_layer_ontology.robo'):
         if 'id' in o:
             onto_id = o['id'] # HBP_LAYER:0000001
             assert len(onto_id) == 17, 'invalid onto_id: {} in {}'.format(onto_id, o)
             if onto_id2layer_number(onto_id) < 8: # HBP_LAYER:0000001 to 7
                 # just reslove to layer id
                 self.onto_id2layer_numbers[onto_id] = [onto_id2layer_number(onto_id)]
             elif 'union_of' in o:
                 # put the layer numbers of all 'union_of' ids
                 self.onto_id2layer_numbers[onto_id] = [onto_id2layer_number(u) for u in o['union_of']]
             elif 'is_a' in o: # e.g. L5a is_a L5
                 # to simplify, resolve to L5
                 # not happy with this - SJT
                 self.onto_id2layer_numbers[onto_id] = [onto_id2layer_number(onto_id), onto_id2layer_number(o['is_a'][0])]
             else:
                 raise Exception('invalid layer entry: {}'.format(o))
Example #11
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
    try:
        try:
            opts, args = getopt.getopt(argv[1:], "h", ["help"])
        except getopt.error, msg:
             raise Usage(msg)
        do_filename = 'HumanDO.obo'
        do_file = open(do_filename, 'w')
        do_csv = csv.writer(open('HumanDO.csv', 'w'))
        print "Downloading HumanDO file..."
        do_svn = urllib.urlopen('http://diseaseontology.svn.sourceforge.net/svnroot/diseaseontology/trunk/HumanDO.obo')
        do_text = do_svn.read()
        print "Writing to file..."
        do_file.write(do_text)
        do_file.close()
        print "Parsing data..."
        do_data = oboparser.parse(do_filename, ['is_a'])
        for term in do_data.get_terms():
            do_csv.writerow([term.id, term.name, term.definition, str([synonym[0] for synonym in term.synonyms if synonym[1] == 'EXACT']).strip("[]"), str([relationship[2] for relationship in term.relationships]).strip("[]")])
Example #12
0
def index_ontology_files(oboFile, outDir, xref_map):
    """
    Iterates over our list of ontology files and creates an index for each file.
    """
    lucene.initVM()
    analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)

    # Handle a little bit of lucene setup
    filename, _ext = os.path.splitext(os.path.basename(oboFile))

    indexDir = os.path.join(outDir, filename)
    if os.path.exists(indexDir):
        raise ExistingIndexDirectoryException(
            'Error, attempted to index same file twice or index two files named the same'
        )

    dir = lucene.SimpleFSDirectory(lucene.File(indexDir))
    writer = lucene.IndexWriter(dir, analyzer, True,
                                lucene.IndexWriter.MaxFieldLength(512))

    for term in oboparser.parse(oboFile, ['is_a']):
        if term.obsolete:
            continue

        doc = lucene.Document()
        add_field_to_document(doc, "term id", term.id, lucene.Field.Store.YES,
                              lucene.Field.Index.ANALYZED)
        add_field_to_document(doc, "name", term.name, lucene.Field.Store.YES,
                              lucene.Field.Index.ANALYZED, 4.0)

        # Frequently in the definition text we will run into URLs or some sort of hyperlinks that could
        # query hits that we would not want to occur thus errantly increasing the score of the field.
        # We will strip out these hyperlinks and index just the text.
        add_field_to_document(doc, "definition",
                              strip_urls_from_text(term.definition),
                              lucene.Field.Store.YES,
                              lucene.Field.Index.ANALYZED, 0.4)

        # Synonyms, relationships, xrefs, subsets, and alternate ID's are all represented as lists
        # in our Ontology object and need to be entered in one at a time
        add_fields_to_document(doc, "synonym",
                               [x[0] for x in term.synonyms if x],
                               lucene.Field.Store.NO,
                               lucene.Field.Index.ANALYZED, 0.7)

        add_fields_to_document(doc, "alt_id", term.alternateIds,
                               lucene.Field.Store.NO,
                               lucene.Field.Index.ANALYZED)
        add_fields_to_document(
            doc, "xref",
            [replace_xref_identifier(x, xref_map) for x in term.xrefs],
            lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)
        add_fields_to_document(
            doc, "relationship",
            [" ".join(list(x)) for x in list(term.relationships)],
            lucene.Field.Store.NO, lucene.Field.Index.NOT_ANALYZED)
        add_fields_to_document(doc, "subset", term.subsets,
                               lucene.Field.Store.NO,
                               lucene.Field.Index.ANALYZED)
        writer.addDocument(doc)

    writer.optimize()
    writer.close()
Example #13
0
import csv
import sys
import oboparser

filename = sys.argv[1]
reader = csv.reader(open(filename, 'r'))
writer = csv.writer(open(filename.replace('.csv', '-tagged.csv'), 'w'))
cases = {}
do_filename = 'HumanDO.obo'
do_data = oboparser.parse(do_filename, ['is_a'])
print do_data.get_terms()
for row in reader:
    case_title = row[0].strip('" ')
    print case_title.lower(), do_data.get_term(case_title.lower())