def setUp(self): self.assoc_curie = 'MONARCH:test_association' self.eco_id = 'ECO:0000015' self.test_set_1 = ( 'MGI:1920145', 'Setd5', 'WTSI', 'MEFW', 'male', 'heterozygote', 'MGI:4432631', 'Setd5<tm1a(EUCOMM)Wtsi>', 'targeted mutation 1a, Wellcome Trust Sanger Institute', 'MGI:2159965', 'C57BL/6N', 'MGP', 'Wellcome Trust Sanger Institute Mouse Genetics Project', 'MGP Select Pipeline', 'MGP_001', 'MGP_XRY_001', 'X-ray', 'IMPC_XRY_008_001', 'Number of ribs right', 'MP:0005390', 'skeleton phenotype', 'MP:0000480', 'increased rib number', '1.637023E-010', '', '8.885439E-007', 'Wilcoxon rank sum test with continuity correction', 'IMPC') # Generate test curies, these are otherwise generated # within _add_evidence() and _add_study_provenance() self.study_curie = "_:study" self.evidence_curie = "_:evidence" # IRIs for testing sparql output curie_dict = curie_map.get() curie_util = CurieUtil(curie_dict) self.assoc_iri = URIRef(curie_util.get_uri(self.assoc_curie)) return
def setUp(self): self.assoc_curie = 'MONARCH:test_association' self.eco_id = 'ECO:0000015' self.test_set_1 = ('MGI:1920145', 'Setd5', 'WTSI', 'MEFW', 'male', 'heterozygote', 'MGI:4432631', 'Setd5<tm1a(EUCOMM)Wtsi>', 'targeted mutation 1a, Wellcome Trust Sanger Institute', 'MGI:2159965', 'C57BL/6N', 'MGP', 'Wellcome Trust Sanger Institute Mouse Genetics Project', 'MGP Select Pipeline', 'MGP_001', 'MGP_XRY_001', 'X-ray', 'IMPC_XRY_008_001', 'Number of ribs right', 'MP:0005390', 'skeleton phenotype', 'MP:0000480', 'increased rib number', '1.637023E-010', '', '8.885439E-007', 'Wilcoxon rank sum test with continuity correction', 'IMPC') # Generate test curies, these are otherwise generated # within _add_evidence() and _add_study_provenance() self.study_curie = "_:study" self.evidence_curie = "_:evidence" # IRIs for testing sparql output curie_dict = curie_map.get() curie_util = CurieUtil(curie_dict) self.assoc_iri = URIRef(curie_util.get_uri(self.assoc_curie)) return
def test_addGenotype(self): cutil = CurieUtil(self.curie_map) gid = 'MGI:5515892' label = \ 'Pmp22<Tr-2J>/Pmp22<+> [C57BL/6J-Pmp22<Tr-2J>/GrsrJ]' self.genotype.addGenotype(gid, label) self.assertTrue((URIRef(cutil.get_uri(gid)), RDFS['label'], Literal(label)) in self.genotype.graph)
def setUp(self): self.graph = RDFGraph() self.curie_map = curie_map.get() self.genotype = Genotype(self.graph) self.cutil = CurieUtil(self.curie_map) self.test_cat_pred = self.cutil.get_uri(blv.terms['category']) self.test_cat_genotype_category = self.cutil.get_uri( blv.terms['Genotype']) self.test_cat_background_category = self.cutil.get_uri( blv.terms['PopulationOfIndividualOrganisms'])
def test_addGenotype(self): from rdflib.namespace import RDFS,URIRef from rdflib import Literal from dipper.utils.CurieUtil import CurieUtil cu = CurieUtil(self.curie_map) id = 'MGI:5515892' label = \ 'Pmp22<Tr-2J>/Pmp22<+> [C57BL/6J-Pmp22<Tr-2J>/GrsrJ]' self.genotype.addGenotype(id, label) self.assertTrue((URIRef(cu.get_uri(id)), RDFS['label'], Literal(label)) in self.genotype.graph)
def test_addGenotype(self): from rdflib.namespace import RDFS, URIRef from rdflib import Literal from dipper.utils.CurieUtil import CurieUtil cutil = CurieUtil(self.curie_map) gid = 'MGI:5515892' label = \ 'Pmp22<Tr-2J>/Pmp22<+> [C57BL/6J-Pmp22<Tr-2J>/GrsrJ]' self.genotype.addGenotype(gid, label) self.assertTrue((URIRef(cutil.get_uri(gid)), RDFS['label'], Literal(label)) in self.genotype.graph)
def setUp(self): self.graph = RDFGraph() this_curie_map = curie_map.get() self.cutil = CurieUtil(this_curie_map) # stuff to make test triples self.test_cat_subj = "http://www.google.com" self.test_cat_default_pred = self.cutil.get_uri("biolink:category") self.test_cat_nondefault_pred = self.cutil.get_uri("rdf:type") self.test_cat_default_category = self.cutil.get_uri( "biolink:NamedThing") self.test_cat_nondefault_category = self.cutil.get_uri("biolink:Gene") self.test_cat_type = self.cutil.get_uri("rdf:type") self.test_cat_class = self.cutil.get_uri("rdf:class")
def setUp(self): g = RDFGraph() self.model = Model(g) this_curie_map = curie_map.get() self.cutil = CurieUtil(this_curie_map) # stuff to make test triples self.test_cat_subj_curie = "MGI:1234" self.test_cat_subj = self.cutil.get_uri("MGI:1234") self.test_cat_default_pred = self.cutil.get_uri("biolink:category") self.test_named_indiv = self.cutil.get_uri("owl:NamedIndividual") self.test_label_pred = self.cutil.get_uri("rdfs:label") self.test_label = "some label" self.test_comment_IRI = self.cutil.get_uri("rdfs:comment") self.test_comment = 'bonus eruptus'
def test_associations(self): """ Given the above sample input, produce the following: CGD:VariantID has_phenotype(RO:0002200) CGD:DiseaseInstance A CGD:AssociationID OBO:RO_0002558 Traceable Author Statement (ECO:0000033) A CGD:AssociationID dc:source PMID:20498393 A CGD:AssociationID has_environment CGD:DrugID A CGD:AssociationID OBAN:association_has_subject CGD:VariantID A CGD:AssociationID OBAN:association_has_object_property has_phenotype A CGD:AssociationID OBAN:association_has_object CGD:DiseaseInstance """ from dipper.utils.TestUtils import TestUtils # Make testutils object and load bindings cu = CurieUtil(self.curie_map) test_env = TestUtils(self.cgd.graph) self.cgd.load_bindings() evidence = 'OBO:ECO_0000033' evidence_uri = URIRef(cu.get_uri(evidence)) sparql_query = """ SELECT ?diseaseInd ?variant ?drug ?vdannot ?source ?evidence WHERE {{ ?variant OBO:RO_0002200 ?diseaseInd . ?vdannot a OBAN:association ; OBO:RO_0002558 ?evidence ; dc:source ?source ; <{0}> ?drug ; OBAN:association_has_object ?diseaseInd ; OBAN:association_has_object_property OBO:RO_0002200 ; OBAN:association_has_subject ?variant . }} """.format(self.relationship_uri) # Expected Results expected_results = [[self.disease_ind_uri, self.variant_uri, self.drug_uri, self.vd_annot_uri, self.source_uri, evidence_uri]] # Query graph sparql_output = test_env.query_graph(sparql_query) self.assertEqual(expected_results, sparql_output)
def _replace_entity(graph, old_id, new_id, bindings={}, is_property=False): """ Replace entity in graph Replace one ID with another :param graph rdflib.graph object :param old_id, String curie,IRI, or literal to be replaced :param new_id, String curie, IRI, or literal to replace the old id :param bindings, Dict, dictionary of namespace prefixes :param is_property, Boolean, is an id a property/predicate rather than a class, individual, or literal :return: None """ cu = CurieUtil(curie_map.get()) old_uri = URIRef(cu.get_uri(old_id)) new_uri = URIRef(cu.get_uri(new_id)) if is_property is False: sparql_update = \ """ DELETE {{ <{0}> ?pred ?obj }} INSERT {{ <{1}> ?pred ?obj }} WHERE {{ <{0}> ?pred ?obj }} """.format(old_uri, new_uri) graph.update(sparql_update, 'sparql', bindings) sparql_update = \ """ DELETE {{ ?sub ?pred <{0}> }} INSERT {{ ?sub ?pred <{1}> }} WHERE {{ ?sub ?pred <{0}> }} """.format(old_uri, new_uri) graph.update(sparql_update, 'sparql', bindings) else: sparql_update = \ """ DELETE {{ ?sub <{0}> ?obj }} INSERT {{ ?sub <{1}> ?obj }} WHERE {{ ?sub <{0}> {?obj} }} """.format(old_uri, new_uri) graph.update(sparql_update, 'sparql', bindings) return
def setUp(self): self.curie_map = curie_map.get() cu = CurieUtil(self.curie_map) # Fake credentials as these tests do not require a database connection database = 'foo' user = '******' password = '******' self.cgd = CGD(database, user, password) test_data = ((387, 'MLH1 any mutation', 13, 'Adenocarcinoma', None, 'Colon', 'no response', 1, '5FU-based adjuvant therapy', 'late trials', '20498393'),) self.cgd.add_disease_drug_variant_to_graph(test_data) (variant_key, variant_label, diagnoses_key, diagnoses, specific_diagnosis, organ, relationship, drug_key, drug, therapy_status, pubmed_id) = test_data[0] source_id = "PMID:{0}".format(pubmed_id) variant_id = self.cgd.make_cgd_id('variant{0}'.format(variant_key)) disease_id = self.cgd.make_cgd_id('disease{0}{1}'.format(diagnoses_key, diagnoses)) relationship_id = "RO:has_environment" disease_quality = ("CGD:{0}".format(relationship)).replace(" ", "_") has_quality_property = "BFO:0000159" drug_id = self.cgd.make_cgd_id('drug{0}'.format(drug_key)) disease_instance_id = self.cgd.make_cgd_id('phenotype{0}{1}{2}'.format( diagnoses, variant_key, relationship)) variant_disease_annot = self.cgd.make_cgd_id("assoc{0}{1}".format(variant_key, diagnoses)) # Set up URIs self.source_uri = URIRef(cu.get_uri(source_id)) self.variant_uri = URIRef(cu.get_uri(variant_id)) self.disease_uri = URIRef(cu.get_uri(disease_id)) self.disease_ind_uri = URIRef(cu.get_uri(disease_instance_id)) self.relationship_uri = URIRef(cu.get_uri(relationship_id)) self.drug_uri = URIRef(cu.get_uri(drug_id)) self.vd_annot_uri = URIRef(cu.get_uri(variant_disease_annot)) self.disease_quality_uri = URIRef(cu.get_uri(disease_quality)) self.variant_label = variant_label self.disease_label = diagnoses self.disease_instance_label = "{0} with {1} to therapy".format(diagnoses, relationship) self.drug_label = drug return
def hpo_to_tree(cls, hpo_terms, hpo_graph, tree, path): tree_path = copy.copy(path) tree_path.append(cls) curie_util = CurieUtil(curie_map.get()) if cls not in hpo_terms: hpo_terms[cls] = { 'label': hpo_graph.label(URIRef(curie_util.get_uri(cls))) } parents = hpo_graph.objects(URIRef(curie_util.get_uri(cls)), RDFS.subClassOf) hpo_terms[cls]['parents'] = len(list(parents)) lay_person = get_lay_person(cls, hpo_graph) hpo_terms[cls]["lay_person"] = lay_person # Traverse the tree to get to the input class position = tree[tree_path[0]] for term in tree_path[1:]: position = position[term] for sub_class in hpo_graph.subjects( RDFS.subClassOf, URIRef(curie_util.get_uri(tree_path[-1]))): curie = curie_util.get_curie(sub_class).replace("OBO:HP_", "HP:") position[curie] = {} hpo_to_tree(curie, hpo_terms, hpo_graph, tree, tree_path)
def hpo_to_tree(cls, hpo_terms, hpo_graph, tree, path): tree_path = copy.copy(path) tree_path.append(cls) curie_util = CurieUtil(curie_map.get()) if cls not in hpo_terms: hpo_terms[cls] = { 'label': hpo_graph.label(URIRef(curie_util.get_uri(cls))) } parents = hpo_graph.objects(URIRef(curie_util.get_uri(cls)), RDFS.subClassOf) hpo_terms[cls]['parents'] = len(list(parents)) lay_person = get_lay_person(cls, hpo_graph) hpo_terms[cls]["lay_person"] = lay_person # Traverse the tree to get to the input class position = tree[tree_path[0]] for term in tree_path[1:]: position = position[term] for sub_class in hpo_graph.subjects(RDFS.subClassOf, URIRef(curie_util.get_uri(tree_path[-1]))): curie = curie_util.get_curie(sub_class).replace("OBO:HP_", "HP:") position[curie] = {} hpo_to_tree(curie, hpo_terms, hpo_graph, tree, tree_path)
def __init__(self, curie_map, materialize_bnodes=False): self.curie_map = curie_map self.cu = CurieUtil(curie_map) self.nobnodes = materialize_bnodes return
class RDFGraph(ConjunctiveGraph, DipperGraph): """ Extends RDFLibs ConjunctiveGraph The goal of this class is wrap the creation of triples and manage creation of URIRef, Bnodes, and literals from an input curie """ curie_util = CurieUtil(curie_map.get()) curie_map = curie_map def __init__(self, are_bnodes_skized=True): super().__init__() self.are_bnodes_skized = are_bnodes_skized # Can be removed when this is resolved # https://github.com/RDFLib/rdflib/issues/632 obo_map = curie_map.get()['OBO'] self.bind('OBO', Namespace(obo_map)) def addTriple(self, subject_id, predicate_id, obj, object_is_literal=False, literal_type=None): if object_is_literal is True: if literal_type is not None and obj is not None: literal_type_iri = self._getNode(literal_type) self.add( (self._getNode(subject_id), self._getNode(predicate_id), Literal(obj, datatype=literal_type_iri))) elif obj is not None: self.add( (self._getNode(subject_id), self._getNode(predicate_id), Literal(obj))) else: logger.warn("None as literal object for subj: %s and pred: %s", subject_id, predicate_id) elif obj is not None and obj != '': self.add((self._getNode(subject_id), self._getNode(predicate_id), self._getNode(obj))) else: logger.warn("None/empty object IRI for subj: %s and pred: %s", subject_id, predicate_id) return def skolemizeBlankNode(self, curie): stripped_id = re.sub(r'^_:|^_', '', curie, 1) node = BNode(stripped_id).skolemize(self.curie_map.get_base()) node = re.sub(r'rdflib/', '', node) return URIRef(node) def _getNode(self, curie): """ This is a wrapper for creating a URIRef or Bnode object with a given a curie or iri as a string. If an id starts with an underscore, it assigns it to a BNode, otherwise it creates it with a standard URIRef. Alternatively, self.skolemize_blank_node is True, it will skolemize the blank node :param curie: str identifier formatted as curie or iri :return: node: RDFLib URIRef or BNode object """ node = None if re.match(r'^_', curie): if self.are_bnodes_skized is True: node = self.skolemizeBlankNode(curie) else: # replace the leading underscore to make it cleaner node = BNode(re.sub(r'^_:|^_', '', curie, 1)) # Check if curie actually an IRI elif re.match(r'^http|^ftp', curie): node = URIRef(curie) else: iri = RDFGraph.curie_util.get_uri(curie) if iri is not None: node = URIRef(RDFGraph.curie_util.get_uri(curie)) # Bind prefix map to graph prefix = curie.split(':')[0] if prefix not in self.namespace_manager.namespaces(): mapped_iri = curie_map.get()[prefix] self.bind(prefix, Namespace(mapped_iri)) else: logger.error("couldn't make URI for %s", curie) return node def bind_all_namespaces(self): for prefix in curie_map.get().keys(): iri = curie_map.get()[prefix] self.bind(prefix, Namespace(iri))
def test_genome_build_chromosome_model(self): """ Test modelling of genome, builds, and chromosomes Using test data set 2, and the function add_variant_info_to_graph() """ from dipper.utils.TestUtils import TestUtils self.cgd.add_variant_info_to_graph(self.test_set_2) # Make testutils object and load bindings test_env = TestUtils(self.cgd.graph) cu = CurieUtil(self.curie_map) self.cgd.load_bindings() genome = ":9606genome" genome_label = "Human genome" chromosome = "CHR:9606chr9" chromosome_label = "chr9 (Human)" build_curie = "UCSC:hg19" build_label = "hg19" chrom_on_build = ":MONARCH_hg19chr9" chrom_build_label = "chr9 (hg19)" genome_uri = URIRef(cu.get_uri(genome)) chromosome_uri = URIRef(cu.get_uri(chromosome)) build_uri = URIRef(cu.get_uri(build_curie)) chrom_on_build_uri = URIRef(cu.get_uri(chrom_on_build)) ''' sparql_query = """ SELECT ?genome ?chromosome ?build ?chromOnBuild WHERE {{ ?genome a owl:Class ; rdfs:label "{0}" ; OBO:RO_0002162 OBO:NCBITaxon_9606 ; OBO:RO_0002351 ?chromosome ; rdfs:subClassOf OBO:SO_0001026 . ?chromosome a owl:Class ; rdfs:label "{1}" ; OBO:RO_0002350 ?genome ; rdfs:subClassOf OBO:SO_0000340 . ?build a OBO:SO_0001505 ; a ?genome ; rdfs:label "{2}" ; OBO:RO_0002351 ?chromOnBuild ; rdfs:subClassOf ?genome . ?chromOnBuild a ?chromosome ; rdfs:label "{3}" ; OBO:RO_0002350 ?build . }} """.format(genome_label, chromosome_label, build_label, chrom_build_label) ''' sparql_query = """ SELECT ?genome ?chromosome ?build ?chromOnBuild WHERE {{ ?genome a owl:Class ; rdfs:label "{0}" ; rdfs:subClassOf OBO:SO_0001026 . ?chromosome a owl:Class ; rdfs:label "{1}" ; rdfs:subClassOf OBO:SO_0000340 . ?build a OBO:SO_0001505 ; a ?genome ; rdfs:label "{2}" ; OBO:RO_0002162 OBO:NCBITaxon_9606 ; OBO:RO_0002351 ?chromOnBuild . ?chromOnBuild a ?chromosome ; a OBO:SO_0000340 ; rdfs:label "{3}" ; OBO:RO_0002350 ?build . }} """.format(genome_label, chromosome_label, build_label, chrom_build_label) # Expected Results expected_results = [[ genome_uri, chromosome_uri, build_uri, chrom_on_build_uri ]] # Query graph sparql_output = test_env.query_graph(sparql_query) self.assertEqual(expected_results, sparql_output)
def test_variant_position_region_model(self): """ Test modelling of variant positions on a transcript Using test data set 2, and the function add_variant_info_to_graph() We want to test the following triples: CGD:RegionID is an instance of faldo:Region CGD:RegionID faldo:begin BothStrandPositionID CGD:RegionID faldo:end BothStrandPositionID CGD:BothStrandPositionID is an instance of faldo:BothStrandPosition CGD:BothStrandPositionID is an instance of faldo:Position CGD:BothStrandPositionID faldo:position 944 CGD:BothStrandPositionID faldo:reference CGD:TranscriptID """ from dipper.utils.TestUtils import TestUtils self.cgd.add_variant_info_to_graph(self.test_set_2) # Make testutils object and load bindings test_env = TestUtils(self.cgd.graph) cu = CurieUtil(self.curie_map) self.cgd.load_bindings() (variant_key, variant_label, amino_acid_variant, amino_acid_position, transcript_id, transcript_priority, protein_variant_type, functional_impact, stop_gain_loss, transcript_gene, protein_variant_source, variant_gene, bp_pos, variant_cdna, cosmic_id, db_snp_id, genome_pos_start, genome_pos_end, ref_base, variant_base, primary_transcript_exons, primary_transcript_variant_sub_types, variant_type, chromosome, genome_build, build_version, build_date) = self.test_set_2[0] transcript_curie = self.cgd._make_transcript_curie(transcript_id) ccds_id = "35166.1" variant_id = self.cgd.make_cgd_id('variant{0}'.format(variant_key)) region_id = ":_{0}Region".format(transcript_curie) both_strand_id = ":_{0}-{1}".format(ccds_id, bp_pos) region_uri = URIRef(cu.get_uri(region_id)) both_strand_uri = URIRef(cu.get_uri(both_strand_id)) ccds_uri = URIRef(cu.get_uri(transcript_curie)) sparql_query = """ SELECT ?region ?bsPosition ?transcript WHERE {{ ?region a faldo:Region ; faldo:begin ?bsPosition ; faldo:end ?bsPosition . ?bsPosition a faldo:Position ; faldo:position {0} ; faldo:reference ?transcript . }} """.format(bp_pos) # Expected Results expected_results = [[region_uri, both_strand_uri, ccds_uri]] # Query graph sparql_output = test_env.query_graph(sparql_query) self.assertEqual(expected_results, sparql_output)
class RDFGraph(ConjunctiveGraph, DipperGraph): """ Extends RDFLibs ConjunctiveGraph The goal of this class is wrap the creation of triples and manage creation of URIRef, Bnodes, and literals from an input curie """ curie_map = curie_map.get() curie_util = CurieUtil(curie_map) # make global translation table available outside the ingest with open('translationtable/GLOBAL_TERMS.yaml') as fh: globaltt = yaml.safe_load(fh) globaltcid = {v: k for k, v in globaltt.items()} def __init__(self, are_bnodes_skized=True, identifier=None): # print("in RDFGraph with id: ", identifier) super().__init__('IOMemory', identifier) self.are_bnodes_skized = are_bnodes_skized # Can be removed when this is resolved # https://github.com/RDFLib/rdflib/issues/632 obo_map = curie_map.get()['OBO'] self.bind('OBO', Namespace(obo_map)) # try adding them all # self.bind_all_namespaces() # too much def addTriple(self, subject_id, predicate_id, obj, object_is_literal=False, literal_type=None): if object_is_literal is True: if literal_type is not None and obj is not None: literal_type_iri = self._getNode(literal_type) self.add( (self._getNode(subject_id), self._getNode(predicate_id), Literal(obj, datatype=literal_type_iri))) elif obj is not None: self.add( (self._getNode(subject_id), self._getNode(predicate_id), Literal(obj))) else: logger.warning( "None as literal object for subj: %s and pred: %s", subject_id, predicate_id) # magic number 2 here is "steps up the stack" logger.warning(sys._getframe(2).f_code.co_name) elif obj is not None and obj != '': self.add(( self._getNode(subject_id), self._getNode(predicate_id), self._getNode(obj))) else: logger.warning( "None/empty object IRI for subj: %s and pred: %s", subject_id, predicate_id) return def skolemizeBlankNode(self, curie): stripped_id = re.sub(r'^_:|^_', '', curie, 1) node = BNode(stripped_id).skolemize(self.curie_util.get_base()) node = re.sub(r'rdflib/', '', node) # remove string added by rdflib return URIRef(node) def _getNode(self, curie): """ This is a wrapper for creating a URIRef or Bnode object with a given a curie or iri as a string. If an id starts with an underscore, it assigns it to a BNode, otherwise it creates it with a standard URIRef. Alternatively, self.skolemize_blank_node is True, it will skolemize the blank node :param curie: str identifier formatted as curie or iri :return: node: RDFLib URIRef or BNode object """ node = None if re.match(r'^_', curie): if self.are_bnodes_skized is True: node = self.skolemizeBlankNode(curie) else: # delete the leading underscore to make it cleaner node = BNode(re.sub(r'^_:|^_', '', curie, 1)) # Check if curie actually an IRI elif re.match(r'^http|^ftp', curie): node = URIRef(curie) else: iri = RDFGraph.curie_util.get_uri(curie) if iri is not None: node = URIRef(RDFGraph.curie_util.get_uri(curie)) # Bind prefix map to graph prefix = curie.split(':')[0] if prefix not in self.namespace_manager.namespaces(): mapped_iri = curie_map.get()[prefix] self.bind(prefix, Namespace(mapped_iri)) else: logger.error("couldn't make URI for %s", curie) return node def bind_all_namespaces(self): for prefix in curie_map.get().keys(): iri = curie_map.get()[prefix] self.bind(prefix, Namespace(iri))
def __init__(self, curie_map): self.curie_map = curie_map self.cu = CurieUtil(curie_map) return
class GraphUtils: # FIXME - i've duplicated relationships in Assoc and here - # pick one or the other and refactor # TODO - refactor using the getNode() method to clear out the # URIRef(cu.get_uri(<id>)) nonsense OWLCLASS = OWL['Class'] OWLIND = OWL['NamedIndividual'] OWLRESTRICTION = OWL['Restriction'] OWLPROP = OWL['ObjectProperty'] OBJPROP = OWL['ObjectProperty'] ANNOTPROP = OWL['AnnotationProperty'] DATAPROP = OWL['DatatypeProperty'] SUBCLASS = RDFS['subClassOf'] PERSON = FOAF['Person'] annotation_properties = { 'replaced_by': 'IAO:0100001', 'consider': 'OIO:consider', 'hasExactSynonym': 'OIO:hasExactSynonym', 'hasRelatedSynonym': 'OIO:hasRelatedSynonym', 'definition': 'IAO:0000115', 'has_xref': 'OIO:hasDbXref', 'clique_leader': 'MONARCH:cliqueLeader' } object_properties = { 'has_disposition': 'GENO:0000208', 'has_phenotype': 'RO:0002200', 'in_taxon': 'RO:0002162', 'has_quality': 'RO:0000086', 'has_qualifier': 'GENO:0000580', 'towards': 'RO:0002503', 'has_subject': ':hasSubject', 'has_object': ':hasObject', 'has_predicate': ':hasPredicate', 'is_about': 'IAO:0000136', 'has_member': 'RO:0002351', 'member_of': 'RO:0002350', 'involved_in': 'RO:0002331', 'enables': 'RO:0002327', 'derives_from': 'RO:0001000', 'part_of': 'BFO:0000050', 'has_part': 'BFO:0000051', 'mentions': 'IAO:0000142', 'model_of': 'RO:0003301', 'has_gene_product': 'RO:0002205', 'existence_starts_at': 'UBERON:existence_starts_at', 'existence_starts_during': 'RO:0002488', 'existence_ends_at': 'UBERON:existence_ends_at', 'existence_ends_during': 'RO:0002492', 'starts_with': 'RO:0002224', 'starts_during': 'RO:0002091', 'ends_during': 'RO:0002093', 'ends_with': 'RO:0002230', 'occurs_in': 'BFO:0000066', 'has_environment_qualifier': 'GENO:0000580', 'has_begin_stage_qualifier': 'GENO:0000630', 'has_end_stage_qualifier': 'GENO:0000631', 'correlates_with': 'RO:0002610', 'substance_that_treats': 'RO:0002606', 'is_marker_for': 'RO:0002607', 'contributes_to': 'RO:0002326', 'has_origin': 'GENO:0000643', 'has_author': 'ERO:0000232', 'dc:source': 'dc:source', 'dc:evidence': 'dc:evidence', 'has_evidence': 'RO:0002558', 'causally_upstream_of_or_within': 'RO:0002418' } datatype_properties = { 'position': 'faldo:position', 'has_measurement': 'IAO:0000004', } properties = annotation_properties.copy() properties.update(object_properties) properties.update(datatype_properties) def __init__(self, curie_map, materialize_bnodes=False): self.curie_map = curie_map self.cu = CurieUtil(curie_map) # TEC: what is cu really? self.nobnodes = materialize_bnodes return def addClassToGraph(self, g, id, label, type=None, description=None): """ Any node added to the graph will get at least 3 triples: *(node, type, owl:Class) and *(node, label, literal(label)) *if a type is added, then the node will be an OWL:subclassOf that the type *if a description is provided, it will also get added as a dc:description :param id: :param label: :param type: :param description: :return: """ n = self.getNode(id) g.add((n, RDF['type'], self.OWLCLASS)) if label is not None: g.add((n, RDFS['label'], Literal(label))) if type is not None: t = URIRef(self.cu.get_uri(type)) g.add((n, self.SUBCLASS, t)) if description is not None: g.add((n, DC['description'], Literal(description))) return g def addIndividualToGraph(self, g, id, label, type=None, description=None): n = self.getNode(id) if label is not None: g.add((n, RDFS['label'], Literal(label))) if type is not None: t = self.getNode(type) g.add((n, RDF['type'], t)) else: g.add((n, RDF['type'], self.OWLIND)) if description is not None: g.add((n, DC['description'], Literal(description))) return g def addOWLPropertyClassRestriction( self, g, class_id, property_id, property_value): # make a blank node to hold the property restrictions # scrub the colons, they will make the ttl parsers choke nid = \ '_'+re.sub(r':', '', property_id)+re.sub(r':', '', property_value) n = self.getNode(nid) g.add((n, RDF['type'], self.OWLRESTRICTION)) g.add((n, OWL['onProperty'], self.getNode(property_id))) g.add((n, OWL['someValuesFrom'], self.getNode(property_value))) g.add((self.getNode(class_id), self.SUBCLASS, n)) return def addEquivalentClass(self, g, id1, id2): n1 = self.getNode(id1) n2 = self.getNode(id2) if n1 is not None and n2 is not None: g.add((n1, OWL['equivalentClass'], n2)) return def addSameIndividual(self, g, id1, id2): n1 = self.getNode(id1) n2 = self.getNode(id2) if n1 is not None and n2 is not None: g.add((n1, OWL['sameAs'], n2)) return def addPerson(self, graph, person_id, person_label): graph.add((self.getNode(person_id), RDF['type'], self.PERSON)) if person_label is not None: graph.add( (self.getNode(person_id), RDFS['label'], Literal(person_label))) return def addDeprecatedClass(self, g, oldid, newids=None): """ Will mark the oldid as a deprecated class. if one newid is supplied, it will mark it as replaced by. if >1 newid is supplied, it will mark it with consider properties :param g: :param oldid: the class id to deprecate :param newids: the class idlist that is the replacement(s) of the old class. Not required. :return: """ n1 = URIRef(self.cu.get_uri(oldid)) g.add((n1, RDF['type'], self.OWLCLASS)) self._addReplacementIds(g, oldid, newids) return def addDeprecatedIndividual(self, g, oldid, newids=None): """ Will mark the oldid as a deprecated individual. if one newid is supplied, it will mark it as replaced by. if >1 newid is supplied, it will mark it with consider properties :param g: :param oldid: the individual id to deprecate :param newids: the individual idlist that is the replacement(s) of the old individual. Not required. :return: """ n1 = URIRef(self.cu.get_uri(oldid)) g.add((n1, RDF['type'], self.OWLIND)) self._addReplacementIds(g, oldid, newids) return def _addReplacementIds(self, g, oldid, newids): consider = URIRef(self.cu.get_uri(self.properties['consider'])) replaced_by = URIRef(self.cu.get_uri(self.properties['replaced_by'])) n1 = URIRef(self.cu.get_uri(oldid)) g.add((n1, OWL['deprecated'], Literal(True, datatype=XSD[bool]))) if newids is not None: if len(newids) == 1: n = URIRef(self.cu.get_uri(newids[0])) g.add((n1, replaced_by, n)) elif len(newids) > 0: for i in newids: n = URIRef(self.cu.get_uri(i.strip())) g.add((n1, consider, n)) return def addSubclass(self, g, parentid, childid): p = URIRef(self.cu.get_uri(parentid)) c = URIRef(self.cu.get_uri(childid)) g.add((c, self.SUBCLASS, p)) return def addType(self, graph, subject_id, type, type_is_literal=False): # FIXME check this... i don't think a type should ever be a literal if type_is_literal is True: graph.add((self.getNode(subject_id), RDF['type'], Literal(type))) else: graph.add( (self.getNode(subject_id), RDF['type'], self.getNode(type))) return def addLabel(self, graph, subject_id, label): graph.add( (self.getNode(subject_id), RDFS['label'], Literal(label))) return def addSynonym(self, g, cid, synonym, synonym_type=None): """ Add the synonym as a property of the class cid. Assume it is an exact synonym, unless otherwise specified :param g: :param cid: class id :param synonym: the literal synonym label :param synonym_type: the CURIE of the synonym type (not the URI) :return: """ n = self.getNode(cid) if synonym_type is None: # default synonym_type = URIRef( self.cu.get_uri(self.properties['hasExactSynonym'])) else: synonym_type = URIRef(self.cu.get_uri(synonym_type)) g.add((n, synonym_type, Literal(synonym))) return def addDefinition(self, g, cid, definition): if definition is not None: n = self.getNode(cid) p = URIRef(self.cu.get_uri(self.properties['definition'])) g.add((n, p, Literal(definition))) return def addXref(self, g, cid, xrefid, xref_as_literal=False): self.addTriple( g, cid, self.properties['has_xref'], xrefid, xref_as_literal) return def addDepiction(self, g, subject_id, image_url): g.add( (self.getNode(subject_id), FOAF['depiction'], Literal(image_url))) return def addComment(self, g, subject_id, comment): g.add( (self.getNode(subject_id), DC['comment'], Literal(comment.strip()))) return def addDescription(self, g, subject_id, description): g.add( (self.getNode(subject_id), DC['description'], Literal(description.strip()))) return def addPage(self, g, subject_id, page_url): g.add( (self.getNode(subject_id), FOAF['page'], Literal(page_url))) return def addTitle(self, g, subject_id, title): g.add( (self.getNode(subject_id), DC['title'], Literal(title))) return def addMember(self, g, group_id, member_id): self.addTriple( g, group_id, self.properties['has_member'], member_id) def addMemberOf(self, g, member_id, group_id): self.addTriple( g, member_id, self.properties['member_of'], group_id) return def addInvolvedIn(self, g, member_id, group_id): self.addTriple( g, member_id, self.properties['involved_in'], group_id) def write(self, graph, fileformat=None, file=None): """ a basic graph writer (to stdout) for any of the sources. this will write raw triples in rdfxml, unless specified. to write turtle, specify format='turtle' an optional file can be supplied instead of stdout :return: None """ filewriter = None if fileformat is None: fileformat = 'rdfxml' if file is not None: filewriter = open(file, 'wb') logger.info("Writing triples in %s to %s", fileformat, file) graph.serialize(filewriter, format=fileformat) filewriter.close() else: print(graph.serialize(format=fileformat).decode()) return def write_raw_triples(self, graph, file=None): """ a basic graph writer (to stdout) for any of the sources. this will write raw triples in rdfxml, unless specified. to write turtle, specify format='turtle' an optional file can be supplied instead of stdout :return: None """ filewriter = None if file is not None: filewriter = open(file, 'w') logger.info("Writing raw triples to %s", file) for (s, p, o) in graph: output = [s, p, o] print(' '.join(output), file=filewriter) if filewriter is not None: filewriter.close() return def write_compact_triples(self, graph, file=None): """ Will write out the raw triples, except it will replace the full uri with the curie prefix :param graph: :param file: :return: """ # TODO return def _getNode(self, id, materialize_bnode): """ This is a wrapper for creating a node with a given identifier. If an id starts with an underscore, it assigns it to a BNode, otherwise it creates it with a standard URIRef. Alternatively, if materialize_bnode is True, it will add any nodes that would have been blank into the BASE space. This will return None if it can't map the node properly. :param id: :return: """ base = Namespace(self.curie_map.get('')) n = None if id is not None and re.match(r'^_', id): if materialize_bnode is True: n = base[id] else: # replace the leading underscore to make it cleaner n = BNode(re.sub(r'_', '', id, 1)) elif re.match(r'^\:', id): # do we need to remove embedded ID colons? n = base[re.sub(r':', '', id, 1)] else: u = self.cu.get_uri(id) if u is not None: n = URIRef(self.cu.get_uri(id)) else: logger.error("couldn't make URI for %s", id) return n def getNode(self, id, materialize_bnode=False): return self._getNode(id, materialize_bnode) def addTriple( self, graph, subject_id, predicate_id, object, object_is_literal=False): if object_is_literal is True: graph.add( (self.getNode(subject_id), self.getNode(predicate_id), Literal(object))) else: graph.add( (self.getNode(subject_id), self.getNode(predicate_id), self.getNode(object))) return def loadObjectProperties(self, graph, op): """ Given a graph, it will load the supplied object properties as owl['ObjectProperty'] types A convenience. Status: DEPRECATED. See loadProperties(). :param graph: :param op: a dictionary of object properties :return: None """ self.loadProperties(graph, op, self.OBJPROP) return def loadProperties(self, graph, op, property_type): """ Given a graph, it will load the supplied object properties as the given property_type. :param graph: a graph :param op: a dictionary of object properties :param property_type: one of OWL:(Annotation|Data|Object)Property :return: None """ if property_type not in [self.OBJPROP, self.ANNOTPROP, self.DATAPROP]: logger.error( "bad property type assigned: %s, %s", property_type, op) else: for k in op: graph.add( (self.getNode(op[k]), RDF['type'], property_type)) return def loadAllProperties(self, graph): """ A convenience to load all stored properties (object, data, and annotation) into the supplied graph. :param graph: :return: """ self.loadProperties(graph, self.object_properties, self.OBJPROP) self.loadProperties(graph, self.annotation_properties, self.ANNOTPROP) self.loadProperties(graph, self.datatype_properties, self.DATAPROP) return def addOntologyDeclaration(self, graph, ontology_id): graph.add((self.getNode(ontology_id), RDF['type'], OWL['Ontology'])) return def addOWLVersionIRI(self, graph, ontology_id, version_iri): graph.add( (self.getNode(ontology_id), OWL['versionIRI'], self.getNode(version_iri))) return def addOWLVersionInfo(self, graph, ontology_id, version_info): graph.add( (self.getNode(ontology_id), OWL['versionInfo'], Literal(version_info))) return def makeLeader(self, graph, node_id): """ Add an annotation property to the given ```node_id``` to be the clique_leader. This is a monarchism. :param graph: :param node_id: :return: """ self.addTriple( graph, node_id, self.annotation_properties['clique_leader'], Literal(True, datatype=XSD[bool]), True) return
def process_catalog(self, limit=None): """ :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['catalog']['file'])) logger.info("Processing Data from %s", raw) gu = GraphUtils(curie_map.get()) if self.testMode: # set the graph to build g = self.testgraph else: g = self.graph line_counter = 0 geno = Genotype(g) gu.loadProperties(g, geno.object_properties, gu.OBJPROP) gu.loadAllProperties(g) tax_id = 'NCBITaxon:9606' # hardcode genome_version = 'GRCh38' # hardcode # build a hashmap of genomic location to identifiers, # to try to get the equivalences loc_to_id_hash = {} with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') next(filereader, None) # skip the header row for row in filereader: if not row: pass else: line_counter += 1 (date_added_to_catalog, pubmed_num, first_author, pub_date, journal, link, study_name, disease_or_trait, initial_sample_description, replicate_sample_description, region, chrom_num, chrom_pos, reported_gene_nums, mapped_gene, upstream_gene_num, downstream_gene_num, snp_gene_nums, upstream_gene_distance, downstream_gene_distance, strongest_snp_risk_allele, snps, merged, snp_id_current, context, intergenic_flag, risk_allele_frequency, pvalue, pvalue_mlog, pvalue_text, or_or_beta, confidence_interval_95, platform_with_snps_passing_qc, cnv_flag, mapped_trait, mapped_trait_uri) = row intersect = \ list(set([str(i) for i in self.test_ids['gene']]) & set(re.split(r',', snp_gene_nums))) # skip if no matches found in test set if self.testMode and len(intersect) == 0: continue # 06-May-2015 25917933 Zai CC 20-Nov-2014 J Psychiatr Res http://europepmc.org/abstract/MED/25917933 # A genome-wide association study of suicide severity scores in bipolar disorder. # Suicide in bipolar disorder # 959 European ancestry individuals NA # 10p11.22 10 32704340 C10orf68, CCDC7, ITGB1 CCDC7 # rs7079041-A rs7079041 0 7079041 intron 0 2E-6 5.698970 if chrom_num != '' and chrom_pos != '': loc = 'chr'+str(chrom_num)+':'+str(chrom_pos) if loc not in loc_to_id_hash: loc_to_id_hash[loc] = set() else: loc = None if re.search(r' x ', strongest_snp_risk_allele) \ or re.search(r',', strongest_snp_risk_allele): # TODO deal with haplotypes logger.warning( "We can't deal with haplotypes yet: %s", strongest_snp_risk_allele) continue elif re.match(r'rs', strongest_snp_risk_allele): rs_id = 'dbSNP:'+strongest_snp_risk_allele.strip() # remove the alteration elif re.match(r'kgp', strongest_snp_risk_allele): # FIXME this isn't correct rs_id = 'dbSNP:'+strongest_snp_risk_allele.strip() # http://www.1000genomes.org/faq/what-are-kgp-identifiers # for some information # They were created by Illumina for their genotyping # platform before some variants identified during the # pilot phase of the project had been assigned # rs numbers. elif re.match(r'chr', strongest_snp_risk_allele): # like: chr10:106180121-G rs_id = ':gwas-' + \ re.sub( r':', '-', strongest_snp_risk_allele.strip()) elif strongest_snp_risk_allele.strip() == '': # logger.debug( # "No strongest SNP risk allele for %s:\n%s", # pubmed_num, str(row)) # FIXME still consider adding in the EFO terms # for what the study measured? continue else: logger.warning( "There's a snp id i can't manage: %s", strongest_snp_risk_allele) continue alteration = re.search(r'-(.*)$', rs_id) if alteration is not None \ and re.match(r'[ATGC]', alteration.group(1)): # add variation to snp pass # TODO rs_id = re.sub(r'-.*$', '', rs_id).strip() if loc is not None: loc_to_id_hash[loc].add(rs_id) pubmed_id = 'PMID:'+pubmed_num r = Reference( pubmed_id, Reference.ref_types['journal_article']) r.addRefToGraph(g) # create the chromosome chrom_id = makeChromID(chrom_num, genome_version, 'CHR') # add the feature to the graph snp_description = None if risk_allele_frequency != '' and \ risk_allele_frequency != 'NR': snp_description = \ str(risk_allele_frequency) + \ ' [risk allele frequency]' f = Feature( rs_id, strongest_snp_risk_allele.strip(), Feature.types[r'SNP'], snp_description) if chrom_num != '' and chrom_pos != '': f.addFeatureStartLocation(chrom_pos, chrom_id) f.addFeatureEndLocation(chrom_pos, chrom_id) f.addFeatureToGraph(g) f.addTaxonToFeature(g, tax_id) # TODO consider adding allele frequency as property; # but would need background info to do that # also want to add other descriptive info about # the variant from the context for c in re.split(r';', context): cid = self._map_variant_type(c.strip()) if cid is not None: gu.addType(g, rs_id, cid) # add deprecation information if merged == 1 and str(snp_id_current.strip()) != '': # get the current rs_id current_rs_id = 'dbSNP:' if not re.match(r'rs', snp_id_current): current_rs_id += 'rs' if loc is not None: loc_to_id_hash[loc].append(current_rs_id) current_rs_id += str(snp_id_current) gu.addDeprecatedIndividual(g, rs_id, current_rs_id) # TODO check on this # should we add the annotations to the current # or orig? gu.makeLeader(g, current_rs_id) else: gu.makeLeader(g, rs_id) # add the feature as a sequence alteration # affecting various genes # note that intronic variations don't necessarily list # the genes such as for rs10448080 FIXME if snp_gene_nums != '': for s in re.split(r',', snp_gene_nums): s = s.strip() # still have to test for this, # because sometimes there's a leading comma if s != '': gene_id = 'NCBIGene:'+s geno.addAlleleOfGene(rs_id, gene_id) # add the up and downstream genes if they are available if upstream_gene_num != '': downstream_gene_id = 'NCBIGene:'+downstream_gene_num gu.addTriple( g, rs_id, Feature.object_properties[ r'upstream_of_sequence_of'], downstream_gene_id) if downstream_gene_num != '': upstream_gene_id = 'NCBIGene:'+upstream_gene_num gu.addTriple( g, rs_id, Feature.object_properties[ 'downstream_of_sequence_of'], upstream_gene_id) description = 'A study of ' + disease_or_trait + \ ' in ' + initial_sample_description if replicate_sample_description != '': description = \ ' '.join( (description, 'with', replicate_sample_description)) if platform_with_snps_passing_qc != '': description = ' '.join( (description, 'on platform', platform_with_snps_passing_qc)) description = ' '.join((description, '(p='+pvalue+')')) # make associations to the EFO terms; there can be >1 if mapped_trait_uri.strip() != '': for t in re.split(r',', mapped_trait_uri): t = t.strip() cu = CurieUtil(curie_map.get()) tid = cu.get_curie(t) assoc = G2PAssoc( self.name, rs_id, tid, gu.object_properties['contributes_to']) assoc.add_source(pubmed_id) # combinatorial evidence # used in automatic assertion eco_id = 'ECO:0000213' assoc.add_evidence(eco_id) # assoc.set_description(description) # FIXME score should get added to provenance/study # assoc.set_score(pvalue) assoc.add_association_to_graph(g) if not self.testMode and\ (limit is not None and line_counter > limit): break Assoc(self.name).load_all_properties(g) # loop through the location hash, # and make all snps at that location equivalent for l in loc_to_id_hash: snp_ids = loc_to_id_hash[l] if len(snp_ids) > 1: logger.info("%s has >1 snp id: %s", l, str(snp_ids)) return
class StreamedGraph(DipperGraph): """ Stream rdf triples to file or stdout Assumes a downstream process will sort then uniquify triples Theoretically could support both ntriple, rdfxml formats, for now just support nt """ curie_map = curimap.get() curie_util = CurieUtil(curie_map) with open('translationtable/GLOBAL_TERMS.yaml') as fhandle: globaltt = yaml.safe_load(fhandle).copy() globaltcid = {v: k for k, v in globaltt.items()} def __init__(self, are_bnodes_skized=True, identifier=None, file_handle=None, fmt='nt'): self.are_bnodes_skized = are_bnodes_skized self.fmt = fmt self.file_handle = file_handle self.identifier = identifier def addTriple(self, subject_id, predicate_id, obj, object_is_literal=None, literal_type=None): # trying making infrence on type of object if none is supplied if object_is_literal is None: if self.curie_regexp.match(obj) or\ obj.split(':')[0].lower() in ('http', 'https', 'ftp'): object_is_literal = False else: object_is_literal = True subject_iri = self._getnode(subject_id) predicate_iri = self._getnode(predicate_id) if not object_is_literal: obj = self._getnode(obj) if literal_type is not None: literal_type = self._getnode(literal_type) if obj is not None: self.serialize(subject_iri, predicate_iri, obj, object_is_literal, literal_type) else: LOG.warning("Null value passed as object") return def skolemizeBlankNode(self, curie): base_iri = StreamedGraph.curie_map.get_base() curie_id = curie.split(':')[1] skolem_iri = "{0}.wellknown/genid/{1}".format(base_iri, curie_id) return skolem_iri def serialize(self, subject_iri, predicate_iri, obj, object_is_literal=False, literal_type=None): if not object_is_literal: triple = "<{}> <{}> <{}> .".format(subject_iri, predicate_iri, obj) elif literal_type is not None: triple = '<{}> <{}> {}^^<{}> .'.format( subject_iri, predicate_iri, self._quote_encode(str(obj)), literal_type) else: if isinstance(obj, str): triple = '<{}> <{}> {} .'.format(subject_iri, predicate_iri, self._quote_encode(obj)) else: lit_type = self._getLiteralXSDType(obj) if type is not None: triple = '<{}> <{}> "{}"^^<{}> .'.format( subject_iri, predicate_iri, obj, lit_type) else: raise TypeError("Cannot determine type of {}".format(obj)) if self.file_handle is None: print(triple) else: self.file_handle.write("{}\n".format(triple)) def _getnode(self, curie): """ Returns IRI, or blank node curie/iri depending on self.skolemize_blank_node setting :param curie: str id as curie or iri :return: """ if re.match(r'^_:', curie): if self.are_bnodes_skized is True: node = self.skolemizeBlankNode(curie) else: node = curie elif re.match(r'^http|^ftp', curie): node = curie elif len(curie.split(':')) == 2: node = StreamedGraph.curie_util.get_uri(curie) else: raise TypeError("Cannot process curie {}".format(curie)) return node def _getLiteralXSDType(self, literal): """ This could be much more nuanced, but for now if a literal is not a str, determine if it's a xsd int or double :param literal: :return: str - xsd full iri """ if isinstance(literal, int): return self._getnode("xsd:integer") if isinstance(literal, float): return self._getnode("xsd:double") @staticmethod def _quote_encode(literal): """ Copy of code in rdflib here: https://github.com/RDFLib/rdflib/blob/776b90be/ rdflib/plugins/serializers/nt.py#L76 :param literal: :return: """ return '"%s"' % literal.replace('\\', '\\\\')\ .replace('\n', '\\n')\ .replace('"', '\\"')\ .replace('\r', '\\r')
class RDFGraph(DipperGraph, ConjunctiveGraph): """ Extends RDFLibs ConjunctiveGraph The goal of this class is wrap the creation of triples and manage creation of URIRef, Bnodes, and literals from an input curie """ curie_map = curie_map_class.get() curie_util = CurieUtil(curie_map) # make global translation table available outside the ingest with open( os.path.join( os.path.dirname(__file__), '../../translationtable/GLOBAL_TERMS.yaml')) as fhandle: globaltt = yaml.safe_load(fhandle) globaltcid = {v: k for k, v in globaltt.items()} def __init__(self, are_bnodes_skized=True, identifier=None): # print("in RDFGraph with id: ", identifier) super().__init__('IOMemory', identifier) self.are_bnodes_skized = are_bnodes_skized # Can be removed when this is resolved # https://github.com/RDFLib/rdflib/issues/632 for pfx in ('OBO', ): # , 'ORPHA'): self.bind(pfx, Namespace(self.curie_map[pfx])) # try adding them all # self.bind_all_namespaces() # too much def addTriple(self, subject_id, predicate_id, obj, object_is_literal=None, literal_type=None): # trying making infrence on type of object if none is supplied if object_is_literal is None: if self.curie_regexp.match(obj) is not None or\ obj.split(':')[0].lower() in ('http', 'https', 'ftp'): object_is_literal = False else: object_is_literal = True if object_is_literal is True: if literal_type is not None and obj is not None: literal_type_iri = self._getnode(literal_type) self.add( (self._getnode(subject_id), self._getnode(predicate_id), Literal(obj, datatype=literal_type_iri))) elif obj is not None: self.add( (self._getnode(subject_id), self._getnode(predicate_id), Literal(obj))) else: LOG.warning("None as literal object for subj: %s and pred: %s", subject_id, predicate_id) # get a sense of where the None is comming from # magic number here is "steps up the call stack" for call in range(2, 0, -1): LOG.warning('\t%sfrom: %s', '\t' * call, sys._getframe(call).f_code.co_name) elif obj is not None and obj != '': # object is a resourse self.add((self._getnode(subject_id), self._getnode(predicate_id), self._getnode(obj))) else: LOG.warning("None/empty object IRI for subj: %s and pred: %s", subject_id, predicate_id) return def skolemizeBlankNode(self, curie): stripped_id = re.sub(r'^_:|^_', '', curie, 1) node = BNode(stripped_id).skolemize(self.curie_util.get_base()) node = re.sub(r'rdflib/', '', node) # remove string added by rdflib return URIRef(node) def _getnode(self, curie): # convention is lowercase names """ This is a wrapper for creating a URIRef or Bnode object with a given a curie or iri as a string. If an id starts with an underscore, it assigns it to a BNode, otherwise it creates it with a standard URIRef. Alternatively, self.skolemize_blank_node is True, it will skolemize the blank node :param curie: str identifier formatted as curie or iri :return: node: RDFLib URIRef or BNode object """ node = None if curie[0] == '_': if self.are_bnodes_skized is True: node = self.skolemizeBlankNode(curie) else: # delete the leading underscore to make it cleaner node = BNode(re.sub(r'^_:|^_', '', curie, 1)) # Check if curie string is actually an IRI elif curie[:4] == 'http' or curie[:3] == 'ftp' or curie[:4] == 'jdbc': node = URIRef(curie) else: iri = RDFGraph.curie_util.get_uri(curie) if iri is not None: node = URIRef(RDFGraph.curie_util.get_uri(curie)) # Bind prefix map to graph prefix = curie.split(':')[0] if prefix not in self.namespace_manager.namespaces(): mapped_iri = self.curie_map[prefix] self.bind(prefix, Namespace(mapped_iri)) else: LOG.error("couldn't make URI for %s", curie) return node def bind_all_namespaces(self): """ Results in the RDF @prefix directives for every ingest being added to this ingest. """ for prefix in self.curie_map.keys(): iri = self.curie_map[prefix] self.bind(prefix, Namespace(iri)) return # serialize() conflicts between rdflib & Graph.serialize abstractmethod # GraphUtils expects the former. (too bad there is no multiple dispatch) def serialize( # rdflib version self, destination=None, format='turtle', base=None, encoding=None): return ConjunctiveGraph.serialize(self, destination, format)
class ModelTestCase(unittest.TestCase): def setUp(self): g = RDFGraph() self.model = Model(g) this_curie_map = curie_map.get() self.cutil = CurieUtil(this_curie_map) # stuff to make test triples self.test_cat_subj_curie = "MGI:1234" self.test_cat_subj = self.cutil.get_uri("MGI:1234") self.test_cat_default_pred = self.cutil.get_uri("biolink:category") self.test_named_indiv = self.cutil.get_uri("owl:NamedIndividual") self.test_label_pred = self.cutil.get_uri("rdfs:label") self.test_label = "some label" self.test_comment_IRI = self.cutil.get_uri("rdfs:comment") self.test_comment = 'bonus eruptus' def tearDown(self): self.graph = None def test_addIndividualToGraph_assign_label(self): self.model.addIndividualToGraph(self.test_cat_subj_curie, "some label") label_triple = list( self.model.graph.triples((URIRef(self.test_cat_subj), URIRef(self.test_label_pred), None))) self.assertEqual(len(label_triple), 1, "method didn't assign label") self.assertEqual(str(label_triple[0][2]), self.test_label, "method didn't assign correct label") def test_addIndividualToGraph_assign_type_named_individual(self): self.model.addIndividualToGraph(self.test_cat_subj_curie, "some label") triples = list( self.model.graph.triples((URIRef(self.test_cat_subj), None, URIRef(self.test_named_indiv)))) self.assertEqual(len(triples), 1, "method didn't assign type as named individual") def test_addIndividualToGraph_assign_category(self): self.model.addIndividualToGraph(self.test_cat_subj_curie, "some label", ind_category=blv.terms['Genotype']) triples = list( self.model.graph.triples( (URIRef(self.test_cat_subj), URIRef(self.test_cat_default_pred), None))) self.assertEqual(len(triples), 1, "method didn't assign category") def test_add_comment(self): self.model.addComment(self.test_cat_subj, self.test_comment) triples = list( self.model.graph.triples( (URIRef(self.test_cat_subj), URIRef(self.test_comment_IRI), Literal(self.test_comment)))) self.assertEqual(len(triples), 1, "method didn't assign comment") def test_add_comment_assign_subject_category(self): self.model.addComment(self.test_cat_subj, self.test_comment, subject_category=blv.terms['Genotype']) triples = list( self.model.graph.triples( (URIRef(self.test_cat_subj), URIRef(self.test_cat_default_pred), None))) self.assertEqual(len(triples), 1, "method didn't assign category")
def test_missense_variant_cdna_model(self): """ Test missense variant with cdna information Using test data set 2, and the function add_variant_info_to_graph() We want to test the following triples: CGD:VariantID is an instance of OBO:SO_0001059 CGD:VariantID is an instance of OBO:SO_0001583 CGD:VariantID has the label "ABL1 T315I missense mutation" CGD:VariantID is_sequence_variant_instance_of (OBO:GENO_0000408) NCBIGene:25 CGD:VariantID has location (faldo:location) AminoAcidRegionID CGD:VariantID has location (faldo:location) CDNARegionID CGD:VariantID has location (faldo:location) ChromosomalRegionID CGD:VariantID OBO:GENO_reference_amino_acid "T" CGD:VariantID OBO:GENO_results_in_amino_acid_change "I" CGD:VariantID owl:sameAs dbSNP:rs121913459 CGD:VariantID owl:sameAs COSMIC:12560 CGD:VariantID RO:0002205 (transcribed_to) CCDS:35166.1 CCDS:35166.1 is an instance of OBO:SO_0000233 CCDS:35166.1 has the label "CCDS35166.1" CCDS:35166.1 OBO:RO_0002513 (translates_to) UniProtKB:P00519#P00519-1 CCDS:35166.1 OBO:RO_0002513 (translates_to) NCBIProtein:NP_005148.2 UniProtKB:P00519#P00519-1 owl:sameAs NCBIProtein:NP_005148.2 UniProtKB:P00519#P00519-1 is an instance of OBO:SO_0000104 (polypeptide) UniProtKB:P00519#P00519-1 has the label "P00519#P00519-1" NCBIProtein:NP_005148.2 is an instance of OBO:SO_0000104 (polypeptide) NCBIProtein:NP_005148.2 has the label "NP_005148.2" """ from dipper.utils.TestUtils import TestUtils self.cgd.add_variant_info_to_graph(self.test_set_2) # Make testutils object and load bindings test_env = TestUtils(self.cgd.graph) cu = CurieUtil(self.curie_map) self.cgd.load_bindings() (variant_key, variant_label, amino_acid_variant, amino_acid_position, transcript_id, transcript_priority, protein_variant_type, functional_impact, stop_gain_loss, transcript_gene, protein_variant_source, variant_gene, bp_pos, variant_cdna, cosmic_id, db_snp_id, genome_pos_start, genome_pos_end, ref_base, variant_base, primary_transcript_exons, primary_transcript_variant_sub_types, variant_type, chromosome, genome_build, build_version, build_date) = self.test_set_2[0] gene_id = self.cgd.gene_map[transcript_gene] ref_amino_acid = "T" altered_amino_acid = "I" db_snp_curie = "dbSNP:121913459" cosmic_curie = "COSMIC:12560" uniprot_curie = "UniProtKB:P00519#P00519-1" uniprot_id = "P00519#P00519-1" refseq_curie = "NCBIProtein:NP_005148.2" transcript_curie = "CCDS:35166.1" ccds_id = "35166.1" position = 315 chromosome_curie = "hg19chr9" variant_id = self.cgd.make_cgd_id('variant{0}'.format(variant_key)) aa_region_id = ":_{0}{1}{2}Region".format(position, position, uniprot_curie) cdna_region_id = ":_{0}Region".format(transcript_curie) chr_region_id = ":_{0}{1}Region-{2}-{3}".format( genome_build, chromosome, genome_pos_start, genome_pos_end) aa_coord_id = ":_{0}-{1}".format(uniprot_id, position) cdna_coord_id = ":_{0}-{1}".format(ccds_id, bp_pos) # chr_coord_id = "CHR:{0}-{1}".format(chromosome_curie, genome_pos_start) chr_coord_id = ":_{0}-{1}".format(chromosome_curie, genome_pos_start) variant_uri = URIRef(cu.get_uri(variant_id)) transcript_uri = URIRef(cu.get_uri(transcript_curie)) gene_uri = URIRef(cu.get_uri(gene_id)) db_snp_uri = URIRef(cu.get_uri(db_snp_curie)) cosmic_uri = URIRef(cu.get_uri(cosmic_curie)) uniprot_uri = URIRef(cu.get_uri(uniprot_curie)) refseq_uri = URIRef(cu.get_uri(refseq_curie)) aa_region_uri = URIRef(cu.get_uri(aa_region_id)) cdna_region_uri = URIRef(cu.get_uri(cdna_region_id)) chr_region_uri = URIRef(cu.get_uri(chr_region_id)) aa_coord_uri = URIRef(cu.get_uri(aa_coord_id)) cdna_coord_uri = URIRef(cu.get_uri(cdna_coord_id)) chr_coord_uri = URIRef(cu.get_uri(chr_coord_id)) sparql_query = """ SELECT ?cosmic ?gene ?aaRegion ?cdnaRegion ?chrRegion ?dbSNP ?transcript ?uniprot ?refseq ?aaCoord ?cdnaCoord ?chrCoord WHERE {{ ?cosmic a OBO:SO_0001059; a OBO:SO_0001583 ; OBO:GENO_0000408 ?gene ; faldo:location ?aaRegion ; faldo:location ?cdnaRegion ; faldo:location ?chrRegion ; OBO:GENO_reference_amino_acid "{0}" ; OBO:GENO_reference_nucleotide "{1}" ; OBO:GENO_altered_nucleotide "{2}" ; OBO:GENO_results_in_amino_acid_change "{3}" ; owl:sameAs ?dbSNP ; RO:0002205 ?transcript . ?cosmic owl:sameAs ?dbSNP . ?transcript a OBO:SO_0000233 ; rdfs:label "{4}" ; OBO:RO_0002513 ?uniprot ; OBO:RO_0002513 ?refseq . ?uniprot a OBO:SO_0000104 ; rdfs:label "P00519-1" . ?refseq a OBO:SO_0000104 ; rdfs:label "NP_005148.2" . ?refseq owl:sameAs ?uniprot . ?aaRegion faldo:begin ?aaCoord . ?cdnaRegion faldo:begin ?cdnaCoord . ?chrRegion faldo:begin ?chrCoord . ?aaCoord faldo:position {5} . ?cdnaCoord faldo:position {6} . ?chrCoord faldo:position {7} . ?dbSNP rdfs:label "{8}" . }} """.format(ref_amino_acid, ref_base, variant_base, altered_amino_acid, transcript_id, position, bp_pos, genome_pos_start, db_snp_id) # Expected Results expected_results = [[ cosmic_uri, gene_uri, aa_region_uri, cdna_region_uri, chr_region_uri, db_snp_uri, transcript_uri, uniprot_uri, refseq_uri, aa_coord_uri, cdna_coord_uri, chr_coord_uri ]] # Query graph sparql_output = test_env.query_graph(sparql_query) self.assertEqual(expected_results, sparql_output)
def test_missense_variant_protein_model(self): """ Test missense variant with only protein information Using test data set 1, and the function add_variant_info_to_graph() We want to test the following triples: CGD:VariantID is an instance of OBO:SO_0001059 CGD:VariantID is an instance of OBO:SO_0001583 CGD:VariantID has the label "CSF3R Q741X missense mutation" CGD:VariantID is_sequence_variant_instance_of (OBO:GENO_0000408) NCBIGene:1441 CGD:VariantID has location (faldo:location) CGD:RegionID CGD:VariantID OBO:GENO_reference_amino_acid "Q" CGD:VariantID OBO:GENO_results_in_amino_acid_change "X" CGD:VariantID RO:0002205 CCDS:413.1 CCDS:413.1 is an instance of OBO:GENO_primary CCDS:413.1 has the label "CCDS413.1" """ from dipper.utils.TestUtils import TestUtils self.cgd.add_variant_info_to_graph(self.test_set_1) # Make testutils object and load bindings test_env = TestUtils(self.cgd.graph) cu = CurieUtil(self.curie_map) self.cgd.load_bindings() (variant_key, variant_label, amino_acid_variant, amino_acid_position, transcript_id, transcript_priority, protein_variant_type, functional_impact, stop_gain_loss, transcript_gene, protein_variant_source) = self.test_set_1[0][0:11] gene_id = self.cgd.gene_map[transcript_gene] ref_amino_acid = "Q" altered_amino_acid = "X" position = 741 uniprot_curie = "UniProtKB:Q99062#Q99062-1" variant_id = self.cgd.make_cgd_id('variant{0}'.format(variant_key)) transcript = "CCDS:413.1" region_id = ":_{0}{1}{2}Region".format(position, position, uniprot_curie) variant_uri = URIRef(cu.get_uri(variant_id)) transcript_uri = URIRef(cu.get_uri(transcript)) gene_uri = URIRef(cu.get_uri(gene_id)) region_uri = URIRef(cu.get_uri(region_id)) sparql_query = """ SELECT ?variant ?gene ?region ?transcript WHERE {{ ?variant a OBO:SO_0001059; a OBO:SO_0001583 ; rdfs:label "{0}" ; OBO:GENO_0000408 ?gene ; faldo:location ?region ; OBO:GENO_reference_amino_acid "{1}" ; OBO:GENO_results_in_amino_acid_change "{2}" ; RO:0002205 ?transcript . ?transcript a OBO:SO_0000233 ; rdfs:label "{3}" . }} """.format(variant_label, ref_amino_acid, altered_amino_acid, transcript_id) # Expected Results expected_results = [[ variant_uri, gene_uri, region_uri, transcript_uri ]] # Query graph sparql_output = test_env.query_graph(sparql_query) self.assertEqual(expected_results, sparql_output)
def test_chromosome_position_model(self): """ Test modelling of genomic positions Using test data set 2, and the function add_variant_info_to_graph() """ from dipper.utils.TestUtils import TestUtils self.cgd.add_variant_info_to_graph(self.test_set_2) # Make testutils object and load bindings test_env = TestUtils(self.cgd.graph) cu = CurieUtil(self.curie_map) self.cgd.load_bindings() (variant_key, variant_label, amino_acid_variant, amino_acid_position, transcript_id, transcript_priority, protein_variant_type, functional_impact, stop_gain_loss, transcript_gene, protein_variant_source, variant_gene, bp_pos, variant_cdna, cosmic_id, db_snp_id, genome_pos_start, genome_pos_end, ref_base, variant_base, primary_transcript_exons, primary_transcript_variant_sub_types, variant_type, chromosome, genome_build, build_version, build_date) = self.test_set_2[0] variant_id = self.cgd.make_cgd_id('variant{0}'.format(variant_key)) chromosome_curie = ":MONARCH_hg19chr9" region_id = ":_{0}{1}Region-{2}-{3}".format(genome_build, chromosome, genome_pos_start, genome_pos_end) start_id = ":_hg19chr9-{0}".format(genome_pos_start) end_id = ":_hg19chr9-{0}".format(genome_pos_end) region_uri = URIRef(cu.get_uri(region_id)) start_uri = URIRef(cu.get_uri(start_id)) end_uri = URIRef(cu.get_uri(end_id)) chromosome_uri = URIRef(cu.get_uri(chromosome_curie)) sparql_query = """ SELECT ?region ?startPosition ?endPosition ?chromosome WHERE {{ ?region a faldo:Region ; faldo:begin ?startPosition ; faldo:end ?endPosition . ?startPosition a faldo:Position ; faldo:position {0} ; faldo:reference ?chromosome . ?endPosition a faldo:Position ; faldo:position {1} ; faldo:reference ?chromosome . }} """.format( genome_pos_start, genome_pos_end, ) # Expected Results expected_results = [[region_uri, start_uri, end_uri, chromosome_uri]] # Query graph sparql_output = test_env.query_graph(sparql_query) self.assertEqual(expected_results, sparql_output)
def setUp(self): self.test_util = TestUtils() self.assoc_curie = 'MONARCH:test_association' self.eco_id = 'ECO:0000015' # Headers: # 01 marker_accession_id, # 02 marker_symbol, # 03 phenotyping_center, # 04 colony_raw, # 05 sex, # 06 zygosity, # 07 allele_accession_id, # 08 allele_symbol, # 09 allele_name, # 10 strain_accession_id, # 11 strain_name, # 12 project_name, # 13 project_fullname, # 14 pipeline_name, # 15 pipeline_stable_id, # 16 procedure_stable_id, # 17 procedure_name, # 18 parameter_stable_id, # 19 parameter_name, # 20 top_level_mp_term_id, # 21 top_level_mp_term_name, # 22 mp_term_id, # 23 mp_term_name, # 24 p_value, # 25 percentage_change, # 26 effect_size, # 27 statistical_method, # 28 resource_name self.test_set_1 = ( 'MGI:1920145', # 01 'Setd5', # 02 'WTSI', # 03 'MEFW', # 04 'male', # 05 'heterozygote', # 06 'MGI:4432631', # 07 'Setd5<tm1a(EUCOMM)Wtsi>', # 08 'targeted mutation 1a, Wellcome Trust Sanger Institute', # 09 'MGI:2159965', # 10 'C57BL/6N', # 11 'MGP', # 12 'Wellcome Trust Sanger Institute Mouse Genetics Project', # 13 'MGP Select Pipeline', # 14 'MGP_001', # 15 'MGP_XRY_001', # 16 'X-ray', # 17 'IMPC_XRY_008_001', # 18 'Number of ribs right', # 19 'MP:0005390', # 20 'skeleton phenotype', # 21 'MP:0000480', # 22 'increased rib number', # 23 '1.637023E-010', # 24 '', # 25 '8.885439E-007', # 26 'Wilcoxon rank sum test with continuity correction', # 27 'IMPC' # 28 ) # Generate test curies, these are otherwise generated # within _add_evidence() and _add_study_provenance() # these blank nodes are hardcoded as NOT Skolemized ... self.study_curie = "_:study" self.evidence_curie = "_:evidence" # IRIs for testing sparql output curie_dict = curie_map.get() curie_util = CurieUtil(curie_dict) self.assoc_iri = URIRef(curie_util.get_uri(self.assoc_curie)) return
class GenotypeTestCase(unittest.TestCase): def setUp(self): self.graph = RDFGraph() self.curie_map = curie_map.get() self.genotype = Genotype(self.graph) self.cutil = CurieUtil(self.curie_map) self.test_cat_pred = self.cutil.get_uri(blv.terms['category']) self.test_cat_genotype_category = self.cutil.get_uri( blv.terms['Genotype']) self.test_cat_background_category = self.cutil.get_uri( blv.terms['PopulationOfIndividualOrganisms']) def tearDown(self): self.genotype = None def test_addGenotype(self): cutil = CurieUtil(self.curie_map) gid = 'MGI:5515892' label = \ 'Pmp22<Tr-2J>/Pmp22<+> [C57BL/6J-Pmp22<Tr-2J>/GrsrJ]' self.genotype.addGenotype(gid, label) self.assertTrue((URIRef(cutil.get_uri(gid)), RDFS['label'], Literal(label)) in self.genotype.graph) def test_addGenomicBackgroundToGenotype_adds_genotype(self): """ test that addGenomicBackgroundToGenotype() correctly assigns subject/object category """ genotype_id = "GENO:0000002" background_id = "GENO:0000002" # no idea what a good example background ID is self.genotype.addGenomicBackgroundToGenotype( background_id=background_id, genotype_id=genotype_id) geno_triples = list( self.graph.triples((URIRef(self.cutil.get_uri(genotype_id)), URIRef(self.test_cat_pred), URIRef(self.test_cat_genotype_category)))) def test_addGenomicBackgroundToGenotype_adds_categories(self): """ test that addGenomicBackgroundToGenotype() correctly assigns subject/object category """ genotype_id = "GENO:0000002" background_id = "GENO:0000002" # no idea what a good example background ID is self.genotype.addGenomicBackgroundToGenotype( background_id=background_id, genotype_id=genotype_id) geno_triples = list( self.graph.triples((URIRef(self.cutil.get_uri(genotype_id)), URIRef(self.test_cat_pred), URIRef(self.test_cat_genotype_category)))) self.assertEqual( len(geno_triples), 1, "addTriples() didn't make exactly 1 genotype category triple") self.assertEqual( geno_triples[0][2], URIRef(self.test_cat_genotype_category), "addTriples() didn't assign the right genotype category") background_triples = list( self.graph.triples((URIRef(self.cutil.get_uri(background_id)), URIRef(self.test_cat_pred), URIRef(self.test_cat_background_category)))) self.assertEqual( len(background_triples), 1, "addTriples() didn't make exactly 1 genotype category triple") self.assertEqual( background_triples[0][2], URIRef(self.test_cat_background_category), "addTriples() didn't assign the right background category") # does not compile # def test_addParts(self): # """ # """ # if part_relationship is None: # part_relationship = self.globaltt['has_part'] # # Fail loudly if parent or child identifiers are None # if parent_id is None: # raise TypeError('Attempt to pass None as parent') # elif part_id is None: # raise TypeError('Attempt to pass None as child') # elif part_relationship is None: # part_relationship = self.globaltt['has_part'] # # self.graph.addTriple(parent_id, part_relationship, part_id, # subject_category=subject_category, # object_category=object_category) return
def _add_variant_trait_association(self, variant_id, mapped_trait_uri, efo_ontology, pubmed_id, description=None): if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) # make associations to the EFO terms; there can be >1 if mapped_trait_uri.strip() != '': for trait in re.split(r',', mapped_trait_uri): trait = trait.strip() cu = CurieUtil(curie_map.get()) trait_id = cu.get_curie(trait) dis_query = """ SELECT ?trait WHERE {{ {0} rdfs:subClassOf+ EFO:0000408 . {0} rdfs:label ?trait . }} """.format(trait_id) query_result = efo_ontology.query(dis_query) if len(list(query_result)) > 0: if re.match(r'^EFO', trait_id): model.addClassToGraph(trait_id, list(query_result)[0][0], 'DOID:4') phenotype_query = """ SELECT ?trait WHERE {{ {0} rdfs:subClassOf+ EFO:0000651 . {0} rdfs:label ?trait . }} """.format(trait_id) query_result = efo_ontology.query(phenotype_query) if len(list(query_result)) > 0: if re.match(r'^EFO', trait_id): model.addClassToGraph(trait_id, list(query_result)[0][0], 'UPHENO:0001001') pubmed_curie = 'PMID:' + pubmed_id ref = Reference(g, pubmed_curie, Reference.ref_types['journal_article']) ref.addRefToGraph() assoc = G2PAssoc(g, self.name, variant_id, trait_id, model.object_properties['contributes_to']) assoc.add_source(pubmed_curie) # combinatorial evidence # used in automatic assertion eco_id = 'ECO:0000213' assoc.add_evidence(eco_id) if description is not None: assoc.set_description(description) # FIXME score should get added to provenance/study # assoc.set_score(pvalue) assoc.add_association_to_graph()
class RDFGraph(DipperGraph, ConjunctiveGraph): """ Extends RDFLibs ConjunctiveGraph The goal of this class is wrap the creation of triples and manage creation of URIRef, Bnodes, and literals from an input curie """ curie_map = curie_map_class.get() curie_util = CurieUtil(curie_map) # make global translation table available outside the ingest with open( os.path.join( os.path.dirname(__file__), '../../translationtable/GLOBAL_TERMS.yaml')) as fhandle: globaltt = yaml.safe_load(fhandle) globaltcid = {v: k for k, v in globaltt.items()} def __init__(self, are_bnodes_skized=True, identifier=None): # print("in RDFGraph with id: ", identifier) super().__init__('IOMemory', identifier) self.are_bnodes_skized = are_bnodes_skized self.prefixes = set() # Can be removed when this is resolved # https://github.com/RDFLib/rdflib/issues/632 for pfx in ('OBO',): # , 'ORPHA'): self.bind(pfx, Namespace(self.curie_map[pfx])) def _make_category_triple( self, subject, category, predicate=blv.terms['category'] ): """ add a triple to capture subject or object category (in CURIE form) that was passed to addTriple() """ try: self.add(( self._getnode(subject), self._getnode(predicate), self._getnode(category))) except: LOG.warning( "Problem adding triple in _makeCategoryTriple for " + \ "subj: %s pred: %s obj(category): %s", subject, predicate, category) def _is_literal(self, thing): """ make inference on type (literal or CURIE) return: logical """ if self.curie_regexp.match(thing) is not None or\ thing.split(':')[0].lower() in ('http', 'https', 'ftp'): object_is_literal = False else: object_is_literal = True return object_is_literal def addTriple( self, subject_id, predicate_id, obj, object_is_literal=None, literal_type=None, subject_category=None, object_category=None ): if object_is_literal is None: object_is_literal = self._is_literal(obj) # add triples for subject category info if subject_category is not None: self._make_category_triple(subject_id, subject_category) # add triples for obj category info, if obj is not a literal if not object_is_literal: if object_category is not None: self._make_category_triple(obj, object_category) else: # emit warning if object category is given for a literal if object_category is not None: LOG.warning("I was given a category %s for obj: %s, " + "which seems to be a literal!", object_category, obj) if object_is_literal is True: if isinstance(obj, str): re.sub(r'[\t\n\r\f\v]+', ' ', obj) # reduce any ws to a space if literal_type is not None and obj is not None and obj not in ("", " "): literal_type_iri = self._getnode(literal_type) self.add( (self._getnode(subject_id), self._getnode(predicate_id), Literal(obj, datatype=literal_type_iri))) elif obj is not None: # could attempt to infer a type here but there is no use case self.add(( self._getnode(subject_id), self._getnode(predicate_id), Literal(obj))) else: LOG.warning( "None as literal object for subj: %s and pred: %s", subject_id, predicate_id) # get a sense of where the None is comming from # magic number here is "steps up the call stack" # TODO there may be easier/ideomatic ways to do this now for call in range(2, 0, -1): LOG.warning( '\t%sfrom: %s', '\t' * call, sys._getframe(call).f_code.co_name) elif obj is not None and obj != '': # object is a resource self.add(( self._getnode(subject_id), self._getnode(predicate_id), self._getnode(obj))) else: LOG.warning( "None/empty object IRI for subj: %s and pred: %s", subject_id, predicate_id) def skolemizeBlankNode(self, curie): stripped_id = re.sub(r'^_:|^_', '', curie, 1) return URIRef(self.curie_map['BNODE'] + stripped_id) def _getnode(self, curie): """ This is a wrapper for creating a URIRef or Bnode object with a given a curie or iri as a string. If an id starts with an underscore, it assigns it to a BNode, otherwise it creates it with a standard URIRef. Alternatively, self.skolemize_blank_node is True, it will skolemize the blank node :param curie: str identifier formatted as curie or iri :return: node: RDFLib URIRef or BNode object """ node = None if curie[0] == '_': if self.are_bnodes_skized: node = self.skolemizeBlankNode(curie) else: # delete the leading underscore to make it cleaner node = BNode(re.sub(r'^_:|^_', '', curie, 1)) # Check if curie string is actually an IRI elif curie[:4] == 'http' or curie[:3] == 'ftp' or curie[:4] == 'jdbc': node = URIRef(curie) else: iri = RDFGraph.curie_util.get_uri(curie) if iri is not None: node = URIRef(iri) # Bind prefix map to graph prefix = curie.split(':')[0] self.prefixes.add(prefix) else: LOG.error("couldn't make URI for %s", curie) # get a sense of where the CURIE-ish? thing is comming from # magic number here is "steps up the call stack" for call in range(3, 0, -1): LOG.warning( '\t%sfrom: %s', '\t' * call, sys._getframe(call).f_code.co_name) return node def bind_all_namespaces(self): """ Results in the RDF @prefix directives for every ingest being added to this ingest. """ for prefix in self.curie_map.keys(): iri = self.curie_map[prefix] self.bind(prefix, Namespace(iri)) # serialize() conflicts between rdflib & Graph.serialize abstractmethod # GraphUtils expects the former. (too bad there is no multiple dispatch) # rdflib version def serialize( self, destination=None, format='turtle', base=None, encoding=None ): for prefix in self.prefixes: mapped_iri = self.curie_map[prefix] self.bind(prefix, Namespace(mapped_iri)) return ConjunctiveGraph.serialize(self, destination, format)
def __init__(self, curie_map, materialize_bnodes=False): self.curie_map = curie_map self.cu = CurieUtil(curie_map) # TEC: what is cu really? self.nobnodes = materialize_bnodes return
def test_amino_acid_position_region_model(self): """ Test modelling of amino acid positions Using test data set 1, and the function add_variant_info_to_graph() We want to test the following triples: CGD:RegionID is an instance of faldo:Region CGD:RegionID faldo:begin BothStrandPositionID CGD:RegionID faldo:end BothStrandPositionID CGD:BothStrandPositionID is an instance of faldo:BothStrandPosition CGD:BothStrandPositionID is an instance of faldo:Position CGD:BothStrandPositionID faldo:position 741 CGD:BothStrandPositionID faldo:reference UniProtID """ from dipper.utils.TestUtils import TestUtils self.cgd.add_variant_info_to_graph(self.test_set_1) # Make testutils object and load bindings test_env = TestUtils(self.cgd.graph) cu = CurieUtil(self.curie_map) self.cgd.load_bindings() (variant_key, variant_label, amino_acid_variant, amino_acid_position, transcript_id, transcript_priority, protein_variant_type, functional_impact, stop_gain_loss, transcript_gene, protein_variant_source) = self.test_set_1[0][0:11] position = 741 variant_id = self.cgd.make_cgd_id('variant{0}'.format(variant_key)) uniprot_curie = "UniProtKB:Q99062#Q99062-1" uniprot_id = "Q99062#Q99062-1" region_id = ":_{0}{1}{2}Region".format(position, position, uniprot_curie) both_strand_id = ":_{0}-{1}".format(uniprot_id, position) region_uri = URIRef(cu.get_uri(region_id)) both_strand_uri = URIRef(cu.get_uri(both_strand_id)) uniprot_uri = URIRef(cu.get_uri(uniprot_curie)) sparql_query = """ SELECT ?region ?bsPosition ?protein WHERE {{ ?region a faldo:Region ; faldo:begin ?bsPosition ; faldo:end ?bsPosition . ?bsPosition a faldo:Position ; faldo:position {0} ; faldo:reference ?protein . }} """.format(position) # Expected Results expected_results = [[region_uri, both_strand_uri, uniprot_uri]] # Query graph sparql_output = test_env.query_graph(sparql_query) self.assertEqual(expected_results, sparql_output)
class RDFGraphTestCase(unittest.TestCase): def setUp(self): self.graph = RDFGraph() this_curie_map = curie_map.get() self.cutil = CurieUtil(this_curie_map) # stuff to make test triples self.test_cat_subj = "http://www.google.com" self.test_cat_default_pred = self.cutil.get_uri("biolink:category") self.test_cat_nondefault_pred = self.cutil.get_uri("rdf:type") self.test_cat_default_category = self.cutil.get_uri( "biolink:NamedThing") self.test_cat_nondefault_category = self.cutil.get_uri("biolink:Gene") self.test_cat_type = self.cutil.get_uri("rdf:type") self.test_cat_class = self.cutil.get_uri("rdf:class") def tearDown(self): self.graph = None def test_add_triple_makes_triple(self): """ test that addTriple() makes at least one triple """ self.graph.addTriple(subject_id=self.test_cat_subj, predicate_id="rdf:type", obj="rdf:class") self.assertTrue( len(self.graph) > 0, "addTriples() didn't make >=1 triple") def test_add_triple_subject_category_assignment(self): """ test that addTriple() correctly assigns subject category """ self.graph.addTriple( subject_id=self.test_cat_subj, predicate_id="rdf:comment", obj="website", subject_category=self.test_cat_nondefault_category) triples = list( self.graph.triples((URIRef(self.test_cat_subj), URIRef(self.test_cat_default_pred), None))) self.assertEqual( len(triples), 1, "addTriples() didn't make exactly one triple subject category") self.assertEqual( triples[0][2], URIRef(self.test_cat_nondefault_category), "addTriples() didn't assign the right triple subject category") def test_add_triple_object_category_assignment(self): """ test that addTriple() correctly assigns object category """ self.graph.addTriple(subject_id=self.test_cat_subj, predicate_id=self.test_cat_type, obj=self.test_cat_class, object_category=self.test_cat_nondefault_category) triples = list( self.graph.triples((URIRef(self.test_cat_class), URIRef(self.test_cat_default_pred), None))) self.assertEqual( len(triples), 1, "addTriples() didn't make exactly one triple object category") self.assertEqual( triples[0][2], URIRef(self.test_cat_nondefault_category), "addTriples() didn't assign the right triple object category") def read_graph_from_turtle_file(self, f): """ This will read the specified file into a graph. A simple parsing test. :param f: :return: """ vg = RDFGraph() p = os.path.abspath(f) logger.info("Testing reading turtle file from %s", p) vg.parse(f, format="turtle") logger.info('Found %s graph nodes in %s', len(vg), p) self.assertTrue(len(vg) > 0, "No nodes found in " + p) return def read_graph_into_owl(self, f): """ test if the ttl can be parsed by owlparser this expects owltools to be accessible from commandline :param f: file of ttl :return: """ import subprocess from subprocess import check_call status = check_call(["owltools", f], stderr=subprocess.STDOUT) # returns zero is success! if status != 0: logger.error('finished verifying with owltools with status %s', status) self.assertTrue(status == 0) return def test_make_category_triple_default(self): """ test that method adds category triple to graph correctly (default pred and obj) """ self.graph._make_category_triple(self.test_cat_subj) triples = list(self.graph.triples((None, None, None))) self.assertEqual(len(triples), 1, "method didn't make exactly one triple") self.assertEqual(triples[0][0], URIRef(self.test_cat_subj), "didn't assign correct subject") self.assertEqual(triples[0][1], URIRef(self.test_cat_default_pred), "didn't assign correct predicate") self.assertEqual(triples[0][2], URIRef(self.test_cat_default_category), "didn't assign correct category") def test_make_category_triple_non_default_category(self): """ test that method adds category triple to graph correctly """ self.graph._make_category_triple(self.test_cat_subj, self.test_cat_nondefault_category) triples = list(self.graph.triples((None, None, None))) self.assertEqual(len(triples), 1, "method didn't make exactly one triple") self.assertEqual(URIRef(self.test_cat_nondefault_category), triples[0][2], "didn't assign correct (non-default) category") def test_make_category_triple_non_default_pred(self): """ test that method adds category triple to graph correctly (non default pred) """ self.graph._make_category_triple( self.test_cat_subj, self.test_cat_default_category, predicate=self.test_cat_nondefault_pred) triples = list(self.graph.triples((None, None, None))) self.assertEqual(len(triples), 1, "method didn't make exactly one triple") self.assertEqual(URIRef(self.test_cat_nondefault_pred), triples[0][1], "didn't assign correct (non-default) category") def test_make_category_triple_category_none_should_emit_named_thing(self): """ test that method adds category triple to graph correctly (default pred and obj) """ self.graph._make_category_triple(self.test_cat_subj, category=None) triples = list(self.graph.triples((None, None, None))) self.assertEqual(len(triples), 1, "method didn't make exactly one triple") self.assertEqual(URIRef(self.test_cat_default_category), triples[0][2], "didn't assign correct default category") def test_is_literal(self): """ test that method infers type (either literal or CURIE) correctly """ self.assertTrue(self.graph._is_literal("1")) self.assertTrue(not self.graph._is_literal("foo:bar")) self.assertTrue(not self.graph._is_literal("http://www.zombo.com/")) self.assertTrue(not self.graph._is_literal("https://www.zombo.com/")) self.assertTrue( not self.graph._is_literal("ftp://ftp.1000genomes.ebi.ac.uk/"))