Ejemplo n.º 1
0
    def setUp(self):
        self.assoc_curie = 'MONARCH:test_association'
        self.eco_id = 'ECO:0000015'

        self.test_set_1 = (
            'MGI:1920145', 'Setd5', 'WTSI', 'MEFW', 'male', 'heterozygote',
            'MGI:4432631', 'Setd5<tm1a(EUCOMM)Wtsi>',
            'targeted mutation 1a, Wellcome Trust Sanger Institute',
            'MGI:2159965', 'C57BL/6N', 'MGP',
            'Wellcome Trust Sanger Institute Mouse Genetics Project',
            'MGP Select Pipeline', 'MGP_001', 'MGP_XRY_001', 'X-ray',
            'IMPC_XRY_008_001', 'Number of ribs right', 'MP:0005390',
            'skeleton phenotype', 'MP:0000480', 'increased rib number',
            '1.637023E-010', '', '8.885439E-007',
            'Wilcoxon rank sum test with continuity correction', 'IMPC')

        # Generate test curies, these are otherwise generated
        # within _add_evidence() and _add_study_provenance()
        self.study_curie = "_:study"
        self.evidence_curie = "_:evidence"

        # IRIs for testing sparql output
        curie_dict = curie_map.get()
        curie_util = CurieUtil(curie_dict)
        self.assoc_iri = URIRef(curie_util.get_uri(self.assoc_curie))

        return
Ejemplo n.º 2
0
    def setUp(self):
        self.assoc_curie = 'MONARCH:test_association'
        self.eco_id = 'ECO:0000015'

        self.test_set_1 = ('MGI:1920145', 'Setd5', 'WTSI', 'MEFW', 'male',
                           'heterozygote', 'MGI:4432631', 'Setd5<tm1a(EUCOMM)Wtsi>',
                           'targeted mutation 1a, Wellcome Trust Sanger Institute',
                           'MGI:2159965', 'C57BL/6N', 'MGP',
                           'Wellcome Trust Sanger Institute Mouse Genetics Project',
                           'MGP Select Pipeline', 'MGP_001', 'MGP_XRY_001', 'X-ray',
                           'IMPC_XRY_008_001', 'Number of ribs right', 'MP:0005390',
                           'skeleton phenotype', 'MP:0000480', 'increased rib number',
                           '1.637023E-010', '', '8.885439E-007',
                           'Wilcoxon rank sum test with continuity correction', 'IMPC')

        # Generate test curies, these are otherwise generated
        # within _add_evidence() and _add_study_provenance()
        self.study_curie = "_:study"
        self.evidence_curie = "_:evidence"

        # IRIs for testing sparql output
        curie_dict = curie_map.get()
        curie_util = CurieUtil(curie_dict)
        self.assoc_iri = URIRef(curie_util.get_uri(self.assoc_curie))

        return
Ejemplo n.º 3
0
 def test_addGenotype(self):
     cutil = CurieUtil(self.curie_map)
     gid = 'MGI:5515892'
     label = \
         'Pmp22<Tr-2J>/Pmp22<+> [C57BL/6J-Pmp22<Tr-2J>/GrsrJ]'
     self.genotype.addGenotype(gid, label)
     self.assertTrue((URIRef(cutil.get_uri(gid)), RDFS['label'],
                      Literal(label)) in self.genotype.graph)
Ejemplo n.º 4
0
 def setUp(self):
     self.graph = RDFGraph()
     self.curie_map = curie_map.get()
     self.genotype = Genotype(self.graph)
     self.cutil = CurieUtil(self.curie_map)
     self.test_cat_pred = self.cutil.get_uri(blv.terms['category'])
     self.test_cat_genotype_category = self.cutil.get_uri(
         blv.terms['Genotype'])
     self.test_cat_background_category = self.cutil.get_uri(
         blv.terms['PopulationOfIndividualOrganisms'])
Ejemplo n.º 5
0
 def test_addGenotype(self):
     from rdflib.namespace import RDFS,URIRef
     from rdflib import Literal
     from dipper.utils.CurieUtil import CurieUtil
     cu = CurieUtil(self.curie_map)
     id = 'MGI:5515892'
     label = \
         'Pmp22<Tr-2J>/Pmp22<+> [C57BL/6J-Pmp22<Tr-2J>/GrsrJ]'
     self.genotype.addGenotype(id, label)
     self.assertTrue((URIRef(cu.get_uri(id)), RDFS['label'],
                      Literal(label)) in self.genotype.graph)
Ejemplo n.º 6
0
 def test_addGenotype(self):
     from rdflib.namespace import RDFS, URIRef
     from rdflib import Literal
     from dipper.utils.CurieUtil import CurieUtil
     cutil = CurieUtil(self.curie_map)
     gid = 'MGI:5515892'
     label = \
         'Pmp22<Tr-2J>/Pmp22<+> [C57BL/6J-Pmp22<Tr-2J>/GrsrJ]'
     self.genotype.addGenotype(gid, label)
     self.assertTrue((URIRef(cutil.get_uri(gid)), RDFS['label'],
                      Literal(label)) in self.genotype.graph)
Ejemplo n.º 7
0
    def setUp(self):
        self.graph = RDFGraph()

        this_curie_map = curie_map.get()
        self.cutil = CurieUtil(this_curie_map)

        # stuff to make test triples
        self.test_cat_subj = "http://www.google.com"
        self.test_cat_default_pred = self.cutil.get_uri("biolink:category")
        self.test_cat_nondefault_pred = self.cutil.get_uri("rdf:type")
        self.test_cat_default_category = self.cutil.get_uri(
            "biolink:NamedThing")
        self.test_cat_nondefault_category = self.cutil.get_uri("biolink:Gene")
        self.test_cat_type = self.cutil.get_uri("rdf:type")
        self.test_cat_class = self.cutil.get_uri("rdf:class")
Ejemplo n.º 8
0
    def setUp(self):
        g = RDFGraph()
        self.model = Model(g)

        this_curie_map = curie_map.get()
        self.cutil = CurieUtil(this_curie_map)

        # stuff to make test triples
        self.test_cat_subj_curie = "MGI:1234"
        self.test_cat_subj = self.cutil.get_uri("MGI:1234")
        self.test_cat_default_pred = self.cutil.get_uri("biolink:category")
        self.test_named_indiv = self.cutil.get_uri("owl:NamedIndividual")
        self.test_label_pred = self.cutil.get_uri("rdfs:label")
        self.test_label = "some label"

        self.test_comment_IRI = self.cutil.get_uri("rdfs:comment")
        self.test_comment = 'bonus eruptus'
Ejemplo n.º 9
0
    def test_associations(self):
        """
        Given the above sample input, produce the following:
        CGD:VariantID has_phenotype(RO:0002200) CGD:DiseaseInstance

        A CGD:AssociationID OBO:RO_0002558 Traceable Author Statement (ECO:0000033)
        A CGD:AssociationID dc:source PMID:20498393
        A CGD:AssociationID has_environment CGD:DrugID
        A CGD:AssociationID OBAN:association_has_subject CGD:VariantID
        A CGD:AssociationID OBAN:association_has_object_property has_phenotype
        A CGD:AssociationID OBAN:association_has_object CGD:DiseaseInstance
        """
        from dipper.utils.TestUtils import TestUtils

        # Make testutils object and load bindings
        cu = CurieUtil(self.curie_map)
        test_env = TestUtils(self.cgd.graph)
        self.cgd.load_bindings()
        evidence = 'OBO:ECO_0000033'
        evidence_uri = URIRef(cu.get_uri(evidence))

        sparql_query = """
                       SELECT ?diseaseInd ?variant ?drug ?vdannot ?source ?evidence
                       WHERE {{
                           ?variant OBO:RO_0002200 ?diseaseInd .

                           ?vdannot a OBAN:association ;
                               OBO:RO_0002558 ?evidence ;
                               dc:source ?source ;
                               <{0}> ?drug ;
                               OBAN:association_has_object ?diseaseInd ;
                               OBAN:association_has_object_property OBO:RO_0002200 ;
                               OBAN:association_has_subject ?variant .
                       }}
                       """.format(self.relationship_uri)

        # Expected Results
        expected_results = [[self.disease_ind_uri, self.variant_uri, self.drug_uri,
                             self.vd_annot_uri,
                             self.source_uri, evidence_uri]]
        # Query graph
        sparql_output = test_env.query_graph(sparql_query)

        self.assertEqual(expected_results, sparql_output)
Ejemplo n.º 10
0
    def _replace_entity(graph, old_id, new_id, bindings={}, is_property=False):
        """
        Replace entity in graph
        Replace one ID with another
        :param graph rdflib.graph object
        :param old_id, String curie,IRI, or literal to be replaced
        :param new_id, String curie, IRI, or literal to replace the old id
        :param bindings, Dict, dictionary of namespace prefixes
        :param is_property, Boolean, is an id a property/predicate rather than
                                 a class, individual, or literal
        :return: None
        """
        cu = CurieUtil(curie_map.get())
        old_uri = URIRef(cu.get_uri(old_id))
        new_uri = URIRef(cu.get_uri(new_id))
        if is_property is False:
            sparql_update = \
                """
                DELETE {{ <{0}> ?pred ?obj }}
                INSERT {{ <{1}> ?pred ?obj }}
                WHERE {{ <{0}> ?pred ?obj }}
                """.format(old_uri, new_uri)

            graph.update(sparql_update, 'sparql', bindings)

            sparql_update = \
                """
                DELETE {{ ?sub ?pred <{0}> }}
                INSERT {{ ?sub ?pred <{1}> }}
                WHERE {{ ?sub ?pred <{0}> }}
                """.format(old_uri, new_uri)

            graph.update(sparql_update, 'sparql', bindings)
        else:
            sparql_update = \
                """
                DELETE {{ ?sub <{0}> ?obj }}
                INSERT {{ ?sub <{1}> ?obj }}
                WHERE {{ ?sub <{0}> {?obj} }}
                """.format(old_uri, new_uri)

            graph.update(sparql_update, 'sparql', bindings)

        return
Ejemplo n.º 11
0
    def setUp(self):

        self.curie_map = curie_map.get()
        cu = CurieUtil(self.curie_map)
        # Fake credentials as these tests do not require a database connection
        database = 'foo'
        user = '******'
        password = '******'

        self.cgd = CGD(database, user, password)
        test_data = ((387, 'MLH1 any mutation', 13, 'Adenocarcinoma',
                     None, 'Colon', 'no response', 1,
                     '5FU-based adjuvant therapy', 'late trials', '20498393'),)
        self.cgd.add_disease_drug_variant_to_graph(test_data)

        (variant_key, variant_label, diagnoses_key, diagnoses,
         specific_diagnosis, organ, relationship,
         drug_key, drug, therapy_status, pubmed_id) = test_data[0]

        source_id = "PMID:{0}".format(pubmed_id)
        variant_id = self.cgd.make_cgd_id('variant{0}'.format(variant_key))
        disease_id = self.cgd.make_cgd_id('disease{0}{1}'.format(diagnoses_key,
                                                                 diagnoses))
        relationship_id = "RO:has_environment"
        disease_quality = ("CGD:{0}".format(relationship)).replace(" ", "_")
        has_quality_property = "BFO:0000159"
        drug_id = self.cgd.make_cgd_id('drug{0}'.format(drug_key))
        disease_instance_id = self.cgd.make_cgd_id('phenotype{0}{1}{2}'.format(
            diagnoses, variant_key, relationship))

        variant_disease_annot = self.cgd.make_cgd_id("assoc{0}{1}".format(variant_key, diagnoses))

        # Set up URIs
        self.source_uri = URIRef(cu.get_uri(source_id))
        self.variant_uri = URIRef(cu.get_uri(variant_id))
        self.disease_uri = URIRef(cu.get_uri(disease_id))
        self.disease_ind_uri = URIRef(cu.get_uri(disease_instance_id))
        self.relationship_uri = URIRef(cu.get_uri(relationship_id))
        self.drug_uri = URIRef(cu.get_uri(drug_id))
        self.vd_annot_uri = URIRef(cu.get_uri(variant_disease_annot))
        self.disease_quality_uri = URIRef(cu.get_uri(disease_quality))

        self.variant_label = variant_label
        self.disease_label = diagnoses
        self.disease_instance_label = "{0} with {1} to therapy".format(diagnoses, relationship)
        self.drug_label = drug

        return
Ejemplo n.º 12
0
def hpo_to_tree(cls, hpo_terms, hpo_graph, tree, path):
    tree_path = copy.copy(path)
    tree_path.append(cls)
    curie_util = CurieUtil(curie_map.get())
    if cls not in hpo_terms:
        hpo_terms[cls] = {
            'label': hpo_graph.label(URIRef(curie_util.get_uri(cls)))
        }
        parents = hpo_graph.objects(URIRef(curie_util.get_uri(cls)),
                                    RDFS.subClassOf)
        hpo_terms[cls]['parents'] = len(list(parents))

        lay_person = get_lay_person(cls, hpo_graph)
        hpo_terms[cls]["lay_person"] = lay_person

    # Traverse the tree to get to the input class
    position = tree[tree_path[0]]
    for term in tree_path[1:]:
        position = position[term]

    for sub_class in hpo_graph.subjects(
            RDFS.subClassOf, URIRef(curie_util.get_uri(tree_path[-1]))):
        curie = curie_util.get_curie(sub_class).replace("OBO:HP_", "HP:")
        position[curie] = {}
        hpo_to_tree(curie, hpo_terms, hpo_graph, tree, tree_path)
Ejemplo n.º 13
0
def hpo_to_tree(cls, hpo_terms, hpo_graph, tree, path):
    tree_path = copy.copy(path)
    tree_path.append(cls)
    curie_util = CurieUtil(curie_map.get())
    if cls not in hpo_terms:
        hpo_terms[cls] = {
            'label': hpo_graph.label(URIRef(curie_util.get_uri(cls)))
        }
        parents = hpo_graph.objects(URIRef(curie_util.get_uri(cls)), RDFS.subClassOf)
        hpo_terms[cls]['parents'] = len(list(parents))

        lay_person = get_lay_person(cls, hpo_graph)
        hpo_terms[cls]["lay_person"] = lay_person

    # Traverse the tree to get to the input class
    position = tree[tree_path[0]]
    for term in tree_path[1:]:
        position = position[term]

    for sub_class in hpo_graph.subjects(RDFS.subClassOf, URIRef(curie_util.get_uri(tree_path[-1]))):
        curie = curie_util.get_curie(sub_class).replace("OBO:HP_", "HP:")
        position[curie] = {}
        hpo_to_tree(curie, hpo_terms, hpo_graph, tree, tree_path)
Ejemplo n.º 14
0
 def __init__(self, curie_map, materialize_bnodes=False):
     self.curie_map = curie_map
     self.cu = CurieUtil(curie_map)
     self.nobnodes = materialize_bnodes
     return
Ejemplo n.º 15
0
class RDFGraph(ConjunctiveGraph, DipperGraph):
    """
    Extends RDFLibs ConjunctiveGraph
    The goal of this class is wrap the creation
    of triples and manage creation of URIRef,
    Bnodes, and literals from an input curie
    """

    curie_util = CurieUtil(curie_map.get())
    curie_map = curie_map

    def __init__(self, are_bnodes_skized=True):
        super().__init__()
        self.are_bnodes_skized = are_bnodes_skized

        # Can be removed when this is resolved
        # https://github.com/RDFLib/rdflib/issues/632
        obo_map = curie_map.get()['OBO']
        self.bind('OBO', Namespace(obo_map))

    def addTriple(self,
                  subject_id,
                  predicate_id,
                  obj,
                  object_is_literal=False,
                  literal_type=None):

        if object_is_literal is True:
            if literal_type is not None and obj is not None:
                literal_type_iri = self._getNode(literal_type)
                self.add(
                    (self._getNode(subject_id), self._getNode(predicate_id),
                     Literal(obj, datatype=literal_type_iri)))
            elif obj is not None:
                self.add(
                    (self._getNode(subject_id), self._getNode(predicate_id),
                     Literal(obj)))
            else:
                logger.warn("None as literal object for subj: %s and pred: %s",
                            subject_id, predicate_id)
        elif obj is not None and obj != '':
            self.add((self._getNode(subject_id), self._getNode(predicate_id),
                      self._getNode(obj)))
        else:
            logger.warn("None/empty object IRI for subj: %s and pred: %s",
                        subject_id, predicate_id)
        return

    def skolemizeBlankNode(self, curie):
        stripped_id = re.sub(r'^_:|^_', '', curie, 1)
        node = BNode(stripped_id).skolemize(self.curie_map.get_base())
        node = re.sub(r'rdflib/', '', node)
        return URIRef(node)

    def _getNode(self, curie):
        """
        This is a wrapper for creating a URIRef or Bnode object
        with a given a curie or iri as a string.

        If an id starts with an underscore, it assigns it to a BNode, otherwise
        it creates it with a standard URIRef.
        Alternatively, self.skolemize_blank_node is True,
        it will skolemize the blank node

        :param curie: str identifier formatted as curie or iri
        :return: node: RDFLib URIRef or BNode object
        """
        node = None
        if re.match(r'^_', curie):
            if self.are_bnodes_skized is True:
                node = self.skolemizeBlankNode(curie)
            else:  # replace the leading underscore to make it cleaner
                node = BNode(re.sub(r'^_:|^_', '', curie, 1))
        # Check if curie actually an IRI
        elif re.match(r'^http|^ftp', curie):
            node = URIRef(curie)
        else:
            iri = RDFGraph.curie_util.get_uri(curie)
            if iri is not None:
                node = URIRef(RDFGraph.curie_util.get_uri(curie))
                # Bind prefix map to graph
                prefix = curie.split(':')[0]
                if prefix not in self.namespace_manager.namespaces():
                    mapped_iri = curie_map.get()[prefix]
                    self.bind(prefix, Namespace(mapped_iri))
            else:
                logger.error("couldn't make URI for %s", curie)
        return node

    def bind_all_namespaces(self):
        for prefix in curie_map.get().keys():
            iri = curie_map.get()[prefix]
            self.bind(prefix, Namespace(iri))
Ejemplo n.º 16
0
    def test_genome_build_chromosome_model(self):
        """
        Test modelling of genome, builds, and chromosomes
        Using test data set 2, and the function add_variant_info_to_graph()
        """
        from dipper.utils.TestUtils import TestUtils
        self.cgd.add_variant_info_to_graph(self.test_set_2)

        # Make testutils object and load bindings
        test_env = TestUtils(self.cgd.graph)
        cu = CurieUtil(self.curie_map)
        self.cgd.load_bindings()

        genome = ":9606genome"
        genome_label = "Human genome"
        chromosome = "CHR:9606chr9"
        chromosome_label = "chr9 (Human)"
        build_curie = "UCSC:hg19"
        build_label = "hg19"
        chrom_on_build = ":MONARCH_hg19chr9"
        chrom_build_label = "chr9 (hg19)"

        genome_uri = URIRef(cu.get_uri(genome))
        chromosome_uri = URIRef(cu.get_uri(chromosome))
        build_uri = URIRef(cu.get_uri(build_curie))
        chrom_on_build_uri = URIRef(cu.get_uri(chrom_on_build))
        '''
        sparql_query = """
                       SELECT ?genome ?chromosome ?build ?chromOnBuild
                       WHERE {{
                           ?genome a owl:Class ;
                               rdfs:label "{0}" ;
                               OBO:RO_0002162 OBO:NCBITaxon_9606 ;
                               OBO:RO_0002351 ?chromosome ;
                               rdfs:subClassOf OBO:SO_0001026 .

                           ?chromosome a owl:Class ;
                               rdfs:label "{1}" ;
                               OBO:RO_0002350 ?genome ;
                               rdfs:subClassOf OBO:SO_0000340 .

                           ?build a OBO:SO_0001505 ;
                               a ?genome ;
                               rdfs:label "{2}" ;
                               OBO:RO_0002351 ?chromOnBuild ;
                               rdfs:subClassOf ?genome .

                           ?chromOnBuild a ?chromosome ;
                               rdfs:label "{3}" ;
                               OBO:RO_0002350 ?build .
                       }}
                       """.format(genome_label, chromosome_label,
                                  build_label, chrom_build_label)
        '''
        sparql_query = """
                       SELECT ?genome ?chromosome ?build ?chromOnBuild
                       WHERE {{
                           ?genome a owl:Class ;
                               rdfs:label "{0}" ;
                               rdfs:subClassOf OBO:SO_0001026 .

                           ?chromosome a owl:Class ;
                               rdfs:label "{1}" ;
                               rdfs:subClassOf OBO:SO_0000340 .

                           ?build a OBO:SO_0001505 ;
                               a ?genome ;
                               rdfs:label "{2}" ;
                               OBO:RO_0002162 OBO:NCBITaxon_9606 ;
                               OBO:RO_0002351 ?chromOnBuild .

                           ?chromOnBuild a ?chromosome ;
                               a OBO:SO_0000340 ;
                               rdfs:label "{3}" ;
                               OBO:RO_0002350 ?build .
                       }}
                       """.format(genome_label, chromosome_label, build_label,
                                  chrom_build_label)

        # Expected Results
        expected_results = [[
            genome_uri, chromosome_uri, build_uri, chrom_on_build_uri
        ]]

        # Query graph
        sparql_output = test_env.query_graph(sparql_query)

        self.assertEqual(expected_results, sparql_output)
Ejemplo n.º 17
0
    def test_variant_position_region_model(self):
        """
        Test modelling of variant positions on a transcript
        Using test data set 2, and the function add_variant_info_to_graph()
        We want to test the following triples:

        CGD:RegionID is an instance of faldo:Region
        CGD:RegionID faldo:begin BothStrandPositionID
        CGD:RegionID faldo:end BothStrandPositionID

        CGD:BothStrandPositionID is an instance of faldo:BothStrandPosition
        CGD:BothStrandPositionID is an instance of faldo:Position
        CGD:BothStrandPositionID faldo:position 944
        CGD:BothStrandPositionID faldo:reference CGD:TranscriptID
        """
        from dipper.utils.TestUtils import TestUtils
        self.cgd.add_variant_info_to_graph(self.test_set_2)

        # Make testutils object and load bindings
        test_env = TestUtils(self.cgd.graph)
        cu = CurieUtil(self.curie_map)
        self.cgd.load_bindings()

        (variant_key, variant_label, amino_acid_variant, amino_acid_position,
         transcript_id, transcript_priority, protein_variant_type,
         functional_impact, stop_gain_loss, transcript_gene,
         protein_variant_source, variant_gene, bp_pos, variant_cdna, cosmic_id,
         db_snp_id, genome_pos_start, genome_pos_end, ref_base, variant_base,
         primary_transcript_exons, primary_transcript_variant_sub_types,
         variant_type, chromosome, genome_build, build_version,
         build_date) = self.test_set_2[0]

        transcript_curie = self.cgd._make_transcript_curie(transcript_id)
        ccds_id = "35166.1"
        variant_id = self.cgd.make_cgd_id('variant{0}'.format(variant_key))

        region_id = ":_{0}Region".format(transcript_curie)
        both_strand_id = ":_{0}-{1}".format(ccds_id, bp_pos)

        region_uri = URIRef(cu.get_uri(region_id))
        both_strand_uri = URIRef(cu.get_uri(both_strand_id))
        ccds_uri = URIRef(cu.get_uri(transcript_curie))

        sparql_query = """
                       SELECT ?region ?bsPosition ?transcript
                       WHERE {{
                           ?region a faldo:Region ;
                               faldo:begin ?bsPosition ;
                               faldo:end ?bsPosition .

                           ?bsPosition a faldo:Position ;
                               faldo:position {0} ;
                               faldo:reference ?transcript .
                       }}
                       """.format(bp_pos)

        # Expected Results
        expected_results = [[region_uri, both_strand_uri, ccds_uri]]

        # Query graph
        sparql_output = test_env.query_graph(sparql_query)

        self.assertEqual(expected_results, sparql_output)
Ejemplo n.º 18
0
class RDFGraph(ConjunctiveGraph, DipperGraph):
    """
    Extends RDFLibs ConjunctiveGraph
    The goal of this class is wrap the creation
    of triples and manage creation of URIRef,
    Bnodes, and literals from an input curie
    """

    curie_map = curie_map.get()
    curie_util = CurieUtil(curie_map)

    # make global translation table available outside the ingest
    with open('translationtable/GLOBAL_TERMS.yaml') as fh:
        globaltt = yaml.safe_load(fh)
        globaltcid = {v: k for k, v in globaltt.items()}

    def __init__(self, are_bnodes_skized=True, identifier=None):
        # print("in RDFGraph  with id: ", identifier)
        super().__init__('IOMemory', identifier)
        self.are_bnodes_skized = are_bnodes_skized

        # Can be removed when this is resolved
        # https://github.com/RDFLib/rdflib/issues/632
        obo_map = curie_map.get()['OBO']
        self.bind('OBO', Namespace(obo_map))

        # try adding them all
        # self.bind_all_namespaces()  # too much

    def addTriple(self, subject_id, predicate_id, obj,
                  object_is_literal=False, literal_type=None):

        if object_is_literal is True:
            if literal_type is not None and obj is not None:
                literal_type_iri = self._getNode(literal_type)
                self.add(
                    (self._getNode(subject_id), self._getNode(predicate_id),
                     Literal(obj, datatype=literal_type_iri)))
            elif obj is not None:
                self.add(
                    (self._getNode(subject_id), self._getNode(predicate_id),
                     Literal(obj)))
            else:
                logger.warning(
                    "None as literal object for subj: %s and pred: %s",
                    subject_id, predicate_id)
                # magic number 2 here is "steps up the stack"
                logger.warning(sys._getframe(2).f_code.co_name)
        elif obj is not None and obj != '':
            self.add((
                self._getNode(subject_id), self._getNode(predicate_id),
                self._getNode(obj)))
        else:
            logger.warning(
                "None/empty object IRI for subj: %s and pred: %s",
                subject_id, predicate_id)
        return

    def skolemizeBlankNode(self, curie):
        stripped_id = re.sub(r'^_:|^_', '', curie, 1)
        node = BNode(stripped_id).skolemize(self.curie_util.get_base())
        node = re.sub(r'rdflib/', '', node)  # remove string added by rdflib
        return URIRef(node)

    def _getNode(self, curie):
        """
        This is a wrapper for creating a URIRef or Bnode object
        with a given a curie or iri as a string.

        If an id starts with an underscore, it assigns it to a BNode, otherwise
        it creates it with a standard URIRef.
        Alternatively, self.skolemize_blank_node is True,
        it will skolemize the blank node

        :param curie: str identifier formatted as curie or iri
        :return: node: RDFLib URIRef or BNode object
        """
        node = None
        if re.match(r'^_', curie):
            if self.are_bnodes_skized is True:
                node = self.skolemizeBlankNode(curie)
            else:  # delete the leading underscore to make it cleaner
                node = BNode(re.sub(r'^_:|^_', '', curie, 1))

        # Check if curie actually an IRI
        elif re.match(r'^http|^ftp', curie):
            node = URIRef(curie)
        else:
            iri = RDFGraph.curie_util.get_uri(curie)
            if iri is not None:
                node = URIRef(RDFGraph.curie_util.get_uri(curie))
                # Bind prefix map to graph
                prefix = curie.split(':')[0]
                if prefix not in self.namespace_manager.namespaces():
                    mapped_iri = curie_map.get()[prefix]
                    self.bind(prefix, Namespace(mapped_iri))
            else:
                logger.error("couldn't make URI for %s", curie)
        return node

    def bind_all_namespaces(self):
        for prefix in curie_map.get().keys():
            iri = curie_map.get()[prefix]
            self.bind(prefix, Namespace(iri))
Ejemplo n.º 19
0
    def __init__(self, curie_map):
        self.curie_map = curie_map
        self.cu = CurieUtil(curie_map)

        return
Ejemplo n.º 20
0
class GraphUtils:

    # FIXME - i've duplicated relationships in Assoc and here -
    #         pick one or the other and refactor
    # TODO -  refactor using the getNode() method to clear out the
    #         URIRef(cu.get_uri(<id>)) nonsense

    OWLCLASS = OWL['Class']
    OWLIND = OWL['NamedIndividual']
    OWLRESTRICTION = OWL['Restriction']
    OWLPROP = OWL['ObjectProperty']
    OBJPROP = OWL['ObjectProperty']
    ANNOTPROP = OWL['AnnotationProperty']
    DATAPROP = OWL['DatatypeProperty']
    SUBCLASS = RDFS['subClassOf']
    PERSON = FOAF['Person']

    annotation_properties = {
        'replaced_by': 'IAO:0100001',
        'consider': 'OIO:consider',
        'hasExactSynonym': 'OIO:hasExactSynonym',
        'hasRelatedSynonym': 'OIO:hasRelatedSynonym',
        'definition': 'IAO:0000115',
        'has_xref': 'OIO:hasDbXref',
        'clique_leader': 'MONARCH:cliqueLeader'
    }

    object_properties = {
        'has_disposition': 'GENO:0000208',
        'has_phenotype': 'RO:0002200',
        'in_taxon': 'RO:0002162',
        'has_quality': 'RO:0000086',
        'has_qualifier': 'GENO:0000580',
        'towards': 'RO:0002503',
        'has_subject': ':hasSubject',
        'has_object': ':hasObject',
        'has_predicate': ':hasPredicate',
        'is_about': 'IAO:0000136',
        'has_member': 'RO:0002351',
        'member_of': 'RO:0002350',
        'involved_in': 'RO:0002331',
        'enables': 'RO:0002327',
        'derives_from': 'RO:0001000',
        'part_of': 'BFO:0000050',
        'has_part': 'BFO:0000051',
        'mentions': 'IAO:0000142',
        'model_of': 'RO:0003301',
        'has_gene_product': 'RO:0002205',
        'existence_starts_at': 'UBERON:existence_starts_at',
        'existence_starts_during': 'RO:0002488',
        'existence_ends_at': 'UBERON:existence_ends_at',
        'existence_ends_during': 'RO:0002492',
        'starts_with': 'RO:0002224',
        'starts_during': 'RO:0002091',
        'ends_during': 'RO:0002093',
        'ends_with': 'RO:0002230',
        'occurs_in': 'BFO:0000066',
        'has_environment_qualifier': 'GENO:0000580',
        'has_begin_stage_qualifier': 'GENO:0000630',
        'has_end_stage_qualifier': 'GENO:0000631',
        'correlates_with': 'RO:0002610',
        'substance_that_treats': 'RO:0002606',
        'is_marker_for': 'RO:0002607',
        'contributes_to': 'RO:0002326',
        'has_origin': 'GENO:0000643',
        'has_author': 'ERO:0000232',
        'dc:source': 'dc:source',
        'dc:evidence': 'dc:evidence',
        'has_evidence': 'RO:0002558',
        'causally_upstream_of_or_within': 'RO:0002418'
    }

    datatype_properties = {
        'position': 'faldo:position',
        'has_measurement': 'IAO:0000004',
    }

    properties = annotation_properties.copy()
    properties.update(object_properties)
    properties.update(datatype_properties)

    def __init__(self, curie_map, materialize_bnodes=False):
        self.curie_map = curie_map
        self.cu = CurieUtil(curie_map)         # TEC: what is cu really?
        self.nobnodes = materialize_bnodes
        return

    def addClassToGraph(self, g, id, label, type=None, description=None):
        """
        Any node added to the graph will get at least 3 triples:
        *(node, type, owl:Class) and
        *(node, label, literal(label))
        *if a type is added,
            then the node will be an OWL:subclassOf that the type
        *if a description is provided,
            it will also get added as a dc:description
        :param id:
        :param label:
        :param type:
        :param description:
        :return:

        """

        n = self.getNode(id)

        g.add((n, RDF['type'], self.OWLCLASS))
        if label is not None:
            g.add((n, RDFS['label'], Literal(label)))
        if type is not None:
            t = URIRef(self.cu.get_uri(type))
            g.add((n, self.SUBCLASS, t))
        if description is not None:
            g.add((n, DC['description'], Literal(description)))
        return g

    def addIndividualToGraph(self, g, id, label, type=None, description=None):
        n = self.getNode(id)

        if label is not None:
            g.add((n, RDFS['label'], Literal(label)))
        if type is not None:
            t = self.getNode(type)
            g.add((n, RDF['type'], t))
        else:
            g.add((n, RDF['type'], self.OWLIND))
        if description is not None:
            g.add((n, DC['description'], Literal(description)))
        return g

    def addOWLPropertyClassRestriction(
            self, g, class_id, property_id, property_value):

        # make a blank node to hold the property restrictions
        # scrub the colons, they will make the ttl parsers choke
        nid = \
            '_'+re.sub(r':', '', property_id)+re.sub(r':', '', property_value)
        n = self.getNode(nid)

        g.add((n, RDF['type'], self.OWLRESTRICTION))
        g.add((n, OWL['onProperty'], self.getNode(property_id)))
        g.add((n, OWL['someValuesFrom'], self.getNode(property_value)))

        g.add((self.getNode(class_id), self.SUBCLASS, n))

        return

    def addEquivalentClass(self, g, id1, id2):
        n1 = self.getNode(id1)
        n2 = self.getNode(id2)

        if n1 is not None and n2 is not None:
            g.add((n1, OWL['equivalentClass'], n2))

        return

    def addSameIndividual(self, g, id1, id2):
        n1 = self.getNode(id1)
        n2 = self.getNode(id2)

        if n1 is not None and n2 is not None:
            g.add((n1, OWL['sameAs'], n2))

        return

    def addPerson(self, graph, person_id, person_label):
        graph.add((self.getNode(person_id), RDF['type'], self.PERSON))
        if person_label is not None:
            graph.add(
                (self.getNode(person_id), RDFS['label'],
                 Literal(person_label)))
        return

    def addDeprecatedClass(self, g, oldid, newids=None):
        """
        Will mark the oldid as a deprecated class.
        if one newid is supplied, it will mark it as replaced by.
        if >1 newid is supplied, it will mark it with consider properties
        :param g:
        :param oldid: the class id to deprecate
        :param newids: the class idlist that is
                       the replacement(s) of the old class.  Not required.
        :return:

        """

        n1 = URIRef(self.cu.get_uri(oldid))
        g.add((n1, RDF['type'], self.OWLCLASS))

        self._addReplacementIds(g, oldid, newids)

        return

    def addDeprecatedIndividual(self, g, oldid, newids=None):
        """
        Will mark the oldid as a deprecated individual.
        if one newid is supplied, it will mark it as replaced by.
        if >1 newid is supplied, it will mark it with consider properties
        :param g:
        :param oldid: the individual id to deprecate
        :param newids: the individual idlist that is the replacement(s) of
                       the old individual.  Not required.
        :return:

        """

        n1 = URIRef(self.cu.get_uri(oldid))
        g.add((n1, RDF['type'], self.OWLIND))

        self._addReplacementIds(g, oldid, newids)

        return

    def _addReplacementIds(self, g, oldid, newids):
        consider = URIRef(self.cu.get_uri(self.properties['consider']))
        replaced_by = URIRef(self.cu.get_uri(self.properties['replaced_by']))

        n1 = URIRef(self.cu.get_uri(oldid))
        g.add((n1, OWL['deprecated'], Literal(True, datatype=XSD[bool])))

        if newids is not None:
            if len(newids) == 1:
                n = URIRef(self.cu.get_uri(newids[0]))
                g.add((n1, replaced_by, n))
            elif len(newids) > 0:
                for i in newids:
                    n = URIRef(self.cu.get_uri(i.strip()))
                    g.add((n1, consider, n))
        return

    def addSubclass(self, g, parentid, childid):
        p = URIRef(self.cu.get_uri(parentid))
        c = URIRef(self.cu.get_uri(childid))
        g.add((c, self.SUBCLASS, p))

        return

    def addType(self, graph, subject_id, type, type_is_literal=False):
        # FIXME check this... i don't think a type should ever be a literal
        if type_is_literal is True:
            graph.add((self.getNode(subject_id), RDF['type'], Literal(type)))
        else:
            graph.add(
                (self.getNode(subject_id), RDF['type'], self.getNode(type)))
        return

    def addLabel(self, graph, subject_id, label):
        graph.add(
            (self.getNode(subject_id), RDFS['label'], Literal(label)))
        return

    def addSynonym(self, g, cid, synonym, synonym_type=None):
        """
        Add the synonym as a property of the class cid.
        Assume it is an exact synonym, unless otherwise specified
        :param g:
        :param cid: class id
        :param synonym: the literal synonym label
        :param synonym_type: the CURIE of the synonym type (not the URI)
        :return:

        """
        n = self.getNode(cid)
        if synonym_type is None:
            # default
            synonym_type = URIRef(
                self.cu.get_uri(self.properties['hasExactSynonym']))
        else:
            synonym_type = URIRef(self.cu.get_uri(synonym_type))

        g.add((n, synonym_type, Literal(synonym)))
        return

    def addDefinition(self, g, cid, definition):
        if definition is not None:
            n = self.getNode(cid)
            p = URIRef(self.cu.get_uri(self.properties['definition']))
            g.add((n, p, Literal(definition)))

        return

    def addXref(self, g, cid, xrefid, xref_as_literal=False):
        self.addTriple(
            g, cid, self.properties['has_xref'], xrefid, xref_as_literal)
        return

    def addDepiction(self, g, subject_id, image_url):
        g.add(
            (self.getNode(subject_id), FOAF['depiction'], Literal(image_url)))
        return

    def addComment(self, g, subject_id, comment):
        g.add(
            (self.getNode(subject_id), DC['comment'],
             Literal(comment.strip())))
        return

    def addDescription(self, g, subject_id, description):
        g.add(
            (self.getNode(subject_id), DC['description'],
             Literal(description.strip())))
        return

    def addPage(self, g, subject_id, page_url):
        g.add(
            (self.getNode(subject_id), FOAF['page'], Literal(page_url)))
        return

    def addTitle(self, g, subject_id, title):
        g.add(
            (self.getNode(subject_id), DC['title'], Literal(title)))
        return

    def addMember(self, g, group_id, member_id):
        self.addTriple(
            g, group_id, self.properties['has_member'], member_id)

    def addMemberOf(self, g, member_id, group_id):
        self.addTriple(
            g, member_id, self.properties['member_of'], group_id)
        return

    def addInvolvedIn(self, g, member_id, group_id):
        self.addTriple(
            g, member_id, self.properties['involved_in'], group_id)

    def write(self, graph, fileformat=None, file=None):
        """
         a basic graph writer (to stdout) for any of the sources.
         this will write raw triples in rdfxml, unless specified.
         to write turtle, specify format='turtle'
         an optional file can be supplied instead of stdout
        :return: None

        """
        filewriter = None
        if fileformat is None:
            fileformat = 'rdfxml'
        if file is not None:
            filewriter = open(file, 'wb')

            logger.info("Writing triples in %s to %s", fileformat, file)
            graph.serialize(filewriter, format=fileformat)
            filewriter.close()
        else:
            print(graph.serialize(format=fileformat).decode())
        return

    def write_raw_triples(self, graph, file=None):
        """
         a basic graph writer (to stdout) for any of the sources.
         this will write raw triples in rdfxml, unless specified.
         to write turtle, specify format='turtle'
         an optional file can be supplied instead of stdout
        :return: None
        """
        filewriter = None
        if file is not None:
            filewriter = open(file, 'w')
            logger.info("Writing raw triples to %s", file)

        for (s, p, o) in graph:
            output = [s, p, o]

            print(' '.join(output), file=filewriter)

        if filewriter is not None:
            filewriter.close()

        return

    def write_compact_triples(self, graph, file=None):
        """
        Will write out the raw triples,
        except it will replace the full uri with the curie prefix
        :param graph:
        :param file:
        :return:
        """
        # TODO

        return

    def _getNode(self, id, materialize_bnode):
        """
        This is a wrapper for creating a node with a given identifier.
        If an id starts with an underscore, it assigns it to a BNode, otherwise
        it creates it with a standard URIRef. Alternatively,
        if materialize_bnode is True,
        it will add any nodes that would have been blank into the BASE space.
        This will return None if it can't map the node properly.
        :param id:
        :return:
        """
        base = Namespace(self.curie_map.get(''))
        n = None
        if id is not None and re.match(r'^_', id):
            if materialize_bnode is True:
                n = base[id]
            else:  # replace the leading underscore to make it cleaner
                n = BNode(re.sub(r'_', '', id, 1))
        elif re.match(r'^\:', id):  # do we need to remove embedded ID colons?
            n = base[re.sub(r':', '', id, 1)]
        else:
            u = self.cu.get_uri(id)
            if u is not None:
                n = URIRef(self.cu.get_uri(id))
            else:
                logger.error("couldn't make URI for %s", id)
        return n

    def getNode(self, id, materialize_bnode=False):

        return self._getNode(id, materialize_bnode)

    def addTriple(
            self, graph, subject_id, predicate_id, object,
            object_is_literal=False):
        if object_is_literal is True:
            graph.add(
                (self.getNode(subject_id), self.getNode(predicate_id),
                 Literal(object)))
        else:
            graph.add(
                (self.getNode(subject_id), self.getNode(predicate_id),
                 self.getNode(object)))
        return

    def loadObjectProperties(self, graph, op):
        """
        Given a graph, it will load the supplied object properties
        as owl['ObjectProperty'] types
        A convenience.
        Status: DEPRECATED.  See loadProperties().
        :param graph:
        :param op: a dictionary of object properties
        :return: None

        """
        self.loadProperties(graph, op, self.OBJPROP)
        return

    def loadProperties(self, graph, op, property_type):
        """
        Given a graph, it will load the supplied object properties
        as the given property_type.
        :param graph: a graph
        :param op: a dictionary of object properties
        :param property_type: one of OWL:(Annotation|Data|Object)Property
        :return: None

        """

        if property_type not in [self.OBJPROP, self.ANNOTPROP, self.DATAPROP]:
            logger.error(
                "bad property type assigned: %s, %s", property_type, op)
        else:
            for k in op:
                graph.add(
                    (self.getNode(op[k]), RDF['type'], property_type))
        return

    def loadAllProperties(self, graph):
        """
        A convenience to load all stored properties
        (object, data, and annotation) into the supplied graph.
        :param graph:
        :return:

        """

        self.loadProperties(graph, self.object_properties, self.OBJPROP)
        self.loadProperties(graph, self.annotation_properties, self.ANNOTPROP)
        self.loadProperties(graph, self.datatype_properties, self.DATAPROP)
        return

    def addOntologyDeclaration(self, graph, ontology_id):

        graph.add((self.getNode(ontology_id), RDF['type'], OWL['Ontology']))
        return

    def addOWLVersionIRI(self, graph, ontology_id, version_iri):
        graph.add(
            (self.getNode(ontology_id), OWL['versionIRI'],
             self.getNode(version_iri)))

        return

    def addOWLVersionInfo(self, graph, ontology_id, version_info):
        graph.add(
            (self.getNode(ontology_id), OWL['versionInfo'],
             Literal(version_info)))
        return

    def makeLeader(self, graph, node_id):
        """
        Add an annotation property to the given ```node_id```
        to be the clique_leader.
        This is a monarchism.
        :param graph:
        :param node_id:
        :return:
        """
        self.addTriple(
            graph, node_id, self.annotation_properties['clique_leader'],
            Literal(True, datatype=XSD[bool]), True)
        return
Ejemplo n.º 21
0
    def process_catalog(self, limit=None):
        """
        :param limit:
        :return:

        """
        raw = '/'.join((self.rawdir, self.files['catalog']['file']))
        logger.info("Processing Data from %s", raw)
        gu = GraphUtils(curie_map.get())

        if self.testMode:      # set the graph to build
            g = self.testgraph
        else:
            g = self.graph

        line_counter = 0
        geno = Genotype(g)

        gu.loadProperties(g, geno.object_properties, gu.OBJPROP)
        gu.loadAllProperties(g)

        tax_id = 'NCBITaxon:9606'  # hardcode
        genome_version = 'GRCh38'  # hardcode

        # build a hashmap of genomic location to identifiers,
        # to try to get the equivalences

        loc_to_id_hash = {}

        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            next(filereader, None)  # skip the header row
            for row in filereader:
                if not row:
                    pass
                else:
                    line_counter += 1
                    (date_added_to_catalog, pubmed_num, first_author, pub_date,
                     journal, link, study_name, disease_or_trait,
                     initial_sample_description, replicate_sample_description,
                     region, chrom_num, chrom_pos, reported_gene_nums,
                     mapped_gene, upstream_gene_num, downstream_gene_num,
                     snp_gene_nums, upstream_gene_distance,
                     downstream_gene_distance, strongest_snp_risk_allele, snps,
                     merged, snp_id_current, context, intergenic_flag,
                     risk_allele_frequency, pvalue, pvalue_mlog, pvalue_text,
                     or_or_beta, confidence_interval_95,
                     platform_with_snps_passing_qc, cnv_flag, mapped_trait,
                     mapped_trait_uri) = row

                    intersect = \
                        list(set([str(i) for i in self.test_ids['gene']]) &
                             set(re.split(r',', snp_gene_nums)))
                    # skip if no matches found in test set
                    if self.testMode and len(intersect) == 0:
                        continue

# 06-May-2015	25917933	Zai CC	20-Nov-2014	J Psychiatr Res	http://europepmc.org/abstract/MED/25917933
# A genome-wide association study of suicide severity scores in bipolar disorder.
# Suicide in bipolar disorder
# 959 European ancestry individuals	NA
# 10p11.22	10	32704340	C10orf68, CCDC7, ITGB1	CCDC7
# rs7079041-A	rs7079041	0	7079041	intron	0		2E-6	5.698970
                    if chrom_num != '' and chrom_pos != '':
                        loc = 'chr'+str(chrom_num)+':'+str(chrom_pos)
                        if loc not in loc_to_id_hash:
                            loc_to_id_hash[loc] = set()
                    else:
                        loc = None

                    if re.search(r' x ', strongest_snp_risk_allele) \
                            or re.search(r',', strongest_snp_risk_allele):
                        # TODO deal with haplotypes
                        logger.warning(
                            "We can't deal with haplotypes yet: %s",
                            strongest_snp_risk_allele)
                        continue
                    elif re.match(r'rs', strongest_snp_risk_allele):
                        rs_id = 'dbSNP:'+strongest_snp_risk_allele.strip()
                        # remove the alteration
                    elif re.match(r'kgp', strongest_snp_risk_allele):
                        # FIXME this isn't correct
                        rs_id = 'dbSNP:'+strongest_snp_risk_allele.strip()
                        # http://www.1000genomes.org/faq/what-are-kgp-identifiers
                        # for some information
                        # They were created by Illumina for their genotyping
                        # platform before some variants identified during the
                        # pilot phase of the project had been assigned
                        # rs numbers.
                    elif re.match(r'chr', strongest_snp_risk_allele):
                        # like: chr10:106180121-G
                        rs_id = ':gwas-' + \
                            re.sub(
                                r':', '-', strongest_snp_risk_allele.strip())
                    elif strongest_snp_risk_allele.strip() == '':
                        # logger.debug(
                        #    "No strongest SNP risk allele for %s:\n%s",
                        #    pubmed_num, str(row))
                        # FIXME still consider adding in the EFO terms
                        # for what the study measured?
                        continue
                    else:
                        logger.warning(
                            "There's a snp id i can't manage: %s",
                            strongest_snp_risk_allele)
                        continue

                    alteration = re.search(r'-(.*)$', rs_id)
                    if alteration is not None \
                            and re.match(r'[ATGC]', alteration.group(1)):
                        # add variation to snp
                        pass  # TODO
                    rs_id = re.sub(r'-.*$', '', rs_id).strip()
                    if loc is not None:
                        loc_to_id_hash[loc].add(rs_id)

                    pubmed_id = 'PMID:'+pubmed_num

                    r = Reference(
                        pubmed_id, Reference.ref_types['journal_article'])
                    r.addRefToGraph(g)

                    # create the chromosome
                    chrom_id = makeChromID(chrom_num, genome_version, 'CHR')

                    # add the feature to the graph
                    snp_description = None
                    if risk_allele_frequency != '' and \
                            risk_allele_frequency != 'NR':
                        snp_description = \
                            str(risk_allele_frequency) + \
                            ' [risk allele frequency]'

                    f = Feature(
                        rs_id, strongest_snp_risk_allele.strip(),
                        Feature.types[r'SNP'], snp_description)
                    if chrom_num != '' and chrom_pos != '':
                        f.addFeatureStartLocation(chrom_pos, chrom_id)
                        f.addFeatureEndLocation(chrom_pos, chrom_id)
                    f.addFeatureToGraph(g)
                    f.addTaxonToFeature(g, tax_id)
                    # TODO consider adding allele frequency as property;
                    # but would need background info to do that

                    # also want to add other descriptive info about
                    # the variant from the context
                    for c in re.split(r';', context):
                        cid = self._map_variant_type(c.strip())
                        if cid is not None:
                            gu.addType(g, rs_id, cid)

                    # add deprecation information
                    if merged == 1 and str(snp_id_current.strip()) != '':
                        # get the current rs_id
                        current_rs_id = 'dbSNP:'
                        if not re.match(r'rs', snp_id_current):
                            current_rs_id += 'rs'
                        if loc is not None:
                            loc_to_id_hash[loc].append(current_rs_id)
                        current_rs_id += str(snp_id_current)
                        gu.addDeprecatedIndividual(g, rs_id, current_rs_id)
                        # TODO check on this
                        # should we add the annotations to the current
                        # or orig?
                        gu.makeLeader(g, current_rs_id)
                    else:
                        gu.makeLeader(g, rs_id)

                    # add the feature as a sequence alteration
                    # affecting various genes
                    # note that intronic variations don't necessarily list
                    # the genes such as for rs10448080  FIXME
                    if snp_gene_nums != '':
                        for s in re.split(r',', snp_gene_nums):
                            s = s.strip()
                            # still have to test for this,
                            # because sometimes there's a leading comma
                            if s != '':
                                gene_id = 'NCBIGene:'+s
                                geno.addAlleleOfGene(rs_id, gene_id)

                    # add the up and downstream genes if they are available
                    if upstream_gene_num != '':
                        downstream_gene_id = 'NCBIGene:'+downstream_gene_num
                        gu.addTriple(
                            g, rs_id,
                            Feature.object_properties[
                                r'upstream_of_sequence_of'],
                            downstream_gene_id)
                    if downstream_gene_num != '':
                        upstream_gene_id = 'NCBIGene:'+upstream_gene_num
                        gu.addTriple(
                            g, rs_id,
                            Feature.object_properties[
                                'downstream_of_sequence_of'],
                            upstream_gene_id)

                    description = 'A study of ' + disease_or_trait + \
                        ' in ' + initial_sample_description
                    if replicate_sample_description != '':
                        description = \
                            ' '.join(
                                (description, 'with',
                                 replicate_sample_description))
                    if platform_with_snps_passing_qc != '':
                        description = ' '.join(
                            (description, 'on platform',
                             platform_with_snps_passing_qc))
                    description = ' '.join((description, '(p='+pvalue+')'))

                    # make associations to the EFO terms; there can be >1
                    if mapped_trait_uri.strip() != '':
                        for t in re.split(r',', mapped_trait_uri):
                            t = t.strip()

                            cu = CurieUtil(curie_map.get())
                            tid = cu.get_curie(t)

                            assoc = G2PAssoc(
                                self.name, rs_id, tid,
                                gu.object_properties['contributes_to'])
                            assoc.add_source(pubmed_id)
                            # combinatorial evidence
                            # used in automatic assertion
                            eco_id = 'ECO:0000213'
                            assoc.add_evidence(eco_id)

                            # assoc.set_description(description)
                            # FIXME score should get added to provenance/study
                            # assoc.set_score(pvalue)
                            assoc.add_association_to_graph(g)

                    if not self.testMode and\
                            (limit is not None and line_counter > limit):
                        break

            Assoc(self.name).load_all_properties(g)

        # loop through the location hash,
        # and make all snps at that location equivalent
        for l in loc_to_id_hash:
            snp_ids = loc_to_id_hash[l]
            if len(snp_ids) > 1:
                logger.info("%s has >1 snp id: %s", l, str(snp_ids))
        return
Ejemplo n.º 22
0
class StreamedGraph(DipperGraph):
    """
    Stream rdf triples to file or stdout
    Assumes a downstream process will sort then uniquify triples

    Theoretically could support both ntriple, rdfxml formats, for now
    just support nt
    """

    curie_map = curimap.get()
    curie_util = CurieUtil(curie_map)

    with open('translationtable/GLOBAL_TERMS.yaml') as fhandle:
        globaltt = yaml.safe_load(fhandle).copy()
        globaltcid = {v: k for k, v in globaltt.items()}

    def __init__(self,
                 are_bnodes_skized=True,
                 identifier=None,
                 file_handle=None,
                 fmt='nt'):
        self.are_bnodes_skized = are_bnodes_skized
        self.fmt = fmt
        self.file_handle = file_handle
        self.identifier = identifier

    def addTriple(self,
                  subject_id,
                  predicate_id,
                  obj,
                  object_is_literal=None,
                  literal_type=None):
        # trying making infrence on type of object if none is supplied
        if object_is_literal is None:
            if self.curie_regexp.match(obj) or\
                    obj.split(':')[0].lower() in ('http', 'https', 'ftp'):
                object_is_literal = False
        else:
            object_is_literal = True

        subject_iri = self._getnode(subject_id)
        predicate_iri = self._getnode(predicate_id)
        if not object_is_literal:
            obj = self._getnode(obj)

        if literal_type is not None:
            literal_type = self._getnode(literal_type)

        if obj is not None:
            self.serialize(subject_iri, predicate_iri, obj, object_is_literal,
                           literal_type)
        else:
            LOG.warning("Null value passed as object")
        return

    def skolemizeBlankNode(self, curie):
        base_iri = StreamedGraph.curie_map.get_base()
        curie_id = curie.split(':')[1]
        skolem_iri = "{0}.wellknown/genid/{1}".format(base_iri, curie_id)
        return skolem_iri

    def serialize(self,
                  subject_iri,
                  predicate_iri,
                  obj,
                  object_is_literal=False,
                  literal_type=None):
        if not object_is_literal:
            triple = "<{}> <{}> <{}> .".format(subject_iri, predicate_iri, obj)
        elif literal_type is not None:
            triple = '<{}> <{}> {}^^<{}> .'.format(
                subject_iri, predicate_iri, self._quote_encode(str(obj)),
                literal_type)
        else:
            if isinstance(obj, str):
                triple = '<{}> <{}> {} .'.format(subject_iri, predicate_iri,
                                                 self._quote_encode(obj))
            else:
                lit_type = self._getLiteralXSDType(obj)
                if type is not None:
                    triple = '<{}> <{}> "{}"^^<{}> .'.format(
                        subject_iri, predicate_iri, obj, lit_type)
                else:
                    raise TypeError("Cannot determine type of {}".format(obj))

        if self.file_handle is None:
            print(triple)
        else:
            self.file_handle.write("{}\n".format(triple))

    def _getnode(self, curie):
        """
        Returns IRI, or blank node curie/iri depending on
        self.skolemize_blank_node setting

        :param curie: str id as curie or iri
        :return:
        """
        if re.match(r'^_:', curie):
            if self.are_bnodes_skized is True:
                node = self.skolemizeBlankNode(curie)
            else:
                node = curie
        elif re.match(r'^http|^ftp', curie):
            node = curie
        elif len(curie.split(':')) == 2:
            node = StreamedGraph.curie_util.get_uri(curie)
        else:
            raise TypeError("Cannot process curie {}".format(curie))
        return node

    def _getLiteralXSDType(self, literal):
        """
        This could be much more nuanced, but for now
        if a literal is not a str, determine if it's
        a xsd int or double
        :param literal:
        :return: str - xsd full iri
        """
        if isinstance(literal, int):
            return self._getnode("xsd:integer")
        if isinstance(literal, float):
            return self._getnode("xsd:double")

    @staticmethod
    def _quote_encode(literal):
        """
        Copy of code in rdflib here:
        https://github.com/RDFLib/rdflib/blob/776b90be/
        rdflib/plugins/serializers/nt.py#L76
        :param literal:
        :return:
        """
        return '"%s"' % literal.replace('\\', '\\\\')\
            .replace('\n', '\\n')\
            .replace('"', '\\"')\
            .replace('\r', '\\r')
Ejemplo n.º 23
0
class RDFGraph(DipperGraph, ConjunctiveGraph):
    """
    Extends RDFLibs ConjunctiveGraph
    The goal of this class is wrap the creation
    of triples and manage creation of URIRef,
    Bnodes, and literals from an input curie
    """

    curie_map = curie_map_class.get()
    curie_util = CurieUtil(curie_map)

    # make global translation table available outside the ingest
    with open(
            os.path.join(
                os.path.dirname(__file__),
                '../../translationtable/GLOBAL_TERMS.yaml')) as fhandle:
        globaltt = yaml.safe_load(fhandle)
        globaltcid = {v: k for k, v in globaltt.items()}

    def __init__(self, are_bnodes_skized=True, identifier=None):
        # print("in RDFGraph  with id: ", identifier)
        super().__init__('IOMemory', identifier)
        self.are_bnodes_skized = are_bnodes_skized

        # Can be removed when this is resolved
        # https://github.com/RDFLib/rdflib/issues/632
        for pfx in ('OBO', ):  # , 'ORPHA'):
            self.bind(pfx, Namespace(self.curie_map[pfx]))

        # try adding them all
        # self.bind_all_namespaces()  # too much

    def addTriple(self,
                  subject_id,
                  predicate_id,
                  obj,
                  object_is_literal=None,
                  literal_type=None):
        # trying making infrence on type of object if none is supplied
        if object_is_literal is None:
            if self.curie_regexp.match(obj) is not None or\
                    obj.split(':')[0].lower() in ('http', 'https', 'ftp'):
                object_is_literal = False
            else:
                object_is_literal = True

        if object_is_literal is True:
            if literal_type is not None and obj is not None:
                literal_type_iri = self._getnode(literal_type)
                self.add(
                    (self._getnode(subject_id), self._getnode(predicate_id),
                     Literal(obj, datatype=literal_type_iri)))
            elif obj is not None:
                self.add(
                    (self._getnode(subject_id), self._getnode(predicate_id),
                     Literal(obj)))
            else:
                LOG.warning("None as literal object for subj: %s and pred: %s",
                            subject_id, predicate_id)
                # get a sense of where the None is comming from
                # magic number here is "steps up the call stack"
                for call in range(2, 0, -1):
                    LOG.warning('\t%sfrom: %s', '\t' * call,
                                sys._getframe(call).f_code.co_name)

        elif obj is not None and obj != '':  # object is a resourse
            self.add((self._getnode(subject_id), self._getnode(predicate_id),
                      self._getnode(obj)))
        else:
            LOG.warning("None/empty object IRI for subj: %s and pred: %s",
                        subject_id, predicate_id)
        return

    def skolemizeBlankNode(self, curie):
        stripped_id = re.sub(r'^_:|^_', '', curie, 1)
        node = BNode(stripped_id).skolemize(self.curie_util.get_base())
        node = re.sub(r'rdflib/', '', node)  # remove string added by rdflib
        return URIRef(node)

    def _getnode(self, curie):  # convention is lowercase names
        """
        This is a wrapper for creating a URIRef or Bnode object
        with a given a curie or iri as a string.

        If an id starts with an underscore, it assigns it to a BNode, otherwise
        it creates it with a standard URIRef.
        Alternatively, self.skolemize_blank_node is True,
        it will skolemize the blank node

        :param curie: str identifier formatted as curie or iri
        :return: node: RDFLib URIRef or BNode object
        """
        node = None
        if curie[0] == '_':
            if self.are_bnodes_skized is True:
                node = self.skolemizeBlankNode(curie)
            else:  # delete the leading underscore to make it cleaner
                node = BNode(re.sub(r'^_:|^_', '', curie, 1))

        # Check if curie string is actually an IRI
        elif curie[:4] == 'http' or curie[:3] == 'ftp' or curie[:4] == 'jdbc':
            node = URIRef(curie)
        else:
            iri = RDFGraph.curie_util.get_uri(curie)
            if iri is not None:
                node = URIRef(RDFGraph.curie_util.get_uri(curie))
                # Bind prefix map to graph
                prefix = curie.split(':')[0]
                if prefix not in self.namespace_manager.namespaces():
                    mapped_iri = self.curie_map[prefix]
                    self.bind(prefix, Namespace(mapped_iri))
            else:
                LOG.error("couldn't make URI for %s", curie)
        return node

    def bind_all_namespaces(self):
        """
            Results in the RDF @prefix directives for every ingest
            being added to this ingest.
        """
        for prefix in self.curie_map.keys():
            iri = self.curie_map[prefix]
            self.bind(prefix, Namespace(iri))
        return

    # serialize() conflicts between rdflib & Graph.serialize abstractmethod
    # GraphUtils expects the former.  (too bad there is no multiple dispatch)
    def serialize(  # rdflib version
            self,
            destination=None,
            format='turtle',
            base=None,
            encoding=None):
        return ConjunctiveGraph.serialize(self, destination, format)
Ejemplo n.º 24
0
class ModelTestCase(unittest.TestCase):
    def setUp(self):
        g = RDFGraph()
        self.model = Model(g)

        this_curie_map = curie_map.get()
        self.cutil = CurieUtil(this_curie_map)

        # stuff to make test triples
        self.test_cat_subj_curie = "MGI:1234"
        self.test_cat_subj = self.cutil.get_uri("MGI:1234")
        self.test_cat_default_pred = self.cutil.get_uri("biolink:category")
        self.test_named_indiv = self.cutil.get_uri("owl:NamedIndividual")
        self.test_label_pred = self.cutil.get_uri("rdfs:label")
        self.test_label = "some label"

        self.test_comment_IRI = self.cutil.get_uri("rdfs:comment")
        self.test_comment = 'bonus eruptus'

    def tearDown(self):
        self.graph = None

    def test_addIndividualToGraph_assign_label(self):
        self.model.addIndividualToGraph(self.test_cat_subj_curie, "some label")

        label_triple = list(
            self.model.graph.triples((URIRef(self.test_cat_subj),
                                      URIRef(self.test_label_pred), None)))

        self.assertEqual(len(label_triple), 1, "method didn't assign label")
        self.assertEqual(str(label_triple[0][2]), self.test_label,
                         "method didn't assign correct label")

    def test_addIndividualToGraph_assign_type_named_individual(self):
        self.model.addIndividualToGraph(self.test_cat_subj_curie, "some label")

        triples = list(
            self.model.graph.triples((URIRef(self.test_cat_subj), None,
                                      URIRef(self.test_named_indiv))))

        self.assertEqual(len(triples), 1,
                         "method didn't assign type as named individual")

    def test_addIndividualToGraph_assign_category(self):
        self.model.addIndividualToGraph(self.test_cat_subj_curie,
                                        "some label",
                                        ind_category=blv.terms['Genotype'])

        triples = list(
            self.model.graph.triples(
                (URIRef(self.test_cat_subj),
                 URIRef(self.test_cat_default_pred), None)))

        self.assertEqual(len(triples), 1, "method didn't assign category")

    def test_add_comment(self):
        self.model.addComment(self.test_cat_subj, self.test_comment)

        triples = list(
            self.model.graph.triples(
                (URIRef(self.test_cat_subj), URIRef(self.test_comment_IRI),
                 Literal(self.test_comment))))

        self.assertEqual(len(triples), 1, "method didn't assign comment")

    def test_add_comment_assign_subject_category(self):
        self.model.addComment(self.test_cat_subj,
                              self.test_comment,
                              subject_category=blv.terms['Genotype'])

        triples = list(
            self.model.graph.triples(
                (URIRef(self.test_cat_subj),
                 URIRef(self.test_cat_default_pred), None)))
        self.assertEqual(len(triples), 1, "method didn't assign category")
Ejemplo n.º 25
0
    def test_missense_variant_cdna_model(self):
        """
        Test missense variant with cdna information
        Using test data set 2, and the function add_variant_info_to_graph()
        We want to test the following triples:

        CGD:VariantID is an instance of OBO:SO_0001059
        CGD:VariantID is an instance of OBO:SO_0001583
        CGD:VariantID has the label "ABL1 T315I missense mutation"
        CGD:VariantID is_sequence_variant_instance_of (OBO:GENO_0000408) NCBIGene:25
        CGD:VariantID has location (faldo:location) AminoAcidRegionID
        CGD:VariantID has location (faldo:location) CDNARegionID
        CGD:VariantID has location (faldo:location) ChromosomalRegionID
        CGD:VariantID OBO:GENO_reference_amino_acid "T"
        CGD:VariantID OBO:GENO_results_in_amino_acid_change "I"
        CGD:VariantID owl:sameAs dbSNP:rs121913459
        CGD:VariantID owl:sameAs COSMIC:12560
        CGD:VariantID RO:0002205 (transcribed_to) CCDS:35166.1

        CCDS:35166.1 is an instance of OBO:SO_0000233
        CCDS:35166.1 has the label "CCDS35166.1"
        CCDS:35166.1 OBO:RO_0002513 (translates_to) UniProtKB:P00519#P00519-1
        CCDS:35166.1 OBO:RO_0002513 (translates_to) NCBIProtein:NP_005148.2

        UniProtKB:P00519#P00519-1 owl:sameAs NCBIProtein:NP_005148.2

        UniProtKB:P00519#P00519-1 is an instance of OBO:SO_0000104 (polypeptide)
        UniProtKB:P00519#P00519-1 has the label "P00519#P00519-1"

        NCBIProtein:NP_005148.2 is an instance of OBO:SO_0000104 (polypeptide)
        NCBIProtein:NP_005148.2 has the label "NP_005148.2"
        """
        from dipper.utils.TestUtils import TestUtils

        self.cgd.add_variant_info_to_graph(self.test_set_2)

        # Make testutils object and load bindings
        test_env = TestUtils(self.cgd.graph)
        cu = CurieUtil(self.curie_map)
        self.cgd.load_bindings()

        (variant_key, variant_label, amino_acid_variant, amino_acid_position,
         transcript_id, transcript_priority, protein_variant_type,
         functional_impact, stop_gain_loss, transcript_gene,
         protein_variant_source, variant_gene, bp_pos, variant_cdna, cosmic_id,
         db_snp_id, genome_pos_start, genome_pos_end, ref_base, variant_base,
         primary_transcript_exons, primary_transcript_variant_sub_types,
         variant_type, chromosome, genome_build, build_version,
         build_date) = self.test_set_2[0]

        gene_id = self.cgd.gene_map[transcript_gene]
        ref_amino_acid = "T"
        altered_amino_acid = "I"
        db_snp_curie = "dbSNP:121913459"
        cosmic_curie = "COSMIC:12560"
        uniprot_curie = "UniProtKB:P00519#P00519-1"
        uniprot_id = "P00519#P00519-1"
        refseq_curie = "NCBIProtein:NP_005148.2"
        transcript_curie = "CCDS:35166.1"
        ccds_id = "35166.1"
        position = 315
        chromosome_curie = "hg19chr9"

        variant_id = self.cgd.make_cgd_id('variant{0}'.format(variant_key))
        aa_region_id = ":_{0}{1}{2}Region".format(position, position,
                                                  uniprot_curie)
        cdna_region_id = ":_{0}Region".format(transcript_curie)
        chr_region_id = ":_{0}{1}Region-{2}-{3}".format(
            genome_build, chromosome, genome_pos_start, genome_pos_end)
        aa_coord_id = ":_{0}-{1}".format(uniprot_id, position)
        cdna_coord_id = ":_{0}-{1}".format(ccds_id, bp_pos)
        # chr_coord_id = "CHR:{0}-{1}".format(chromosome_curie, genome_pos_start)
        chr_coord_id = ":_{0}-{1}".format(chromosome_curie, genome_pos_start)

        variant_uri = URIRef(cu.get_uri(variant_id))
        transcript_uri = URIRef(cu.get_uri(transcript_curie))
        gene_uri = URIRef(cu.get_uri(gene_id))
        db_snp_uri = URIRef(cu.get_uri(db_snp_curie))
        cosmic_uri = URIRef(cu.get_uri(cosmic_curie))
        uniprot_uri = URIRef(cu.get_uri(uniprot_curie))
        refseq_uri = URIRef(cu.get_uri(refseq_curie))
        aa_region_uri = URIRef(cu.get_uri(aa_region_id))
        cdna_region_uri = URIRef(cu.get_uri(cdna_region_id))
        chr_region_uri = URIRef(cu.get_uri(chr_region_id))
        aa_coord_uri = URIRef(cu.get_uri(aa_coord_id))
        cdna_coord_uri = URIRef(cu.get_uri(cdna_coord_id))
        chr_coord_uri = URIRef(cu.get_uri(chr_coord_id))

        sparql_query = """
                       SELECT ?cosmic ?gene ?aaRegion ?cdnaRegion ?chrRegion
                              ?dbSNP ?transcript ?uniprot ?refseq
                              ?aaCoord ?cdnaCoord ?chrCoord
                       WHERE {{
                           ?cosmic a OBO:SO_0001059;
                               a OBO:SO_0001583 ;
                               OBO:GENO_0000408 ?gene ;
                               faldo:location ?aaRegion ;
                               faldo:location ?cdnaRegion ;
                               faldo:location ?chrRegion ;
                               OBO:GENO_reference_amino_acid "{0}" ;
                               OBO:GENO_reference_nucleotide "{1}" ;
                               OBO:GENO_altered_nucleotide "{2}" ;
                               OBO:GENO_results_in_amino_acid_change "{3}" ;
                               owl:sameAs ?dbSNP ;
                               RO:0002205 ?transcript .

                           ?cosmic owl:sameAs ?dbSNP .

                           ?transcript a OBO:SO_0000233 ;
                               rdfs:label "{4}" ;
                               OBO:RO_0002513 ?uniprot ;
                               OBO:RO_0002513 ?refseq .

                           ?uniprot a OBO:SO_0000104 ;
                               rdfs:label "P00519-1" .

                           ?refseq a OBO:SO_0000104 ;
                               rdfs:label "NP_005148.2" .

                           ?refseq owl:sameAs ?uniprot .

                           ?aaRegion faldo:begin ?aaCoord .
                           ?cdnaRegion faldo:begin ?cdnaCoord .
                           ?chrRegion faldo:begin ?chrCoord .

                           ?aaCoord faldo:position {5} .
                           ?cdnaCoord faldo:position {6} .
                           ?chrCoord faldo:position {7} .

                           ?dbSNP rdfs:label "{8}" .
                       }}
                       """.format(ref_amino_acid, ref_base, variant_base,
                                  altered_amino_acid, transcript_id, position,
                                  bp_pos, genome_pos_start, db_snp_id)

        # Expected Results
        expected_results = [[
            cosmic_uri, gene_uri, aa_region_uri, cdna_region_uri,
            chr_region_uri, db_snp_uri, transcript_uri, uniprot_uri,
            refseq_uri, aa_coord_uri, cdna_coord_uri, chr_coord_uri
        ]]
        # Query graph
        sparql_output = test_env.query_graph(sparql_query)

        self.assertEqual(expected_results, sparql_output)
Ejemplo n.º 26
0
    def test_missense_variant_protein_model(self):
        """
        Test missense variant with only protein information
        Using test data set 1, and the function add_variant_info_to_graph()
        We want to test the following triples:

        CGD:VariantID is an instance of OBO:SO_0001059
        CGD:VariantID is an instance of OBO:SO_0001583
        CGD:VariantID has the label "CSF3R Q741X  missense mutation"
        CGD:VariantID is_sequence_variant_instance_of (OBO:GENO_0000408) NCBIGene:1441
        CGD:VariantID has location (faldo:location) CGD:RegionID
        CGD:VariantID OBO:GENO_reference_amino_acid "Q"
        CGD:VariantID OBO:GENO_results_in_amino_acid_change "X"
        CGD:VariantID RO:0002205 CCDS:413.1

        CCDS:413.1 is an instance of OBO:GENO_primary
        CCDS:413.1 has the label "CCDS413.1"
        """
        from dipper.utils.TestUtils import TestUtils

        self.cgd.add_variant_info_to_graph(self.test_set_1)

        # Make testutils object and load bindings
        test_env = TestUtils(self.cgd.graph)
        cu = CurieUtil(self.curie_map)
        self.cgd.load_bindings()

        (variant_key, variant_label, amino_acid_variant, amino_acid_position,
         transcript_id, transcript_priority, protein_variant_type,
         functional_impact, stop_gain_loss, transcript_gene,
         protein_variant_source) = self.test_set_1[0][0:11]

        gene_id = self.cgd.gene_map[transcript_gene]
        ref_amino_acid = "Q"
        altered_amino_acid = "X"
        position = 741
        uniprot_curie = "UniProtKB:Q99062#Q99062-1"

        variant_id = self.cgd.make_cgd_id('variant{0}'.format(variant_key))
        transcript = "CCDS:413.1"
        region_id = ":_{0}{1}{2}Region".format(position, position,
                                               uniprot_curie)
        variant_uri = URIRef(cu.get_uri(variant_id))
        transcript_uri = URIRef(cu.get_uri(transcript))
        gene_uri = URIRef(cu.get_uri(gene_id))
        region_uri = URIRef(cu.get_uri(region_id))

        sparql_query = """
                       SELECT ?variant ?gene ?region ?transcript
                       WHERE {{
                           ?variant a OBO:SO_0001059;
                               a OBO:SO_0001583 ;
                               rdfs:label "{0}" ;
                               OBO:GENO_0000408 ?gene ;
                               faldo:location ?region ;
                               OBO:GENO_reference_amino_acid "{1}" ;
                               OBO:GENO_results_in_amino_acid_change "{2}" ;
                               RO:0002205 ?transcript .

                           ?transcript a OBO:SO_0000233 ;
                               rdfs:label "{3}" .
                       }}
                       """.format(variant_label, ref_amino_acid,
                                  altered_amino_acid, transcript_id)

        # Expected Results
        expected_results = [[
            variant_uri, gene_uri, region_uri, transcript_uri
        ]]
        # Query graph
        sparql_output = test_env.query_graph(sparql_query)

        self.assertEqual(expected_results, sparql_output)
Ejemplo n.º 27
0
    def test_chromosome_position_model(self):
        """
        Test modelling of genomic positions
        Using test data set 2, and the function add_variant_info_to_graph()
        """
        from dipper.utils.TestUtils import TestUtils
        self.cgd.add_variant_info_to_graph(self.test_set_2)

        # Make testutils object and load bindings
        test_env = TestUtils(self.cgd.graph)
        cu = CurieUtil(self.curie_map)
        self.cgd.load_bindings()

        (variant_key, variant_label, amino_acid_variant, amino_acid_position,
         transcript_id, transcript_priority, protein_variant_type,
         functional_impact, stop_gain_loss, transcript_gene,
         protein_variant_source, variant_gene, bp_pos, variant_cdna, cosmic_id,
         db_snp_id, genome_pos_start, genome_pos_end, ref_base, variant_base,
         primary_transcript_exons, primary_transcript_variant_sub_types,
         variant_type, chromosome, genome_build, build_version,
         build_date) = self.test_set_2[0]

        variant_id = self.cgd.make_cgd_id('variant{0}'.format(variant_key))

        chromosome_curie = ":MONARCH_hg19chr9"
        region_id = ":_{0}{1}Region-{2}-{3}".format(genome_build, chromosome,
                                                    genome_pos_start,
                                                    genome_pos_end)
        start_id = ":_hg19chr9-{0}".format(genome_pos_start)
        end_id = ":_hg19chr9-{0}".format(genome_pos_end)

        region_uri = URIRef(cu.get_uri(region_id))
        start_uri = URIRef(cu.get_uri(start_id))
        end_uri = URIRef(cu.get_uri(end_id))
        chromosome_uri = URIRef(cu.get_uri(chromosome_curie))

        sparql_query = """
                       SELECT ?region ?startPosition ?endPosition ?chromosome
                       WHERE {{
                           ?region a faldo:Region ;
                               faldo:begin ?startPosition ;
                               faldo:end ?endPosition .

                           ?startPosition a faldo:Position ;
                               faldo:position {0} ;
                               faldo:reference ?chromosome .

                           ?endPosition a faldo:Position ;
                               faldo:position {1} ;
                               faldo:reference ?chromosome .
                       }}
                       """.format(
            genome_pos_start,
            genome_pos_end,
        )

        # Expected Results
        expected_results = [[region_uri, start_uri, end_uri, chromosome_uri]]

        # Query graph
        sparql_output = test_env.query_graph(sparql_query)

        self.assertEqual(expected_results, sparql_output)
Ejemplo n.º 28
0
    def setUp(self):
        self.test_util = TestUtils()
        self.assoc_curie = 'MONARCH:test_association'
        self.eco_id = 'ECO:0000015'

        # Headers:
        # 01 marker_accession_id,
        # 02 marker_symbol,
        # 03 phenotyping_center,
        # 04 colony_raw,
        # 05 sex,
        # 06 zygosity,
        # 07 allele_accession_id,
        # 08 allele_symbol,
        # 09 allele_name,
        # 10 strain_accession_id,
        # 11 strain_name,
        # 12 project_name,
        # 13 project_fullname,
        # 14 pipeline_name,
        # 15 pipeline_stable_id,
        # 16 procedure_stable_id,
        # 17 procedure_name,
        # 18 parameter_stable_id,
        # 19 parameter_name,
        # 20 top_level_mp_term_id,
        # 21 top_level_mp_term_name,
        # 22 mp_term_id,
        # 23 mp_term_name,
        # 24 p_value,
        # 25 percentage_change,
        # 26 effect_size,
        # 27 statistical_method,
        # 28 resource_name

        self.test_set_1 = (
            'MGI:1920145',              # 01
            'Setd5',                    # 02
            'WTSI',                     # 03
            'MEFW',                     # 04
            'male',                     # 05
            'heterozygote',             # 06
            'MGI:4432631',              # 07
            'Setd5<tm1a(EUCOMM)Wtsi>',  # 08
            'targeted mutation 1a, Wellcome Trust Sanger Institute',    # 09
            'MGI:2159965',              # 10
            'C57BL/6N',                 # 11
            'MGP',                      # 12
            'Wellcome Trust Sanger Institute Mouse Genetics Project',   # 13
            'MGP Select Pipeline',      # 14
            'MGP_001',                  # 15
            'MGP_XRY_001',              # 16
            'X-ray',                    # 17
            'IMPC_XRY_008_001',         # 18
            'Number of ribs right',     # 19
            'MP:0005390',               # 20
            'skeleton phenotype',       # 21
            'MP:0000480',               # 22
            'increased rib number',     # 23
            '1.637023E-010',            # 24
            '',                         # 25
            '8.885439E-007',            # 26
            'Wilcoxon rank sum test with continuity correction',    # 27
            'IMPC'            # 28
        )

        # Generate test curies, these are otherwise generated
        # within _add_evidence() and _add_study_provenance()
        # these blank nodes are hardcoded as NOT Skolemized  ...
        self.study_curie = "_:study"
        self.evidence_curie = "_:evidence"

        # IRIs for testing sparql output
        curie_dict = curie_map.get()
        curie_util = CurieUtil(curie_dict)
        self.assoc_iri = URIRef(curie_util.get_uri(self.assoc_curie))

        return
Ejemplo n.º 29
0
class GenotypeTestCase(unittest.TestCase):
    def setUp(self):
        self.graph = RDFGraph()
        self.curie_map = curie_map.get()
        self.genotype = Genotype(self.graph)
        self.cutil = CurieUtil(self.curie_map)
        self.test_cat_pred = self.cutil.get_uri(blv.terms['category'])
        self.test_cat_genotype_category = self.cutil.get_uri(
            blv.terms['Genotype'])
        self.test_cat_background_category = self.cutil.get_uri(
            blv.terms['PopulationOfIndividualOrganisms'])

    def tearDown(self):
        self.genotype = None

    def test_addGenotype(self):
        cutil = CurieUtil(self.curie_map)
        gid = 'MGI:5515892'
        label = \
            'Pmp22<Tr-2J>/Pmp22<+> [C57BL/6J-Pmp22<Tr-2J>/GrsrJ]'
        self.genotype.addGenotype(gid, label)
        self.assertTrue((URIRef(cutil.get_uri(gid)), RDFS['label'],
                         Literal(label)) in self.genotype.graph)

    def test_addGenomicBackgroundToGenotype_adds_genotype(self):
        """
         test that addGenomicBackgroundToGenotype() correctly assigns
         subject/object category
         """
        genotype_id = "GENO:0000002"
        background_id = "GENO:0000002"  # no idea what a good example background ID is
        self.genotype.addGenomicBackgroundToGenotype(
            background_id=background_id, genotype_id=genotype_id)

        geno_triples = list(
            self.graph.triples((URIRef(self.cutil.get_uri(genotype_id)),
                                URIRef(self.test_cat_pred),
                                URIRef(self.test_cat_genotype_category))))

    def test_addGenomicBackgroundToGenotype_adds_categories(self):
        """
         test that addGenomicBackgroundToGenotype() correctly assigns
         subject/object category
         """
        genotype_id = "GENO:0000002"
        background_id = "GENO:0000002"  # no idea what a good example background ID is
        self.genotype.addGenomicBackgroundToGenotype(
            background_id=background_id, genotype_id=genotype_id)

        geno_triples = list(
            self.graph.triples((URIRef(self.cutil.get_uri(genotype_id)),
                                URIRef(self.test_cat_pred),
                                URIRef(self.test_cat_genotype_category))))
        self.assertEqual(
            len(geno_triples), 1,
            "addTriples() didn't make exactly 1 genotype category triple")
        self.assertEqual(
            geno_triples[0][2], URIRef(self.test_cat_genotype_category),
            "addTriples() didn't assign the right genotype category")

        background_triples = list(
            self.graph.triples((URIRef(self.cutil.get_uri(background_id)),
                                URIRef(self.test_cat_pred),
                                URIRef(self.test_cat_background_category))))
        self.assertEqual(
            len(background_triples), 1,
            "addTriples() didn't make exactly 1 genotype category triple")
        self.assertEqual(
            background_triples[0][2],
            URIRef(self.test_cat_background_category),
            "addTriples() didn't assign the right background category")

        # does not compile
        #    def test_addParts(self):
        #        """
        #        """
        #        if part_relationship is None:
        #            part_relationship = self.globaltt['has_part']
        #        # Fail loudly if parent or child identifiers are None
        #        if parent_id is None:
        #            raise TypeError('Attempt to pass None as parent')
        #        elif part_id is None:
        #            raise TypeError('Attempt to pass None as child')
        #        elif part_relationship is None:
        #            part_relationship = self.globaltt['has_part']
        #
        #        self.graph.addTriple(parent_id, part_relationship, part_id,
        #                             subject_category=subject_category,
        #                             object_category=object_category)

        return
Ejemplo n.º 30
0
    def _add_variant_trait_association(self,
                                       variant_id,
                                       mapped_trait_uri,
                                       efo_ontology,
                                       pubmed_id,
                                       description=None):
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        # make associations to the EFO terms; there can be >1
        if mapped_trait_uri.strip() != '':
            for trait in re.split(r',', mapped_trait_uri):
                trait = trait.strip()

                cu = CurieUtil(curie_map.get())
                trait_id = cu.get_curie(trait)

                dis_query = """
                    SELECT ?trait
                    WHERE {{
                        {0} rdfs:subClassOf+ EFO:0000408 .
                        {0} rdfs:label ?trait .
                    }}
                """.format(trait_id)

                query_result = efo_ontology.query(dis_query)
                if len(list(query_result)) > 0:
                    if re.match(r'^EFO', trait_id):
                        model.addClassToGraph(trait_id,
                                              list(query_result)[0][0],
                                              'DOID:4')

                phenotype_query = """
                    SELECT ?trait
                    WHERE {{
                        {0} rdfs:subClassOf+ EFO:0000651 .
                        {0} rdfs:label ?trait .
                    }}
                """.format(trait_id)

                query_result = efo_ontology.query(phenotype_query)
                if len(list(query_result)) > 0:
                    if re.match(r'^EFO', trait_id):
                        model.addClassToGraph(trait_id,
                                              list(query_result)[0][0],
                                              'UPHENO:0001001')

                pubmed_curie = 'PMID:' + pubmed_id

                ref = Reference(g, pubmed_curie,
                                Reference.ref_types['journal_article'])
                ref.addRefToGraph()

                assoc = G2PAssoc(g, self.name, variant_id, trait_id,
                                 model.object_properties['contributes_to'])
                assoc.add_source(pubmed_curie)
                # combinatorial evidence
                # used in automatic assertion
                eco_id = 'ECO:0000213'
                assoc.add_evidence(eco_id)

                if description is not None:
                    assoc.set_description(description)

                # FIXME score should get added to provenance/study
                # assoc.set_score(pvalue)
                assoc.add_association_to_graph()
Ejemplo n.º 31
0
class RDFGraph(DipperGraph, ConjunctiveGraph):
    """
    Extends RDFLibs ConjunctiveGraph
    The goal of this class is wrap the creation
    of triples and manage creation of URIRef,
    Bnodes, and literals from an input curie
    """

    curie_map = curie_map_class.get()
    curie_util = CurieUtil(curie_map)

    # make global translation table available outside the ingest
    with open(
        os.path.join(
            os.path.dirname(__file__),
            '../../translationtable/GLOBAL_TERMS.yaml')) as fhandle:
        globaltt = yaml.safe_load(fhandle)
        globaltcid = {v: k for k, v in globaltt.items()}

    def __init__(self, are_bnodes_skized=True, identifier=None):
        # print("in RDFGraph  with id: ", identifier)
        super().__init__('IOMemory', identifier)
        self.are_bnodes_skized = are_bnodes_skized
        self.prefixes = set()

        # Can be removed when this is resolved
        # https://github.com/RDFLib/rdflib/issues/632
        for pfx in ('OBO',):  # , 'ORPHA'):
            self.bind(pfx, Namespace(self.curie_map[pfx]))

    def _make_category_triple(
            self, subject, category, predicate=blv.terms['category']
    ):
        """
        add a triple to capture subject or object category (in CURIE form) that was
        passed to addTriple()
        """
        try:
            self.add((
                self._getnode(subject),
                self._getnode(predicate),
                self._getnode(category)))
        except:
            LOG.warning(
                "Problem adding triple in _makeCategoryTriple for " + \
                "subj: %s pred: %s obj(category): %s",
                subject, predicate, category)
                
    def _is_literal(self, thing):
        """
        make inference on type (literal or CURIE)

        return: logical
        """
        if self.curie_regexp.match(thing) is not None or\
           thing.split(':')[0].lower() in ('http', 'https', 'ftp'):
            object_is_literal = False
        else:
            object_is_literal = True

        return object_is_literal

    def addTriple(
            self,
            subject_id,
            predicate_id,
            obj,
            object_is_literal=None,
            literal_type=None,
            subject_category=None,
            object_category=None
    ):

        if object_is_literal is None:
            object_is_literal = self._is_literal(obj)

        # add triples for subject category info
        if subject_category is not None:
            self._make_category_triple(subject_id, subject_category)

        # add triples for obj category info, if obj is not a literal
        if not object_is_literal:
            if object_category is not None:
                self._make_category_triple(obj, object_category)
        else: # emit warning if object category is given for a literal
            if object_category is not None:
                LOG.warning("I was given a category %s for obj: %s, " +
                            "which seems to be a literal!",
                            object_category, obj)
            
        if object_is_literal is True:
            if isinstance(obj, str):
                re.sub(r'[\t\n\r\f\v]+', ' ', obj)  # reduce any ws to a space
            if literal_type is not None and obj is not None and obj not in ("", " "):
                literal_type_iri = self._getnode(literal_type)

                self.add(
                    (self._getnode(subject_id), self._getnode(predicate_id),
                     Literal(obj, datatype=literal_type_iri)))
            elif obj is not None:
                # could attempt to infer a type here but there is no use case
                self.add((
                    self._getnode(subject_id), self._getnode(predicate_id),
                    Literal(obj)))
            else:
                LOG.warning(
                    "None as literal object for subj: %s and pred: %s",
                    subject_id, predicate_id)
                # get a sense of where the None is comming from
                # magic number here is "steps up the call stack"
                # TODO there may be easier/ideomatic ways to do this now
                for call in range(2, 0, -1):
                    LOG.warning(
                        '\t%sfrom: %s', '\t' * call, sys._getframe(call).f_code.co_name)

        elif obj is not None and obj != '':  # object is a resource
            self.add((
                self._getnode(subject_id),
                self._getnode(predicate_id),
                self._getnode(obj)))
        else:
            LOG.warning(
                "None/empty object IRI for subj: %s and pred: %s",
                subject_id, predicate_id)

    def skolemizeBlankNode(self, curie):
        stripped_id = re.sub(r'^_:|^_', '', curie, 1)
        return URIRef(self.curie_map['BNODE'] + stripped_id)

    def _getnode(self, curie):
        """
        This is a wrapper for creating a URIRef or Bnode object
        with a given a curie or iri as a string.

        If an id starts with an underscore, it assigns it to a BNode, otherwise
        it creates it with a standard URIRef.
        Alternatively, self.skolemize_blank_node is True,
        it will skolemize the blank node

        :param curie: str identifier formatted as curie or iri
        :return: node: RDFLib URIRef or BNode object
        """
        node = None
        if curie[0] == '_':
            if self.are_bnodes_skized:
                node = self.skolemizeBlankNode(curie)
            else:  # delete the leading underscore to make it cleaner
                node = BNode(re.sub(r'^_:|^_', '', curie, 1))

        # Check if curie string is actually an IRI
        elif curie[:4] == 'http' or curie[:3] == 'ftp' or curie[:4] == 'jdbc':
            node = URIRef(curie)
        else:
            iri = RDFGraph.curie_util.get_uri(curie)
            if iri is not None:
                node = URIRef(iri)
                # Bind prefix map to graph
                prefix = curie.split(':')[0]
                self.prefixes.add(prefix)
            else:
                LOG.error("couldn't make URI for %s", curie)
                # get a sense of where the CURIE-ish? thing is comming from
                # magic number here is "steps up the call stack"
                for call in range(3, 0, -1):
                    LOG.warning(
                        '\t%sfrom: %s', '\t' * call, sys._getframe(call).f_code.co_name)
        return node

    def bind_all_namespaces(self):
        """
            Results in the RDF @prefix directives for every ingest
            being added to this ingest.
        """
        for prefix in self.curie_map.keys():
            iri = self.curie_map[prefix]
            self.bind(prefix, Namespace(iri))

    # serialize() conflicts between rdflib & Graph.serialize abstractmethod
    # GraphUtils expects the former.  (too bad there is no multiple dispatch)
    # rdflib version
    def serialize(
            self, destination=None, format='turtle', base=None, encoding=None
    ):
        for prefix in self.prefixes:
            mapped_iri = self.curie_map[prefix]
            self.bind(prefix, Namespace(mapped_iri))
        return ConjunctiveGraph.serialize(self, destination, format)
Ejemplo n.º 32
0
 def __init__(self, curie_map, materialize_bnodes=False):
     self.curie_map = curie_map
     self.cu = CurieUtil(curie_map)         # TEC: what is cu really?
     self.nobnodes = materialize_bnodes
     return
Ejemplo n.º 33
0
    def test_amino_acid_position_region_model(self):
        """
        Test modelling of amino acid positions
        Using test data set 1, and the function add_variant_info_to_graph()
        We want to test the following triples:

        CGD:RegionID is an instance of faldo:Region
        CGD:RegionID faldo:begin BothStrandPositionID
        CGD:RegionID faldo:end BothStrandPositionID

        CGD:BothStrandPositionID is an instance of faldo:BothStrandPosition
        CGD:BothStrandPositionID is an instance of faldo:Position
        CGD:BothStrandPositionID faldo:position 741
        CGD:BothStrandPositionID faldo:reference UniProtID
        """
        from dipper.utils.TestUtils import TestUtils
        self.cgd.add_variant_info_to_graph(self.test_set_1)

        # Make testutils object and load bindings
        test_env = TestUtils(self.cgd.graph)
        cu = CurieUtil(self.curie_map)
        self.cgd.load_bindings()

        (variant_key, variant_label, amino_acid_variant, amino_acid_position,
         transcript_id, transcript_priority, protein_variant_type,
         functional_impact, stop_gain_loss, transcript_gene,
         protein_variant_source) = self.test_set_1[0][0:11]

        position = 741
        variant_id = self.cgd.make_cgd_id('variant{0}'.format(variant_key))

        uniprot_curie = "UniProtKB:Q99062#Q99062-1"
        uniprot_id = "Q99062#Q99062-1"
        region_id = ":_{0}{1}{2}Region".format(position, position,
                                               uniprot_curie)
        both_strand_id = ":_{0}-{1}".format(uniprot_id, position)

        region_uri = URIRef(cu.get_uri(region_id))
        both_strand_uri = URIRef(cu.get_uri(both_strand_id))
        uniprot_uri = URIRef(cu.get_uri(uniprot_curie))

        sparql_query = """
                       SELECT ?region ?bsPosition ?protein
                       WHERE {{
                           ?region a faldo:Region ;
                               faldo:begin ?bsPosition ;
                               faldo:end ?bsPosition .

                           ?bsPosition a faldo:Position ;
                               faldo:position {0} ;
                               faldo:reference ?protein .
                       }}
                       """.format(position)

        # Expected Results
        expected_results = [[region_uri, both_strand_uri, uniprot_uri]]

        # Query graph
        sparql_output = test_env.query_graph(sparql_query)

        self.assertEqual(expected_results, sparql_output)
Ejemplo n.º 34
0
    def setUp(self):
        self.test_util = TestUtils()
        self.assoc_curie = 'MONARCH:test_association'
        self.eco_id = 'ECO:0000015'

        # Headers:
        # 01 marker_accession_id,
        # 02 marker_symbol,
        # 03 phenotyping_center,
        # 04 colony_raw,
        # 05 sex,
        # 06 zygosity,
        # 07 allele_accession_id,
        # 08 allele_symbol,
        # 09 allele_name,
        # 10 strain_accession_id,
        # 11 strain_name,
        # 12 project_name,
        # 13 project_fullname,
        # 14 pipeline_name,
        # 15 pipeline_stable_id,
        # 16 procedure_stable_id,
        # 17 procedure_name,
        # 18 parameter_stable_id,
        # 19 parameter_name,
        # 20 top_level_mp_term_id,
        # 21 top_level_mp_term_name,
        # 22 mp_term_id,
        # 23 mp_term_name,
        # 24 p_value,
        # 25 percentage_change,
        # 26 effect_size,
        # 27 statistical_method,
        # 28 resource_name

        self.test_set_1 = (
            'MGI:1920145',              # 01
            'Setd5',                    # 02
            'WTSI',                     # 03
            'MEFW',                     # 04
            'male',                     # 05
            'heterozygote',             # 06
            'MGI:4432631',              # 07
            'Setd5<tm1a(EUCOMM)Wtsi>',  # 08
            'targeted mutation 1a, Wellcome Trust Sanger Institute',    # 09
            'MGI:2159965',              # 10
            'C57BL/6N',                 # 11
            'MGP',                      # 12
            'Wellcome Trust Sanger Institute Mouse Genetics Project',   # 13
            'MGP Select Pipeline',      # 14
            'MGP_001',                  # 15
            'MGP_XRY_001',              # 16
            'X-ray',                    # 17
            'IMPC_XRY_008_001',         # 18
            'Number of ribs right',     # 19
            'MP:0005390',               # 20
            'skeleton phenotype',       # 21
            'MP:0000480',               # 22
            'increased rib number',     # 23
            '1.637023E-010',            # 24
            '',                         # 25
            '8.885439E-007',            # 26
            'Wilcoxon rank sum test with continuity correction',    # 27
            'IMPC'            # 28
        )

        # Generate test curies, these are otherwise generated
        # within _add_evidence() and _add_study_provenance()
        # these blank nodes are hardcoded as NOT Skolemized  ...
        self.study_curie = "_:study"
        self.evidence_curie = "_:evidence"

        # IRIs for testing sparql output
        curie_dict = curie_map.get()
        curie_util = CurieUtil(curie_dict)
        self.assoc_iri = URIRef(curie_util.get_uri(self.assoc_curie))

        return
Ejemplo n.º 35
0
class RDFGraphTestCase(unittest.TestCase):
    def setUp(self):
        self.graph = RDFGraph()

        this_curie_map = curie_map.get()
        self.cutil = CurieUtil(this_curie_map)

        # stuff to make test triples
        self.test_cat_subj = "http://www.google.com"
        self.test_cat_default_pred = self.cutil.get_uri("biolink:category")
        self.test_cat_nondefault_pred = self.cutil.get_uri("rdf:type")
        self.test_cat_default_category = self.cutil.get_uri(
            "biolink:NamedThing")
        self.test_cat_nondefault_category = self.cutil.get_uri("biolink:Gene")
        self.test_cat_type = self.cutil.get_uri("rdf:type")
        self.test_cat_class = self.cutil.get_uri("rdf:class")

    def tearDown(self):
        self.graph = None

    def test_add_triple_makes_triple(self):
        """
        test that addTriple() makes at least one triple
        """
        self.graph.addTriple(subject_id=self.test_cat_subj,
                             predicate_id="rdf:type",
                             obj="rdf:class")
        self.assertTrue(
            len(self.graph) > 0, "addTriples() didn't make >=1 triple")

    def test_add_triple_subject_category_assignment(self):
        """
        test that addTriple() correctly assigns subject category
        """
        self.graph.addTriple(
            subject_id=self.test_cat_subj,
            predicate_id="rdf:comment",
            obj="website",
            subject_category=self.test_cat_nondefault_category)
        triples = list(
            self.graph.triples((URIRef(self.test_cat_subj),
                                URIRef(self.test_cat_default_pred), None)))
        self.assertEqual(
            len(triples), 1,
            "addTriples() didn't make exactly one triple subject category")
        self.assertEqual(
            triples[0][2], URIRef(self.test_cat_nondefault_category),
            "addTriples() didn't assign the right triple subject category")

    def test_add_triple_object_category_assignment(self):
        """
        test that addTriple() correctly assigns object category
        """
        self.graph.addTriple(subject_id=self.test_cat_subj,
                             predicate_id=self.test_cat_type,
                             obj=self.test_cat_class,
                             object_category=self.test_cat_nondefault_category)
        triples = list(
            self.graph.triples((URIRef(self.test_cat_class),
                                URIRef(self.test_cat_default_pred), None)))
        self.assertEqual(
            len(triples), 1,
            "addTriples() didn't make exactly one triple object category")
        self.assertEqual(
            triples[0][2], URIRef(self.test_cat_nondefault_category),
            "addTriples() didn't assign the right triple object category")

    def read_graph_from_turtle_file(self, f):
        """
        This will read the specified file into a graph.  A simple parsing test.
        :param f:
        :return:

        """
        vg = RDFGraph()
        p = os.path.abspath(f)
        logger.info("Testing reading turtle file from %s", p)
        vg.parse(f, format="turtle")
        logger.info('Found %s graph nodes in %s', len(vg), p)
        self.assertTrue(len(vg) > 0, "No nodes found in " + p)

        return

    def read_graph_into_owl(self, f):
        """
        test if the ttl can be parsed by owlparser
        this expects owltools to be accessible from commandline
        :param f: file of ttl
        :return:
        """

        import subprocess
        from subprocess import check_call

        status = check_call(["owltools", f], stderr=subprocess.STDOUT)
        # returns zero is success!
        if status != 0:
            logger.error('finished verifying with owltools with status %s',
                         status)
        self.assertTrue(status == 0)

        return

    def test_make_category_triple_default(self):
        """
        test that method adds category triple to graph correctly (default pred and obj)
        """
        self.graph._make_category_triple(self.test_cat_subj)

        triples = list(self.graph.triples((None, None, None)))
        self.assertEqual(len(triples), 1,
                         "method didn't make exactly one triple")
        self.assertEqual(triples[0][0], URIRef(self.test_cat_subj),
                         "didn't assign correct subject")
        self.assertEqual(triples[0][1], URIRef(self.test_cat_default_pred),
                         "didn't assign correct predicate")
        self.assertEqual(triples[0][2], URIRef(self.test_cat_default_category),
                         "didn't assign correct category")

    def test_make_category_triple_non_default_category(self):
        """
        test that method adds category triple to graph correctly
        """
        self.graph._make_category_triple(self.test_cat_subj,
                                         self.test_cat_nondefault_category)
        triples = list(self.graph.triples((None, None, None)))

        self.assertEqual(len(triples), 1,
                         "method didn't make exactly one triple")
        self.assertEqual(URIRef(self.test_cat_nondefault_category),
                         triples[0][2],
                         "didn't assign correct (non-default) category")

    def test_make_category_triple_non_default_pred(self):
        """
        test that method adds category triple to graph correctly (non default pred)
        """
        self.graph._make_category_triple(
            self.test_cat_subj,
            self.test_cat_default_category,
            predicate=self.test_cat_nondefault_pred)
        triples = list(self.graph.triples((None, None, None)))
        self.assertEqual(len(triples), 1,
                         "method didn't make exactly one triple")
        self.assertEqual(URIRef(self.test_cat_nondefault_pred), triples[0][1],
                         "didn't assign correct (non-default) category")

    def test_make_category_triple_category_none_should_emit_named_thing(self):
        """
        test that method adds category triple to graph correctly (default pred and obj)
        """
        self.graph._make_category_triple(self.test_cat_subj, category=None)
        triples = list(self.graph.triples((None, None, None)))
        self.assertEqual(len(triples), 1,
                         "method didn't make exactly one triple")
        self.assertEqual(URIRef(self.test_cat_default_category), triples[0][2],
                         "didn't assign correct default category")

    def test_is_literal(self):
        """
        test that method infers type (either literal or CURIE) correctly
        """
        self.assertTrue(self.graph._is_literal("1"))
        self.assertTrue(not self.graph._is_literal("foo:bar"))
        self.assertTrue(not self.graph._is_literal("http://www.zombo.com/"))
        self.assertTrue(not self.graph._is_literal("https://www.zombo.com/"))
        self.assertTrue(
            not self.graph._is_literal("ftp://ftp.1000genomes.ebi.ac.uk/"))