コード例 #1
0
ファイル: test_impc.py プロジェクト: putmantime/dipper
    def setUp(self):
        self.assoc_curie = 'MONARCH:test_association'
        self.eco_id = 'ECO:0000015'

        self.test_set_1 = (
            'MGI:1920145', 'Setd5', 'WTSI', 'MEFW', 'male', 'heterozygote',
            'MGI:4432631', 'Setd5<tm1a(EUCOMM)Wtsi>',
            'targeted mutation 1a, Wellcome Trust Sanger Institute',
            'MGI:2159965', 'C57BL/6N', 'MGP',
            'Wellcome Trust Sanger Institute Mouse Genetics Project',
            'MGP Select Pipeline', 'MGP_001', 'MGP_XRY_001', 'X-ray',
            'IMPC_XRY_008_001', 'Number of ribs right', 'MP:0005390',
            'skeleton phenotype', 'MP:0000480', 'increased rib number',
            '1.637023E-010', '', '8.885439E-007',
            'Wilcoxon rank sum test with continuity correction', 'IMPC')

        # Generate test curies, these are otherwise generated
        # within _add_evidence() and _add_study_provenance()
        self.study_curie = "_:study"
        self.evidence_curie = "_:evidence"

        # IRIs for testing sparql output
        curie_dict = curie_map.get()
        curie_util = CurieUtil(curie_dict)
        self.assoc_iri = URIRef(curie_util.get_uri(self.assoc_curie))

        return
コード例 #2
0
def hpo_to_tree(cls, hpo_terms, hpo_graph, tree, path):
    tree_path = copy.copy(path)
    tree_path.append(cls)
    curie_util = CurieUtil(curie_map.get())
    if cls not in hpo_terms:
        hpo_terms[cls] = {
            'label': hpo_graph.label(URIRef(curie_util.get_uri(cls)))
        }
        parents = hpo_graph.objects(URIRef(curie_util.get_uri(cls)),
                                    RDFS.subClassOf)
        hpo_terms[cls]['parents'] = len(list(parents))

        lay_person = get_lay_person(cls, hpo_graph)
        hpo_terms[cls]["lay_person"] = lay_person

    # Traverse the tree to get to the input class
    position = tree[tree_path[0]]
    for term in tree_path[1:]:
        position = position[term]

    for sub_class in hpo_graph.subjects(
            RDFS.subClassOf, URIRef(curie_util.get_uri(tree_path[-1]))):
        curie = curie_util.get_curie(sub_class).replace("OBO:HP_", "HP:")
        position[curie] = {}
        hpo_to_tree(curie, hpo_terms, hpo_graph, tree, tree_path)
コード例 #3
0
ファイル: test_genotype.py プロジェクト: tegar9000/dipper-1
 def test_addGenotype(self):
     cutil = CurieUtil(self.curie_map)
     gid = 'MGI:5515892'
     label = \
         'Pmp22<Tr-2J>/Pmp22<+> [C57BL/6J-Pmp22<Tr-2J>/GrsrJ]'
     self.genotype.addGenotype(gid, label)
     self.assertTrue((URIRef(cutil.get_uri(gid)), RDFS['label'],
                      Literal(label)) in self.genotype.graph)
コード例 #4
0
ファイル: test_genotype.py プロジェクト: tegar9000/dipper-1
 def setUp(self):
     self.graph = RDFGraph()
     self.curie_map = curie_map.get()
     self.genotype = Genotype(self.graph)
     self.cutil = CurieUtil(self.curie_map)
     self.test_cat_pred = self.cutil.get_uri(blv.terms['category'])
     self.test_cat_genotype_category = self.cutil.get_uri(
         blv.terms['Genotype'])
     self.test_cat_background_category = self.cutil.get_uri(
         blv.terms['PopulationOfIndividualOrganisms'])
コード例 #5
0
 def test_addGenotype(self):
     from rdflib.namespace import RDFS, URIRef
     from rdflib import Literal
     from dipper.utils.CurieUtil import CurieUtil
     cutil = CurieUtil(self.curie_map)
     gid = 'MGI:5515892'
     label = \
         'Pmp22<Tr-2J>/Pmp22<+> [C57BL/6J-Pmp22<Tr-2J>/GrsrJ]'
     self.genotype.addGenotype(gid, label)
     self.assertTrue((URIRef(cutil.get_uri(gid)), RDFS['label'],
                      Literal(label)) in self.genotype.graph)
コード例 #6
0
ファイル: test_rdfgraph.py プロジェクト: tegar9000/dipper-1
    def setUp(self):
        self.graph = RDFGraph()

        this_curie_map = curie_map.get()
        self.cutil = CurieUtil(this_curie_map)

        # stuff to make test triples
        self.test_cat_subj = "http://www.google.com"
        self.test_cat_default_pred = self.cutil.get_uri("biolink:category")
        self.test_cat_nondefault_pred = self.cutil.get_uri("rdf:type")
        self.test_cat_default_category = self.cutil.get_uri(
            "biolink:NamedThing")
        self.test_cat_nondefault_category = self.cutil.get_uri("biolink:Gene")
        self.test_cat_type = self.cutil.get_uri("rdf:type")
        self.test_cat_class = self.cutil.get_uri("rdf:class")
コード例 #7
0
    def setUp(self):

        self.curie_map = curie_map.get()
        cu = CurieUtil(self.curie_map)
        # Fake credentials as these tests do not require a database connection
        database = 'foo'
        user = '******'
        password = '******'

        self.cgd = CGD(database, user, password)
        test_data = ((387, 'MLH1 any mutation', 13, 'Adenocarcinoma',
                     None, 'Colon', 'no response', 1,
                     '5FU-based adjuvant therapy', 'late trials', '20498393'),)
        self.cgd.add_disease_drug_variant_to_graph(test_data)

        (variant_key, variant_label, diagnoses_key, diagnoses,
         specific_diagnosis, organ, relationship,
         drug_key, drug, therapy_status, pubmed_id) = test_data[0]

        source_id = "PMID:{0}".format(pubmed_id)
        variant_id = self.cgd.make_cgd_id('variant{0}'.format(variant_key))
        disease_id = self.cgd.make_cgd_id('disease{0}{1}'.format(diagnoses_key,
                                                                 diagnoses))
        relationship_id = "RO:has_environment"
        disease_quality = ("CGD:{0}".format(relationship)).replace(" ", "_")
        has_quality_property = "BFO:0000159"
        drug_id = self.cgd.make_cgd_id('drug{0}'.format(drug_key))
        disease_instance_id = self.cgd.make_cgd_id('phenotype{0}{1}{2}'.format(
            diagnoses, variant_key, relationship))

        variant_disease_annot = self.cgd.make_cgd_id("assoc{0}{1}".format(variant_key, diagnoses))

        # Set up URIs
        self.source_uri = URIRef(cu.get_uri(source_id))
        self.variant_uri = URIRef(cu.get_uri(variant_id))
        self.disease_uri = URIRef(cu.get_uri(disease_id))
        self.disease_ind_uri = URIRef(cu.get_uri(disease_instance_id))
        self.relationship_uri = URIRef(cu.get_uri(relationship_id))
        self.drug_uri = URIRef(cu.get_uri(drug_id))
        self.vd_annot_uri = URIRef(cu.get_uri(variant_disease_annot))
        self.disease_quality_uri = URIRef(cu.get_uri(disease_quality))

        self.variant_label = variant_label
        self.disease_label = diagnoses
        self.disease_instance_label = "{0} with {1} to therapy".format(diagnoses, relationship)
        self.drug_label = drug

        return
コード例 #8
0
ファイル: test_model.py プロジェクト: tegar9000/dipper-1
    def setUp(self):
        g = RDFGraph()
        self.model = Model(g)

        this_curie_map = curie_map.get()
        self.cutil = CurieUtil(this_curie_map)

        # stuff to make test triples
        self.test_cat_subj_curie = "MGI:1234"
        self.test_cat_subj = self.cutil.get_uri("MGI:1234")
        self.test_cat_default_pred = self.cutil.get_uri("biolink:category")
        self.test_named_indiv = self.cutil.get_uri("owl:NamedIndividual")
        self.test_label_pred = self.cutil.get_uri("rdfs:label")
        self.test_label = "some label"

        self.test_comment_IRI = self.cutil.get_uri("rdfs:comment")
        self.test_comment = 'bonus eruptus'
コード例 #9
0
    def test_associations(self):
        """
        Given the above sample input, produce the following:
        CGD:VariantID has_phenotype(RO:0002200) CGD:DiseaseInstance

        A CGD:AssociationID OBO:RO_0002558 Traceable Author Statement (ECO:0000033)
        A CGD:AssociationID dc:source PMID:20498393
        A CGD:AssociationID has_environment CGD:DrugID
        A CGD:AssociationID OBAN:association_has_subject CGD:VariantID
        A CGD:AssociationID OBAN:association_has_object_property has_phenotype
        A CGD:AssociationID OBAN:association_has_object CGD:DiseaseInstance
        """
        from dipper.utils.TestUtils import TestUtils

        # Make testutils object and load bindings
        cu = CurieUtil(self.curie_map)
        test_env = TestUtils(self.cgd.graph)
        self.cgd.load_bindings()
        evidence = 'OBO:ECO_0000033'
        evidence_uri = URIRef(cu.get_uri(evidence))

        sparql_query = """
                       SELECT ?diseaseInd ?variant ?drug ?vdannot ?source ?evidence
                       WHERE {{
                           ?variant OBO:RO_0002200 ?diseaseInd .

                           ?vdannot a OBAN:association ;
                               OBO:RO_0002558 ?evidence ;
                               dc:source ?source ;
                               <{0}> ?drug ;
                               OBAN:association_has_object ?diseaseInd ;
                               OBAN:association_has_object_property OBO:RO_0002200 ;
                               OBAN:association_has_subject ?variant .
                       }}
                       """.format(self.relationship_uri)

        # Expected Results
        expected_results = [[self.disease_ind_uri, self.variant_uri, self.drug_uri,
                             self.vd_annot_uri,
                             self.source_uri, evidence_uri]]
        # Query graph
        sparql_output = test_env.query_graph(sparql_query)

        self.assertEqual(expected_results, sparql_output)
コード例 #10
0
ファイル: CGD.py プロジェクト: zzygyx9119/mckb
    def _replace_entity(graph, old_id, new_id, bindings={}, is_property=False):
        """
        Replace entity in graph
        Replace one ID with another
        :param graph rdflib.graph object
        :param old_id, String curie,IRI, or literal to be replaced
        :param new_id, String curie, IRI, or literal to replace the old id
        :param bindings, Dict, dictionary of namespace prefixes
        :param is_property, Boolean, is an id a property/predicate rather than
                                 a class, individual, or literal
        :return: None
        """
        cu = CurieUtil(curie_map.get())
        old_uri = URIRef(cu.get_uri(old_id))
        new_uri = URIRef(cu.get_uri(new_id))
        if is_property is False:
            sparql_update = \
                """
                DELETE {{ <{0}> ?pred ?obj }}
                INSERT {{ <{1}> ?pred ?obj }}
                WHERE {{ <{0}> ?pred ?obj }}
                """.format(old_uri, new_uri)

            graph.update(sparql_update, 'sparql', bindings)

            sparql_update = \
                """
                DELETE {{ ?sub ?pred <{0}> }}
                INSERT {{ ?sub ?pred <{1}> }}
                WHERE {{ ?sub ?pred <{0}> }}
                """.format(old_uri, new_uri)

            graph.update(sparql_update, 'sparql', bindings)
        else:
            sparql_update = \
                """
                DELETE {{ ?sub <{0}> ?obj }}
                INSERT {{ ?sub <{1}> ?obj }}
                WHERE {{ ?sub <{0}> {?obj} }}
                """.format(old_uri, new_uri)

            graph.update(sparql_update, 'sparql', bindings)

        return
コード例 #11
0
ファイル: test_impc.py プロジェクト: tegar9000/dipper-1
    def setUp(self):
        self.test_util = TestUtils()
        self.assoc_curie = 'MONARCH:test_association'
        self.eco_id = 'ECO:0000015'

        # Headers:
        # 01 marker_accession_id,
        # 02 marker_symbol,
        # 03 phenotyping_center,
        # 04 colony_raw,
        # 05 sex,
        # 06 zygosity,
        # 07 allele_accession_id,
        # 08 allele_symbol,
        # 09 allele_name,
        # 10 strain_accession_id,
        # 11 strain_name,
        # 12 project_name,
        # 13 project_fullname,
        # 14 pipeline_name,
        # 15 pipeline_stable_id,
        # 16 procedure_stable_id,
        # 17 procedure_name,
        # 18 parameter_stable_id,
        # 19 parameter_name,
        # 20 top_level_mp_term_id,
        # 21 top_level_mp_term_name,
        # 22 mp_term_id,
        # 23 mp_term_name,
        # 24 p_value,
        # 25 percentage_change,
        # 26 effect_size,
        # 27 statistical_method,
        # 28 resource_name

        self.test_set_1 = (
            'MGI:1920145',              # 01
            'Setd5',                    # 02
            'WTSI',                     # 03
            'MEFW',                     # 04
            'male',                     # 05
            'heterozygote',             # 06
            'MGI:4432631',              # 07
            'Setd5<tm1a(EUCOMM)Wtsi>',  # 08
            'targeted mutation 1a, Wellcome Trust Sanger Institute',    # 09
            'MGI:2159965',              # 10
            'C57BL/6N',                 # 11
            'MGP',                      # 12
            'Wellcome Trust Sanger Institute Mouse Genetics Project',   # 13
            'MGP Select Pipeline',      # 14
            'MGP_001',                  # 15
            'MGP_XRY_001',              # 16
            'X-ray',                    # 17
            'IMPC_XRY_008_001',         # 18
            'Number of ribs right',     # 19
            'MP:0005390',               # 20
            'skeleton phenotype',       # 21
            'MP:0000480',               # 22
            'increased rib number',     # 23
            '1.637023E-010',            # 24
            '',                         # 25
            '8.885439E-007',            # 26
            'Wilcoxon rank sum test with continuity correction',    # 27
            'IMPC'            # 28
        )

        # Generate test curies, these are otherwise generated
        # within _add_evidence() and _add_study_provenance()
        # these blank nodes are hardcoded as NOT Skolemized  ...
        self.study_curie = "_:study"
        self.evidence_curie = "_:evidence"

        # IRIs for testing sparql output
        curie_dict = curie_map.get()
        curie_util = CurieUtil(curie_dict)
        self.assoc_iri = URIRef(curie_util.get_uri(self.assoc_curie))

        return
コード例 #12
0
ファイル: RDFGraph.py プロジェクト: matthewbrickley/dipper
class RDFGraph(DipperGraph, ConjunctiveGraph):
    """
    Extends RDFLibs ConjunctiveGraph
    The goal of this class is wrap the creation
    of triples and manage creation of URIRef,
    Bnodes, and literals from an input curie
    """

    curie_map = curie_map_class.get()
    curie_util = CurieUtil(curie_map)

    # make global translation table available outside the ingest
    with open(
            os.path.join(
                os.path.dirname(__file__),
                '../../translationtable/GLOBAL_TERMS.yaml')) as fhandle:
        globaltt = yaml.safe_load(fhandle)
        globaltcid = {v: k for k, v in globaltt.items()}

    def __init__(self, are_bnodes_skized=True, identifier=None):
        # print("in RDFGraph  with id: ", identifier)
        super().__init__('IOMemory', identifier)
        self.are_bnodes_skized = are_bnodes_skized

        # Can be removed when this is resolved
        # https://github.com/RDFLib/rdflib/issues/632
        for pfx in ('OBO', ):  # , 'ORPHA'):
            self.bind(pfx, Namespace(self.curie_map[pfx]))

        # try adding them all
        # self.bind_all_namespaces()  # too much

    def addTriple(self,
                  subject_id,
                  predicate_id,
                  obj,
                  object_is_literal=None,
                  literal_type=None):
        # trying making infrence on type of object if none is supplied
        if object_is_literal is None:
            if self.curie_regexp.match(obj) is not None or\
                    obj.split(':')[0].lower() in ('http', 'https', 'ftp'):
                object_is_literal = False
            else:
                object_is_literal = True

        if object_is_literal is True:
            if literal_type is not None and obj is not None:
                literal_type_iri = self._getnode(literal_type)
                self.add(
                    (self._getnode(subject_id), self._getnode(predicate_id),
                     Literal(obj, datatype=literal_type_iri)))
            elif obj is not None:
                self.add(
                    (self._getnode(subject_id), self._getnode(predicate_id),
                     Literal(obj)))
            else:
                LOG.warning("None as literal object for subj: %s and pred: %s",
                            subject_id, predicate_id)
                # get a sense of where the None is comming from
                # magic number here is "steps up the call stack"
                for call in range(2, 0, -1):
                    LOG.warning('\t%sfrom: %s', '\t' * call,
                                sys._getframe(call).f_code.co_name)

        elif obj is not None and obj != '':  # object is a resourse
            self.add((self._getnode(subject_id), self._getnode(predicate_id),
                      self._getnode(obj)))
        else:
            LOG.warning("None/empty object IRI for subj: %s and pred: %s",
                        subject_id, predicate_id)
        return

    def skolemizeBlankNode(self, curie):
        stripped_id = re.sub(r'^_:|^_', '', curie, 1)
        node = BNode(stripped_id).skolemize(self.curie_util.get_base())
        node = re.sub(r'rdflib/', '', node)  # remove string added by rdflib
        return URIRef(node)

    def _getnode(self, curie):  # convention is lowercase names
        """
        This is a wrapper for creating a URIRef or Bnode object
        with a given a curie or iri as a string.

        If an id starts with an underscore, it assigns it to a BNode, otherwise
        it creates it with a standard URIRef.
        Alternatively, self.skolemize_blank_node is True,
        it will skolemize the blank node

        :param curie: str identifier formatted as curie or iri
        :return: node: RDFLib URIRef or BNode object
        """
        node = None
        if curie[0] == '_':
            if self.are_bnodes_skized is True:
                node = self.skolemizeBlankNode(curie)
            else:  # delete the leading underscore to make it cleaner
                node = BNode(re.sub(r'^_:|^_', '', curie, 1))

        # Check if curie string is actually an IRI
        elif curie[:4] == 'http' or curie[:3] == 'ftp' or curie[:4] == 'jdbc':
            node = URIRef(curie)
        else:
            iri = RDFGraph.curie_util.get_uri(curie)
            if iri is not None:
                node = URIRef(RDFGraph.curie_util.get_uri(curie))
                # Bind prefix map to graph
                prefix = curie.split(':')[0]
                if prefix not in self.namespace_manager.namespaces():
                    mapped_iri = self.curie_map[prefix]
                    self.bind(prefix, Namespace(mapped_iri))
            else:
                LOG.error("couldn't make URI for %s", curie)
        return node

    def bind_all_namespaces(self):
        """
            Results in the RDF @prefix directives for every ingest
            being added to this ingest.
        """
        for prefix in self.curie_map.keys():
            iri = self.curie_map[prefix]
            self.bind(prefix, Namespace(iri))
        return

    # serialize() conflicts between rdflib & Graph.serialize abstractmethod
    # GraphUtils expects the former.  (too bad there is no multiple dispatch)
    def serialize(  # rdflib version
            self,
            destination=None,
            format='turtle',
            base=None,
            encoding=None):
        return ConjunctiveGraph.serialize(self, destination, format)
コード例 #13
0
    def __init__(self, curie_map):
        self.curie_map = curie_map
        self.cu = CurieUtil(curie_map)

        return
コード例 #14
0
ファイル: StreamedGraph.py プロジェクト: moon3stars/dipper
class StreamedGraph(DipperGraph):
    """
    Stream rdf triples to file or stdout
    Assumes a downstream process will sort then uniquify triples

    Theoretically could support both ntriple, rdfxml formats, for now
    just support nt
    """

    curie_map = curimap.get()
    curie_util = CurieUtil(curie_map)

    with open('translationtable/GLOBAL_TERMS.yaml') as fhandle:
        globaltt = yaml.safe_load(fhandle).copy()
        globaltcid = {v: k for k, v in globaltt.items()}

    def __init__(self,
                 are_bnodes_skized=True,
                 identifier=None,
                 file_handle=None,
                 fmt='nt'):
        self.are_bnodes_skized = are_bnodes_skized
        self.fmt = fmt
        self.file_handle = file_handle
        self.identifier = identifier

    def addTriple(self,
                  subject_id,
                  predicate_id,
                  obj,
                  object_is_literal=None,
                  literal_type=None):
        # trying making infrence on type of object if none is supplied
        if object_is_literal is None:
            if self.curie_regexp.match(obj) or\
                    obj.split(':')[0].lower() in ('http', 'https', 'ftp'):
                object_is_literal = False
        else:
            object_is_literal = True

        subject_iri = self._getnode(subject_id)
        predicate_iri = self._getnode(predicate_id)
        if not object_is_literal:
            obj = self._getnode(obj)

        if literal_type is not None:
            literal_type = self._getnode(literal_type)

        if obj is not None:
            self.serialize(subject_iri, predicate_iri, obj, object_is_literal,
                           literal_type)
        else:
            LOG.warning("Null value passed as object")
        return

    def skolemizeBlankNode(self, curie):
        base_iri = StreamedGraph.curie_map.get_base()
        curie_id = curie.split(':')[1]
        skolem_iri = "{0}.wellknown/genid/{1}".format(base_iri, curie_id)
        return skolem_iri

    def serialize(self,
                  subject_iri,
                  predicate_iri,
                  obj,
                  object_is_literal=False,
                  literal_type=None):
        if not object_is_literal:
            triple = "<{}> <{}> <{}> .".format(subject_iri, predicate_iri, obj)
        elif literal_type is not None:
            triple = '<{}> <{}> {}^^<{}> .'.format(
                subject_iri, predicate_iri, self._quote_encode(str(obj)),
                literal_type)
        else:
            if isinstance(obj, str):
                triple = '<{}> <{}> {} .'.format(subject_iri, predicate_iri,
                                                 self._quote_encode(obj))
            else:
                lit_type = self._getLiteralXSDType(obj)
                if type is not None:
                    triple = '<{}> <{}> "{}"^^<{}> .'.format(
                        subject_iri, predicate_iri, obj, lit_type)
                else:
                    raise TypeError("Cannot determine type of {}".format(obj))

        if self.file_handle is None:
            print(triple)
        else:
            self.file_handle.write("{}\n".format(triple))

    def _getnode(self, curie):
        """
        Returns IRI, or blank node curie/iri depending on
        self.skolemize_blank_node setting

        :param curie: str id as curie or iri
        :return:
        """
        if re.match(r'^_:', curie):
            if self.are_bnodes_skized is True:
                node = self.skolemizeBlankNode(curie)
            else:
                node = curie
        elif re.match(r'^http|^ftp', curie):
            node = curie
        elif len(curie.split(':')) == 2:
            node = StreamedGraph.curie_util.get_uri(curie)
        else:
            raise TypeError("Cannot process curie {}".format(curie))
        return node

    def _getLiteralXSDType(self, literal):
        """
        This could be much more nuanced, but for now
        if a literal is not a str, determine if it's
        a xsd int or double
        :param literal:
        :return: str - xsd full iri
        """
        if isinstance(literal, int):
            return self._getnode("xsd:integer")
        if isinstance(literal, float):
            return self._getnode("xsd:double")

    @staticmethod
    def _quote_encode(literal):
        """
        Copy of code in rdflib here:
        https://github.com/RDFLib/rdflib/blob/776b90be/
        rdflib/plugins/serializers/nt.py#L76
        :param literal:
        :return:
        """
        return '"%s"' % literal.replace('\\', '\\\\')\
            .replace('\n', '\\n')\
            .replace('"', '\\"')\
            .replace('\r', '\\r')
コード例 #15
0
class RDFGraph(ConjunctiveGraph, DipperGraph):
    """
    Extends RDFLibs ConjunctiveGraph
    The goal of this class is wrap the creation
    of triples and manage creation of URIRef,
    Bnodes, and literals from an input curie
    """

    curie_util = CurieUtil(curie_map.get())
    curie_map = curie_map

    def __init__(self, are_bnodes_skized=True):
        super().__init__()
        self.are_bnodes_skized = are_bnodes_skized

        # Can be removed when this is resolved
        # https://github.com/RDFLib/rdflib/issues/632
        obo_map = curie_map.get()['OBO']
        self.bind('OBO', Namespace(obo_map))

    def addTriple(self,
                  subject_id,
                  predicate_id,
                  obj,
                  object_is_literal=False,
                  literal_type=None):

        if object_is_literal is True:
            if literal_type is not None and obj is not None:
                literal_type_iri = self._getNode(literal_type)
                self.add(
                    (self._getNode(subject_id), self._getNode(predicate_id),
                     Literal(obj, datatype=literal_type_iri)))
            elif obj is not None:
                self.add(
                    (self._getNode(subject_id), self._getNode(predicate_id),
                     Literal(obj)))
            else:
                logger.warn("None as literal object for subj: %s and pred: %s",
                            subject_id, predicate_id)
        elif obj is not None and obj != '':
            self.add((self._getNode(subject_id), self._getNode(predicate_id),
                      self._getNode(obj)))
        else:
            logger.warn("None/empty object IRI for subj: %s and pred: %s",
                        subject_id, predicate_id)
        return

    def skolemizeBlankNode(self, curie):
        stripped_id = re.sub(r'^_:|^_', '', curie, 1)
        node = BNode(stripped_id).skolemize(self.curie_map.get_base())
        node = re.sub(r'rdflib/', '', node)
        return URIRef(node)

    def _getNode(self, curie):
        """
        This is a wrapper for creating a URIRef or Bnode object
        with a given a curie or iri as a string.

        If an id starts with an underscore, it assigns it to a BNode, otherwise
        it creates it with a standard URIRef.
        Alternatively, self.skolemize_blank_node is True,
        it will skolemize the blank node

        :param curie: str identifier formatted as curie or iri
        :return: node: RDFLib URIRef or BNode object
        """
        node = None
        if re.match(r'^_', curie):
            if self.are_bnodes_skized is True:
                node = self.skolemizeBlankNode(curie)
            else:  # replace the leading underscore to make it cleaner
                node = BNode(re.sub(r'^_:|^_', '', curie, 1))
        # Check if curie actually an IRI
        elif re.match(r'^http|^ftp', curie):
            node = URIRef(curie)
        else:
            iri = RDFGraph.curie_util.get_uri(curie)
            if iri is not None:
                node = URIRef(RDFGraph.curie_util.get_uri(curie))
                # Bind prefix map to graph
                prefix = curie.split(':')[0]
                if prefix not in self.namespace_manager.namespaces():
                    mapped_iri = curie_map.get()[prefix]
                    self.bind(prefix, Namespace(mapped_iri))
            else:
                logger.error("couldn't make URI for %s", curie)
        return node

    def bind_all_namespaces(self):
        for prefix in curie_map.get().keys():
            iri = curie_map.get()[prefix]
            self.bind(prefix, Namespace(iri))
コード例 #16
0
ファイル: RDFGraph.py プロジェクト: nicholsn/dipper
class RDFGraph(DipperGraph, ConjunctiveGraph):
    """
    Extends RDFLibs ConjunctiveGraph
    The goal of this class is wrap the creation
    of triples and manage creation of URIRef,
    Bnodes, and literals from an input curie
    """

    curie_map = curie_map_class.get()
    curie_util = CurieUtil(curie_map)

    # make global translation table available outside the ingest
    with open(
        os.path.join(
            os.path.dirname(__file__),
            '../../translationtable/GLOBAL_TERMS.yaml')) as fhandle:
        globaltt = yaml.safe_load(fhandle)
        globaltcid = {v: k for k, v in globaltt.items()}

    def __init__(self, are_bnodes_skized=True, identifier=None):
        # print("in RDFGraph  with id: ", identifier)
        super().__init__('IOMemory', identifier)
        self.are_bnodes_skized = are_bnodes_skized
        self.prefixes = set()

        # Can be removed when this is resolved
        # https://github.com/RDFLib/rdflib/issues/632
        for pfx in ('OBO',):  # , 'ORPHA'):
            self.bind(pfx, Namespace(self.curie_map[pfx]))

    def _make_category_triple(
            self, subject, category, predicate=blv.terms['category']
    ):
        """
        add a triple to capture subject or object category (in CURIE form) that was
        passed to addTriple()
        """
        try:
            self.add((
                self._getnode(subject),
                self._getnode(predicate),
                self._getnode(category)))
        except:
            LOG.warning(
                "Problem adding triple in _makeCategoryTriple for " + \
                "subj: %s pred: %s obj(category): %s",
                subject, predicate, category)
                
    def _is_literal(self, thing):
        """
        make inference on type (literal or CURIE)

        return: logical
        """
        if self.curie_regexp.match(thing) is not None or\
           thing.split(':')[0].lower() in ('http', 'https', 'ftp'):
            object_is_literal = False
        else:
            object_is_literal = True

        return object_is_literal

    def addTriple(
            self,
            subject_id,
            predicate_id,
            obj,
            object_is_literal=None,
            literal_type=None,
            subject_category=None,
            object_category=None
    ):

        if object_is_literal is None:
            object_is_literal = self._is_literal(obj)

        # add triples for subject category info
        if subject_category is not None:
            self._make_category_triple(subject_id, subject_category)

        # add triples for obj category info, if obj is not a literal
        if not object_is_literal:
            if object_category is not None:
                self._make_category_triple(obj, object_category)
        else: # emit warning if object category is given for a literal
            if object_category is not None:
                LOG.warning("I was given a category %s for obj: %s, " +
                            "which seems to be a literal!",
                            object_category, obj)
            
        if object_is_literal is True:
            if isinstance(obj, str):
                re.sub(r'[\t\n\r\f\v]+', ' ', obj)  # reduce any ws to a space
            if literal_type is not None and obj is not None and obj not in ("", " "):
                literal_type_iri = self._getnode(literal_type)

                self.add(
                    (self._getnode(subject_id), self._getnode(predicate_id),
                     Literal(obj, datatype=literal_type_iri)))
            elif obj is not None:
                # could attempt to infer a type here but there is no use case
                self.add((
                    self._getnode(subject_id), self._getnode(predicate_id),
                    Literal(obj)))
            else:
                LOG.warning(
                    "None as literal object for subj: %s and pred: %s",
                    subject_id, predicate_id)
                # get a sense of where the None is comming from
                # magic number here is "steps up the call stack"
                # TODO there may be easier/ideomatic ways to do this now
                for call in range(2, 0, -1):
                    LOG.warning(
                        '\t%sfrom: %s', '\t' * call, sys._getframe(call).f_code.co_name)

        elif obj is not None and obj != '':  # object is a resource
            self.add((
                self._getnode(subject_id),
                self._getnode(predicate_id),
                self._getnode(obj)))
        else:
            LOG.warning(
                "None/empty object IRI for subj: %s and pred: %s",
                subject_id, predicate_id)

    def skolemizeBlankNode(self, curie):
        stripped_id = re.sub(r'^_:|^_', '', curie, 1)
        return URIRef(self.curie_map['BNODE'] + stripped_id)

    def _getnode(self, curie):
        """
        This is a wrapper for creating a URIRef or Bnode object
        with a given a curie or iri as a string.

        If an id starts with an underscore, it assigns it to a BNode, otherwise
        it creates it with a standard URIRef.
        Alternatively, self.skolemize_blank_node is True,
        it will skolemize the blank node

        :param curie: str identifier formatted as curie or iri
        :return: node: RDFLib URIRef or BNode object
        """
        node = None
        if curie[0] == '_':
            if self.are_bnodes_skized:
                node = self.skolemizeBlankNode(curie)
            else:  # delete the leading underscore to make it cleaner
                node = BNode(re.sub(r'^_:|^_', '', curie, 1))

        # Check if curie string is actually an IRI
        elif curie[:4] == 'http' or curie[:3] == 'ftp' or curie[:4] == 'jdbc':
            node = URIRef(curie)
        else:
            iri = RDFGraph.curie_util.get_uri(curie)
            if iri is not None:
                node = URIRef(iri)
                # Bind prefix map to graph
                prefix = curie.split(':')[0]
                self.prefixes.add(prefix)
            else:
                LOG.error("couldn't make URI for %s", curie)
                # get a sense of where the CURIE-ish? thing is comming from
                # magic number here is "steps up the call stack"
                for call in range(3, 0, -1):
                    LOG.warning(
                        '\t%sfrom: %s', '\t' * call, sys._getframe(call).f_code.co_name)
        return node

    def bind_all_namespaces(self):
        """
            Results in the RDF @prefix directives for every ingest
            being added to this ingest.
        """
        for prefix in self.curie_map.keys():
            iri = self.curie_map[prefix]
            self.bind(prefix, Namespace(iri))

    # serialize() conflicts between rdflib & Graph.serialize abstractmethod
    # GraphUtils expects the former.  (too bad there is no multiple dispatch)
    # rdflib version
    def serialize(
            self, destination=None, format='turtle', base=None, encoding=None
    ):
        for prefix in self.prefixes:
            mapped_iri = self.curie_map[prefix]
            self.bind(prefix, Namespace(mapped_iri))
        return ConjunctiveGraph.serialize(self, destination, format)
コード例 #17
0
ファイル: RDFGraph.py プロジェクト: alexgarciac/dipper
class RDFGraph(ConjunctiveGraph, DipperGraph):
    """
    Extends RDFLibs ConjunctiveGraph
    The goal of this class is wrap the creation
    of triples and manage creation of URIRef,
    Bnodes, and literals from an input curie
    """

    curie_map = curie_map.get()
    curie_util = CurieUtil(curie_map)

    # make global translation table available outside the ingest
    with open('translationtable/GLOBAL_TERMS.yaml') as fh:
        globaltt = yaml.safe_load(fh)
        globaltcid = {v: k for k, v in globaltt.items()}

    def __init__(self, are_bnodes_skized=True, identifier=None):
        # print("in RDFGraph  with id: ", identifier)
        super().__init__('IOMemory', identifier)
        self.are_bnodes_skized = are_bnodes_skized

        # Can be removed when this is resolved
        # https://github.com/RDFLib/rdflib/issues/632
        obo_map = curie_map.get()['OBO']
        self.bind('OBO', Namespace(obo_map))

        # try adding them all
        # self.bind_all_namespaces()  # too much

    def addTriple(self, subject_id, predicate_id, obj,
                  object_is_literal=False, literal_type=None):

        if object_is_literal is True:
            if literal_type is not None and obj is not None:
                literal_type_iri = self._getNode(literal_type)
                self.add(
                    (self._getNode(subject_id), self._getNode(predicate_id),
                     Literal(obj, datatype=literal_type_iri)))
            elif obj is not None:
                self.add(
                    (self._getNode(subject_id), self._getNode(predicate_id),
                     Literal(obj)))
            else:
                logger.warning(
                    "None as literal object for subj: %s and pred: %s",
                    subject_id, predicate_id)
                # magic number 2 here is "steps up the stack"
                logger.warning(sys._getframe(2).f_code.co_name)
        elif obj is not None and obj != '':
            self.add((
                self._getNode(subject_id), self._getNode(predicate_id),
                self._getNode(obj)))
        else:
            logger.warning(
                "None/empty object IRI for subj: %s and pred: %s",
                subject_id, predicate_id)
        return

    def skolemizeBlankNode(self, curie):
        stripped_id = re.sub(r'^_:|^_', '', curie, 1)
        node = BNode(stripped_id).skolemize(self.curie_util.get_base())
        node = re.sub(r'rdflib/', '', node)  # remove string added by rdflib
        return URIRef(node)

    def _getNode(self, curie):
        """
        This is a wrapper for creating a URIRef or Bnode object
        with a given a curie or iri as a string.

        If an id starts with an underscore, it assigns it to a BNode, otherwise
        it creates it with a standard URIRef.
        Alternatively, self.skolemize_blank_node is True,
        it will skolemize the blank node

        :param curie: str identifier formatted as curie or iri
        :return: node: RDFLib URIRef or BNode object
        """
        node = None
        if re.match(r'^_', curie):
            if self.are_bnodes_skized is True:
                node = self.skolemizeBlankNode(curie)
            else:  # delete the leading underscore to make it cleaner
                node = BNode(re.sub(r'^_:|^_', '', curie, 1))

        # Check if curie actually an IRI
        elif re.match(r'^http|^ftp', curie):
            node = URIRef(curie)
        else:
            iri = RDFGraph.curie_util.get_uri(curie)
            if iri is not None:
                node = URIRef(RDFGraph.curie_util.get_uri(curie))
                # Bind prefix map to graph
                prefix = curie.split(':')[0]
                if prefix not in self.namespace_manager.namespaces():
                    mapped_iri = curie_map.get()[prefix]
                    self.bind(prefix, Namespace(mapped_iri))
            else:
                logger.error("couldn't make URI for %s", curie)
        return node

    def bind_all_namespaces(self):
        for prefix in curie_map.get().keys():
            iri = curie_map.get()[prefix]
            self.bind(prefix, Namespace(iri))
コード例 #18
0
ファイル: test_genotype.py プロジェクト: zzygyx9119/mckb
    def test_amino_acid_position_region_model(self):
        """
        Test modelling of amino acid positions
        Using test data set 1, and the function add_variant_info_to_graph()
        We want to test the following triples:

        CGD:RegionID is an instance of faldo:Region
        CGD:RegionID faldo:begin BothStrandPositionID
        CGD:RegionID faldo:end BothStrandPositionID

        CGD:BothStrandPositionID is an instance of faldo:BothStrandPosition
        CGD:BothStrandPositionID is an instance of faldo:Position
        CGD:BothStrandPositionID faldo:position 741
        CGD:BothStrandPositionID faldo:reference UniProtID
        """
        from dipper.utils.TestUtils import TestUtils
        self.cgd.add_variant_info_to_graph(self.test_set_1)

        # Make testutils object and load bindings
        test_env = TestUtils(self.cgd.graph)
        cu = CurieUtil(self.curie_map)
        self.cgd.load_bindings()

        (variant_key, variant_label, amino_acid_variant, amino_acid_position,
         transcript_id, transcript_priority, protein_variant_type,
         functional_impact, stop_gain_loss, transcript_gene,
         protein_variant_source) = self.test_set_1[0][0:11]

        position = 741
        variant_id = self.cgd.make_cgd_id('variant{0}'.format(variant_key))

        uniprot_curie = "UniProtKB:Q99062#Q99062-1"
        uniprot_id = "Q99062#Q99062-1"
        region_id = ":_{0}{1}{2}Region".format(position, position,
                                               uniprot_curie)
        both_strand_id = ":_{0}-{1}".format(uniprot_id, position)

        region_uri = URIRef(cu.get_uri(region_id))
        both_strand_uri = URIRef(cu.get_uri(both_strand_id))
        uniprot_uri = URIRef(cu.get_uri(uniprot_curie))

        sparql_query = """
                       SELECT ?region ?bsPosition ?protein
                       WHERE {{
                           ?region a faldo:Region ;
                               faldo:begin ?bsPosition ;
                               faldo:end ?bsPosition .

                           ?bsPosition a faldo:Position ;
                               faldo:position {0} ;
                               faldo:reference ?protein .
                       }}
                       """.format(position)

        # Expected Results
        expected_results = [[region_uri, both_strand_uri, uniprot_uri]]

        # Query graph
        sparql_output = test_env.query_graph(sparql_query)

        self.assertEqual(expected_results, sparql_output)
コード例 #19
0
ファイル: test_genotype.py プロジェクト: zzygyx9119/mckb
    def test_missense_variant_cdna_model(self):
        """
        Test missense variant with cdna information
        Using test data set 2, and the function add_variant_info_to_graph()
        We want to test the following triples:

        CGD:VariantID is an instance of OBO:SO_0001059
        CGD:VariantID is an instance of OBO:SO_0001583
        CGD:VariantID has the label "ABL1 T315I missense mutation"
        CGD:VariantID is_sequence_variant_instance_of (OBO:GENO_0000408) NCBIGene:25
        CGD:VariantID has location (faldo:location) AminoAcidRegionID
        CGD:VariantID has location (faldo:location) CDNARegionID
        CGD:VariantID has location (faldo:location) ChromosomalRegionID
        CGD:VariantID OBO:GENO_reference_amino_acid "T"
        CGD:VariantID OBO:GENO_results_in_amino_acid_change "I"
        CGD:VariantID owl:sameAs dbSNP:rs121913459
        CGD:VariantID owl:sameAs COSMIC:12560
        CGD:VariantID RO:0002205 (transcribed_to) CCDS:35166.1

        CCDS:35166.1 is an instance of OBO:SO_0000233
        CCDS:35166.1 has the label "CCDS35166.1"
        CCDS:35166.1 OBO:RO_0002513 (translates_to) UniProtKB:P00519#P00519-1
        CCDS:35166.1 OBO:RO_0002513 (translates_to) NCBIProtein:NP_005148.2

        UniProtKB:P00519#P00519-1 owl:sameAs NCBIProtein:NP_005148.2

        UniProtKB:P00519#P00519-1 is an instance of OBO:SO_0000104 (polypeptide)
        UniProtKB:P00519#P00519-1 has the label "P00519#P00519-1"

        NCBIProtein:NP_005148.2 is an instance of OBO:SO_0000104 (polypeptide)
        NCBIProtein:NP_005148.2 has the label "NP_005148.2"
        """
        from dipper.utils.TestUtils import TestUtils

        self.cgd.add_variant_info_to_graph(self.test_set_2)

        # Make testutils object and load bindings
        test_env = TestUtils(self.cgd.graph)
        cu = CurieUtil(self.curie_map)
        self.cgd.load_bindings()

        (variant_key, variant_label, amino_acid_variant, amino_acid_position,
         transcript_id, transcript_priority, protein_variant_type,
         functional_impact, stop_gain_loss, transcript_gene,
         protein_variant_source, variant_gene, bp_pos, variant_cdna, cosmic_id,
         db_snp_id, genome_pos_start, genome_pos_end, ref_base, variant_base,
         primary_transcript_exons, primary_transcript_variant_sub_types,
         variant_type, chromosome, genome_build, build_version,
         build_date) = self.test_set_2[0]

        gene_id = self.cgd.gene_map[transcript_gene]
        ref_amino_acid = "T"
        altered_amino_acid = "I"
        db_snp_curie = "dbSNP:121913459"
        cosmic_curie = "COSMIC:12560"
        uniprot_curie = "UniProtKB:P00519#P00519-1"
        uniprot_id = "P00519#P00519-1"
        refseq_curie = "NCBIProtein:NP_005148.2"
        transcript_curie = "CCDS:35166.1"
        ccds_id = "35166.1"
        position = 315
        chromosome_curie = "hg19chr9"

        variant_id = self.cgd.make_cgd_id('variant{0}'.format(variant_key))
        aa_region_id = ":_{0}{1}{2}Region".format(position, position,
                                                  uniprot_curie)
        cdna_region_id = ":_{0}Region".format(transcript_curie)
        chr_region_id = ":_{0}{1}Region-{2}-{3}".format(
            genome_build, chromosome, genome_pos_start, genome_pos_end)
        aa_coord_id = ":_{0}-{1}".format(uniprot_id, position)
        cdna_coord_id = ":_{0}-{1}".format(ccds_id, bp_pos)
        # chr_coord_id = "CHR:{0}-{1}".format(chromosome_curie, genome_pos_start)
        chr_coord_id = ":_{0}-{1}".format(chromosome_curie, genome_pos_start)

        variant_uri = URIRef(cu.get_uri(variant_id))
        transcript_uri = URIRef(cu.get_uri(transcript_curie))
        gene_uri = URIRef(cu.get_uri(gene_id))
        db_snp_uri = URIRef(cu.get_uri(db_snp_curie))
        cosmic_uri = URIRef(cu.get_uri(cosmic_curie))
        uniprot_uri = URIRef(cu.get_uri(uniprot_curie))
        refseq_uri = URIRef(cu.get_uri(refseq_curie))
        aa_region_uri = URIRef(cu.get_uri(aa_region_id))
        cdna_region_uri = URIRef(cu.get_uri(cdna_region_id))
        chr_region_uri = URIRef(cu.get_uri(chr_region_id))
        aa_coord_uri = URIRef(cu.get_uri(aa_coord_id))
        cdna_coord_uri = URIRef(cu.get_uri(cdna_coord_id))
        chr_coord_uri = URIRef(cu.get_uri(chr_coord_id))

        sparql_query = """
                       SELECT ?cosmic ?gene ?aaRegion ?cdnaRegion ?chrRegion
                              ?dbSNP ?transcript ?uniprot ?refseq
                              ?aaCoord ?cdnaCoord ?chrCoord
                       WHERE {{
                           ?cosmic a OBO:SO_0001059;
                               a OBO:SO_0001583 ;
                               OBO:GENO_0000408 ?gene ;
                               faldo:location ?aaRegion ;
                               faldo:location ?cdnaRegion ;
                               faldo:location ?chrRegion ;
                               OBO:GENO_reference_amino_acid "{0}" ;
                               OBO:GENO_reference_nucleotide "{1}" ;
                               OBO:GENO_altered_nucleotide "{2}" ;
                               OBO:GENO_results_in_amino_acid_change "{3}" ;
                               owl:sameAs ?dbSNP ;
                               RO:0002205 ?transcript .

                           ?cosmic owl:sameAs ?dbSNP .

                           ?transcript a OBO:SO_0000233 ;
                               rdfs:label "{4}" ;
                               OBO:RO_0002513 ?uniprot ;
                               OBO:RO_0002513 ?refseq .

                           ?uniprot a OBO:SO_0000104 ;
                               rdfs:label "P00519-1" .

                           ?refseq a OBO:SO_0000104 ;
                               rdfs:label "NP_005148.2" .

                           ?refseq owl:sameAs ?uniprot .

                           ?aaRegion faldo:begin ?aaCoord .
                           ?cdnaRegion faldo:begin ?cdnaCoord .
                           ?chrRegion faldo:begin ?chrCoord .

                           ?aaCoord faldo:position {5} .
                           ?cdnaCoord faldo:position {6} .
                           ?chrCoord faldo:position {7} .

                           ?dbSNP rdfs:label "{8}" .
                       }}
                       """.format(ref_amino_acid, ref_base, variant_base,
                                  altered_amino_acid, transcript_id, position,
                                  bp_pos, genome_pos_start, db_snp_id)

        # Expected Results
        expected_results = [[
            cosmic_uri, gene_uri, aa_region_uri, cdna_region_uri,
            chr_region_uri, db_snp_uri, transcript_uri, uniprot_uri,
            refseq_uri, aa_coord_uri, cdna_coord_uri, chr_coord_uri
        ]]
        # Query graph
        sparql_output = test_env.query_graph(sparql_query)

        self.assertEqual(expected_results, sparql_output)
コード例 #20
0
ファイル: test_genotype.py プロジェクト: zzygyx9119/mckb
    def test_variant_position_region_model(self):
        """
        Test modelling of variant positions on a transcript
        Using test data set 2, and the function add_variant_info_to_graph()
        We want to test the following triples:

        CGD:RegionID is an instance of faldo:Region
        CGD:RegionID faldo:begin BothStrandPositionID
        CGD:RegionID faldo:end BothStrandPositionID

        CGD:BothStrandPositionID is an instance of faldo:BothStrandPosition
        CGD:BothStrandPositionID is an instance of faldo:Position
        CGD:BothStrandPositionID faldo:position 944
        CGD:BothStrandPositionID faldo:reference CGD:TranscriptID
        """
        from dipper.utils.TestUtils import TestUtils
        self.cgd.add_variant_info_to_graph(self.test_set_2)

        # Make testutils object and load bindings
        test_env = TestUtils(self.cgd.graph)
        cu = CurieUtil(self.curie_map)
        self.cgd.load_bindings()

        (variant_key, variant_label, amino_acid_variant, amino_acid_position,
         transcript_id, transcript_priority, protein_variant_type,
         functional_impact, stop_gain_loss, transcript_gene,
         protein_variant_source, variant_gene, bp_pos, variant_cdna, cosmic_id,
         db_snp_id, genome_pos_start, genome_pos_end, ref_base, variant_base,
         primary_transcript_exons, primary_transcript_variant_sub_types,
         variant_type, chromosome, genome_build, build_version,
         build_date) = self.test_set_2[0]

        transcript_curie = self.cgd._make_transcript_curie(transcript_id)
        ccds_id = "35166.1"
        variant_id = self.cgd.make_cgd_id('variant{0}'.format(variant_key))

        region_id = ":_{0}Region".format(transcript_curie)
        both_strand_id = ":_{0}-{1}".format(ccds_id, bp_pos)

        region_uri = URIRef(cu.get_uri(region_id))
        both_strand_uri = URIRef(cu.get_uri(both_strand_id))
        ccds_uri = URIRef(cu.get_uri(transcript_curie))

        sparql_query = """
                       SELECT ?region ?bsPosition ?transcript
                       WHERE {{
                           ?region a faldo:Region ;
                               faldo:begin ?bsPosition ;
                               faldo:end ?bsPosition .

                           ?bsPosition a faldo:Position ;
                               faldo:position {0} ;
                               faldo:reference ?transcript .
                       }}
                       """.format(bp_pos)

        # Expected Results
        expected_results = [[region_uri, both_strand_uri, ccds_uri]]

        # Query graph
        sparql_output = test_env.query_graph(sparql_query)

        self.assertEqual(expected_results, sparql_output)
コード例 #21
0
ファイル: GWASCatalog.py プロジェクト: putmantime/dipper
    def _add_variant_trait_association(self,
                                       variant_id,
                                       mapped_trait_uri,
                                       efo_ontology,
                                       pubmed_id,
                                       description=None):
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        # make associations to the EFO terms; there can be >1
        if mapped_trait_uri.strip() != '':
            for trait in re.split(r',', mapped_trait_uri):
                trait = trait.strip()

                cu = CurieUtil(curie_map.get())
                trait_id = cu.get_curie(trait)

                dis_query = """
                    SELECT ?trait
                    WHERE {{
                        {0} rdfs:subClassOf+ EFO:0000408 .
                        {0} rdfs:label ?trait .
                    }}
                """.format(trait_id)

                query_result = efo_ontology.query(dis_query)
                if len(list(query_result)) > 0:
                    if re.match(r'^EFO', trait_id):
                        model.addClassToGraph(trait_id,
                                              list(query_result)[0][0],
                                              'DOID:4')

                phenotype_query = """
                    SELECT ?trait
                    WHERE {{
                        {0} rdfs:subClassOf+ EFO:0000651 .
                        {0} rdfs:label ?trait .
                    }}
                """.format(trait_id)

                query_result = efo_ontology.query(phenotype_query)
                if len(list(query_result)) > 0:
                    if re.match(r'^EFO', trait_id):
                        model.addClassToGraph(trait_id,
                                              list(query_result)[0][0],
                                              'UPHENO:0001001')

                pubmed_curie = 'PMID:' + pubmed_id

                ref = Reference(g, pubmed_curie,
                                Reference.ref_types['journal_article'])
                ref.addRefToGraph()

                assoc = G2PAssoc(g, self.name, variant_id, trait_id,
                                 model.object_properties['contributes_to'])
                assoc.add_source(pubmed_curie)
                # combinatorial evidence
                # used in automatic assertion
                eco_id = 'ECO:0000213'
                assoc.add_evidence(eco_id)

                if description is not None:
                    assoc.set_description(description)

                # FIXME score should get added to provenance/study
                # assoc.set_score(pvalue)
                assoc.add_association_to_graph()
コード例 #22
0
ファイル: test_genotype.py プロジェクト: zzygyx9119/mckb
    def test_genome_build_chromosome_model(self):
        """
        Test modelling of genome, builds, and chromosomes
        Using test data set 2, and the function add_variant_info_to_graph()
        """
        from dipper.utils.TestUtils import TestUtils
        self.cgd.add_variant_info_to_graph(self.test_set_2)

        # Make testutils object and load bindings
        test_env = TestUtils(self.cgd.graph)
        cu = CurieUtil(self.curie_map)
        self.cgd.load_bindings()

        genome = ":9606genome"
        genome_label = "Human genome"
        chromosome = "CHR:9606chr9"
        chromosome_label = "chr9 (Human)"
        build_curie = "UCSC:hg19"
        build_label = "hg19"
        chrom_on_build = ":MONARCH_hg19chr9"
        chrom_build_label = "chr9 (hg19)"

        genome_uri = URIRef(cu.get_uri(genome))
        chromosome_uri = URIRef(cu.get_uri(chromosome))
        build_uri = URIRef(cu.get_uri(build_curie))
        chrom_on_build_uri = URIRef(cu.get_uri(chrom_on_build))
        '''
        sparql_query = """
                       SELECT ?genome ?chromosome ?build ?chromOnBuild
                       WHERE {{
                           ?genome a owl:Class ;
                               rdfs:label "{0}" ;
                               OBO:RO_0002162 OBO:NCBITaxon_9606 ;
                               OBO:RO_0002351 ?chromosome ;
                               rdfs:subClassOf OBO:SO_0001026 .

                           ?chromosome a owl:Class ;
                               rdfs:label "{1}" ;
                               OBO:RO_0002350 ?genome ;
                               rdfs:subClassOf OBO:SO_0000340 .

                           ?build a OBO:SO_0001505 ;
                               a ?genome ;
                               rdfs:label "{2}" ;
                               OBO:RO_0002351 ?chromOnBuild ;
                               rdfs:subClassOf ?genome .

                           ?chromOnBuild a ?chromosome ;
                               rdfs:label "{3}" ;
                               OBO:RO_0002350 ?build .
                       }}
                       """.format(genome_label, chromosome_label,
                                  build_label, chrom_build_label)
        '''
        sparql_query = """
                       SELECT ?genome ?chromosome ?build ?chromOnBuild
                       WHERE {{
                           ?genome a owl:Class ;
                               rdfs:label "{0}" ;
                               rdfs:subClassOf OBO:SO_0001026 .

                           ?chromosome a owl:Class ;
                               rdfs:label "{1}" ;
                               rdfs:subClassOf OBO:SO_0000340 .

                           ?build a OBO:SO_0001505 ;
                               a ?genome ;
                               rdfs:label "{2}" ;
                               OBO:RO_0002162 OBO:NCBITaxon_9606 ;
                               OBO:RO_0002351 ?chromOnBuild .

                           ?chromOnBuild a ?chromosome ;
                               a OBO:SO_0000340 ;
                               rdfs:label "{3}" ;
                               OBO:RO_0002350 ?build .
                       }}
                       """.format(genome_label, chromosome_label, build_label,
                                  chrom_build_label)

        # Expected Results
        expected_results = [[
            genome_uri, chromosome_uri, build_uri, chrom_on_build_uri
        ]]

        # Query graph
        sparql_output = test_env.query_graph(sparql_query)

        self.assertEqual(expected_results, sparql_output)
コード例 #23
0
ファイル: test_genotype.py プロジェクト: zzygyx9119/mckb
    def test_chromosome_position_model(self):
        """
        Test modelling of genomic positions
        Using test data set 2, and the function add_variant_info_to_graph()
        """
        from dipper.utils.TestUtils import TestUtils
        self.cgd.add_variant_info_to_graph(self.test_set_2)

        # Make testutils object and load bindings
        test_env = TestUtils(self.cgd.graph)
        cu = CurieUtil(self.curie_map)
        self.cgd.load_bindings()

        (variant_key, variant_label, amino_acid_variant, amino_acid_position,
         transcript_id, transcript_priority, protein_variant_type,
         functional_impact, stop_gain_loss, transcript_gene,
         protein_variant_source, variant_gene, bp_pos, variant_cdna, cosmic_id,
         db_snp_id, genome_pos_start, genome_pos_end, ref_base, variant_base,
         primary_transcript_exons, primary_transcript_variant_sub_types,
         variant_type, chromosome, genome_build, build_version,
         build_date) = self.test_set_2[0]

        variant_id = self.cgd.make_cgd_id('variant{0}'.format(variant_key))

        chromosome_curie = ":MONARCH_hg19chr9"
        region_id = ":_{0}{1}Region-{2}-{3}".format(genome_build, chromosome,
                                                    genome_pos_start,
                                                    genome_pos_end)
        start_id = ":_hg19chr9-{0}".format(genome_pos_start)
        end_id = ":_hg19chr9-{0}".format(genome_pos_end)

        region_uri = URIRef(cu.get_uri(region_id))
        start_uri = URIRef(cu.get_uri(start_id))
        end_uri = URIRef(cu.get_uri(end_id))
        chromosome_uri = URIRef(cu.get_uri(chromosome_curie))

        sparql_query = """
                       SELECT ?region ?startPosition ?endPosition ?chromosome
                       WHERE {{
                           ?region a faldo:Region ;
                               faldo:begin ?startPosition ;
                               faldo:end ?endPosition .

                           ?startPosition a faldo:Position ;
                               faldo:position {0} ;
                               faldo:reference ?chromosome .

                           ?endPosition a faldo:Position ;
                               faldo:position {1} ;
                               faldo:reference ?chromosome .
                       }}
                       """.format(
            genome_pos_start,
            genome_pos_end,
        )

        # Expected Results
        expected_results = [[region_uri, start_uri, end_uri, chromosome_uri]]

        # Query graph
        sparql_output = test_env.query_graph(sparql_query)

        self.assertEqual(expected_results, sparql_output)
コード例 #24
0
ファイル: test_genotype.py プロジェクト: zzygyx9119/mckb
    def test_missense_variant_protein_model(self):
        """
        Test missense variant with only protein information
        Using test data set 1, and the function add_variant_info_to_graph()
        We want to test the following triples:

        CGD:VariantID is an instance of OBO:SO_0001059
        CGD:VariantID is an instance of OBO:SO_0001583
        CGD:VariantID has the label "CSF3R Q741X  missense mutation"
        CGD:VariantID is_sequence_variant_instance_of (OBO:GENO_0000408) NCBIGene:1441
        CGD:VariantID has location (faldo:location) CGD:RegionID
        CGD:VariantID OBO:GENO_reference_amino_acid "Q"
        CGD:VariantID OBO:GENO_results_in_amino_acid_change "X"
        CGD:VariantID RO:0002205 CCDS:413.1

        CCDS:413.1 is an instance of OBO:GENO_primary
        CCDS:413.1 has the label "CCDS413.1"
        """
        from dipper.utils.TestUtils import TestUtils

        self.cgd.add_variant_info_to_graph(self.test_set_1)

        # Make testutils object and load bindings
        test_env = TestUtils(self.cgd.graph)
        cu = CurieUtil(self.curie_map)
        self.cgd.load_bindings()

        (variant_key, variant_label, amino_acid_variant, amino_acid_position,
         transcript_id, transcript_priority, protein_variant_type,
         functional_impact, stop_gain_loss, transcript_gene,
         protein_variant_source) = self.test_set_1[0][0:11]

        gene_id = self.cgd.gene_map[transcript_gene]
        ref_amino_acid = "Q"
        altered_amino_acid = "X"
        position = 741
        uniprot_curie = "UniProtKB:Q99062#Q99062-1"

        variant_id = self.cgd.make_cgd_id('variant{0}'.format(variant_key))
        transcript = "CCDS:413.1"
        region_id = ":_{0}{1}{2}Region".format(position, position,
                                               uniprot_curie)
        variant_uri = URIRef(cu.get_uri(variant_id))
        transcript_uri = URIRef(cu.get_uri(transcript))
        gene_uri = URIRef(cu.get_uri(gene_id))
        region_uri = URIRef(cu.get_uri(region_id))

        sparql_query = """
                       SELECT ?variant ?gene ?region ?transcript
                       WHERE {{
                           ?variant a OBO:SO_0001059;
                               a OBO:SO_0001583 ;
                               rdfs:label "{0}" ;
                               OBO:GENO_0000408 ?gene ;
                               faldo:location ?region ;
                               OBO:GENO_reference_amino_acid "{1}" ;
                               OBO:GENO_results_in_amino_acid_change "{2}" ;
                               RO:0002205 ?transcript .

                           ?transcript a OBO:SO_0000233 ;
                               rdfs:label "{3}" .
                       }}
                       """.format(variant_label, ref_amino_acid,
                                  altered_amino_acid, transcript_id)

        # Expected Results
        expected_results = [[
            variant_uri, gene_uri, region_uri, transcript_uri
        ]]
        # Query graph
        sparql_output = test_env.query_graph(sparql_query)

        self.assertEqual(expected_results, sparql_output)