Exemple #1
0
    def setUp(self):
        self.test_util = TestUtils()
        self.test_set_1 = {
            'Allele':
            'atp6-L183R (L183R)',
            'Chemical':
            'glycerol',
            'Condition':
            'elevated temperature (35 deg C)|nonfermentable carbon source',
            'Details':
            'similar results obtained with atp6-L247R, and atp6-W136R, all '
            'corresponding to human NARP syndrome mutants',
            'Experiment Type':
            'classical genetics',
            'Feature Name':
            'Q0085',
            'Feature Type':
            'ORF',
            'Gene Name':
            'ATP6',
            'Mutant Type':
            'reduction of function',
            'Phenotype':
            'respiratory growth: decreased rate',
            'Reference':
            'PMID: 21715656|SGD_REF: S000145858',
            'Reporter':
            ' ',
            'SGDID':
            'S000007268',
            'Strain Background':
            'Other'
        }

        return
Exemple #2
0
 def setUp(self):
     """
     """
     self.test_util = TestUtils()
     self.orphanet = Orphanet('rdf_graph', True)
     self.orphanet.rawdir = os.path.join(os.path.dirname(__file__),
                                         'resources/orphanet')
 def setUp(self):
     self.test_util = TestUtils()
     self.source = GWASCatalog('rdf_graph', True)
     self.source.graph = RDFGraph(True)  # Reset graph
     self.source.graph.bind_all_namespaces()
     self.test_data = {
         'snp_label': 'rs1491921-C',
         'chrom_num': '5',
         'chrom_pos': '21259029',
         'context': 'intergenic_variant',
         'allele_freq': '0.013',
         'trait': 'Diisocyanate-induced asthma',
         'trait_uri': 'http://www.ebi.ac.uk/efo/EFO_0006995, http://www.ebi.ac.uk/efo/EFO_0003949',
         'pvalue': '0.0000007',
         'merged': '0',
         'snp_id_current': '1491921',
         'mapped_gene': 'LOC102723561 - GUSBP1',
         'snp_gene_nums': '',
         'upstream_gene_num': '107986179',
         'downstream_gene_num': '107986180',
         'init_sample_desc': '74 European ancestry cases, 824 European ancestry controls',
         'replicated_sample_desc': 'NA',
         'platform': 'Illumina [1556551]',
         'pubmed': '25918132'
     }
Exemple #4
0
 def setUp(self):
     self.test_util = TestUtils()
     self.source = GWASCatalog('rdf_graph', True)
     self.source.graph = RDFGraph(True)
     self.test_data = {
         'snp_label': 'rs1329573-?; rs7020413-?; rs3824344-?; rs3758171-?',
         'chrom_num': '9;9;9;9',
         'chrom_pos': '36998996;37002118;37000690;36997420',
         'context':
         'intron_variant; intron_variant; intron_variant; intron_variant',
         'allele_freq': 'NR',
         'trait': 'Intelligence',
         'trait_uri': 'http://www.ebi.ac.uk/efo/EFO_0004337',
         'pvalue': '0.00000004',
         'merged': '0',
         'snp_id_current': '',
         'mapped_gene': 'PAX5; PAX5; PAX5; PAX5',
         'snp_gene_nums': '',
         'upstream_gene_num': '107986179',
         'downstream_gene_num': '107986180',
         'init_sample_desc':
         '656 European ancestry individuals from ADHD families',
         'replicated_sample_desc': 'NA',
         'platform': 'Illumina [795637]',
         'pubmed': '22449649'
     }
Exemple #5
0
 def setUp(self):
     self.test_util = TestUtils()
     self.test_set_1 = \
         ('ENSBTAP00000013354', 'R-BTA-3000480',
          'http://www.reactome.org/PathwayBrowser/#/R-BTA-3000480',
          'Scavenging by Class A Receptors', 'IEA', 'Bos taurus')
     return
Exemple #6
0
 def setUp(self):
     self.test_util = TestUtils()
     self.source = CTD('rdf_graph', True)
     self.source.graph = RDFGraph(True)
     self.test_row = [
         'Nicotine', 'D009538', '', 'TOBACCO ADDICTION, SUSCEPTIBILITY TO',
         'OMIM:188890', 'therapeutic', '', '', '', '12345|56789'
     ]
     return
Exemple #7
0
    def setUp(self):
        self.test_util = TestUtils()
        self.source = MyChem('rdf_graph', True)

        # Replaces source.fetch()
        data_fh = open(TESTDATA, 'r')
        self.test_data = json.load(data_fh)
        data_fh.close()
        self.source.drugbank_targets.append(self.test_data[0])
        self.source.drugcentral_interactors.append(self.test_data[0])
Exemple #8
0
 def setUp(self):
     """
     """
     self.test_util = TestUtils()
     self.orphanet = Orphanet('rdf_graph', True)
     # Override so tests don't break when we update terms
     self.globaltt = self.orphanet.open_and_parse_yaml(
         os.path.join(os.path.dirname(__file__),
                      './resources/test_terms.yaml'))
     self.orphanet.rawdir = os.path.join(os.path.dirname(__file__),
                                         'resources/orphanet')
Exemple #9
0
    def test_therapeutic_relationship(self):
        from dipper.utils.TestUtils import TestUtils
        from dipper.models.Model import Model

        # Make testutils object and load ttl
        test_query = TestUtils(self.source.graph)
        test_query.load_testgraph_from_turtle(self.source)
        graph = self.source.graph
        model = Model(graph)

        # Expected structure
        # TODO can this be unified OBAN and the Annot models
        # to be automatically generated?
        sparql_query = """
                       SELECT ?assoc ?disease ?rel ?chemical
                       WHERE {
                           ?assoc a OBAN:association ;
                           OBAN:association_has_object ?disease ;
                           OBAN:association_has_predicate ?rel ;
                           OBAN:association_has_subject ?chemical .}
                       """

        # SPARQL variables to check
        chem_id = 'MESH:D009538'
        chem_uri = graph._getNode(chem_id)
        disease_id = 'OMIM:188890'
        disease_uri = graph._getNode(disease_id)
        rel_id = model.object_properties['substance_that_treats']
        rel_uri = graph._getNode(rel_id)
        # TODO unused
        # pubmed_id = 'PMID:16785264'
        # pubmed_uri = gu.getNode(pubmed_id)
        # eco = 'ECO:0000033'

        assoc = G2PAssoc(graph, self.source.name, chem_id, disease_id, rel_id)
        assoc_id = assoc.make_g2p_id()
        assoc_uri = self.source.graph._getNode(assoc_id)

        # One of the expected outputs from query
        expected_output = [assoc_uri, disease_uri, rel_uri, chem_uri]

        # Query graph
        sparql_output = test_query.query_graph(sparql_query)

        self.assertTrue(
            expected_output in sparql_output,
            "did not find expected association: " + str(expected_output) +
            " found " + str(len(sparql_output)) + " others:\n" +
            str(sparql_output))

        logger.info("Test query data finished.")
Exemple #10
0
    def test_therapeutic_relationship(self):
        from dipper.utils.TestUtils import TestUtils
        from dipper.models.Model import Model

        # Make testutils object and load ttl
        test_query = TestUtils(self.source.graph)
        test_query.load_testgraph_from_turtle(self.source)
        graph = self.source.graph
        model = Model(graph)

        # Expected structure
        # TODO can this be unified OBAN and the Annot models
        # to be automatically generated?
        sparql_query = """
                       SELECT ?assoc ?disease ?rel ?chemical
                       WHERE {
                           ?assoc a OBAN:association ;
                           OBAN:association_has_object ?disease ;
                           OBAN:association_has_predicate ?rel ;
                           OBAN:association_has_subject ?chemical .}
                       """

        # SPARQL variables to check
        chem_id = 'MESH:D009538'
        chem_uri = graph._getNode(chem_id)
        disease_id = 'OMIM:188890'
        disease_uri = graph._getNode(disease_id)
        rel_id = model.object_properties['substance_that_treats']
        rel_uri = graph._getNode(rel_id)
        # TODO unused
        # pubmed_id = 'PMID:16785264'
        # pubmed_uri = gu.getNode(pubmed_id)
        # eco = 'ECO:0000033'

        assoc = G2PAssoc(graph, self.source.name, chem_id, disease_id, rel_id)
        assoc_id = assoc.make_g2p_id()
        assoc_uri = self.source.graph._getNode(assoc_id)

        # One of the expected outputs from query
        expected_output = [assoc_uri, disease_uri, rel_uri, chem_uri]

        # Query graph
        sparql_output = test_query.query_graph(sparql_query)

        self.assertTrue(
            expected_output in sparql_output,
            "did not find expected association: " + str(expected_output) +
            " found " +
            str(len(sparql_output)) + " others:\n" + str(sparql_output))

        logger.info("Test query data finished.")
Exemple #11
0
 def setUp(self):
     """
     Because _process_evidence_view uses
     self.rawdir to find the evidence file,
     the defaults are overriden here to
     point to our test file
     Note the file name must match what is in
     that method - evidence_view
     """
     self.test_util = TestUtils()
     self.mgi = MGI('rdf_graph', True)
     self.mgi.rawdir = os.path.join(os.path.dirname(__file__),
                                    'resources/mgi')
     self.mgi.idhash['annot']['6901981'] = ':association'
Exemple #12
0
 def setUp(self):
     self.test_util = TestUtils()
     self.test_set_1 = \
         ('ENSBTAP00000013354', 'R-BTA-3000480',
          'http://www.reactome.org/PathwayBrowser/#/R-BTA-3000480',
          'Scavenging by Class A Receptors',	'IEA', 'Bos taurus')
     return
Exemple #13
0
class EvidenceTestCase(unittest.TestCase):
    def setUp(self):
        """
        Because _process_evidence_view uses
        self.rawdir to find the evidence file,
        the defaults are overriden here to
        point to our test file
        Note the file name must match what is in
        that method - evidence_view
        """
        self.test_util = TestUtils()
        self.mgi = MGI('rdf_graph', True)
        self.mgi.rawdir = os.path.join(os.path.dirname(__file__),
                                       'resources/mgi')
        self.mgi.idhash['annot']['6901981'] = ':association'

    def tearDown(self):
        self.mgi = None
        return

    def test_sex_specificity_model(self):
        self.mgi.graph = RDFGraph(True)  # Reset graph
        self.mgi._process_evidence_view(limit=None)
        logger.debug("Reference graph: %s",
                     self.mgi.graph.serialize(format="turtle").decode("utf-8"))
        expected_triples = """
        :association RO:0002558 ECO:0000006 ;
            dc:source J:74619 ;
            :has_sex_specificity PATO:0000384 .

        J:74619 a IAO:0000310 .
        """
        self.assertTrue(
            self.test_util.test_graph_equality(expected_triples,
                                               self.mgi.graph))
Exemple #14
0
    def test_parse(self):
        for rcv in RCVS:
            output_nt = rcv + '.nt'
            input_xml = rcv + '.xml.gz'
            reference_ttl = TTL_PATH + rcv + '.ttl'
            with self.subTest(rcv=rcv):

                mock_args = [
                    "test_clinvar.py", "--inputdir", XML_PATH, "--filename",
                    input_xml, "--mapfile", MAP_FILE, "--destination", NT_PATH,
                    "--output", output_nt
                ]

                patch('sys.argv', mock_args).start()
                clinvar_parse()
                query_graph = RDFGraph()
                query_graph.bind_all_namespaces()
                query_graph.parse(NT_PATH + output_nt, format='nt')

                with open(reference_ttl, 'r') as ref_fh:
                    ref_graph = "\n".join(ref_fh.readlines())

                # debug
                LOG.debug(
                    "Reference graph: %s",
                    query_graph.serialize(format="turtle").decode("utf-8"))

                # Convert output from ClinVar parse to dot then png
                dot_file_path = DOT_PATH + rcv + ".dot"
                with open(dot_file_path, 'w') as dot_file:
                    rdf2dot(query_graph, dot_file)

                self.assertTrue(
                    TestUtils.test_graph_equality(ref_graph, query_graph))
Exemple #15
0
    def setUp(self):
        self.test_util = TestUtils()
        self.assoc_curie = 'MONARCH:test_association'
        self.eco_id = 'ECO:0000015'

        self.test_set_1 = (
            'MGI:1920145', 'Setd5', 'WTSI', 'MEFW', 'male',
            'heterozygote', 'MGI:4432631', 'Setd5<tm1a(EUCOMM)Wtsi>',
            'targeted mutation 1a, Wellcome Trust Sanger Institute',
            'MGI:2159965', 'C57BL/6N', 'MGP',
            'Wellcome Trust Sanger Institute Mouse Genetics Project',
            'MGP Select Pipeline', 'MGP_001', 'MGP_XRY_001', 'X-ray',
            'IMPC_XRY_008_001', 'Number of ribs right', 'MP:0005390',
            'skeleton phenotype', 'MP:0000480', 'increased rib number',
            '1.637023E-010', '', '8.885439E-007',
            'Wilcoxon rank sum test with continuity correction', 'IMPC')

        # Generate test curies, these are otherwise generated
        # within _add_evidence() and _add_study_provenance()
        # these blank nodes are hardcoded as NOT Skolemized  ...
        self.study_curie = "_:study"
        self.evidence_curie = "_:evidence"

        # IRIs for testing sparql output
        curie_dict = curie_map.get()
        curie_util = CurieUtil(curie_dict)
        self.assoc_iri = URIRef(curie_util.get_uri(self.assoc_curie))

        return
    def test_parse(self):
        """
        Runs WormBase.parse() and outputs dot file for each allele
        This is less of a unit test and more for viewing the
        output of an entire run on a single allele,
        dot files can be converted to images using
        scripts/dot-to-svg.sh
        """
        for variant in VARIANTS:
            with self.subTest(variant_id=variant):
                self.tearDownAndSetUp()
                self.gwascatalog.rawdir = RAW_PATH + '/' + variant
                self.gwascatalog.parse()
                dot_file_path = DOT_PATH + variant + ".dot"
                with open(dot_file_path, 'w') as dot_file:
                    rdf2dot(self.gwascatalog.graph, dot_file)

                # debug
                LOG.debug(
                    "Reference graph: %s",
                    self.gwascatalog.graph.serialize(
                        format="turtle").decode("utf-8"))

                reference_ttl = TTL_PATH + variant + '.ttl'

                self.assertTrue(
                    TestUtils.test_graph_equality(reference_ttl,
                                                  self.gwascatalog.graph))
Exemple #17
0
class TestMyChemParser(unittest.TestCase):
    def setUp(self):
        self.test_util = TestUtils()
        self.source = MyChem('rdf_graph', True)

        # Replaces source.fetch()
        data_fh = open(TESTDATA, 'r')
        self.test_data = json.load(data_fh)
        data_fh.close()
        self.source.drugbank_targets.append(self.test_data[0])
        self.source.drugcentral_interactors.append(self.test_data[0])

    def tearDown(self):
        self.source = None

    def test_parse(self):
        self.source.graph = RDFGraph(True)  # Reset graph
        self.assertTrue(len(list(self.source.graph)) == 0)

        self.source.parse()

        triples = """
        UNII:46U771ERWK RO:0002606 SNOMED:386761002 ;
            rdfs:subClassOf CHEBI:23367 .

        SNOMED:386761002 rdfs:label "Local anesthesia" ;
            rdfs:subClassOf DOID:4 .
        """

        # dbg
        logger.debug(
            "Reference graph: %s",
            self.source.graph.serialize(format="turtle").decode("utf-8"))
        self.assertTrue(
            self.test_util.test_graph_equality(triples, self.source.graph))
Exemple #18
0
    def test_therapeutic_relationship(self):
        from dipper.utils.TestUtils import TestUtils
        from dipper.utils.GraphUtils import GraphUtils
        from dipper import curie_map

        # Make testutils object and load ttl
        test_query = TestUtils(self.source.graph)
        test_query.load_testgraph_from_turtle(self.source)

        # Expected structure
        # TODO can this be unified OBAN and the Annot models to be automatically generated?
        sparql_query = """
                       SELECT ?assoc ?pubmed ?disease ?chemical
                       WHERE {
                       ?assoc a Annotation: ;
                           dc:evidence OBO:ECO_0000033 ;
                           dc:source ?pubmed ;
                           :hasObject ?disease ;
                           :hasPredicate OBO:RO_0002606 ;
                           :hasSubject ?chemical .}
                       """

        # SPARQL variables to check
        gu = GraphUtils(curie_map.get())
        chem_id = 'MESH:D009538'
        chem_uri = gu.getNode(chem_id)
        disease_id = 'OMIM:188890'
        disease_uri = gu.getNode(disease_id)
        eco = 'ECO:0000033'
        rel_id = gu.object_properties['substance_that_treats']
        pubmed_id = 'PMID:16785264'
        pubmed_uri = gu.getNode(pubmed_id)

        # consider replacing with make_ctd_chem_disease_assoc_id()
        assoc_id = self.source.make_association_id('ctd', chem_id, rel_id, disease_id, eco, pubmed_id)
        assoc_uri = gu.getNode(assoc_id)

        # One of the expected outputs from query
        expected_output = [assoc_uri, pubmed_uri, disease_uri, chem_uri]

        # Query graph
        sparql_output = test_query.query_graph(sparql_query)

        self.assertTrue(expected_output in sparql_output, "did not find expected association: " + assoc_id +
                        " found: " + pprint.pformat(sparql_output))

        logger.info("Test query data finished.")
    def test_therapeutic_relationship(self):
        from dipper.utils.TestUtils import TestUtils
        from dipper.utils.GraphUtils import GraphUtils

        # Make testutils object and load bindings
        test_query = TestUtils(self.ctd.graph)
        self.ctd.load_bindings()

        # Expected structure
        sparql_query = """
                       SELECT ?assoc ?pubmed ?disease ?chemical
                       WHERE {
                       ?assoc a Annotation: ;
                           dc:evidence OBO:ECO_0000033 ;
                           dc:source ?pubmed ;
                           :hasObject ?disease ;
                           :hasPredicate OBO:RO_0002606 ;
                           :hasSubject ?chemical .}
                       """

        # SPARQL variables to check
        gu = GraphUtils(curie_map.get())
        chem_id = 'MESH:D009538'
        chem_uri = gu.getNode(chem_id)
        disease_id = 'OMIM:188890'
        disease_uri = gu.getNode(disease_id)
        pubmed_id = 'PMID:16785264'
        pubmed_uri = gu.getNode(pubmed_id)
        rel_id = gu.object_properties['substance_that_treats']
        eco = 'ECO:0000033'
        # TODO PYLINT  make_association_id() does not exist in CTD
        # there is "_make_association()" with a different sig

        assoc_id = self.ctd.make_association_id(
            'ctd', chem_id, rel_id, disease_id, eco, pubmed_id)
        assoc_uri = gu.getNode(assoc_id)

        # Expected output from query
        expected_output = [assoc_uri, pubmed_uri, disease_uri, chem_uri]

        # Query graph
        sparql_output = test_query.query_graph(sparql_query)

        self.assertTrue(expected_output in sparql_output)

        logger.info("Test finished.")
Exemple #20
0
    def test_classes_indiv_properties(self):
        """
        Given the above sample input, produce the following:
        A CGD:DiseaseID is an OWL Class
        A CGD:DiseaseID is a subclass of DOID:4
        A CGD:Disease rdfs:label "Adenocarcinoma"
        A CGD:DiseaseInstance is an individual of CGD:DiseaseID
        A CGD:DiseaseInstance rdfs:label "Adenocarcinoma with response {1} to therapy"
        A CGD:DrugID is an OWL Class
        A CGD:DrugID is a subclass of CHEBI:23888
        A CGD:DrugID rdfs:label "5FU-based adjuvant therapy"
        A CGD:RelationID is an object property
        PMID:12345 is a IAO:0000013 (journal article)
        """
        from dipper.utils.TestUtils import TestUtils

        # Make testutils object and load bindings
        test_env = TestUtils(self.cgd.graph)
        self.cgd.load_bindings()

        sparql_query = """
                       SELECT ?disease ?diseaseInd ?diseaseQual ?drug ?source
                       WHERE {{
                           ?disease a owl:Class ;
                               rdfs:subClassOf DOID:4 ;
                               rdfs:label "{0}" .
                           ?diseaseInd a ?disease ;
                               rdfs:label "{1}" ;
                               BFO:0000159 ?diseaseQual .
                           ?drug a owl:Class ;
                               rdfs:subClassOf CHEBI:23888 ;
                               rdfs:label "{2}" .
                           <{3}> a owl:ObjectProperty .
                           ?source a IAO:0000013 .
                       }}
                       """.format(self.disease_label, self.disease_instance_label,
                                  self.drug_label, self.relationship_uri)

        # Expected Results
        expected_results = [[self.disease_uri, self.disease_ind_uri,
                             self.disease_quality_uri, self.drug_uri,
                             self.source_uri]]
        # Query graph
        sparql_output = test_env.query_graph(sparql_query)

        self.assertEqual(expected_results, sparql_output)
Exemple #21
0
    def setUp(self):
        self.test_util = TestUtils()
        self.test_set_1 = {
            'aspect':
            'N',
            'date':
            '2006-10-26',
            'evidence': {
                'has_supporting_reference': ['RGD:1581841', 'PMID:12799311'],
                'type': 'IED',
                'with_support_from': []
            },
            'negated':
            False,
            'object': {
                'id': 'MP:0003340',
                'taxon': 'NCBITaxon:10116'
            },
            'provided_by':
            'RGD',
            'qualifiers': [],
            'relation': {
                'id': None
            },
            'source_line':
            'RGD\t2535\tEdnra\t\tMP:0003340\tRGD:1581841|PMID:12799311\t'
            'IED\t\tN\tendothelin receptor type A\t\tgene\ttaxon:10116\t'
            '20061026\tRGD\t\t\n',
            'subject': {
                'fullname': 'endothelin receptor type A',
                'id': 'RGD:2535',
                'label': 'Ednra',
                'synonyms': [],
                'taxon': {
                    'id': 'NCBITaxon:10116'
                },
                'type': 'gene'
            },
            'subject_extensions': [{
                'filler': '\n',
                'property': 'isoform'
            }]
        }

        return
Exemple #22
0
    def test_associations(self):
        """
        Given the above sample input, produce the following:
        CGD:VariantID has_phenotype(RO:0002200) CGD:DiseaseInstance

        A CGD:AssociationID OBO:RO_0002558 Traceable Author Statement (ECO:0000033)
        A CGD:AssociationID dc:source PMID:20498393
        A CGD:AssociationID has_environment CGD:DrugID
        A CGD:AssociationID OBAN:association_has_subject CGD:VariantID
        A CGD:AssociationID OBAN:association_has_object_property has_phenotype
        A CGD:AssociationID OBAN:association_has_object CGD:DiseaseInstance
        """
        from dipper.utils.TestUtils import TestUtils

        # Make testutils object and load bindings
        cu = CurieUtil(self.curie_map)
        test_env = TestUtils(self.cgd.graph)
        self.cgd.load_bindings()
        evidence = 'OBO:ECO_0000033'
        evidence_uri = URIRef(cu.get_uri(evidence))

        sparql_query = """
                       SELECT ?diseaseInd ?variant ?drug ?vdannot ?source ?evidence
                       WHERE {{
                           ?variant OBO:RO_0002200 ?diseaseInd .

                           ?vdannot a OBAN:association ;
                               OBO:RO_0002558 ?evidence ;
                               dc:source ?source ;
                               <{0}> ?drug ;
                               OBAN:association_has_object ?diseaseInd ;
                               OBAN:association_has_object_property OBO:RO_0002200 ;
                               OBAN:association_has_subject ?variant .
                       }}
                       """.format(self.relationship_uri)

        # Expected Results
        expected_results = [[self.disease_ind_uri, self.variant_uri, self.drug_uri,
                             self.vd_annot_uri,
                             self.source_uri, evidence_uri]]
        # Query graph
        sparql_output = test_env.query_graph(sparql_query)

        self.assertEqual(expected_results, sparql_output)
Exemple #23
0
    def test_therapeutic_relationship(self):
        from dipper.utils.TestUtils import TestUtils
        from dipper.utils.GraphUtils import GraphUtils

        # Make testutils object and load bindings
        test_query = TestUtils(self.ctd.graph)

        # Expected structure
        sparql_query = """
                       SELECT ?assoc ?pubmed ?disease ?chemical
                       WHERE {
                       ?assoc a Annotation: ;
                           dc:evidence OBO:ECO_0000033 ;
                           dc:source ?pubmed ;
                           :hasObject ?disease ;
                           :hasPredicate OBO:RO_0002606 ;
                           :hasSubject ?chemical .}
                       """

        # SPARQL variables to check
        chem_id = 'MESH:D009538'
        chem_uri = self.graph._getNode(chem_id)
        disease_id = 'OMIM:188890'
        disease_uri = self.graph._getNode(disease_id)
        pubmed_id = 'PMID:16785264'
        pubmed_uri = self.graph._getNode(pubmed_id)
        rel_id = self.model.object_properties['substance_that_treats']
        eco = 'ECO:0000033'
        # TODO PYLINT  make_association_id() does not exist in CTD
        # there is "_make_association()" with a different sig

        assoc_id = self.ctd.make_association_id('ctd', chem_id, rel_id,
                                                disease_id, eco, pubmed_id)
        assoc_uri = self.graph._getNode(assoc_id)

        # Expected output from query
        expected_output = [assoc_uri, pubmed_uri, disease_uri, chem_uri]

        # Query graph
        sparql_output = test_query.query_graph(sparql_query)

        self.assertTrue(expected_output in sparql_output)

        logger.info("Test finished.")
Exemple #24
0
class SGDTestCase(unittest.TestCase):
    def setUp(self):
        self.test_util = TestUtils()
        self.test_set_1 = {
            'Allele': 'atp6-L183R (L183R)',
            'Chemical': 'glycerol',
            'Condition': 'elevated temperature (35 deg C)|nonfermentable carbon source',
            'Details': 'similar results obtained with atp6-L247R, and atp6-W136R, all '
                    'corresponding to human NARP syndrome mutants',
            'Experiment Type': 'classical genetics',
            'Feature Name': 'Q0085',
            'Feature Type': 'ORF',
            'Gene Name': 'ATP6',
            'Mutant Type': 'reduction of function',
            'Phenotype': 'respiratory growth: decreased rate',
            'Reference': 'PMID: 21715656|SGD_REF: S000145858',
            'Reporter': ' ',
            'SGDID': 'S000007268',
            'Strain Background': 'Other'}

        return

    def tearDown(self):
        return

    def testSGDParser(self):
        sgd = SGD('rdf_graph', True)
        sgd.graph = RDFGraph(True)
        record = self.test_set_1
        sgd.make_association(record)

        description = sgd._make_description(record)

        triples = """
        :MONARCH_ba748c98c0f167739128 a OBAN:association ;
            OBO:RO_0002558 OBO:APO_0000020 ;
            dc:description "{0}";
            dc:source PMID:21715656 ;
            OBAN:association_has_object MONARCH:APO_0000309APO_0000245 ;
            OBAN:association_has_predicate OBO:RO_0002200 ;
            OBAN:association_has_subject SGD:S000007268 .
            
        SGD:S000007268 rdfs:label "ATP6" ;
        RO:0002200 MONARCH:APO_0000309APO_0000245 .

        APO:0000020 rdfs:label "classical genetics" .

        PMID:21715656 a OBO:IAO_0000311 ;
        owl:sameAs SGD_REF:S000145858 .

        MONARCH:APO_0000309APO_0000245 rdfs:label "respiratory growth:decreased rate" ;
        rdfs:subClassOf UPHENO:0001001 .

        """.format(description)
        # test exact contents of graph
        self.assertTrue(self.test_util.test_graph_equality(triples, sgd.graph))
Exemple #25
0
class SGDTestCase(unittest.TestCase):
    def setUp(self):
        self.test_util = TestUtils()
        self.test_set_1 = {'Allele': 'atp6-L183R (L183R)',
                           'Chemical': 'glycerol',
                           'Condition': 'elevated temperature (35 deg C)|nonfermentable carbon source',
                           'Details': 'similar results obtained with atp6-L247R, and atp6-W136R, all '
                                      'corresponding to human NARP syndrome mutants',
                           'Experiment Type': 'classical genetics',
                           'Feature Name': 'Q0085',
                           'Feature Type': 'ORF',
                           'Gene Name': 'ATP6',
                           'Mutant Type': 'reduction of function',
                           'Phenotype': 'respiratory growth: decreased rate',
                           'Reference': 'PMID: 21715656|SGD_REF: S000145858',
                           'Reporter': ' ',
                           'SGDID': 'S000007268',
                           'Strain Background': 'Other'}

        return

    def tearDown(self):
        return

    def testSGDParser(self):
        sgd = SGD('rdf_graph', True)
        sgd.graph = RDFGraph(True)
        record = self.test_set_1
        sgd.make_association(record)

        description = sgd._make_description(record)

        triples = """
        :MONARCH_95158d413dd73476 a OBAN:association ;
            OBO:RO_0002558 OBO:APO_0000020 ;
            dc:description "{0}";
            dc:source PMID:21715656 ;
            OBAN:association_has_object MONARCH:OBO_APO_0000309OBO_APO_0000245 ;
            OBAN:association_has_predicate OBO:RO_0002200 ;
            OBAN:association_has_subject SGD:S000007268 .
            
        SGD:S000007268 rdfs:label "ATP6" ;
        RO:0002200 MONARCH:OBO_APO_0000309OBO_APO_0000245 .

        APO:0000020 rdfs:label "classical genetics" .

        PMID:21715656 a OBO:IAO_0000311 ;
        owl:sameAs SGD_REF:S000145858 .

        MONARCH:OBO_APO_0000309OBO_APO_0000245 rdfs:label "respiratory growth:decreased rate" ;
        rdfs:subClassOf UPHENO:0001001 .

        """.format(description)
        # test exact contents of graph
        self.assertTrue(self.test_util.test_graph_equality(
            triples, sgd.graph))
Exemple #26
0
class RGDTestCase(unittest.TestCase):
    def setUp(self):
        self.test_util = TestUtils()
        self.test_set_1 = {'aspect': 'N',
                           'date': '2006-10-26',
                           'evidence': {'has_supporting_reference': ['RGD:1581841', 'PMID:12799311'],
                                        'type': 'IED',
                                        'with_support_from': []},
                           'negated': False,
                           'object': {'id': 'MP:0003340', 'taxon': 'NCBITaxon:10116'},
                           'provided_by': 'RGD',
                           'qualifiers': [],
                           'relation': {'id': None},
                           'source_line': 'RGD\t2535\tEdnra\t\tMP:0003340\tRGD:1581841|PMID:12799311\t'
                                          'IED\t\tN\tendothelin receptor type A\t\tgene\ttaxon:10116\t'
                                          '20061026\tRGD\t\t\n',
                           'subject': {'fullname': 'endothelin receptor type A',
                                       'id': 'RGD:2535',
                                       'label': 'Ednra',
                                       'synonyms': [],
                                       'taxon': {'id': 'NCBITaxon:10116'},
                                       'type': 'gene'},
                           'subject_extensions': [{'filler': '\n', 'property': 'isoform'}]}

        return

    def tearDown(self):
        return

    def testRGDParser(self):
        rgd = RGD('rdf_graph', True)
        rgd.graph = RDFGraph(True)

        self.assertTrue(len(list(rgd.graph)) == 0)

        rgd.make_association(record=self.test_set_1)
        triples = """
    :MONARCH_b4650e8c3d865f11a1a5 a OBAN:association ;
        RO:0002558 ECO:0005611 ;
        dc:source RGDRef:1581841 ;
        OBAN:association_has_object OBO:MP_0003340 ;
        OBAN:association_has_predicate OBO:RO_0002200 ;
        OBAN:association_has_subject RGD:2535 ;
        pav:createdOn "2006-10-26" .
    
    RGD:2535 OBO:RO_0002200 MP:0003340 .
        RGDRef:1581841 a IAO:0000311 ;
        owl:sameAs PMID:12799311 .
        """
        # dbg
        logger.debug("Reference graph: %s",
                     rgd.graph.serialize(format="turtle")
                              .decode("utf-8")
        )
        self.assertTrue(self.test_util.test_graph_equality(
            triples, rgd.graph))
Exemple #27
0
    def setUp(self):
        self.test_util = TestUtils()
        self.source = MyChem('rdf_graph', True)

        # Replaces source.fetch()
        data_fh = open(TESTDATA, 'r')
        self.test_data = json.load(data_fh)
        data_fh.close()
        self.source.drugbank_targets.append(self.test_data[0])
        self.source.drugcentral_interactors.append(self.test_data[0])
Exemple #28
0
    def setUp(self):
        self.test_util = TestUtils()
        # Test set with two proteins from same species
        self.test_set_1 = [[
            '9606.ENSP00000000233', '9606.ENSP00000003084',
            0, 0, 0, 0, 300, 0, 150, 800]]

        # Test set with deprecated protein id
        self.test_set_2 = [[
            '9606.ENSP00000000233', '9606.ENSP00000006101',
            0, 0, 0, 0, 300, 0, 150, 800]]

        self.columns = [
            'protein1', 'protein2', 'neighborhood', 'fusion', 'cooccurence',
            'coexpression', 'experimental', 'database', 'textmining', 'combined_score']

        ensembl = Ensembl('rdf_graph', True)
        self.protein_list = ensembl.fetch_protein_gene_map('9606')

        return
Exemple #29
0
class CTDTestCase(unittest.TestCase):
    def setUp(self):
        self.test_util = TestUtils()
        self.source = CTD('rdf_graph', True)
        self.source.graph = RDFGraph(True)
        self.test_row = [
            'Nicotine',
            'D009538',
            '',
            'TOBACCO ADDICTION, SUSCEPTIBILITY TO',
            'OMIM:188890',
            'therapeutic',
            '',
            '',
            '',
            '12345|56789'
        ]
        return

    def tearDown(self):
        self.source = None
        return

    def test_therapeutic_relationship(self):
        # test that graph is empty
        self.assertTrue(len(list(self.source.graph)) == 0)

        self.source._process_interactions(self.test_row)

        triples = """
            :MONARCH_b6c289df47cb72653f79 a OBAN:association ;
                RO:0002558 ECO:0000033 ;
                dcterms:source PMID:12345, PMID:56789 ;
                OBAN:association_has_object OMIM:188890 ;
                OBAN:association_has_predicate RO:0002606 ;
                OBAN:association_has_subject MESH:D009538 .

            MESH:D009538 a owl:Class ;
                rdfs:label "Nicotine" ;
                biolink:category biolink:ChemicalSubstance ;
                RO:0002606 OMIM:188890 .

            PMID:12345 a IAO:0000013 .

            PMID:56789 a IAO:0000013 .

            OMIM:188890 a owl:Class ;
                biolink:category biolink:DiseaseOrPhenotypicFeature .
        """
        # test exact contents of graph
        self.assertTrue(self.test_util.test_graph_equality(
            triples, self.source.graph))
Exemple #30
0
 def setUp(self):
     """
     Because _process_evidence_view uses
     self.rawdir to find the evidence file,
     the defaults are overriden here to
     point to our test file
     Note the file name must match what is in
     that method - evidence_view
     """
     self.test_util = TestUtils()
     self.mgi = MGI('rdf_graph', True)
     self.mgi.rawdir = os.path.join(os.path.dirname(__file__), 'resources/mgi')
     self.mgi.idhash['annot']['6901981'] = ':association'
Exemple #31
0
class CTDTestCase(unittest.TestCase):
    def setUp(self):
        self.test_util = TestUtils()
        self.source = CTD('rdf_graph', True)
        self.source.graph = RDFGraph(True)
        self.test_row = [
            'Nicotine',
            'D009538',
            '',
            'TOBACCO ADDICTION, SUSCEPTIBILITY TO',
            'OMIM:188890',
            'therapeutic',
            '',
            '',
            '',
            '12345|56789'
        ]
        return

    def tearDown(self):
        self.source = None
        return

    def test_therapeutic_relationship(self):
        # test that graph is empty
        self.assertTrue(len(list(self.source.graph)) == 0)

        self.source._process_interactions(self.test_row)

        triples = """
            :MONARCH_b6c289df47cb72653f79 a OBAN:association ;
                RO:0002558 ECO:0000033 ;
                dc:source PMID:12345, PMID:56789 ;
                OBAN:association_has_object OMIM:188890 ;
                OBAN:association_has_predicate RO:0002606 ;
                OBAN:association_has_subject MESH:D009538 .
            
            MESH:D009538 a owl:Class ;
                rdfs:label "Nicotine" ;
                RO:0002606 OMIM:188890 .
                
            PMID:12345 a IAO:0000013 .
            PMID:56789 a IAO:0000013 .
            
            OMIM:188890 a owl:Class .
        """
        # test exact contents of graph
        self.assertTrue(self.test_util.test_graph_equality(
            triples, self.source.graph))
Exemple #32
0
class ReactomeTestCase(unittest.TestCase):
    def setUp(self):
        self.test_util = TestUtils()
        self.test_set_1 = \
            ('ENSBTAP00000013354',
             'R-BTA-3000480',
             'http://www.reactome.org/PathwayBrowser/#/R-BTA-3000480',
             'Scavenging by Class A Receptors',
             'IEA',
             'Bos taurus')
        self.gaf_eco = {"IEA": "ECO:0000501"}
        return

    def tearDown(self):
        return

    def testEnsemblReactomeParser(self):
        '''

        '''
        reactome = Reactome('rdf_graph', True)
        reactome.graph = RDFGraph(True)
        self.assertTrue(len(list(reactome.graph)) == 0)
        # reactome.parse_gaf_eco('gaf-eco-mapping')

        (gene, pathway_id, pathway_iri, pathway_label, go_ecode,
         species_name) = self.test_set_1
        reactome._add_component_pathway_association('ENSEMBL:' + gene,
                                                    'REACT:' + pathway_id,
                                                    pathway_label,
                                                    self.gaf_eco[go_ecode])

        triples = """
        ENSEMBL:ENSBTAP00000013354 RO:0002331 REACT:R-BTA-3000480 .

        :MONARCH_b582c188b7ec20016206 a OBAN:association ;
            OBO:RO_0002558 ECO:0000501 ;
            OBAN:association_has_object REACT:R-BTA-3000480 ;
            OBAN:association_has_predicate RO:0002331 ;
            OBAN:association_has_subject ENSEMBL:ENSBTAP00000013354 .

        REACT:R-BTA-3000480 a owl:Class ;
            rdfs:label "Scavenging by Class A Receptors" ;
            rdfs:subClassOf GO:0009987,
                PW:0000001 .
        """
        self.assertTrue(
            self.test_util.test_graph_equality(triples, reactome.graph))
Exemple #33
0
    def test_gene_xref(self):
        """
        test FlyBase._process_gene_xref()
        """
        for allele in ALLELES:
            with self.subTest(allele_id=allele):
                self.tearDownAndSetUp()
                self.flybase.rawdir = RAW_PATH + '/' + allele
                self.flybase._process_gene_xref(limit=None)
                LOG.debug(
                    "Reference graph: %s",
                    self.flybase.graph.serialize(format="turtle").decode("utf-8"))

                reference_ttl = TTL_PATH + allele + '/' + 'gene_xref.ttl'
                self.assertTrue(TestUtils.test_graph_equality(
                    reference_ttl, self.flybase.graph))
Exemple #34
0
 def setUp(self):
     self.test_util = TestUtils()
     self.source = CTD('rdf_graph', True)
     self.source.graph = RDFGraph(True)
     self.test_row = [
         'Nicotine',
         'D009538',
         '',
         'TOBACCO ADDICTION, SUSCEPTIBILITY TO',
         'OMIM:188890',
         'therapeutic',
         '',
         '',
         '',
         '12345|56789'
     ]
     return
Exemple #35
0
    def setUp(self):
        self.test_util = TestUtils()
        self.test_set_1 = {'Allele': 'atp6-L183R (L183R)',
                           'Chemical': 'glycerol',
                           'Condition': 'elevated temperature (35 deg C)|nonfermentable carbon source',
                           'Details': 'similar results obtained with atp6-L247R, and atp6-W136R, all '
                                      'corresponding to human NARP syndrome mutants',
                           'Experiment Type': 'classical genetics',
                           'Feature Name': 'Q0085',
                           'Feature Type': 'ORF',
                           'Gene Name': 'ATP6',
                           'Mutant Type': 'reduction of function',
                           'Phenotype': 'respiratory growth: decreased rate',
                           'Reference': 'PMID: 21715656|SGD_REF: S000145858',
                           'Reporter': ' ',
                           'SGDID': 'S000007268',
                           'Strain Background': 'Other'}

        return
Exemple #36
0
    def setUp(self):
        self.test_util = TestUtils()
        # Test set with two proteins from same species
        self.test_set_1 = [[
            '9606.ENSP00000000233', '9606.ENSP00000003084',
            0, 0, 0, 0, 300, 0, 150, 800]]

        # Test set with deprecated protein id
        self.test_set_2 = [[
            '9606.ENSP00000000233', '9606.ENSP00000006101',
            0, 0, 0, 0, 300, 0, 150, 800]]

        self.columns = [
            'protein1', 'protein2', 'neighborhood', 'fusion', 'cooccurence',
            'coexpression', 'experimental', 'database', 'textmining', 'combined_score']

        ensembl = Ensembl('rdf_graph', True)
        self.protein_list = ensembl.fetch_protein_gene_map('9606')

        return
Exemple #37
0
class ReactomeTestCase(unittest.TestCase):

    def setUp(self):
        self.test_util = TestUtils()
        self.test_set_1 = \
            ('ENSBTAP00000013354', 'R-BTA-3000480',
             'http://www.reactome.org/PathwayBrowser/#/R-BTA-3000480',
             'Scavenging by Class A Receptors',	'IEA', 'Bos taurus')
        return

    def tearDown(self):
        return

    def testEnsemblReactomeParser(self):
        reactome = Reactome('rdf_graph', True)
        reactome.graph = RDFGraph(True)
        self.assertTrue(len(list(reactome.graph)) == 0)

        eco_map = Reactome.get_eco_map(Reactome.map_files['eco_map'])
        (gene, pathway_id, pathway_iri, pathway_label,
         go_ecode, species_name) = self.test_set_1
        reactome._add_component_pathway_association(
            eco_map, gene, 'ENSEMBL', pathway_id,
            'REACT', pathway_label, go_ecode)

        triples = """
        ENSEMBL:ENSBTAP00000013354 RO:0002331 REACT:R-BTA-3000480 .
        
        :MONARCH_b582c188b7ec20016206 a OBAN:association ;
            OBO:RO_0002558 ECO:0000501 ;
            OBAN:association_has_object REACT:R-BTA-3000480 ;
            OBAN:association_has_predicate RO:0002331 ;
            OBAN:association_has_subject ENSEMBL:ENSBTAP00000013354 .

        REACT:R-BTA-3000480 a owl:Class ;
            rdfs:label "Scavenging by Class A Receptors" ;
            rdfs:subClassOf GO:0009987,
                PW:0000001 .
        """
        self.assertTrue(self.test_util.test_graph_equality(
            triples, reactome.graph))
Exemple #38
0
class TestMyChemParser(unittest.TestCase):

    def setUp(self):
        self.test_util = TestUtils()
        self.source = MyChem('rdf_graph', True)

        # Replaces source.fetch()
        data_fh = open(TESTDATA, 'r')
        self.test_data = json.load(data_fh)
        data_fh.close()
        self.source.drugbank_targets.append(self.test_data[0])
        self.source.drugcentral_interactors.append(self.test_data[0])

    def tearDown(self):
        self.source = None

    def test_parse(self):
        self.source.graph = RDFGraph(True)  # Reset graph
        self.assertTrue(len(list(self.source.graph)) == 0)

        self.source.parse()

        triples = """
        UNII:46U771ERWK RO:0002606 SNOMED:386761002 ;
            rdfs:subClassOf CHEBI:23367 .

        SNOMED:386761002 rdfs:label "Local anesthesia" ;
            rdfs:subClassOf DOID:4 .
        """

        # dbg
        logger.debug("Reference graph: %s",
                     self.source.graph.serialize(format="turtle")
                                      .decode("utf-8")
        )
        self.assertTrue(self.test_util.test_graph_equality(
            triples, self.source.graph))
Exemple #39
0
class EvidenceTestCase(unittest.TestCase):

    def setUp(self):
        """
        Because _process_evidence_view uses
        self.rawdir to find the evidence file,
        the defaults are overriden here to
        point to our test file
        Note the file name must match what is in
        that method - evidence_view
        """
        self.test_util = TestUtils()
        self.mgi = MGI('rdf_graph', True)
        self.mgi.rawdir = os.path.join(
            os.path.dirname(__file__), 'resources/mgi')
        self.mgi.idhash['annot']['6901981'] = ':association'

    def tearDown(self):
        self.mgi = None
        return

    def test_sex_specificity_model(self):
        self.mgi.graph = RDFGraph(True)  # Reset graph

        self.mgi._process_evidence_view(limit=None)
        logger.debug(
            "Reference graph: %s",
            self.mgi.graph.serialize(format="turtle").decode("utf-8"))
        expected_triples = """
        :association RO:0002558 ECO:0000006 ;
            dc:source J:74619 ;
            :has_sex_specificity PATO:0000384 .

        J:74619 a IAO:0000310 .
        """
        self.assertTrue(self.test_util.test_graph_equality(
            expected_triples, self.mgi.graph))
Exemple #40
0
    def setUp(self):
        self.test_util = TestUtils()
        self.test_set_1 = {'aspect': 'N',
                           'date': '2006-10-26',
                           'evidence': {'has_supporting_reference': ['RGD:1581841', 'PMID:12799311'],
                                        'type': 'IED',
                                        'with_support_from': []},
                           'negated': False,
                           'object': {'id': 'MP:0003340', 'taxon': 'NCBITaxon:10116'},
                           'provided_by': 'RGD',
                           'qualifiers': [],
                           'relation': {'id': None},
                           'source_line': 'RGD\t2535\tEdnra\t\tMP:0003340\tRGD:1581841|PMID:12799311\t'
                                          'IED\t\tN\tendothelin receptor type A\t\tgene\ttaxon:10116\t'
                                          '20061026\tRGD\t\t\n',
                           'subject': {'fullname': 'endothelin receptor type A',
                                       'id': 'RGD:2535',
                                       'label': 'Ednra',
                                       'synonyms': [],
                                       'taxon': {'id': 'NCBITaxon:10116'},
                                       'type': 'gene'},
                           'subject_extensions': [{'filler': '\n', 'property': 'isoform'}]}

        return
Exemple #41
0
 def setUp(self):
     self.test_util = TestUtils()
     self.source = GWASCatalog('rdf_graph', True)
     self.source.graph = RDFGraph(True)
     self.test_data = {
         'snp_label': 'rs1329573-?; rs7020413-?; rs3824344-?; rs3758171-?',
         'chrom_num': '9;9;9;9',
         'chrom_pos': '36998996;37002118;37000690;36997420',
         'context': 'intron_variant; intron_variant; intron_variant; intron_variant',
         'allele_freq': 'NR',
         'trait': 'Intelligence',
         'trait_uri': 'http://www.ebi.ac.uk/efo/EFO_0004337',
         'pvalue': '0.00000004',
         'merged': '0',
         'snp_id_current': '',
         'mapped_gene': 'PAX5; PAX5; PAX5; PAX5',
         'snp_gene_nums': '',
         'upstream_gene_num': '107986179',
         'downstream_gene_num': '107986180',
         'init_sample_desc': '656 European ancestry individuals from ADHD families',
         'replicated_sample_desc': 'NA',
         'platform': 'Illumina [795637]',
         'pubmed': '22449649'
     }
Exemple #42
0
def main():
    # TODO this should be generated by looking in the dipper/sources directory
    # or read from a sources/dataset/config yaml or dir of yamls
    source_to_class_map = {
        # 'facebase_alpha': 'FaceBase_alpha',
        'hpoa': 'HPOAnnotations',   # ~3 min
        'zfin': 'ZFIN',
        'omim': 'OMIM',  # full file takes ~15 min, due to required throttling
        'biogrid': 'BioGrid',  # interactions file takes <10 minutes
        'mgi': 'MGI',
        'impc': 'IMPC',
        # Panther takes ~1hr to map 7 species-worth of associations
        'panther': 'Panther',
        'oma': 'OMA',
        'ncbigene': 'NCBIGene',  # takes about 4 minutes to process 2 species
        'ucscbands': 'UCSCBands',
        'ctd': 'CTD',
        'genereviews': 'GeneReviews',
        'eom': 'EOM',  # Takes about 5 seconds.
        'coriell': 'Coriell',
        # 'clinvar': 'ClinVar',                   # takes ~ half hour
        # 'clinvarxml_alpha': 'ClinVarXML_alpha', # takes ~ five minutes
        'monochrom': 'Monochrom',
        'kegg': 'KEGG',
        'animalqtldb': 'AnimalQTLdb',
        'ensembl': 'Ensembl',
        'hgnc': 'HGNC',
        'orphanet': 'Orphanet',
        'omia': 'OMIA',
        'flybase': 'FlyBase',
        'mmrrc': 'MMRRC',
        'wormbase': 'WormBase',
        'mpd': 'MPD',
        'gwascatalog': 'GWASCatalog',
        'monarch': 'Monarch',
        'go': 'GeneOntology',
        'reactome': 'Reactome',
        'udp': 'UDP',
        'mgi-slim': 'MGISlim',
        'zfinslim': 'ZFINSlim',
        'bgee': 'Bgee',
        'mydrug': 'MyDrug',
        'stringdb': 'StringDB',
        'rgd': 'RGD',
        'sgd': 'SGD',
	'mychem': 'MyChem'
    }

    logger = logging.getLogger(__name__)

    parser = argparse.ArgumentParser(
        description='Dipper: Data Ingestion Pipeline for SciGraph',
        formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument(
        '-g', '--graph', type=str, default="rdf_graph",
        help='graph type: rdf_graph, streamed_graph')
    parser.add_argument(
        '-s', '--sources', type=str, required=True,
        help='comma separated list of sources')
    parser.add_argument(
        '-l', '--limit', type=int,
        help='limit number of rows')
    parser.add_argument(
        '--parse_only', action='store_true',
        help='parse files without writing')
    parser.add_argument(
        '--fetch_only', action='store_true',
        help='fetch sources without parsing')
    parser.add_argument('-f', '--force', action='store_true',
                        help='force re-download of files')
    parser.add_argument(
        '--no_verify',
        help='ignore the verification step', action='store_true')
    parser.add_argument('--query', help='enter in a sparql query', type=str)
    parser.add_argument(
        '-q', '--quiet',
        help='turn off info logging', action="store_true")
    parser.add_argument(
        '--debug', help='turn on debug logging', action="store_true")
    parser.add_argument(
        '--skip_tests', help='skip any testing', action="store_true")

    # Blank Nodes can't be visualized in Protege, default to Skolemizing them
    parser.add_argument(
        '-b', '--use_bnodes',
        help="use blank nodes instead of skolemizing", action="store_true",
        default=False)

    # TODO this should live in a global data file
    #   and the same filter be applied to all sources
    parser.add_argument(
        '-t', '--taxon', type=str,
        help='Add a taxon constraint on a source. Enter 1+ NCBITaxon numbers,'
        ' comma delimited\n'
        'Implemented taxa per source\n'
        'NCBIGene: 9606,10090,7955\n'
        'Panther: 9606,10090,10116,7227,7955,6239,8355\n'
        'BioGrid: 9606,10090,10116,7227,7955,6239,8355\n'
        'UCSCBands: 9606\n'
        'GO: 9606,10090,10116,7227,7955,6239,9615,9823,9031,9913')
    parser.add_argument(
        '-o', '--test_only',
        help='only process and output the pre-configured test subset',
        action="store_true")

    parser.add_argument(
        '--dest_fmt',
        help='serialization format: [turtle], nt, nquads, rdfxml, n3, raw',
        type=str)

    parser.add_argument(
        '--version', '-v',
        help='version of source',
        type=str)

    args = parser.parse_args()
    tax_ids = None
    if args.taxon is not None:
        tax_ids = [int(t) for t in args.taxon.split(',')]

    taxa_supported = [  # these are not taxa
        'Panther', 'NCBIGene', 'BioGrid', 'UCSCBands', 'GeneOntology',
        'Bgee', 'Ensembl', 'StringDB', 'OMA']

    formats_supported = [
        'turtle', 'ttl',
        'ntriples', 'nt',
        'nquads', 'nq',
        'rdfxml', 'xml',
        'notation3', 'n3',
        'raw']

    if args.quiet:
        logging.getLogger().setLevel(logging.WARNING)
    else:
        if args.debug:
            logging.getLogger().setLevel(logging.DEBUG)
        else:
            logging.getLogger().setLevel(logging.INFO)

    if not args.use_bnodes:
        logger.info("Will Skolemize Blank Nodes")

    if args.query is not None:
        test_query = TestUtils()
        for source in args.sources.split(','):
            source = source.lower()
            mysource = source_to_class_map[source]()

            # import source lib
            module = "dipper.sources.{0}".format(mysource)
            imported_module = importlib.import_module(module)
            source_class = getattr(imported_module, mysource)

            test_query.check_query_syntax(args.query, source_class)
            test_query.load_graph_from_turtle(source_class)

        print(test_query.query_graph(args.query, True))
        exit(0)

    # run initial tests
    if (args.no_verify or args.skip_tests) is not True:
        unittest.TextTestRunner(verbosity=2).run(test_suite)

    # set serializer
    if args.dest_fmt is not None:
        if args.dest_fmt in formats_supported:
            if args.dest_fmt == 'ttl':
                args.dest_fmt = 'turtle'
            elif args.dest_fmt == 'ntriples':
                args.dest_fmt = 'nt'
            elif args.dest_fmt == 'nq':
                args.dest_fmt = 'nquads'
            elif args.dest_fmt == 'xml':
                args.dest_fmt = 'rdfxml'
            elif args.dest_fmt == 'notation3':
                args.dest_fmt = 'n3'
        else:
            logger.error(
                "You have specified an invalid serializer: %s", args.dest_fmt)

            exit(0)
    else:
        args.dest_fmt = 'turtle'

    # iterate through all the sources
    for source in args.sources.split(','):
        logger.info("\n******* %s *******", source)
        source = source.lower()
        src = source_to_class_map[source]

        # import source lib
        module = "dipper.sources.{0}".format(src)
        imported_module = importlib.import_module(module)
        source_class = getattr(imported_module, src)
        mysource = None
        # arg factory
        source_args = dict(
            graph_type=args.graph
        )
        source_args['are_bnodes_skolemized'] = not args.use_bnodes
        if src in taxa_supported:
            source_args['tax_ids'] = tax_ids
        if args.version:
            source_args['version'] = args.version

        mysource = source_class(**source_args)
        if args.parse_only is False:
            start_fetch = time.clock()
            mysource.fetch(args.force)
            end_fetch = time.clock()
            logger.info("Fetching time: %d sec", end_fetch-start_fetch)

        mysource.settestonly(args.test_only)

        # run tests first
        if (args.no_verify or args.skip_tests) is not True:
            suite = mysource.getTestSuite()
            if suite is None:
                logger.warning(
                    "No tests configured for this source: %s", source)
            else:
                unittest.TextTestRunner(verbosity=2).run(suite)
        else:
            logger.info("Skipping Tests for source: %s", source)

        if args.test_only is False and args.fetch_only is False:
            start_parse = time.clock()
            mysource.parse(args.limit)
            end_parse = time.clock()
            logger.info("Parsing time: %d sec", end_parse-start_parse)
            if args.graph == 'rdf_graph':
                logger.info("Found %d nodes", len(mysource.graph))

                # Add property axioms
                start_axiom_exp = time.clock()
                logger.info("Adding property axioms")

                properties = GraphUtils.get_properties_from_graph(mysource.graph)
                GraphUtils.add_property_axioms(mysource.graph, properties)
                end_axiom_exp = time.clock()
                logger.info("Property axioms added: %d sec",
                            end_axiom_exp-start_axiom_exp)

                start_write = time.clock()
                mysource.write(fmt=args.dest_fmt)
                end_write = time.clock()
                logger.info("Writing time: %d sec", end_write-start_write)
        # if args.no_verify is not True:

        #    status = mysource.verify()
        #    if status is not True:
        #        logger.error(
        #            'Source %s did not pass verification tests.', source)
        #        exit(1)
        # else:
        #    logger.info('skipping verification step')
        logger.info('***** Finished with %s *****', source)
    # load configuration parameters
    # for example, keys

    logger.info("All done.")
Exemple #43
0
class TestGwasHaplotypeModel(unittest.TestCase):
    """
    Test the modelling of a  SNP to trait association
    from sample GWAS catalog data
    """

    def setUp(self):
        self.test_util = TestUtils()
        self.source = GWASCatalog('rdf_graph', True)
        self.source.graph = RDFGraph(True)
        self.test_data = {
            'snp_label': 'rs1329573-?; rs7020413-?; rs3824344-?; rs3758171-?',
            'chrom_num': '9;9;9;9',
            'chrom_pos': '36998996;37002118;37000690;36997420',
            'context': 'intron_variant; intron_variant; intron_variant; intron_variant',
            'allele_freq': 'NR',
            'trait': 'Intelligence',
            'trait_uri': 'http://www.ebi.ac.uk/efo/EFO_0004337',
            'pvalue': '0.00000004',
            'merged': '0',
            'snp_id_current': '',
            'mapped_gene': 'PAX5; PAX5; PAX5; PAX5',
            'snp_gene_nums': '',
            'upstream_gene_num': '107986179',
            'downstream_gene_num': '107986180',
            'init_sample_desc': '656 European ancestry individuals from ADHD families',
            'replicated_sample_desc': 'NA',
            'platform': 'Illumina [795637]',
            'pubmed': '22449649'
        }

    def tearDown(self):
        self.source = None

    def test_snp_model(self):
        """
        Test output model of _process_haplotype()
        self._process_haplotype(
            variant_curie, strongest_snp_risk_allele,
            chrom_num, chrom_pos, context,
            risk_allele_frequency, mapped_gene, so_ontology)
        """
        self.assertTrue(len(list(self.source.graph)) == 0)
        variant_curie, variant_type = self.source._get_curie_and_type_from_id(
            self.test_data['snp_label'])

        so_ontology = RDFGraph()
        LOG.info("Loading SO ontology in separate rdf graph")
        so_ontology.parse(self.source.files['so']['url'], format='xml')
        so_ontology.bind_all_namespaces()
        LOG.info("Finished loading SO ontology")

        self.source._process_haplotype(
            variant_curie, self.test_data['snp_label'], self.test_data['chrom_num'],
            self.test_data['chrom_pos'], self.test_data['context'],
            self.test_data['allele_freq'], self.test_data['mapped_gene'], so_ontology)

        triples = """
:haplotype_bb627b1f64039b0f751a a OBO:GENO_0000871 ;
    rdfs:label "rs1329573-?; rs7020413-?; rs3824344-?; rs3758171-?" ;
    OBO:GENO_0000382 dbSNP:rs1329573,
        dbSNP:rs3758171,
        dbSNP:rs3824344,
        dbSNP:rs7020413 ;
    OBO:SO_0001627 HGNC:8619 ;
    OBO:RO_0002162 OBO:NCBITaxon_9606 .

dbSNP:rs1329573 a OBO:SO_0000694,
        SO:0001627 ;
    rdfs:label "rs1329573-?" ;
    faldo:location <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996-36998996-Region> ;
    OBO:SO_0001627 HGNC:8619 ;
    OBO:RO_0002162 OBO:NCBITaxon_9606 .

dbSNP:rs3758171 a OBO:SO_0000694,
        OBO:SO_0001627 ;
    rdfs:label "rs3758171-?" ;
    faldo:location <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420-36997420-Region> ;
    OBO:SO_0001627 HGNC:8619 ;
    OBO:RO_0002162 OBO:NCBITaxon_9606 .

dbSNP:rs3824344 a OBO:SO_0000694,
        OBO:SO_0001627 ;
    rdfs:label "rs3824344-?" ;
    faldo:location <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690-37000690-Region> ;
    OBO:SO_0001627 HGNC:8619 ;
    OBO:RO_0002162 OBO:NCBITaxon_9606 .

dbSNP:rs7020413 a OBO:SO_0000694,
        OBO:SO_0001627 ;
    rdfs:label "rs7020413-?" ;
    faldo:location <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118-37002118-Region> ;
    OBO:SO_0001627 HGNC:8619 ;
    OBO:RO_0002162 OBO:NCBITaxon_9606 .

<https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420-36997420-Region> a faldo:Region ;
    faldo:begin <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420> ;
    faldo:end <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420> .

<https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996-36998996-Region> a faldo:Region ;
    faldo:begin <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996> ;
    faldo:end <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996> .

<https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690-37000690-Region> a faldo:Region ;
    faldo:begin <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690> ;
    faldo:end <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690> .

<https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118-37002118-Region> a faldo:Region ;
    faldo:begin <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118> ;
    faldo:end <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118> .

<https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420> a faldo:Position ;
    faldo:position 36997420 ;
    faldo:reference OBO:CHR_GRCh38chr9 .

<https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996> a faldo:Position ;
    faldo:position 36998996 ;
    faldo:reference OBO:CHR_GRCh38chr9 .

<https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690> a faldo:Position ;
    faldo:position 37000690 ;
    faldo:reference OBO:CHR_GRCh38chr9 .

<https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118> a faldo:Position ;
    faldo:position 37002118 ;
    faldo:reference OBO:CHR_GRCh38chr9 .
        """

        # dbg
        # LOG.debug(
        #    "Reference graph: %s",
        #   self.source.graph.serialize(format="turtle").decode("utf-8"))

        #  Does not seem to acknowlage these constant triples 
        self.assertTrue(self.test_util.test_graph_equality(triples, self.source.graph))
Exemple #44
0
class RGDTestCase(unittest.TestCase):
    def setUp(self):
        self.test_util = TestUtils()
        self.test_set_1 = {
            'aspect':
            'N',
            'date':
            '2006-10-26',
            'evidence': {
                'has_supporting_reference': ['RGD:1581841', 'PMID:12799311'],
                'type': 'IED',
                'with_support_from': []
            },
            'negated':
            False,
            'object': {
                'id': 'MP:0003340',
                'taxon': 'NCBITaxon:10116'
            },
            'provided_by':
            'RGD',
            'qualifiers': [],
            'relation': {
                'id': None
            },
            'source_line':
            'RGD\t2535\tEdnra\t\tMP:0003340\tRGD:1581841|PMID:12799311\t'
            'IED\t\tN\tendothelin receptor type A\t\tgene\ttaxon:10116\t'
            '20061026\tRGD\t\t\n',
            'subject': {
                'fullname': 'endothelin receptor type A',
                'id': 'RGD:2535',
                'label': 'Ednra',
                'synonyms': [],
                'taxon': {
                    'id': 'NCBITaxon:10116'
                },
                'type': 'gene'
            },
            'subject_extensions': [{
                'filler': '\n',
                'property': 'isoform'
            }]
        }

        return

    def tearDown(self):
        return

    def testRGDParser(self):
        rgd = RGD('rdf_graph', True)
        rgd.graph = RDFGraph(True)

        self.assertTrue(len(list(rgd.graph)) == 0)

        rgd.make_association(record=self.test_set_1)
        triples = """
    :MONARCH_b4650e8c3d865f11a1a5 a OBAN:association ;
        RO:0002558 ECO:0005611 ;
        dcterms:source RGDRef:1581841 ;
        OBAN:association_has_object OBO:MP_0003340 ;
        OBAN:association_has_predicate OBO:RO_0002200 ;
        OBAN:association_has_subject RGD:2535 ;
        pav:createdOn "2006-10-26" .

    RGD:2535 OBO:RO_0002200 MP:0003340 .
        RGDRef:1581841 a IAO:0000311 ;
        owl:sameAs PMID:12799311 .
        """
        # dbg
        logger.debug("Reference graph: %s",
                     rgd.graph.serialize(format="turtle").decode("utf-8"))
        self.assertTrue(self.test_util.test_graph_equality(triples, rgd.graph))
Exemple #45
0
class EvidenceProvenanceTestCase(unittest.TestCase):

    def setUp(self):
        self.test_util = TestUtils()
        self.assoc_curie = 'MONARCH:test_association'
        self.eco_id = 'ECO:0000015'

        # Headers:
        # 01 marker_accession_id,
        # 02 marker_symbol,
        # 03 phenotyping_center,
        # 04 colony_raw,
        # 05 sex,
        # 06 zygosity,
        # 07 allele_accession_id,
        # 08 allele_symbol,
        # 09 allele_name,
        # 10 strain_accession_id,
        # 11 strain_name,
        # 12 project_name,
        # 13 project_fullname,
        # 14 pipeline_name,
        # 15 pipeline_stable_id,
        # 16 procedure_stable_id,
        # 17 procedure_name,
        # 18 parameter_stable_id,
        # 19 parameter_name,
        # 20 top_level_mp_term_id,
        # 21 top_level_mp_term_name,
        # 22 mp_term_id,
        # 23 mp_term_name,
        # 24 p_value,
        # 25 percentage_change,
        # 26 effect_size,
        # 27 statistical_method,
        # 28 resource_name

        self.test_set_1 = (
            'MGI:1920145',              # 01
            'Setd5',                    # 02
            'WTSI',                     # 03
            'MEFW',                     # 04
            'male',                     # 05
            'heterozygote',             # 06
            'MGI:4432631',              # 07
            'Setd5<tm1a(EUCOMM)Wtsi>',  # 08
            'targeted mutation 1a, Wellcome Trust Sanger Institute',    # 09
            'MGI:2159965',              # 10
            'C57BL/6N',                 # 11
            'MGP',                      # 12
            'Wellcome Trust Sanger Institute Mouse Genetics Project',   # 13
            'MGP Select Pipeline',      # 14
            'MGP_001',                  # 15
            'MGP_XRY_001',              # 16
            'X-ray',                    # 17
            'IMPC_XRY_008_001',         # 18
            'Number of ribs right',     # 19
            'MP:0005390',               # 20
            'skeleton phenotype',       # 21
            'MP:0000480',               # 22
            'increased rib number',     # 23
            '1.637023E-010',            # 24
            '',                         # 25
            '8.885439E-007',            # 26
            'Wilcoxon rank sum test with continuity correction',    # 27
            'IMPC'            # 28
        )

        # Generate test curies, these are otherwise generated
        # within _add_evidence() and _add_study_provenance()
        # these blank nodes are hardcoded as NOT Skolemized  ...
        self.study_curie = "_:study"
        self.evidence_curie = "_:evidence"

        # IRIs for testing sparql output
        curie_dict = curie_map.get()
        curie_util = CurieUtil(curie_dict)
        self.assoc_iri = URIRef(curie_util.get_uri(self.assoc_curie))

        return

    def test_evidence_model(self):
        """
        Functional test for _add_evidence()
        """
        impc = IMPC('rdf_graph', True)
        impc.graph = RDFGraph(True)  # Reset graph
        # Test graph is empty
        self.assertTrue(len(list(impc.graph)) == 0)

        (p_value, percentage_change, effect_size) = self.test_set_1[23:26]

        impc._add_evidence(
            self.assoc_curie, self.eco_id, p_value, percentage_change, effect_size,
            self.study_curie)

        triples = """
:MONARCH_test_association SEPIO:0000007 <https://monarchinitiative.org/.well-known/genid/b97a98087df7a99d8a38> .

<https://monarchinitiative.org/.well-known/genid/b97a98087df7a99d8a38> a ECO:0000015 ;
    SEPIO:0000084 <https://monarchinitiative.org/.well-known/genid/b41ad2bfd375c9de8888>,
        <https://monarchinitiative.org/.well-known/genid/b216606de82749b03956> ;
    SEPIO:0000085 <https://monarchinitiative.org/.well-known/genid/study> .

<https://monarchinitiative.org/.well-known/genid/b216606de82749b03956> a OBI:0000175 ;
    RO:0002353 <https://monarchinitiative.org/.well-known/genid/study> ;
    STATO:0000129 1.637023e-10 .

<https://monarchinitiative.org/.well-known/genid/b41ad2bfd375c9de8888> a STATO:0000085 ;
    RO:0002353 <https://monarchinitiative.org/.well-known/genid/study> ;
    STATO:0000129 "8.885439E-007" .
        """

        self.assertTrue(self.test_util.test_graph_equality(
            triples, impc.graph))

    def test_provenance_model(self):
        """
        Functional test for _add_study_provenance()
        """
        impc = IMPC('rdf_graph', True)
        impc.graph = RDFGraph(True)
        self.assertTrue(len(list(impc.graph)) == 0)

        (phenotyping_center,
         colony) = self.test_set_1[2:4]
        (project_name,
         project_fullname,
         pipeline_name,
         pipeline_stable_id,
         procedure_stable_id,
         procedure_name,
         parameter_stable_id,
         parameter_name) = self.test_set_1[11:19]
        (statistical_method, resource_name) = self.test_set_1[26:28]

        impc._add_study_provenance(
            phenotyping_center, colony,
            project_name,
            pipeline_name, pipeline_stable_id,
            procedure_stable_id, procedure_name,
            parameter_stable_id, parameter_name,
            statistical_method, resource_name)

        # dbg
        LOG.info(
            "Provenance graph as turtle:\n%s\n",
            impc.graph.serialize(format="turtle").decode("utf-8")
        )

        triples = """
<https://monarchinitiative.org/.well-known/genid/b0b26361b8687b5ad9ef> a owl:NamedIndividual ;
    rdfs:label "MEFW" .

<https://monarchinitiative.org/.well-known/genid/b6f14f763c8d0629360e> a OBI:0000471 ;
    BFO:0000050 <http://www.sanger.ac.uk/science/data/mouse-genomes-project>,
        IMPC-pipe:MGP_001 ;
    BFO:0000051 STATO:0000076,
        IMPC-proc:MGP_XRY_001 ;
    SEPIO:0000017 <http://www.sanger.ac.uk/> ;
    SEPIO:0000114 <https://www.mousephenotype.org/impress/OntologyInfo?action=list&procID=MGP_XRY_001#IMPC_XRY_008_001> .

<http://www.sanger.ac.uk/> a foaf:organization ;
    rdfs:label "WTSI" .

<http://www.sanger.ac.uk/science/data/mouse-genomes-project> a VIVO:Project ;
    rdfs:label "MGP" .

<https://www.mousephenotype.org/impress/OntologyInfo?action=list&procID=MGP_XRY_001#IMPC_XRY_008_001> a owl:NamedIndividual ;
    rdfs:label "Number of ribs right (X-ray)" .

IMPC-pipe:MGP_001 a owl:NamedIndividual ;
    rdfs:label "MGP Select Pipeline" .

IMPC-proc:MGP_XRY_001 a owl:NamedIndividual ;
    rdfs:label "X-ray" .
"""

        # dbg
        LOG.info(
            "Reference graph: %s",
            impc.graph.serialize(format="turtle").decode("utf-8")
        )
        self.assertTrue(
            self.test_util.test_graph_equality(triples, impc.graph))

    def test_assertion_model(self):
        """
        Functional test for _add_study_provenance()
        """

        impc = IMPC('rdf_graph', True)
        impc.graph = RDFGraph(True)
        self.assertTrue(len(list(impc.graph)) == 0)

        impc._add_assertion_provenance(self.assoc_curie, self.evidence_curie)

        triples = """
    MONARCH:test_association SEPIO:0000015 <https://monarchinitiative.org/.well-known/genid/bf92df374a884963e805> .
    <https://monarchinitiative.org/.well-known/genid/bf92df374a884963e805> a SEPIO:0000001 ;
        SEPIO:0000018 <https://www.mousephenotype.org/> ;
        SEPIO:0000111 <https://monarchinitiative.org/.well-known/genid/evidence>  .

    <https://www.mousephenotype.org/> a foaf:organization ;
        rdfs:label "International Mouse Phenotyping Consortium" .
        """
        # dbg
        LOG.info(
            "Assertion graph:\n %s\n", impc.graph.serialize(
                format="turtle").decode("utf-8")
        )

        self.assertTrue(self.test_util.test_graph_equality(triples, impc.graph))

    @unittest.skip("Timeouts on travis")
    def test_random_data_set(self):
        """
        Download dataset using fetch(), then take a row of data and
        run through evidence and provenance functions to test the output

        Line of data is hardcoded, but theoretically should work on any line
        """
        line_to_test = 1129
        count = 0
        impc = IMPC('rdf_graph', False)   # Not Skolem
        self.test_set_N = []
        # fetch file
        # impc.fetch(True)
        file_path = '/'.join((impc.rawdir, impc.files['all']['file']))
        with gzip.open(file_path, 'rt') as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            for row in filereader:
                count += 1
                if count < line_to_test:
                    continue
                elif count == line_to_test:
                    self.test_set_N = row
                elif count > line_to_test:
                    LOG.info("stopped at line:\t%s\n", count)
                    break

        # Some DRY violation with the above tests
        (phenotyping_center, colony) = self.test_set_N[2:4]
        (project_name,project_fullname, pipeline_name, pipeline_stable_id,
         procedure_stable_id, procedure_name, parameter_stable_id,
         parameter_name) = self.test_set_N[11:19]
        (statistical_method, resource_name) = self.test_set_N[26:28]

        (p_value, percentage_change, effect_size) = self.test_set_N[23:26]

        # adding evidence
        impc._add_evidence(
            self.assoc_curie, self.eco_id, p_value, percentage_change, effect_size,
            self.study_curie)

        # adding  study
        impc._add_study_provenance(
            phenotyping_center, colony, project_name,
            pipeline_name,
            pipeline_stable_id,
            procedure_stable_id, procedure_name,
            parameter_stable_id, parameter_name,
            statistical_method, resource_name, line_to_test)

        # Note that this doesn't test much since we're dealing with
        # multiple part_of  and has_part links to individuals
        # which results in ambiguity = hard to test

        # dbg
        LOG.info(
            "Row %i graph as ntriples:\n%s\n",
            line_to_test, impc.graph.serialize(format="ntriples").decode("utf-8")
        )

        sparql_query = """
SELECT *
WHERE {
    ?assoc SEPIO:0000007 ?evidenceline .
    ?evidenceline a ECO:0000015 ;
        SEPIO:0000085 _:study .

    ?study a OBI:0000471 ;
        SEPIO:0000114 ?param ;
        SEPIO:0000017 ?agent .
}
"""

        sparql_output = impc.graph.query(sparql_query)
        LOG.info(
            "Test that query for row %i passes and returns one row", int(line_to_test))

        # print("Sparql Output: %s\n", list(sparql_output) )
        # it is an array with one list with five vars in it

        self.assertEqual(len(list(sparql_output)), 1)

    def tearDown(self):
        return
Exemple #46
0
    def setUp(self):
        self.test_util = TestUtils()
        self.assoc_curie = 'MONARCH:test_association'
        self.eco_id = 'ECO:0000015'

        # Headers:
        # 01 marker_accession_id,
        # 02 marker_symbol,
        # 03 phenotyping_center,
        # 04 colony_raw,
        # 05 sex,
        # 06 zygosity,
        # 07 allele_accession_id,
        # 08 allele_symbol,
        # 09 allele_name,
        # 10 strain_accession_id,
        # 11 strain_name,
        # 12 project_name,
        # 13 project_fullname,
        # 14 pipeline_name,
        # 15 pipeline_stable_id,
        # 16 procedure_stable_id,
        # 17 procedure_name,
        # 18 parameter_stable_id,
        # 19 parameter_name,
        # 20 top_level_mp_term_id,
        # 21 top_level_mp_term_name,
        # 22 mp_term_id,
        # 23 mp_term_name,
        # 24 p_value,
        # 25 percentage_change,
        # 26 effect_size,
        # 27 statistical_method,
        # 28 resource_name

        self.test_set_1 = (
            'MGI:1920145',              # 01
            'Setd5',                    # 02
            'WTSI',                     # 03
            'MEFW',                     # 04
            'male',                     # 05
            'heterozygote',             # 06
            'MGI:4432631',              # 07
            'Setd5<tm1a(EUCOMM)Wtsi>',  # 08
            'targeted mutation 1a, Wellcome Trust Sanger Institute',    # 09
            'MGI:2159965',              # 10
            'C57BL/6N',                 # 11
            'MGP',                      # 12
            'Wellcome Trust Sanger Institute Mouse Genetics Project',   # 13
            'MGP Select Pipeline',      # 14
            'MGP_001',                  # 15
            'MGP_XRY_001',              # 16
            'X-ray',                    # 17
            'IMPC_XRY_008_001',         # 18
            'Number of ribs right',     # 19
            'MP:0005390',               # 20
            'skeleton phenotype',       # 21
            'MP:0000480',               # 22
            'increased rib number',     # 23
            '1.637023E-010',            # 24
            '',                         # 25
            '8.885439E-007',            # 26
            'Wilcoxon rank sum test with continuity correction',    # 27
            'IMPC'            # 28
        )

        # Generate test curies, these are otherwise generated
        # within _add_evidence() and _add_study_provenance()
        # these blank nodes are hardcoded as NOT Skolemized  ...
        self.study_curie = "_:study"
        self.evidence_curie = "_:evidence"

        # IRIs for testing sparql output
        curie_dict = curie_map.get()
        curie_util = CurieUtil(curie_dict)
        self.assoc_iri = URIRef(curie_util.get_uri(self.assoc_curie))

        return
Exemple #47
0
class StringTestFakeData(unittest.TestCase):

    def setUp(self):
        self.test_util = TestUtils()
        # Test set with two proteins from same species
        self.test_set_1 = [[
            '9606.ENSP00000000233', '9606.ENSP00000003084',
            0, 0, 0, 0, 300, 0, 150, 800]]

        # Test set with deprecated protein id
        self.test_set_2 = [[
            '9606.ENSP00000000233', '9606.ENSP00000006101',
            0, 0, 0, 0, 300, 0, 150, 800]]

        self.columns = [
            'protein1', 'protein2', 'neighborhood', 'fusion', 'cooccurence',
            'coexpression', 'experimental', 'database', 'textmining', 'combined_score']

        ensembl = Ensembl('rdf_graph', True)
        self.protein_list = ensembl.fetch_protein_gene_map('9606')

        return

    def tearDown(self):
        return

    def testFakeDataSet1(self):
        string_db = StringDB('rdf_graph', True)
        string_db.graph = RDFGraph(True)
        self.assertEqual(len(string_db.graph), 0)

        ensembl = Ensembl('rdf_graph', True)
        prot_map = ensembl.fetch_protein_gene_map('9606')
        for key in prot_map.keys():
            for i, gene in enumerate(prot_map[key]):
                prot_map[key][i] = "ENSEMBL:{}".format(gene)

        print(
            "Finished fetching ENSP IDs, fetched {} proteins"
            .format(len(prot_map.keys())))
        dataframe = pd.DataFrame(data=self.test_set_1, columns=self.columns)

        string_db._process_protein_links(dataframe, prot_map, '9606')

        # g1 <interacts with> g2
        triples = """
ENSEMBL:ENSG00000001626 RO:0002434 ENSEMBL:ENSG00000004059 .
        """

        self.assertTrue(self.test_util.test_graph_equality(triples, string_db.graph))

    def testFakeDataSet2(self):
        """
        Dataset contains a deprecated protein ID
        that we expect if filtered out by ensembl biomart
        We test that this returns an empty graph
        :return:
        """
        string_db = StringDB('rdf_graph', True)
        string_db.graph = RDFGraph()
        self.assertEqual(len(string_db.graph), 0)

        dataframe = pd.DataFrame(data=self.test_set_2, columns=self.columns)
        string_db._process_protein_links(dataframe, self.protein_list, '9606')
        self.assertEqual(len(string_db.graph), 0)
Exemple #48
0
class TestGwasSNPModel(unittest.TestCase):
    """
    Test the modelling of a  SNP to trait association
    from sample GWAS catalog data
    """

    def setUp(self):
        self.test_util = TestUtils()
        self.source = GWASCatalog('rdf_graph', True)
        self.source.graph = RDFGraph(True)  # Reset graph
        self.source.graph.bind_all_namespaces()
        self.test_data = {
            'snp_label': 'rs1491921-C',
            'chrom_num': '5',
            'chrom_pos': '21259029',
            'context': 'intergenic_variant',
            'allele_freq': '0.013',
            'trait': 'Diisocyanate-induced asthma',
            'trait_uri': 'http://www.ebi.ac.uk/efo/EFO_0006995, http://www.ebi.ac.uk/efo/EFO_0003949',
            'pvalue': '0.0000007',
            'merged': '0',
            'snp_id_current': '1491921',
            'mapped_gene': 'LOC102723561 - GUSBP1',
            'snp_gene_nums': '',
            'upstream_gene_num': '107986179',
            'downstream_gene_num': '107986180',
            'init_sample_desc': '74 European ancestry cases, 824 European ancestry controls',
            'replicated_sample_desc': 'NA',
            'platform': 'Illumina [1556551]',
            'pubmed': '25918132'
        }

    def tearDown(self):
        self.source = None
        self.efo_ontology = None

    def test_snp_type_resolution(self):
        """
        Given the label: rs1491921-C
        return dbSNP:rs1491921, snp
        """
        self.assertTrue(len(list(self.source.graph)) == 0)
        variant_curie, variant_type = self.source._get_curie_and_type_from_id(
            self.test_data['snp_label'])

        self.assertEqual(variant_curie, "dbSNP:rs1491921")
        self.assertEqual(variant_type, 'snp')

    def test_snp_model(self):
        """
        Test output model of _add_snp_to_graph()
        """
        self.assertTrue(len(list(self.source.graph)) == 0)
        variant_curie, variant_type = self.source._get_curie_and_type_from_id(
            self.test_data['snp_label'])

        self.source._add_snp_to_graph(
            variant_curie, self.test_data['snp_label'], self.test_data['chrom_num'],
            self.test_data['chrom_pos'], self.test_data['context'],
            self.test_data['allele_freq'])

        triples = """
    dbSNP:rs1491921 a OBO:SO_0000694, OBO:SO_0001628 ;
        rdfs:label "rs1491921-C" ;
        faldo:location  <https://monarchinitiative.org/.well-known/genid/GRCh38chr5-21259029-21259029-Region> ;
        OBO:RO_0002162 OBO:NCBITaxon_9606 ;
        dc:description "0.013 [risk allele frequency]" .

    <https://monarchinitiative.org/.well-known/genid/GRCh38chr5-21259029-21259029-Region> a faldo:Region ;
        faldo:begin <https://monarchinitiative.org/.well-known/genid/GRCh38chr5-21259029> ;
        faldo:end <https://monarchinitiative.org/.well-known/genid/GRCh38chr5-21259029> .

    <https://monarchinitiative.org/.well-known/genid/GRCh38chr5-21259029> a faldo:Position ;
        faldo:position 21259029 ;
        faldo:reference OBO:CHR_GRCh38chr5 .
"""
        # To debug
        # print(self.source.graph.serialize(format="turtle").decode("utf-8"))
        # self.assertTrue(False)

        # dbg
        # LOG.debug(
        #    "Reference graph: %s",
        #   self.source.graph.serialize(format="turtle").decode("utf-8"))

        self.assertTrue(self.test_util.test_graph_equality(triples, self.source.graph))

    def test_snp_gene_relation(self):
        """
        test the _add_snp_gene_relation function
        :return:
        """
        self.assertTrue(len(list(self.source.graph)) == 0)
        variant_curie, variant_type = self.source._get_curie_and_type_from_id(
            self.test_data['snp_label'])

        self.source._add_snp_gene_relation(
            variant_curie, self.test_data['snp_gene_nums'],
            self.test_data['upstream_gene_num'],
            self.test_data['downstream_gene_num'])

        triples = """
        dbSNP:rs1491921 OBO:RO_0002528 NCBIGene:107986180 ;
            OBO:RO_0002529 NCBIGene:107986179 .
        """
        self.assertTrue(self.test_util.test_graph_equality(triples, self.source.graph))

    def test_deprecated_snp(self):
        """
        test the _add_deprecated_snp
        :return:
        """
        self.assertTrue(len(list(self.source.graph)) == 0)
        # fake data
        snp_id_current = '12345'
        merged = '1'

        variant_curie, variant_type = self.source._get_curie_and_type_from_id(
            self.test_data['snp_label'])

        self.source._add_deprecated_snp(
            variant_curie, snp_id_current, merged,
            self.test_data['chrom_num'], self.test_data['chrom_pos'])

        triples = """
        dbSNP:rs1491921 a owl:NamedIndividual ;
            OBO:IAO_0100001 dbSNP:rs12345 ;
            owl:deprecated true .

        dbSNP:rs12345 MONARCH:cliqueLeader true .
        """
        self.assertTrue(self.test_util.test_graph_equality(triples, self.source.graph))

    def test_snp_trait_association(self):
        """
        test the _add_variant_trait_association
        :return:
        """
        self.assertTrue(len(list(self.source.graph)) == 0)
        efo_ontology = RDFGraph()
        LOG.info("Loading EFO ontology in separate rdf graph")
        efo_ontology.parse(self.source.files['efo']['url'], format='xml')
        efo_ontology.bind_all_namespaces()
        LOG.info("Finished loading EFO ontology")

        variant_curie, variant_type = self.source._get_curie_and_type_from_id(
            self.test_data['snp_label'])

        description = self.source._make_description(
            self.test_data['trait'], self.test_data['init_sample_desc'],
            self.test_data['replicated_sample_desc'],
            self.test_data['platform'], self.test_data['pvalue'])

        self.source._add_variant_trait_association(
            variant_curie, self.test_data['trait_uri'], efo_ontology,
            self.test_data['pubmed'], description)

        triples = """


    MONARCH:bffc7a930c08cc8fe931 a OBAN:association ;
        dc:description "{0}" ;
        OBO:RO_0002558 OBO:ECO_0000213 ;
        dc:source PMID:25918132 ;
        OBAN:association_has_object EFO:0003949 ;
        OBAN:association_has_predicate RO:0003304 ;
        OBAN:association_has_subject dbSNP:rs1491921 .

    MONARCH:bff9b97458d67ed7f517 a OBAN:association ;
        dc:description "{0}" ;
        OBO:RO_0002558 OBO:ECO_0000213 ;
        dc:source PMID:25918132 ;
        OBAN:association_has_object EFO:0006995 ;
        OBAN:association_has_predicate RO:0003304 ;
        OBAN:association_has_subject dbSNP:rs1491921 .

    EFO:0003949 a owl:Class ;
        rdfs:label "eye color"^^xsd:string ;
        rdfs:subClassOf UPHENO:0001001 .

    dbSNP:rs1491921 RO:0003304 EFO:0003949,
            EFO:0006995 .

    PMID:25918132 a OBO:IAO_0000013 .
        """.format(description)

        # dbg
        # LOG.debug(
        #    "Reference graph: %s",
        #    self.source.graph.serialize(format="turtle").decode("utf-8"))
        self.assertTrue(self.test_util.test_graph_equality(triples, self.source.graph))
Exemple #49
0
class UDPTestCase(unittest.TestCase):
    """
    Test UDP parser
    """

    def setUp(self):
        self.test_util = TestUtils()
        return

    def tearDown(self):
        return

    def test_dbsnp_indel_resolution(self):
        """
        unit test for _get_rs_id()
        Test that we can resolve indels that
        have different insertion sequence(s)
        for one rsid
        15	51766637	374313651	in-del	-/A/AA/AAA/AAAA/CAAA/TAAA
        """
        udp = UDP('rdf_graph', True)
        rs_map = udp._parse_rs_map_file(udp.map_files['dbsnp_map'])
        variant_type = 'indel'
        variant = {
            'build': 'hg19',
            'chromosome': 'chr15',
            'reference_allele': '-',
            'variant_allele': 'AAAA',
            'position': '51766637'
        }
        rsid = udp._get_rs_id(variant, rs_map, variant_type)

        self.assertEqual(rsid, '374313651')

    def test_dbsnp_snp_mapping(self):
        """
        unit test for _get_rs_id()
        Test that we can resolve snps in dbsnp
        to rsids
        """
        udp = UDP('rdf_graph', True)
        rs_map = udp._parse_rs_map_file(udp.map_files['dbsnp_map'])
        variant_type = 'snp'
        variant = {
            'build': 'hg19',
            'chromosome': 'chr15',
            'reference_allele': 'A',
            'variant_allele': 'C',
            'position': '54624219'
        }
        rsid = udp._get_rs_id(variant, rs_map, variant_type)

        self.assertEqual(rsid, '755532609')

    def test_patient_phenotype_model(self):
        """
        functional test for _parse_patient_phenotypes()
        """
        udp = UDP('rdf_graph', True)
        udp.graph = RDFGraph(True)

        # test that graph is empty
        self.assertTrue(len(list(udp.graph)) == 0)

        mock_lines = [
            'patient_1\tHP:000001\tyes',
            'patient_1\tHP:000002\tno'
        ]
        mock_data = MagicMock()
        mock_data.__iter__.return_value = iter(mock_lines)

        mock_file = mock_open(mock=mock_data)
        udp._parse_patient_phenotypes(mock_file)
        triples = """
        :patient_1 a foaf:Person ;
            rdfs:label "patient_1" ;
            RO:0002200 DOID:4,
              HP:000001 .
        """

        self.assertTrue(self.test_util.test_graph_equality(
            triples, udp.graph))

    def test_variant_model(self):
        """
        functional test for _parse_patient_variants()
        """
        udp = UDP('rdf_graph', True)
        udp.graph = RDFGraph(True)
        # test that graph is empty
        self.assertTrue(len(list(udp.graph)) == 0)

        data = ['patient_1',
                'family_1',
                '1',
                'HG19',
                '155230432',
                'G',
                'A',
                'Maternal',
                'Biallelic',
                'Non-synonymous;DOWNSTREAM',
                'CLK2',
                '',
                '',
                '',
                '',
                '',
                '',
                '',
                'Compound heterozygous',
                'Heterozygous',
                '',
                '0.002747253',
                '']
        test_data = "\t".join(data)
        mock_lines = [test_data]
        mock_data = MagicMock()
        mock_data.__iter__.return_value = iter(mock_lines)

        mock_file = mock_open(mock=mock_data)

        udp._parse_patient_variants(mock_file)

        triples = """
        :patient_1 GENO:0000222 <https://monarchinitiative.org/.well-known/genid/ba5f377fc8c95d4a6d7a> .

        <https://monarchinitiative.org/.well-known/genid/b41e8da0787b45e24c4f> a SO:0001059 ;
            rdfs:label "hg19chr1(CLK2):g.155230432G>A" ;
            GENO:0000418 HGNC:2069 ;
            RO:0002162 NCBITaxon:9606 ;
            owl:sameAs dbSNP:rs11557757 .

        <https://monarchinitiative.org/.well-known/genid/ba5f377fc8c95d4a6d7a> a GENO:0000000 ;
            rdfs:label "patient_1 genotype" ;
            GENO:0000382 <https://monarchinitiative.org/.well-known/genid/b41e8da0787b45e24c4f> .
        """

        self.assertTrue(self.test_util.test_graph_equality(triples, udp.graph))
Exemple #50
0
def main():
    source_to_class_map = {
        'hpoa': HPOAnnotations,  # ~3 min
        'zfin': ZFIN,
        'omim': OMIM,  # full file takes ~15 min, due to required throttling
        'biogrid': BioGrid,  # interactions file takes <10 minutes
        'mgi': MGI,
        'impc': IMPC,
        'panther': Panther,  # this takes a very long time, ~1hr to map 7 species-worth of associations
        'ncbigene': NCBIGene,  # takes about 4 minutes to process 2 species
        'ucscbands': UCSCBands,
        'ctd': CTD,
        'genereviews': GeneReviews,
        'eom': EOM,  # Takes about 5 seconds.
        'coriell': Coriell,
        'clinvar': ClinVar,
        'monochrom': Monochrom,
        'kegg': KEGG,
        'animalqtldb': AnimalQTLdb,
        'ensembl': Ensembl,
        'hgnc': HGNC,
        'orphanet': Orphanet
    }

    logger = logging.getLogger(__name__)

    parser = argparse.ArgumentParser(description='Dipper: Data Ingestion'
                                                 ' Pipeline for SciGraph',
                                     formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument('-s', '--sources', type=str, required=True,
                        help='comma separated list of sources')
    parser.add_argument('-l', '--limit', type=int, help='limit number of rows')
    parser.add_argument('--parse_only', action='store_true',
                        help='parse files without writing'),
    parser.add_argument('--fetch_only', action='store_true',
                        help='fetch sources without parsing')
    parser.add_argument('-f', '--force', action='store_true',
                        help='force re-download of files')
    parser.add_argument('--no_verify', help='ignore the verification step',
                        action='store_true')
    parser.add_argument('--query', help='enter in a sparql query', type=str)
    parser.add_argument('-q', '--quiet', help='turn off info logging',
                        action="store_true")
    parser.add_argument('--debug', help='turn on debug logging',
                        action="store_true")

    # BNodes can't be visualized in Protege, so you can materialize them for testing purposes with this flag
    parser.add_argument('-nb', '--no_bnodes', help="convert blank nodes into identified nodes", action="store_true")

    # TODO this preconfiguration should probably live in the conf.json, and the same filter be applied to all sources
    parser.add_argument('-t', '--taxon', type=str,
                        help='Add a taxon constraint on a source. Enter 1+ NCBITaxon numbers, comma delimited\n'
                             'Implemented taxa per source\n'
                             'NCBIGene: 9606,10090,7955\n'
                             'Panther: 9606,10090,10116,7227,7955,6239,8355\n'
                             'BioGrid: 9606,10090,10116,7227,7955,6239,8355\n'
                             'UCSCBands: 9606')
    parser.add_argument('-o', '--test_only', help='only process and output the pre-configured test subset',
                        action="store_true")

    args = parser.parse_args()
    tax_ids = None
    if args.taxon is not None:
        tax_ids = list(map(int, args.taxon.split(',')))

    taxa_supported = [Panther, NCBIGene, BioGrid, UCSCBands]

    if args.quiet:
        logging.basicConfig(level=logging.ERROR)
    else:
        if args.debug:
            logging.basicConfig(level=logging.DEBUG)
        else:
            logging.basicConfig(level=logging.INFO)

    if args.no_bnodes is True:
        logger.info("Will materialize all BNodes into BASE space")

    if args.query is not None:
        test_query = TestUtils()
        for source in args.sources.split(','):
            source = source.lower()
            mysource = source_to_class_map[source]()
            test_query.check_query_syntax(args.query, mysource)
            test_query.load_graph_from_turtle(mysource)

        print(test_query.query_graph(args.query, True))
        exit(0)

    # run initial tests
    if args.no_verify is not True:
        unittest.TextTestRunner(verbosity=2).run(test_suite)

    # iterate through all the sources
    for source in args.sources.split(','):
        logger.info("\n******* %s *******", source)
        source = source.lower()
        src = source_to_class_map[source]
        mysource = None
        if src in taxa_supported:
            mysource = src(tax_ids)
        else:
            mysource = src()
        if args.parse_only is False:
            mysource.fetch(args.force)

        mysource.settestonly(args.test_only)
        mysource.setnobnodes(args.no_bnodes)

        # run tests first
        if args.no_verify is not True:
            suite = mysource.getTestSuite()
            if suite is None:
                logger.warn("No tests configured for this source: %s", source)
            else:
                unittest.TextTestRunner(verbosity=2).run(suite)
        else:
            logger.info("Skipping Tests for source: %s", source)

        if args.test_only is False and args.fetch_only is False:
            mysource.parse(args.limit)
            mysource.write(format='turtle')

        # if args.no_verify is not True:

        #    status = mysource.verify()
        #    if status is not True:
        #        logger.error('Source %s did not pass verification tests.', source)
        #        exit(1)
        # else:
        #    logger.info('skipping verification step')
        logger.info('***** Finished with %s *****', source)
    # load configuration parameters
    # for example, keys

    logger.info("All done.")
Exemple #51
0
class EvidenceProvenanceTestCase(unittest.TestCase):

    def setUp(self):
        self.test_util = TestUtils()
        self.assoc_curie = 'MONARCH:test_association'
        self.eco_id = 'ECO:0000015'

        # Headers:
        # 01 marker_accession_id,
        # 02 marker_symbol,
        # 03 phenotyping_center,
        # 04 colony_raw,
        # 05 sex,
        # 06 zygosity,
        # 07 allele_accession_id,
        # 08 allele_symbol,
        # 09 allele_name,
        # 10 strain_accession_id,
        # 11 strain_name,
        # 12 project_name,
        # 13 project_fullname,
        # 14 pipeline_name,
        # 15 pipeline_stable_id,
        # 16 procedure_stable_id,
        # 17 procedure_name,
        # 18 parameter_stable_id,
        # 19 parameter_name,
        # 20 top_level_mp_term_id,
        # 21 top_level_mp_term_name,
        # 22 mp_term_id,
        # 23 mp_term_name,
        # 24 p_value,
        # 25 percentage_change,
        # 26 effect_size,
        # 27 statistical_method,
        # 28 resource_name

        self.test_set_1 = (
            'MGI:1920145',              # 01
            'Setd5',                    # 02
            'WTSI',                     # 03
            'MEFW',                     # 04
            'male',                     # 05
            'heterozygote',             # 06
            'MGI:4432631',              # 07
            'Setd5<tm1a(EUCOMM)Wtsi>',  # 08
            'targeted mutation 1a, Wellcome Trust Sanger Institute',    # 09
            'MGI:2159965',              # 10
            'C57BL/6N',                 # 11
            'MGP',                      # 12
            'Wellcome Trust Sanger Institute Mouse Genetics Project',   # 13
            'MGP Select Pipeline',      # 14
            'MGP_001',                  # 15
            'MGP_XRY_001',              # 16
            'X-ray',                    # 17
            'IMPC_XRY_008_001',         # 18
            'Number of ribs right',     # 19
            'MP:0005390',               # 20
            'skeleton phenotype',       # 21
            'MP:0000480',               # 22
            'increased rib number',     # 23
            '1.637023E-010',            # 24
            '',                         # 25
            '8.885439E-007',            # 26
            'Wilcoxon rank sum test with continuity correction',    # 27
            'IMPC'            # 28
        )

        # Generate test curies, these are otherwise generated
        # within _add_evidence() and _add_study_provenance()
        # these blank nodes are hardcoded as NOT Skolemized  ...
        self.study_curie = "_:study"
        self.evidence_curie = "_:evidence"

        # IRIs for testing sparql output
        curie_dict = curie_map.get()
        curie_util = CurieUtil(curie_dict)
        self.assoc_iri = URIRef(curie_util.get_uri(self.assoc_curie))

        return

    def test_evidence_model(self):
        """
        Functional test for _add_evidence()
        """
        impc = IMPC('rdf_graph', True)
        impc.graph = RDFGraph(True)  # Reset graph
        # Test graph is empty
        self.assertTrue(len(list(impc.graph)) == 0)

        (p_value, percentage_change, effect_size) = self.test_set_1[23:26]

        impc._add_evidence(
            self.assoc_curie, self.eco_id, p_value, percentage_change, effect_size,
            self.study_curie)

        triples = """
:MONARCH_test_association SEPIO:0000007 <https://monarchinitiative.org/.well-known/genid/b97a98087df7a99d8a38> .

<https://monarchinitiative.org/.well-known/genid/b97a98087df7a99d8a38> a ECO:0000015 ;
    SEPIO:0000084 <https://monarchinitiative.org/.well-known/genid/b41ad2bfd375c9de8888>,
        <https://monarchinitiative.org/.well-known/genid/b216606de82749b03956> ;
    SEPIO:0000085 <https://monarchinitiative.org/.well-known/genid/study> .

<https://monarchinitiative.org/.well-known/genid/b216606de82749b03956> a OBI:0000175 ;
    RO:0002353 <https://monarchinitiative.org/.well-known/genid/study> ;
    STATO:0000129 1.637023e-10 .

<https://monarchinitiative.org/.well-known/genid/b41ad2bfd375c9de8888> a STATO:0000085 ;
    RO:0002353 <https://monarchinitiative.org/.well-known/genid/study> ;
    STATO:0000129 "8.885439E-007" .
        """

        self.assertTrue(self.test_util.test_graph_equality(
            triples, impc.graph))

    def test_provenance_model(self):
        """
        Functional test for _add_study_provenance()
        """
        impc = IMPC('rdf_graph', True)
        impc.graph = RDFGraph(True)
        self.assertTrue(len(list(impc.graph)) == 0)

        (phenotyping_center, colony) = self.test_set_1[2:4]
        (project_fullname, pipeline_name, pipeline_stable_id,
         procedure_stable_id, procedure_name, parameter_stable_id,
         parameter_name) = self.test_set_1[12:19]
        (statistical_method, resource_name) = self.test_set_1[26:28]

        impc._add_study_provenance(
            phenotyping_center, colony,
            project_fullname,
            pipeline_name, pipeline_stable_id,
            procedure_stable_id, procedure_name,
            parameter_stable_id, parameter_name,
            statistical_method, resource_name, 0)

        # dbg
        logger.info(
            "Provenance graph as turtle:\n%s\n",
            impc.graph.serialize(format="turtle").decode("utf-8")
        )

        triples = """
<https://monarchinitiative.org/.well-known/genid/bdd05a8ca155ddaf415e> a OBI:0000471 ;
  BFO:0000051 OBO:STATO_0000076,
      <https://www.mousephenotype.org/impress/protocol/175/15> ;
  BFO:0000050  IMPRESS-procedure:15 ,
      <http://www.sanger.ac.uk/science/data/mouse-genomes-project> ;
  SEPIO:0000114 <https://www.mousephenotype.org/impress/parameterontologies/1867/91> ;
  SEPIO:0000017 <http://www.sanger.ac.uk/>  .

<https://monarchinitiative.org/.well-known/genid/b0b26361b8687b5ad9ef> a owl:NamedIndividual ;
    rdfs:label "MEFW" .

<http://www.sanger.ac.uk/> a foaf:organization ;
    rdfs:label "WTSI" .

<http://www.sanger.ac.uk/science/data/mouse-genomes-project> a VIVO:Project ;
    rdfs:label "Wellcome Trust Sanger Institute Mouse Genetics Project" .

<https://www.mousephenotype.org/impress/parameterontologies/1867/91> a owl:NamedIndividual ;
    rdfs:label "Number of ribs right (X-ray)" .

IMPRESS-procedure:15 a owl:NamedIndividual ;
    rdfs:label "MGP Select Pipeline" .

<https://www.mousephenotype.org/impress/protocol/175/15> a owl:NamedIndividual ;
    rdfs:label "X-ray" .
"""

        # dbg
        logger.debug(
            "Reference graph: %s", impc.graph.serialize(format="turtle").decode("utf-8")
        )
        self.assertTrue(
            self.test_util.test_graph_equality(triples, impc.graph))

    def test_assertion_model(self):
        """
        Functional test for _add_study_provenance()
        """

        impc = IMPC('rdf_graph', True)
        impc.graph = RDFGraph(True)
        self.assertTrue(len(list(impc.graph)) == 0)

        impc._add_assertion_provenance(self.assoc_curie, self.evidence_curie)

        triples = """
    MONARCH:test_association SEPIO:0000015 <https://monarchinitiative.org/.well-known/genid/bf92df374a884963e805> .
    <https://monarchinitiative.org/.well-known/genid/bf92df374a884963e805> a SEPIO:0000001 ;
        SEPIO:0000018 <https://www.mousephenotype.org/> ;
        SEPIO:0000111 <https://monarchinitiative.org/.well-known/genid/evidence>  .

    <https://www.mousephenotype.org/> a foaf:organization ;
        rdfs:label "International Mouse Phenotyping Consortium" .

        """
        # dbg
        logger.info(
            "Assertion graph:\n %s\n", impc.graph.serialize(
                format="turtle").decode("utf-8")
        )

        self.assertTrue(self.test_util.test_graph_equality(triples, impc.graph))

    def test_random_data_set(self):
        """
        Download dataset using fetch(), then take a row of data and
        run through evidence and provenance functions to test the output

        Line of data is hardcoded, but theoretically should work on any line
        """
        line_to_test = 1129
        count = 0
        impc = IMPC('rdf_graph', False)   # Not Skolem
        self.test_set_N = []
        # fetch file
        # impc.fetch(True)
        file_path = '/'.join((impc.rawdir, impc.files['all']['file']))
        with gzip.open(file_path, 'rt') as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            for row in filereader:
                count += 1
                if count < line_to_test:
                    continue
                elif count == line_to_test:
                    self.test_set_N = row
                elif count > line_to_test:
                    logger.info("stopped at line:\t%s\n", count)
                    break

        # Some DRY violation with the above tests
        (phenotyping_center, colony) = self.test_set_N[2:4]
        (project_fullname, pipeline_name, pipeline_stable_id,
         procedure_stable_id, procedure_name, parameter_stable_id,
         parameter_name) = self.test_set_N[12:19]
        (statistical_method, resource_name) = self.test_set_N[26:28]

        (p_value, percentage_change, effect_size) = self.test_set_N[23:26]

        # adding evidence
        impc._add_evidence(
            self.assoc_curie, self.eco_id, p_value, percentage_change, effect_size,
            self.study_curie)

        # adding  study
        impc._add_study_provenance(
            phenotyping_center, colony, project_fullname,
            pipeline_name,
            pipeline_stable_id,
            procedure_stable_id, procedure_name,
            parameter_stable_id, parameter_name,
            statistical_method, resource_name, line_to_test)

        # Note that this doesn't test much since we're dealing with
        # multiple part_of  and has_part links to individuals
        # which results in ambiguity = hard to test

        # dbg
        logger.info(
            "Row %i graph as ntriples:\n%s\n", line_to_test, impc.graph.serialize(
                format="ntriples").decode("utf-8")
        )

        sparql_query = """
SELECT *
WHERE {
    ?assoc SEPIO:0000007 ?evidenceline .
    ?evidenceline a ECO:0000015 ;
        SEPIO:0000085 _:study .

    ?study a OBI:0000471 ;
        SEPIO:0000114 ?param ;
        SEPIO:0000017 ?agent .
}
"""

        sparql_output = impc.graph.query(sparql_query)
        logger.info("Test that query for row %i passes and returns one row", int(line_to_test))

        # print("Sparql Output: %s\n", list(sparql_output) )
        # it is an array with one list with five vars in it

        self.assertEqual(len(list(sparql_output)), 1)

    def tearDown(self):
        return
Exemple #52
0
    def setUp(self):
        self.test_util = TestUtils()
        self.assoc_curie = 'MONARCH:test_association'
        self.eco_id = 'ECO:0000015'

        # Headers:
        # 01 marker_accession_id,
        # 02 marker_symbol,
        # 03 phenotyping_center,
        # 04 colony_raw,
        # 05 sex,
        # 06 zygosity,
        # 07 allele_accession_id,
        # 08 allele_symbol,
        # 09 allele_name,
        # 10 strain_accession_id,
        # 11 strain_name,
        # 12 project_name,
        # 13 project_fullname,
        # 14 pipeline_name,
        # 15 pipeline_stable_id,
        # 16 procedure_stable_id,
        # 17 procedure_name,
        # 18 parameter_stable_id,
        # 19 parameter_name,
        # 20 top_level_mp_term_id,
        # 21 top_level_mp_term_name,
        # 22 mp_term_id,
        # 23 mp_term_name,
        # 24 p_value,
        # 25 percentage_change,
        # 26 effect_size,
        # 27 statistical_method,
        # 28 resource_name

        self.test_set_1 = (
            'MGI:1920145',              # 01
            'Setd5',                    # 02
            'WTSI',                     # 03
            'MEFW',                     # 04
            'male',                     # 05
            'heterozygote',             # 06
            'MGI:4432631',              # 07
            'Setd5<tm1a(EUCOMM)Wtsi>',  # 08
            'targeted mutation 1a, Wellcome Trust Sanger Institute',    # 09
            'MGI:2159965',              # 10
            'C57BL/6N',                 # 11
            'MGP',                      # 12
            'Wellcome Trust Sanger Institute Mouse Genetics Project',   # 13
            'MGP Select Pipeline',      # 14
            'MGP_001',                  # 15
            'MGP_XRY_001',              # 16
            'X-ray',                    # 17
            'IMPC_XRY_008_001',         # 18
            'Number of ribs right',     # 19
            'MP:0005390',               # 20
            'skeleton phenotype',       # 21
            'MP:0000480',               # 22
            'increased rib number',     # 23
            '1.637023E-010',            # 24
            '',                         # 25
            '8.885439E-007',            # 26
            'Wilcoxon rank sum test with continuity correction',    # 27
            'IMPC'            # 28
        )

        # Generate test curies, these are otherwise generated
        # within _add_evidence() and _add_study_provenance()
        # these blank nodes are hardcoded as NOT Skolemized  ...
        self.study_curie = "_:study"
        self.evidence_curie = "_:evidence"

        # IRIs for testing sparql output
        curie_dict = curie_map.get()
        curie_util = CurieUtil(curie_dict)
        self.assoc_iri = URIRef(curie_util.get_uri(self.assoc_curie))

        return
Exemple #53
0
class StringTestFakeData(unittest.TestCase):
    def setUp(self):
        self.test_util = TestUtils()
        # Test set with two proteins from same species
        self.test_set_1 = \
            [['9606.ENSP00000000233', '9606.ENSP00000003084',
             0, 0, 0, 0, 300, 0, 150, 800]]

        # Test set with deprecated protein id
        self.test_set_2 = \
            [['9606.ENSP00000000233', '9606.ENSP00000006101',
              0, 0, 0, 0, 300, 0, 150, 800]]

        self.columns = [
            'protein1', 'protein2', 'neighborhood', 'fusion', 'cooccurence',
            'coexpression', 'experimental', 'database', 'textmining',
            'combined_score'
        ]

        ensembl = Ensembl('rdf_graph', True)
        self.protein_list = ensembl.fetch_protein_gene_map(9606)

        return

    def tearDown(self):
        return

    def testFakeDataSet1(self):
        string_db = StringDB('rdf_graph', True)
        string_db.graph = RDFGraph(True)
        self.assertEqual(len(string_db.graph), 0)

        ensembl = Ensembl('rdf_graph', True)
        prot_map = ensembl.fetch_protein_gene_map(9606)
        for key in prot_map.keys():
            prot_map[key] = "ENSEMBL:{}".format(prot_map[key])

        print("Finished fetching ENSP IDs, fetched {} proteins".format(
            len(prot_map.keys())))
        dataframe = pd.DataFrame(data=self.test_set_1, columns=self.columns)

        string_db._process_protein_links(dataframe, prot_map, 9606)

        triples = """
            ENSEMBL:ENSG00000001626 RO:0002434 ENSEMBL:ENSG00000004059 .
        """

        self.assertTrue(
            self.test_util.test_graph_equality(triples, string_db.graph))

    def testFakeDataSet2(self):
        """
        Dataset contains a deprecated protein ID
        that we expect if filtered out by ensembl biomart
        We test that this returns an empty graph
        :return:
        """
        string_db = StringDB('rdf_graph', True)
        string_db.graph = RDFGraph()
        self.assertEqual(len(string_db.graph), 0)

        dataframe = pd.DataFrame(data=self.test_set_2, columns=self.columns)
        string_db._process_protein_links(dataframe, self.protein_list, 9606)
        self.assertEqual(len(string_db.graph), 0)
Exemple #54
0
def main():
    # TODO this should be generated by looking in the dipper/sources directory
    # or read from a sources/dataset/config yaml or dir of yamls
    source_to_class_map = {
        # 'facebase_alpha': 'FaceBase_alpha',
        'hpoa': 'HPOAnnotations',   # ~3 min
        'zfin': 'ZFIN',
        'omim': 'OMIM',  # full file takes ~15 min, due to required throttling
        'biogrid': 'BioGrid',  # interactions file takes <10 minutes
        'mgi': 'MGI',
        'impc': 'IMPC',
        # Panther takes ~1hr to map 7 species-worth of associations
        'panther': 'Panther',
        'oma': 'OMA',
        'ncbigene': 'NCBIGene',  # takes about 4 minutes to process 2 species
        'ucscbands': 'UCSCBands',
        'ctd': 'CTD',
        'genereviews': 'GeneReviews',
        'eom': 'EOM',  # Takes about 5 seconds.
        'coriell': 'Coriell',
        # 'clinvar': 'ClinVar',                   # takes ~ half hour
        # 'clinvarxml_alpha': 'ClinVarXML_alpha', # takes ~ five minutes
        'monochrom': 'Monochrom',
        'kegg': 'KEGG',
        'animalqtldb': 'AnimalQTLdb',
        'ensembl': 'Ensembl',
        'hgnc': 'HGNC',
        'orphanet': 'Orphanet',
        'omia': 'OMIA',
        'flybase': 'FlyBase',
        'mmrrc': 'MMRRC',
        'wormbase': 'WormBase',
        'mpd': 'MPD',
        'gwascatalog': 'GWASCatalog',
        'monarch': 'Monarch',
        'go': 'GeneOntology',
        'reactome': 'Reactome',
        'udp': 'UDP',
        'mgi-slim': 'MGISlim',
        'zfin-slim': 'ZFINSlim',
        'bgee': 'Bgee',
        'mydrug': 'MyDrug',
        'stringdb': 'StringDB',
        'rgd': 'RGD',
        'sgd': 'SGD'
    }

    logger = logging.getLogger(__name__)

    parser = argparse.ArgumentParser(
        description='Dipper: Data Ingestion Pipeline for SciGraph',
        formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument(
        '-g', '--graph', type=str, default="rdf_graph",
        help='graph type: rdf_graph, streamed_graph')
    parser.add_argument(
        '-s', '--sources', type=str, required=True,
        help='comma separated list of sources')
    parser.add_argument(
        '-l', '--limit', type=int,
        help='limit number of rows')
    parser.add_argument(
        '--parse_only', action='store_true',
        help='parse files without writing')
    parser.add_argument(
        '--fetch_only', action='store_true',
        help='fetch sources without parsing')
    parser.add_argument('-f', '--force', action='store_true',
                        help='force re-download of files')
    parser.add_argument(
        '--no_verify',
        help='ignore the verification step', action='store_true')
    parser.add_argument('--query', help='enter in a sparql query', type=str)
    parser.add_argument(
        '-q', '--quiet',
        help='turn off info logging', action="store_true")
    parser.add_argument(
        '--debug', help='turn on debug logging', action="store_true")
    parser.add_argument(
        '--skip_tests', help='skip any testing', action="store_true")

    # Blank Nodes can't be visualized in Protege, default to Skolemizing them
    parser.add_argument(
        '-b', '--use_bnodes',
        help="use blank nodes instead of skolemizing", action="store_true",
        default=False)

    # TODO this should live in a global data file
    #   and the same filter be applied to all sources
    parser.add_argument(
        '-t', '--taxon', type=str,
        help='Add a taxon constraint on a source. Enter 1+ NCBITaxon numbers,'
        ' comma delimited\n'
        'Implemented taxa per source\n'
        'NCBIGene: 9606,10090,7955\n'
        'Panther: 9606,10090,10116,7227,7955,6239,8355\n'
        'BioGrid: 9606,10090,10116,7227,7955,6239,8355\n'
        'UCSCBands: 9606\n'
        'GO: 9606,10090,10116,7227,7955,6239,9615,9823,9031,9913')
    parser.add_argument(
        '-o', '--test_only',
        help='only process and output the pre-configured test subset',
        action="store_true")

    parser.add_argument(
        '--dest_fmt',
        help='serialization format: [turtle], nt, nquads, rdfxml, n3, raw',
        type=str)

    parser.add_argument(
        '--version', '-v',
        help='version of source',
        type=str)

    args = parser.parse_args()
    tax_ids = None
    if args.taxon is not None:
        tax_ids = [int(t) for t in args.taxon.split(',')]

    taxa_supported = [  # these are not taxa
        'Panther', 'NCBIGene', 'BioGrid', 'UCSCBands', 'GeneOntology',
        'Bgee', 'Ensembl', 'StringDB', 'OMA']

    formats_supported = [
        'turtle', 'ttl',
        'ntriples', 'nt',
        'nquads', 'nq',
        'rdfxml', 'xml',
        'notation3', 'n3',
        'raw']

    if args.quiet:
        logging.basicConfig(level=logging.ERROR)
    else:
        if args.debug:
            logging.basicConfig(level=logging.DEBUG)
        else:
            logging.basicConfig(level=logging.INFO)

    if not args.use_bnodes:
        logger.info("Will Skolemize Blank Nodes")

    if args.query is not None:
        test_query = TestUtils()
        for source in args.sources.split(','):
            source = source.lower()
            mysource = source_to_class_map[source]()

            # import source lib
            module = "dipper.sources.{0}".format(mysource)
            imported_module = importlib.import_module(module)
            source_class = getattr(imported_module, mysource)

            test_query.check_query_syntax(args.query, source_class)
            test_query.load_graph_from_turtle(source_class)

        print(test_query.query_graph(args.query, True))
        exit(0)

    # run initial tests
    if (args.no_verify or args.skip_tests) is not True:
        unittest.TextTestRunner(verbosity=2).run(test_suite)

    # set serializer
    if args.dest_fmt is not None:
        if args.dest_fmt in formats_supported:
            if args.dest_fmt == 'ttl':
                args.dest_fmt = 'turtle'
            elif args.dest_fmt == 'ntriples':
                args.dest_fmt = 'nt'
            elif args.dest_fmt == 'nq':
                args.dest_fmt = 'nquads'
            elif args.dest_fmt == 'xml':
                args.dest_fmt = 'rdfxml'
            elif args.dest_fmt == 'notation3':
                args.dest_fmt = 'n3'
        else:
            logger.error(
                "You have specified an invalid serializer: %s", args.dest_fmt)

            exit(0)
    else:
        args.dest_fmt = 'turtle'

    # iterate through all the sources
    for source in args.sources.split(','):
        logger.info("\n******* %s *******", source)
        source = source.lower()
        src = source_to_class_map[source]

        # import source lib
        module = "dipper.sources.{0}".format(src)
        imported_module = importlib.import_module(module)
        source_class = getattr(imported_module, src)
        mysource = None
        # arg factory
        source_args = dict(
            graph_type=args.graph
        )
        source_args['are_bnodes_skolemized'] = not args.use_bnodes
        if src in taxa_supported:
            source_args['tax_ids'] = tax_ids
        if args.version:
            source_args['version'] = args.version

        mysource = source_class(**source_args)
        if args.parse_only is False:
            start_fetch = time.clock()
            mysource.fetch(args.force)
            end_fetch = time.clock()
            logger.info("Fetching time: %d sec", end_fetch-start_fetch)

        mysource.settestonly(args.test_only)

        # run tests first
        if (args.no_verify or args.skip_tests) is not True:
            suite = mysource.getTestSuite()
            if suite is None:
                logger.warning(
                    "No tests configured for this source: %s", source)
            else:
                unittest.TextTestRunner(verbosity=2).run(suite)
        else:
            logger.info("Skipping Tests for source: %s", source)

        if args.test_only is False and args.fetch_only is False:
            start_parse = time.clock()
            mysource.parse(args.limit)
            end_parse = time.clock()
            logger.info("Parsing time: %d sec", end_parse-start_parse)
            if args.graph == 'rdf_graph':
                logger.info("Found %d nodes", len(mysource.graph))

                # Add property axioms
                start_axiom_exp = time.clock()
                logger.info("Adding property axioms")

                properties = GraphUtils.get_properties_from_graph(mysource.graph)
                GraphUtils.add_property_axioms(mysource.graph, properties)
                end_axiom_exp = time.clock()
                logger.info("Property axioms added: %d sec",
                            end_axiom_exp-start_axiom_exp)

                start_write = time.clock()
                mysource.write(fmt=args.dest_fmt)
                end_write = time.clock()
                logger.info("Writing time: %d sec", end_write-start_write)
        # if args.no_verify is not True:

        #    status = mysource.verify()
        #    if status is not True:
        #        logger.error(
        #            'Source %s did not pass verification tests.', source)
        #        exit(1)
        # else:
        #    logger.info('skipping verification step')
        logger.info('***** Finished with %s *****', source)
    # load configuration parameters
    # for example, keys

    logger.info("All done.")
Exemple #55
0
 def setUp(self):
     self.test_util = TestUtils()
     return
Exemple #56
0
class GeneVariantDiseaseTest(unittest.TestCase):
    def setUp(self):
        """
        """
        self.test_util = TestUtils()
        self.orphanet = Orphanet('rdf_graph', True)
        self.orphanet.rawdir = os.path.join(os.path.dirname(__file__),
                                            'resources/orphanet')

    def tearDown(self):
        self.orphanet = None
        return

    def test_germline_variant_to_disease(self):
        self.orphanet.graph = RDFGraph()  # Reset graph
        self.orphanet.files['disease-gene']['file'] = 'orph-germline.xml'

        self.orphanet._process_diseasegene(limit=None)
        LOG.debug(
            "Reference graph: %s",
            self.orphanet.graph.serialize(format="turtle").decode("utf-8"))
        expected_triples = """
MONARCH:ba2ac5d2153c70e2bb98 a OBAN:association ;
    RO:0002558 ECO:0000322 ;
    OBAN:association_has_object ORPHA:938475 ;
    OBAN:association_has_predicate RO:0004013 ;
    OBAN:association_has_subject HGNC:30497 .

ENSEMBL:ENSG00000166813 a owl:Class .

HGNC:30497 a owl:Class ;
    RO:0004013 ORPHA:938475 ;
    oboInOwl:hasExactSynonym "KAS1" ;
    owl:equivalentClass ENSEMBL:ENSG00000166813,
       ORPHA:268061 .

ORPHA:268061 a owl:Class .

ORPHA:938475 a owl:Class ;
    rdfs:label "too much unit testing disorder" .
    
ENSEMBL:ENSG00000166813 biolink:category biolink:Gene .
ECO:0000322 biolink:category biolink:EvidenceType .
HGNC:30497 biolink:category biolink:Genotype .
HGNC:30497 biolink:category biolink:Gene .
ORPHA:268061 biolink:category biolink:Gene .
ORPHA:938475 biolink:category biolink:Disease .

MONARCH:ba2ac5d2153c70e2bb98 biolink:category biolink:Association .
        """
        self.assertTrue(
            self.test_util.test_graph_equality(expected_triples,
                                               self.orphanet.graph))
        return

    def test_germline_lof_variant_to_disease(self):
        self.orphanet.graph = RDFGraph()  # Reset graph
        self.orphanet.files['disease-gene']['file'] = 'orph-germline-lof.xml'

        self.orphanet._process_diseasegene(limit=None)
        LOG.debug(
            "Reference graph: %s",
            self.orphanet.graph.serialize(format="turtle").decode("utf-8"))
        expected_triples = """
MONARCH:b9ad1b0c562ad4db3f1e a OBAN:association ;
    RO:0002558 ECO:0000322 ;
    OBAN:association_has_object ORPHA:938475 ;
    OBAN:association_has_predicate RO:0004012 ;
    OBAN:association_has_subject ORPHA:268061 .

ORPHA:268061 RO:0004012 ORPHA:938475 ;
    oboInOwl:hasExactSynonym "KAS1" .

ORPHA:938475 a owl:Class ;
    rdfs:label "too much unit testing disorder" .
    
ECO:0000322 biolink:category biolink:EvidenceType .
ORPHA:268061 biolink:category biolink:Gene .
ORPHA:268061 biolink:category biolink:Genotype .
ORPHA:938475 biolink:category biolink:Disease .
    
MONARCH:b9ad1b0c562ad4db3f1e biolink:category biolink:Association .
        """
        self.assertTrue(
            self.test_util.test_graph_equality(expected_triples,
                                               self.orphanet.graph))
        return

    def test_gene_to_disease(self):
        self.orphanet.graph = RDFGraph()  # Reset graph
        self.orphanet.files['disease-gene']['file'] = 'orph-no-variant.xml'

        self.orphanet._process_diseasegene(limit=None)
        LOG.debug(
            "Reference graph: %s",
            self.orphanet.graph.serialize(format="turtle").decode("utf-8"))
        expected_triples = """
MONARCH:bdbeb077e365ddedda20 a OBAN:association ;
    RO:0002558 ECO:0000322 ;
    OBAN:association_has_object ORPHA:938475 ;
    OBAN:association_has_predicate RO:0004015 ;
    OBAN:association_has_subject ORPHA:268061 .

ORPHA:268061 RO:0004015 ORPHA:938475 ;
    oboInOwl:hasExactSynonym "KAS1" .

ORPHA:938475 a owl:Class ;
    rdfs:label "too much unit testing disorder" .
    
ECO:0000322 biolink:category biolink:EvidenceType .
ORPHA:268061 biolink:category biolink:Gene .
ORPHA:268061 biolink:category biolink:Genotype .
ORPHA:938475 biolink:category biolink:Disease .

MONARCH:bdbeb077e365ddedda20 biolink:category biolink:Association .
        """
        self.assertTrue(
            self.test_util.test_graph_equality(expected_triples,
                                               self.orphanet.graph))
        return

    def test_unmapped_disease_assoc_type(self):
        """
        Test that a gene disease type that we have
        not mapped in translationtable/orphanet.yaml
        raises a ValueError
        """
        self.orphanet.graph = RDFGraph()  # Reset graph
        self.orphanet.files['disease-gene']['file'] = 'orph-no-mapping.xml'
        self.assertRaises(
            KeyError, lambda: self.orphanet._process_diseasegene(limit=None))
        return
Exemple #57
0
class EvidenceProvenanceTestCase(unittest.TestCase):

    def setUp(self):
        self.test_util = TestUtils()
        self.assoc_curie = 'MONARCH:test_association'
        self.eco_id = 'ECO:0000015'

        self.test_set_1 = (
            'MGI:1920145', 'Setd5', 'WTSI', 'MEFW', 'male',
            'heterozygote', 'MGI:4432631', 'Setd5<tm1a(EUCOMM)Wtsi>',
            'targeted mutation 1a, Wellcome Trust Sanger Institute',
            'MGI:2159965', 'C57BL/6N', 'MGP',
            'Wellcome Trust Sanger Institute Mouse Genetics Project',
            'MGP Select Pipeline', 'MGP_001', 'MGP_XRY_001', 'X-ray',
            'IMPC_XRY_008_001', 'Number of ribs right', 'MP:0005390',
            'skeleton phenotype', 'MP:0000480', 'increased rib number',
            '1.637023E-010', '', '8.885439E-007',
            'Wilcoxon rank sum test with continuity correction', 'IMPC')

        # Generate test curies, these are otherwise generated
        # within _add_evidence() and _add_study_provenance()
        # these blank nodes are hardcoded as NOT Skolemized  ...
        self.study_curie = "_:study"
        self.evidence_curie = "_:evidence"

        # IRIs for testing sparql output
        curie_dict = curie_map.get()
        curie_util = CurieUtil(curie_dict)
        self.assoc_iri = URIRef(curie_util.get_uri(self.assoc_curie))

        return

    def test_evidence_model(self):
        """
        Functional test for _add_evidence()
        """
        impc = IMPC('rdf_graph', True)
        impc.graph = RDFGraph(True) # Reset graph
        # Test graph is empty
        self.assertTrue(len(list(impc.graph)) == 0)

        impc_map = impc.open_and_parse_yaml(impc.map_files['impc_map'])

        (p_value, percentage_change, effect_size) = self.test_set_1[23:26]

        impc._add_evidence(self.assoc_curie, self.eco_id, impc_map, p_value,
                           percentage_change, effect_size, self.study_curie)

        triples = """
    :MONARCH_test_association SEPIO:0000007 <https://monarchinitiative.org/.well-known/genid/b097a98087df7a99> .
    
    <https://monarchinitiative.org/.well-known/genid/b097a98087df7a99> a ECO:0000015 ;
        SEPIO:0000084 <https://monarchinitiative.org/.well-known/genid/b89ee584330837c9>,
            <https://monarchinitiative.org/.well-known/genid/bc0eeccdea27a1d8> ;
        SEPIO:0000085 <https://monarchinitiative.org/.well-known/genid/study> .

    <https://monarchinitiative.org/.well-known/genid/bc0eeccdea27a1d8> a OBI:0000175 ;
        RO:0002353 <https://monarchinitiative.org/.well-known/genid/study> ;
        STATO:0000129 1.637023e-10 .

    <https://monarchinitiative.org/.well-known/genid/b89ee584330837c9> a STATO:0000085 ;
        RO:0002353 <https://monarchinitiative.org/.well-known/genid/study> ;
        STATO:0000129 "8.885439E-007" .
        """

        self.assertTrue(self.test_util.test_graph_equality(
            triples, impc.graph))

    def test_provenance_model(self):
        """
        Functional test for _add_study_provenance()
        """
        impc = IMPC('rdf_graph', True)
        impc.graph = RDFGraph(True)
        self.assertTrue(len(list(impc.graph)) == 0)

        impc_map = impc.open_and_parse_yaml(impc.map_files['impc_map'])
        impress_map = json.loads(
            impc.fetch_from_url(
                impc.map_files['impress_map']).read().decode('utf-8'))

        (phenotyping_center, colony) = self.test_set_1[2:4]
        (project_fullname, pipeline_name, pipeline_stable_id,
         procedure_stable_id, procedure_name, parameter_stable_id,
         parameter_name) = self.test_set_1[12:19]
        (statistical_method, resource_name) = self.test_set_1[26:28]

        impc._add_study_provenance(
            impc_map, impress_map, phenotyping_center, colony,
            project_fullname, pipeline_name, pipeline_stable_id,
            procedure_stable_id, procedure_name,
            parameter_stable_id, parameter_name,
            statistical_method, resource_name)

        triples = """
    <https://monarchinitiative.org/.well-known/genid/bbdd05a8ca155dda> a OBI:0000471 ;
      BFO:0000051 OBO:STATO_0000076,
          <https://www.mousephenotype.org/impress/protocol/175/15> ;
      BFO:0000050  IMPRESS-procedure:15 ,
          <http://www.sanger.ac.uk/science/data/mouse-genomes-project> ;
      SEPIO:0000114 <https://www.mousephenotype.org/impress/parameterontologies/1867/91> ;
      SEPIO:0000017 <http://www.sanger.ac.uk/>  .
      
    <https://monarchinitiative.org/.well-known/genid/bc0b26361b8687b5> a owl:NamedIndividual ;
        rdfs:label "MEFW" .

    <http://www.sanger.ac.uk/> a foaf:organization ;
        rdfs:label "WTSI" .

    <http://www.sanger.ac.uk/science/data/mouse-genomes-project> a VIVO:Project ;
        rdfs:label "Wellcome Trust Sanger Institute Mouse Genetics Project" .

    <https://www.mousephenotype.org/impress/parameterontologies/1867/91> a owl:NamedIndividual ;
        rdfs:label "Number of ribs right (X-ray)" .

    IMPRESS-procedure:15 a owl:NamedIndividual ;
        rdfs:label "MGP Select Pipeline" .

    <https://www.mousephenotype.org/impress/protocol/175/15> a owl:NamedIndividual ;
        rdfs:label "X-ray" .
"""
        # dbg
        logger.debug("Reference graph: %s",
                     impc.graph.serialize(format="turtle")
                               .decode("utf-8")
        )

        self.assertTrue(self.test_util.test_graph_equality(
            triples, impc.graph))

    def test_assertion_model(self):
        """
        Functional test for _add_study_provenance()
        """

        impc = IMPC('rdf_graph', True)
        impc.graph = RDFGraph(True)
        self.assertTrue(len(list(impc.graph)) == 0)

        impc_map = impc.open_and_parse_yaml(impc.map_files['impc_map'])

        impc._add_assertion_provenance(
            self.assoc_curie, self.evidence_curie, impc_map)

        triples = """
    MONARCH:test_association SEPIO:0000015 <https://monarchinitiative.org/.well-known/genid/bcb2c00a5c2f9c43> .
    <https://monarchinitiative.org/.well-known/genid/bcb2c00a5c2f9c43> a SEPIO:0000001 ;
        SEPIO:0000018 <http://www.mousephenotype.org/> ;
        SEPIO:0000111 <https://monarchinitiative.org/.well-known/genid/evidence>  .

    <http://www.mousephenotype.org/> a foaf:organization ;
        rdfs:label "International Mouse Phenotyping Consortium" .

        """
        # dbg
        logger.debug("Reference graph: %s",
                     impc.graph.serialize(format="turtle")
                                      .decode("utf-8")
        )

        self.assertTrue(self.test_util.test_graph_equality(
            triples, impc.graph))

    def test_random_data_set(self):
        """
        Download dataset using fetch(), then take a row of data and
        run through evidence and provenance functions to test the output

        Line of data is hardcoded, but theoretically should work on any line
        """
        line_to_test = 1129
        count = 0
        impc = IMPC('rdf_graph', False)   # Not Skolem
        impress_map = json.loads(
            impc.fetch_from_url(
                impc.map_files['impress_map']).read().decode('utf-8'))
        impc_map = impc.open_and_parse_yaml(impc.map_files['impc_map'])

        # fetch file
        impc.fetch(True)
        file_path = '/'.join((impc.rawdir, impc.files['all']['file']))
        with gzip.open(file_path, 'rt') as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            for row in filereader:
                count += 1
                if count == line_to_test:
                    self.test_set_1 = row
                    break

        # Some DRY violation with the above tests
        (phenotyping_center, colony) = row[2:4]
        (project_fullname, pipeline_name, pipeline_stable_id,
         procedure_stable_id, procedure_name, parameter_stable_id,
         parameter_name) = row[12:19]
        (statistical_method, resource_name) = row[26:28]

        (p_value, percentage_change, effect_size) = self.test_set_1[23:26]

        impc._add_evidence(self.assoc_curie, self.eco_id, impc_map, p_value,
                           percentage_change, effect_size, self.study_curie)

        impc._add_study_provenance(
            impc_map, impress_map, phenotyping_center, colony,
            project_fullname, pipeline_name, pipeline_stable_id,
            procedure_stable_id, procedure_name,
            parameter_stable_id, parameter_name,
            statistical_method, resource_name)

        # Note that this doesn't test much since we're dealing with
        # multiple part_of  and has_part links to individuals
        # which results in ambiguity = hard to test
        sparql_query = """
SELECT *
WHERE {
    ?assoc SEPIO:0000007 ?evidenceline .
    ?evidenceline a ECO:0000015 ;
        SEPIO:0000085 _:study .

    ?study a OBI:0000471 ;
        SEPIO:0000114 ?param ;
        SEPIO:0000017 ?agent .
}
"""
        sparql_output = impc.graph.query(sparql_query)
        # Test that query passes and returns one row
        self.assertEqual(len(list(sparql_output)), 1)

    def tearDown(self):
        return