def setUp(self): self.test_util = TestUtils() self.source = GWASCatalog('rdf_graph', True) self.source.graph = RDFGraph(True) self.test_data = { 'snp_label': 'rs1329573-?; rs7020413-?; rs3824344-?; rs3758171-?', 'chrom_num': '9;9;9;9', 'chrom_pos': '36998996;37002118;37000690;36997420', 'context': 'intron_variant; intron_variant; intron_variant; intron_variant', 'allele_freq': 'NR', 'trait': 'Intelligence', 'trait_uri': 'http://www.ebi.ac.uk/efo/EFO_0004337', 'pvalue': '0.00000004', 'merged': '0', 'snp_id_current': '', 'mapped_gene': 'PAX5; PAX5; PAX5; PAX5', 'snp_gene_nums': '', 'upstream_gene_num': '107986179', 'downstream_gene_num': '107986180', 'init_sample_desc': '656 European ancestry individuals from ADHD families', 'replicated_sample_desc': 'NA', 'platform': 'Illumina [795637]', 'pubmed': '22449649' }
def setUp(self): """ """ self.test_util = TestUtils() self.orphanet = Orphanet('rdf_graph', True) self.orphanet.rawdir = os.path.join(os.path.dirname(__file__), 'resources/orphanet')
def setUp(self): self.test_util = TestUtils() self.test_set_1 = \ ('ENSBTAP00000013354', 'R-BTA-3000480', 'http://www.reactome.org/PathwayBrowser/#/R-BTA-3000480', 'Scavenging by Class A Receptors', 'IEA', 'Bos taurus') return
def setUp(self): self.test_util = TestUtils() self.test_set_1 = { 'Allele': 'atp6-L183R (L183R)', 'Chemical': 'glycerol', 'Condition': 'elevated temperature (35 deg C)|nonfermentable carbon source', 'Details': 'similar results obtained with atp6-L247R, and atp6-W136R, all ' 'corresponding to human NARP syndrome mutants', 'Experiment Type': 'classical genetics', 'Feature Name': 'Q0085', 'Feature Type': 'ORF', 'Gene Name': 'ATP6', 'Mutant Type': 'reduction of function', 'Phenotype': 'respiratory growth: decreased rate', 'Reference': 'PMID: 21715656|SGD_REF: S000145858', 'Reporter': ' ', 'SGDID': 'S000007268', 'Strain Background': 'Other' } return
def setUp(self): self.test_util = TestUtils() self.source = GWASCatalog('rdf_graph', True) self.source.graph = RDFGraph(True) # Reset graph self.source.graph.bind_all_namespaces() self.test_data = { 'snp_label': 'rs1491921-C', 'chrom_num': '5', 'chrom_pos': '21259029', 'context': 'intergenic_variant', 'allele_freq': '0.013', 'trait': 'Diisocyanate-induced asthma', 'trait_uri': 'http://www.ebi.ac.uk/efo/EFO_0006995, http://www.ebi.ac.uk/efo/EFO_0003949', 'pvalue': '0.0000007', 'merged': '0', 'snp_id_current': '1491921', 'mapped_gene': 'LOC102723561 - GUSBP1', 'snp_gene_nums': '', 'upstream_gene_num': '107986179', 'downstream_gene_num': '107986180', 'init_sample_desc': '74 European ancestry cases, 824 European ancestry controls', 'replicated_sample_desc': 'NA', 'platform': 'Illumina [1556551]', 'pubmed': '25918132' }
def setUp(self): self.test_util = TestUtils() self.source = CTD('rdf_graph', True) self.source.graph = RDFGraph(True) self.test_row = [ 'Nicotine', 'D009538', '', 'TOBACCO ADDICTION, SUSCEPTIBILITY TO', 'OMIM:188890', 'therapeutic', '', '', '', '12345|56789' ] return
def setUp(self): self.test_util = TestUtils() self.source = MyChem('rdf_graph', True) # Replaces source.fetch() data_fh = open(TESTDATA, 'r') self.test_data = json.load(data_fh) data_fh.close() self.source.drugbank_targets.append(self.test_data[0]) self.source.drugcentral_interactors.append(self.test_data[0])
def setUp(self): """ """ self.test_util = TestUtils() self.orphanet = Orphanet('rdf_graph', True) # Override so tests don't break when we update terms self.globaltt = self.orphanet.open_and_parse_yaml( os.path.join(os.path.dirname(__file__), './resources/test_terms.yaml')) self.orphanet.rawdir = os.path.join(os.path.dirname(__file__), 'resources/orphanet')
def test_therapeutic_relationship(self): from dipper.utils.TestUtils import TestUtils from dipper.models.Model import Model # Make testutils object and load ttl test_query = TestUtils(self.source.graph) test_query.load_testgraph_from_turtle(self.source) graph = self.source.graph model = Model(graph) # Expected structure # TODO can this be unified OBAN and the Annot models # to be automatically generated? sparql_query = """ SELECT ?assoc ?disease ?rel ?chemical WHERE { ?assoc a OBAN:association ; OBAN:association_has_object ?disease ; OBAN:association_has_predicate ?rel ; OBAN:association_has_subject ?chemical .} """ # SPARQL variables to check chem_id = 'MESH:D009538' chem_uri = graph._getNode(chem_id) disease_id = 'OMIM:188890' disease_uri = graph._getNode(disease_id) rel_id = model.object_properties['substance_that_treats'] rel_uri = graph._getNode(rel_id) # TODO unused # pubmed_id = 'PMID:16785264' # pubmed_uri = gu.getNode(pubmed_id) # eco = 'ECO:0000033' assoc = G2PAssoc(graph, self.source.name, chem_id, disease_id, rel_id) assoc_id = assoc.make_g2p_id() assoc_uri = self.source.graph._getNode(assoc_id) # One of the expected outputs from query expected_output = [assoc_uri, disease_uri, rel_uri, chem_uri] # Query graph sparql_output = test_query.query_graph(sparql_query) self.assertTrue( expected_output in sparql_output, "did not find expected association: " + str(expected_output) + " found " + str(len(sparql_output)) + " others:\n" + str(sparql_output)) logger.info("Test query data finished.")
def setUp(self): """ Because _process_evidence_view uses self.rawdir to find the evidence file, the defaults are overriden here to point to our test file Note the file name must match what is in that method - evidence_view """ self.test_util = TestUtils() self.mgi = MGI('rdf_graph', True) self.mgi.rawdir = os.path.join(os.path.dirname(__file__), 'resources/mgi') self.mgi.idhash['annot']['6901981'] = ':association'
def test_classes_indiv_properties(self): """ Given the above sample input, produce the following: A CGD:DiseaseID is an OWL Class A CGD:DiseaseID is a subclass of DOID:4 A CGD:Disease rdfs:label "Adenocarcinoma" A CGD:DiseaseInstance is an individual of CGD:DiseaseID A CGD:DiseaseInstance rdfs:label "Adenocarcinoma with response {1} to therapy" A CGD:DrugID is an OWL Class A CGD:DrugID is a subclass of CHEBI:23888 A CGD:DrugID rdfs:label "5FU-based adjuvant therapy" A CGD:RelationID is an object property PMID:12345 is a IAO:0000013 (journal article) """ from dipper.utils.TestUtils import TestUtils # Make testutils object and load bindings test_env = TestUtils(self.cgd.graph) self.cgd.load_bindings() sparql_query = """ SELECT ?disease ?diseaseInd ?diseaseQual ?drug ?source WHERE {{ ?disease a owl:Class ; rdfs:subClassOf DOID:4 ; rdfs:label "{0}" . ?diseaseInd a ?disease ; rdfs:label "{1}" ; BFO:0000159 ?diseaseQual . ?drug a owl:Class ; rdfs:subClassOf CHEBI:23888 ; rdfs:label "{2}" . <{3}> a owl:ObjectProperty . ?source a IAO:0000013 . }} """.format(self.disease_label, self.disease_instance_label, self.drug_label, self.relationship_uri) # Expected Results expected_results = [[self.disease_uri, self.disease_ind_uri, self.disease_quality_uri, self.drug_uri, self.source_uri]] # Query graph sparql_output = test_env.query_graph(sparql_query) self.assertEqual(expected_results, sparql_output)
def setUp(self): self.test_util = TestUtils() self.test_set_1 = { 'aspect': 'N', 'date': '2006-10-26', 'evidence': { 'has_supporting_reference': ['RGD:1581841', 'PMID:12799311'], 'type': 'IED', 'with_support_from': [] }, 'negated': False, 'object': { 'id': 'MP:0003340', 'taxon': 'NCBITaxon:10116' }, 'provided_by': 'RGD', 'qualifiers': [], 'relation': { 'id': None }, 'source_line': 'RGD\t2535\tEdnra\t\tMP:0003340\tRGD:1581841|PMID:12799311\t' 'IED\t\tN\tendothelin receptor type A\t\tgene\ttaxon:10116\t' '20061026\tRGD\t\t\n', 'subject': { 'fullname': 'endothelin receptor type A', 'id': 'RGD:2535', 'label': 'Ednra', 'synonyms': [], 'taxon': { 'id': 'NCBITaxon:10116' }, 'type': 'gene' }, 'subject_extensions': [{ 'filler': '\n', 'property': 'isoform' }] } return
def test_associations(self): """ Given the above sample input, produce the following: CGD:VariantID has_phenotype(RO:0002200) CGD:DiseaseInstance A CGD:AssociationID OBO:RO_0002558 Traceable Author Statement (ECO:0000033) A CGD:AssociationID dc:source PMID:20498393 A CGD:AssociationID has_environment CGD:DrugID A CGD:AssociationID OBAN:association_has_subject CGD:VariantID A CGD:AssociationID OBAN:association_has_object_property has_phenotype A CGD:AssociationID OBAN:association_has_object CGD:DiseaseInstance """ from dipper.utils.TestUtils import TestUtils # Make testutils object and load bindings cu = CurieUtil(self.curie_map) test_env = TestUtils(self.cgd.graph) self.cgd.load_bindings() evidence = 'OBO:ECO_0000033' evidence_uri = URIRef(cu.get_uri(evidence)) sparql_query = """ SELECT ?diseaseInd ?variant ?drug ?vdannot ?source ?evidence WHERE {{ ?variant OBO:RO_0002200 ?diseaseInd . ?vdannot a OBAN:association ; OBO:RO_0002558 ?evidence ; dc:source ?source ; <{0}> ?drug ; OBAN:association_has_object ?diseaseInd ; OBAN:association_has_object_property OBO:RO_0002200 ; OBAN:association_has_subject ?variant . }} """.format(self.relationship_uri) # Expected Results expected_results = [[self.disease_ind_uri, self.variant_uri, self.drug_uri, self.vd_annot_uri, self.source_uri, evidence_uri]] # Query graph sparql_output = test_env.query_graph(sparql_query) self.assertEqual(expected_results, sparql_output)
def test_therapeutic_relationship(self): from dipper.utils.TestUtils import TestUtils from dipper.utils.GraphUtils import GraphUtils # Make testutils object and load bindings test_query = TestUtils(self.ctd.graph) # Expected structure sparql_query = """ SELECT ?assoc ?pubmed ?disease ?chemical WHERE { ?assoc a Annotation: ; dc:evidence OBO:ECO_0000033 ; dc:source ?pubmed ; :hasObject ?disease ; :hasPredicate OBO:RO_0002606 ; :hasSubject ?chemical .} """ # SPARQL variables to check chem_id = 'MESH:D009538' chem_uri = self.graph._getNode(chem_id) disease_id = 'OMIM:188890' disease_uri = self.graph._getNode(disease_id) pubmed_id = 'PMID:16785264' pubmed_uri = self.graph._getNode(pubmed_id) rel_id = self.model.object_properties['substance_that_treats'] eco = 'ECO:0000033' # TODO PYLINT make_association_id() does not exist in CTD # there is "_make_association()" with a different sig assoc_id = self.ctd.make_association_id('ctd', chem_id, rel_id, disease_id, eco, pubmed_id) assoc_uri = self.graph._getNode(assoc_id) # Expected output from query expected_output = [assoc_uri, pubmed_uri, disease_uri, chem_uri] # Query graph sparql_output = test_query.query_graph(sparql_query) self.assertTrue(expected_output in sparql_output) logger.info("Test finished.")
def setUp(self): self.test_util = TestUtils() # Test set with two proteins from same species self.test_set_1 = [[ '9606.ENSP00000000233', '9606.ENSP00000003084', 0, 0, 0, 0, 300, 0, 150, 800]] # Test set with deprecated protein id self.test_set_2 = [[ '9606.ENSP00000000233', '9606.ENSP00000006101', 0, 0, 0, 0, 300, 0, 150, 800]] self.columns = [ 'protein1', 'protein2', 'neighborhood', 'fusion', 'cooccurence', 'coexpression', 'experimental', 'database', 'textmining', 'combined_score'] ensembl = Ensembl('rdf_graph', True) self.protein_list = ensembl.fetch_protein_gene_map('9606') return
def test_amino_acid_position_region_model(self): """ Test modelling of amino acid positions Using test data set 1, and the function add_variant_info_to_graph() We want to test the following triples: CGD:RegionID is an instance of faldo:Region CGD:RegionID faldo:begin BothStrandPositionID CGD:RegionID faldo:end BothStrandPositionID CGD:BothStrandPositionID is an instance of faldo:BothStrandPosition CGD:BothStrandPositionID is an instance of faldo:Position CGD:BothStrandPositionID faldo:position 741 CGD:BothStrandPositionID faldo:reference UniProtID """ from dipper.utils.TestUtils import TestUtils self.cgd.add_variant_info_to_graph(self.test_set_1) # Make testutils object and load bindings test_env = TestUtils(self.cgd.graph) cu = CurieUtil(self.curie_map) self.cgd.load_bindings() (variant_key, variant_label, amino_acid_variant, amino_acid_position, transcript_id, transcript_priority, protein_variant_type, functional_impact, stop_gain_loss, transcript_gene, protein_variant_source) = self.test_set_1[0][0:11] position = 741 variant_id = self.cgd.make_cgd_id('variant{0}'.format(variant_key)) uniprot_curie = "UniProtKB:Q99062#Q99062-1" uniprot_id = "Q99062#Q99062-1" region_id = ":_{0}{1}{2}Region".format(position, position, uniprot_curie) both_strand_id = ":_{0}-{1}".format(uniprot_id, position) region_uri = URIRef(cu.get_uri(region_id)) both_strand_uri = URIRef(cu.get_uri(both_strand_id)) uniprot_uri = URIRef(cu.get_uri(uniprot_curie)) sparql_query = """ SELECT ?region ?bsPosition ?protein WHERE {{ ?region a faldo:Region ; faldo:begin ?bsPosition ; faldo:end ?bsPosition . ?bsPosition a faldo:Position ; faldo:position {0} ; faldo:reference ?protein . }} """.format(position) # Expected Results expected_results = [[region_uri, both_strand_uri, uniprot_uri]] # Query graph sparql_output = test_env.query_graph(sparql_query) self.assertEqual(expected_results, sparql_output)
def setUp(self): self.test_util = TestUtils() self.assoc_curie = 'MONARCH:test_association' self.eco_id = 'ECO:0000015' # Headers: # 01 marker_accession_id, # 02 marker_symbol, # 03 phenotyping_center, # 04 colony_raw, # 05 sex, # 06 zygosity, # 07 allele_accession_id, # 08 allele_symbol, # 09 allele_name, # 10 strain_accession_id, # 11 strain_name, # 12 project_name, # 13 project_fullname, # 14 pipeline_name, # 15 pipeline_stable_id, # 16 procedure_stable_id, # 17 procedure_name, # 18 parameter_stable_id, # 19 parameter_name, # 20 top_level_mp_term_id, # 21 top_level_mp_term_name, # 22 mp_term_id, # 23 mp_term_name, # 24 p_value, # 25 percentage_change, # 26 effect_size, # 27 statistical_method, # 28 resource_name self.test_set_1 = ( 'MGI:1920145', # 01 'Setd5', # 02 'WTSI', # 03 'MEFW', # 04 'male', # 05 'heterozygote', # 06 'MGI:4432631', # 07 'Setd5<tm1a(EUCOMM)Wtsi>', # 08 'targeted mutation 1a, Wellcome Trust Sanger Institute', # 09 'MGI:2159965', # 10 'C57BL/6N', # 11 'MGP', # 12 'Wellcome Trust Sanger Institute Mouse Genetics Project', # 13 'MGP Select Pipeline', # 14 'MGP_001', # 15 'MGP_XRY_001', # 16 'X-ray', # 17 'IMPC_XRY_008_001', # 18 'Number of ribs right', # 19 'MP:0005390', # 20 'skeleton phenotype', # 21 'MP:0000480', # 22 'increased rib number', # 23 '1.637023E-010', # 24 '', # 25 '8.885439E-007', # 26 'Wilcoxon rank sum test with continuity correction', # 27 'IMPC' # 28 ) # Generate test curies, these are otherwise generated # within _add_evidence() and _add_study_provenance() # these blank nodes are hardcoded as NOT Skolemized ... self.study_curie = "_:study" self.evidence_curie = "_:evidence" # IRIs for testing sparql output curie_dict = curie_map.get() curie_util = CurieUtil(curie_dict) self.assoc_iri = URIRef(curie_util.get_uri(self.assoc_curie)) return
def test_variant_position_region_model(self): """ Test modelling of variant positions on a transcript Using test data set 2, and the function add_variant_info_to_graph() We want to test the following triples: CGD:RegionID is an instance of faldo:Region CGD:RegionID faldo:begin BothStrandPositionID CGD:RegionID faldo:end BothStrandPositionID CGD:BothStrandPositionID is an instance of faldo:BothStrandPosition CGD:BothStrandPositionID is an instance of faldo:Position CGD:BothStrandPositionID faldo:position 944 CGD:BothStrandPositionID faldo:reference CGD:TranscriptID """ from dipper.utils.TestUtils import TestUtils self.cgd.add_variant_info_to_graph(self.test_set_2) # Make testutils object and load bindings test_env = TestUtils(self.cgd.graph) cu = CurieUtil(self.curie_map) self.cgd.load_bindings() (variant_key, variant_label, amino_acid_variant, amino_acid_position, transcript_id, transcript_priority, protein_variant_type, functional_impact, stop_gain_loss, transcript_gene, protein_variant_source, variant_gene, bp_pos, variant_cdna, cosmic_id, db_snp_id, genome_pos_start, genome_pos_end, ref_base, variant_base, primary_transcript_exons, primary_transcript_variant_sub_types, variant_type, chromosome, genome_build, build_version, build_date) = self.test_set_2[0] transcript_curie = self.cgd._make_transcript_curie(transcript_id) ccds_id = "35166.1" variant_id = self.cgd.make_cgd_id('variant{0}'.format(variant_key)) region_id = ":_{0}Region".format(transcript_curie) both_strand_id = ":_{0}-{1}".format(ccds_id, bp_pos) region_uri = URIRef(cu.get_uri(region_id)) both_strand_uri = URIRef(cu.get_uri(both_strand_id)) ccds_uri = URIRef(cu.get_uri(transcript_curie)) sparql_query = """ SELECT ?region ?bsPosition ?transcript WHERE {{ ?region a faldo:Region ; faldo:begin ?bsPosition ; faldo:end ?bsPosition . ?bsPosition a faldo:Position ; faldo:position {0} ; faldo:reference ?transcript . }} """.format(bp_pos) # Expected Results expected_results = [[region_uri, both_strand_uri, ccds_uri]] # Query graph sparql_output = test_env.query_graph(sparql_query) self.assertEqual(expected_results, sparql_output)
def test_genome_build_chromosome_model(self): """ Test modelling of genome, builds, and chromosomes Using test data set 2, and the function add_variant_info_to_graph() """ from dipper.utils.TestUtils import TestUtils self.cgd.add_variant_info_to_graph(self.test_set_2) # Make testutils object and load bindings test_env = TestUtils(self.cgd.graph) cu = CurieUtil(self.curie_map) self.cgd.load_bindings() genome = ":9606genome" genome_label = "Human genome" chromosome = "CHR:9606chr9" chromosome_label = "chr9 (Human)" build_curie = "UCSC:hg19" build_label = "hg19" chrom_on_build = ":MONARCH_hg19chr9" chrom_build_label = "chr9 (hg19)" genome_uri = URIRef(cu.get_uri(genome)) chromosome_uri = URIRef(cu.get_uri(chromosome)) build_uri = URIRef(cu.get_uri(build_curie)) chrom_on_build_uri = URIRef(cu.get_uri(chrom_on_build)) ''' sparql_query = """ SELECT ?genome ?chromosome ?build ?chromOnBuild WHERE {{ ?genome a owl:Class ; rdfs:label "{0}" ; OBO:RO_0002162 OBO:NCBITaxon_9606 ; OBO:RO_0002351 ?chromosome ; rdfs:subClassOf OBO:SO_0001026 . ?chromosome a owl:Class ; rdfs:label "{1}" ; OBO:RO_0002350 ?genome ; rdfs:subClassOf OBO:SO_0000340 . ?build a OBO:SO_0001505 ; a ?genome ; rdfs:label "{2}" ; OBO:RO_0002351 ?chromOnBuild ; rdfs:subClassOf ?genome . ?chromOnBuild a ?chromosome ; rdfs:label "{3}" ; OBO:RO_0002350 ?build . }} """.format(genome_label, chromosome_label, build_label, chrom_build_label) ''' sparql_query = """ SELECT ?genome ?chromosome ?build ?chromOnBuild WHERE {{ ?genome a owl:Class ; rdfs:label "{0}" ; rdfs:subClassOf OBO:SO_0001026 . ?chromosome a owl:Class ; rdfs:label "{1}" ; rdfs:subClassOf OBO:SO_0000340 . ?build a OBO:SO_0001505 ; a ?genome ; rdfs:label "{2}" ; OBO:RO_0002162 OBO:NCBITaxon_9606 ; OBO:RO_0002351 ?chromOnBuild . ?chromOnBuild a ?chromosome ; a OBO:SO_0000340 ; rdfs:label "{3}" ; OBO:RO_0002350 ?build . }} """.format(genome_label, chromosome_label, build_label, chrom_build_label) # Expected Results expected_results = [[ genome_uri, chromosome_uri, build_uri, chrom_on_build_uri ]] # Query graph sparql_output = test_env.query_graph(sparql_query) self.assertEqual(expected_results, sparql_output)
def main(): # TODO this should be generated by looking in the dipper/sources directory # or read from a sources/dataset/config yaml or dir of yamls source_to_class_map = { # 'facebase_alpha': 'FaceBase_alpha', 'hpoa': 'HPOAnnotations', # ~3 min 'zfin': 'ZFIN', 'omim': 'OMIM', # full file takes ~15 min, due to required throttling 'biogrid': 'BioGrid', # interactions file takes <10 minutes 'mgi': 'MGI', 'impc': 'IMPC', # Panther takes ~1hr to map 7 species-worth of associations 'panther': 'Panther', 'oma': 'OMA', 'ncbigene': 'NCBIGene', # takes about 4 minutes to process 2 species 'ucscbands': 'UCSCBands', 'ctd': 'CTD', 'genereviews': 'GeneReviews', 'eom': 'EOM', # Takes about 5 seconds. 'coriell': 'Coriell', # 'clinvar': 'ClinVar', # takes ~ half hour # 'clinvarxml_alpha': 'ClinVarXML_alpha', # takes ~ five minutes 'monochrom': 'Monochrom', 'kegg': 'KEGG', 'animalqtldb': 'AnimalQTLdb', 'ensembl': 'Ensembl', 'hgnc': 'HGNC', 'orphanet': 'Orphanet', 'omia': 'OMIA', 'flybase': 'FlyBase', 'mmrrc': 'MMRRC', 'wormbase': 'WormBase', 'mpd': 'MPD', 'gwascatalog': 'GWASCatalog', 'monarch': 'Monarch', 'go': 'GeneOntology', 'reactome': 'Reactome', 'udp': 'UDP', 'mgi-slim': 'MGISlim', 'zfin-slim': 'ZFINSlim', 'bgee': 'Bgee', 'mydrug': 'MyDrug', 'stringdb': 'StringDB', 'rgd': 'RGD', 'sgd': 'SGD' } logger = logging.getLogger(__name__) parser = argparse.ArgumentParser( description='Dipper: Data Ingestion Pipeline for SciGraph', formatter_class=argparse.RawTextHelpFormatter) parser.add_argument( '-g', '--graph', type=str, default="rdf_graph", help='graph type: rdf_graph, streamed_graph') parser.add_argument( '-s', '--sources', type=str, required=True, help='comma separated list of sources') parser.add_argument( '-l', '--limit', type=int, help='limit number of rows') parser.add_argument( '--parse_only', action='store_true', help='parse files without writing') parser.add_argument( '--fetch_only', action='store_true', help='fetch sources without parsing') parser.add_argument('-f', '--force', action='store_true', help='force re-download of files') parser.add_argument( '--no_verify', help='ignore the verification step', action='store_true') parser.add_argument('--query', help='enter in a sparql query', type=str) parser.add_argument( '-q', '--quiet', help='turn off info logging', action="store_true") parser.add_argument( '--debug', help='turn on debug logging', action="store_true") parser.add_argument( '--skip_tests', help='skip any testing', action="store_true") # Blank Nodes can't be visualized in Protege, default to Skolemizing them parser.add_argument( '-b', '--use_bnodes', help="use blank nodes instead of skolemizing", action="store_true", default=False) # TODO this should live in a global data file # and the same filter be applied to all sources parser.add_argument( '-t', '--taxon', type=str, help='Add a taxon constraint on a source. Enter 1+ NCBITaxon numbers,' ' comma delimited\n' 'Implemented taxa per source\n' 'NCBIGene: 9606,10090,7955\n' 'Panther: 9606,10090,10116,7227,7955,6239,8355\n' 'BioGrid: 9606,10090,10116,7227,7955,6239,8355\n' 'UCSCBands: 9606\n' 'GO: 9606,10090,10116,7227,7955,6239,9615,9823,9031,9913') parser.add_argument( '-o', '--test_only', help='only process and output the pre-configured test subset', action="store_true") parser.add_argument( '--dest_fmt', help='serialization format: [turtle], nt, nquads, rdfxml, n3, raw', type=str) parser.add_argument( '--version', '-v', help='version of source', type=str) args = parser.parse_args() tax_ids = None if args.taxon is not None: tax_ids = [int(t) for t in args.taxon.split(',')] taxa_supported = [ # these are not taxa 'Panther', 'NCBIGene', 'BioGrid', 'UCSCBands', 'GeneOntology', 'Bgee', 'Ensembl', 'StringDB', 'OMA'] formats_supported = [ 'turtle', 'ttl', 'ntriples', 'nt', 'nquads', 'nq', 'rdfxml', 'xml', 'notation3', 'n3', 'raw'] if args.quiet: logging.basicConfig(level=logging.ERROR) else: if args.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) if not args.use_bnodes: logger.info("Will Skolemize Blank Nodes") if args.query is not None: test_query = TestUtils() for source in args.sources.split(','): source = source.lower() mysource = source_to_class_map[source]() # import source lib module = "dipper.sources.{0}".format(mysource) imported_module = importlib.import_module(module) source_class = getattr(imported_module, mysource) test_query.check_query_syntax(args.query, source_class) test_query.load_graph_from_turtle(source_class) print(test_query.query_graph(args.query, True)) exit(0) # run initial tests if (args.no_verify or args.skip_tests) is not True: unittest.TextTestRunner(verbosity=2).run(test_suite) # set serializer if args.dest_fmt is not None: if args.dest_fmt in formats_supported: if args.dest_fmt == 'ttl': args.dest_fmt = 'turtle' elif args.dest_fmt == 'ntriples': args.dest_fmt = 'nt' elif args.dest_fmt == 'nq': args.dest_fmt = 'nquads' elif args.dest_fmt == 'xml': args.dest_fmt = 'rdfxml' elif args.dest_fmt == 'notation3': args.dest_fmt = 'n3' else: logger.error( "You have specified an invalid serializer: %s", args.dest_fmt) exit(0) else: args.dest_fmt = 'turtle' # iterate through all the sources for source in args.sources.split(','): logger.info("\n******* %s *******", source) source = source.lower() src = source_to_class_map[source] # import source lib module = "dipper.sources.{0}".format(src) imported_module = importlib.import_module(module) source_class = getattr(imported_module, src) mysource = None # arg factory source_args = dict( graph_type=args.graph ) source_args['are_bnodes_skolemized'] = not args.use_bnodes if src in taxa_supported: source_args['tax_ids'] = tax_ids if args.version: source_args['version'] = args.version mysource = source_class(**source_args) if args.parse_only is False: start_fetch = time.clock() mysource.fetch(args.force) end_fetch = time.clock() logger.info("Fetching time: %d sec", end_fetch-start_fetch) mysource.settestonly(args.test_only) # run tests first if (args.no_verify or args.skip_tests) is not True: suite = mysource.getTestSuite() if suite is None: logger.warning( "No tests configured for this source: %s", source) else: unittest.TextTestRunner(verbosity=2).run(suite) else: logger.info("Skipping Tests for source: %s", source) if args.test_only is False and args.fetch_only is False: start_parse = time.clock() mysource.parse(args.limit) end_parse = time.clock() logger.info("Parsing time: %d sec", end_parse-start_parse) if args.graph == 'rdf_graph': logger.info("Found %d nodes", len(mysource.graph)) # Add property axioms start_axiom_exp = time.clock() logger.info("Adding property axioms") properties = GraphUtils.get_properties_from_graph(mysource.graph) GraphUtils.add_property_axioms(mysource.graph, properties) end_axiom_exp = time.clock() logger.info("Property axioms added: %d sec", end_axiom_exp-start_axiom_exp) start_write = time.clock() mysource.write(fmt=args.dest_fmt) end_write = time.clock() logger.info("Writing time: %d sec", end_write-start_write) # if args.no_verify is not True: # status = mysource.verify() # if status is not True: # logger.error( # 'Source %s did not pass verification tests.', source) # exit(1) # else: # logger.info('skipping verification step') logger.info('***** Finished with %s *****', source) # load configuration parameters # for example, keys logger.info("All done.")
def test_chromosome_position_model(self): """ Test modelling of genomic positions Using test data set 2, and the function add_variant_info_to_graph() """ from dipper.utils.TestUtils import TestUtils self.cgd.add_variant_info_to_graph(self.test_set_2) # Make testutils object and load bindings test_env = TestUtils(self.cgd.graph) cu = CurieUtil(self.curie_map) self.cgd.load_bindings() (variant_key, variant_label, amino_acid_variant, amino_acid_position, transcript_id, transcript_priority, protein_variant_type, functional_impact, stop_gain_loss, transcript_gene, protein_variant_source, variant_gene, bp_pos, variant_cdna, cosmic_id, db_snp_id, genome_pos_start, genome_pos_end, ref_base, variant_base, primary_transcript_exons, primary_transcript_variant_sub_types, variant_type, chromosome, genome_build, build_version, build_date) = self.test_set_2[0] variant_id = self.cgd.make_cgd_id('variant{0}'.format(variant_key)) chromosome_curie = ":MONARCH_hg19chr9" region_id = ":_{0}{1}Region-{2}-{3}".format(genome_build, chromosome, genome_pos_start, genome_pos_end) start_id = ":_hg19chr9-{0}".format(genome_pos_start) end_id = ":_hg19chr9-{0}".format(genome_pos_end) region_uri = URIRef(cu.get_uri(region_id)) start_uri = URIRef(cu.get_uri(start_id)) end_uri = URIRef(cu.get_uri(end_id)) chromosome_uri = URIRef(cu.get_uri(chromosome_curie)) sparql_query = """ SELECT ?region ?startPosition ?endPosition ?chromosome WHERE {{ ?region a faldo:Region ; faldo:begin ?startPosition ; faldo:end ?endPosition . ?startPosition a faldo:Position ; faldo:position {0} ; faldo:reference ?chromosome . ?endPosition a faldo:Position ; faldo:position {1} ; faldo:reference ?chromosome . }} """.format( genome_pos_start, genome_pos_end, ) # Expected Results expected_results = [[region_uri, start_uri, end_uri, chromosome_uri]] # Query graph sparql_output = test_env.query_graph(sparql_query) self.assertEqual(expected_results, sparql_output)
def test_missense_variant_protein_model(self): """ Test missense variant with only protein information Using test data set 1, and the function add_variant_info_to_graph() We want to test the following triples: CGD:VariantID is an instance of OBO:SO_0001059 CGD:VariantID is an instance of OBO:SO_0001583 CGD:VariantID has the label "CSF3R Q741X missense mutation" CGD:VariantID is_sequence_variant_instance_of (OBO:GENO_0000408) NCBIGene:1441 CGD:VariantID has location (faldo:location) CGD:RegionID CGD:VariantID OBO:GENO_reference_amino_acid "Q" CGD:VariantID OBO:GENO_results_in_amino_acid_change "X" CGD:VariantID RO:0002205 CCDS:413.1 CCDS:413.1 is an instance of OBO:GENO_primary CCDS:413.1 has the label "CCDS413.1" """ from dipper.utils.TestUtils import TestUtils self.cgd.add_variant_info_to_graph(self.test_set_1) # Make testutils object and load bindings test_env = TestUtils(self.cgd.graph) cu = CurieUtil(self.curie_map) self.cgd.load_bindings() (variant_key, variant_label, amino_acid_variant, amino_acid_position, transcript_id, transcript_priority, protein_variant_type, functional_impact, stop_gain_loss, transcript_gene, protein_variant_source) = self.test_set_1[0][0:11] gene_id = self.cgd.gene_map[transcript_gene] ref_amino_acid = "Q" altered_amino_acid = "X" position = 741 uniprot_curie = "UniProtKB:Q99062#Q99062-1" variant_id = self.cgd.make_cgd_id('variant{0}'.format(variant_key)) transcript = "CCDS:413.1" region_id = ":_{0}{1}{2}Region".format(position, position, uniprot_curie) variant_uri = URIRef(cu.get_uri(variant_id)) transcript_uri = URIRef(cu.get_uri(transcript)) gene_uri = URIRef(cu.get_uri(gene_id)) region_uri = URIRef(cu.get_uri(region_id)) sparql_query = """ SELECT ?variant ?gene ?region ?transcript WHERE {{ ?variant a OBO:SO_0001059; a OBO:SO_0001583 ; rdfs:label "{0}" ; OBO:GENO_0000408 ?gene ; faldo:location ?region ; OBO:GENO_reference_amino_acid "{1}" ; OBO:GENO_results_in_amino_acid_change "{2}" ; RO:0002205 ?transcript . ?transcript a OBO:SO_0000233 ; rdfs:label "{3}" . }} """.format(variant_label, ref_amino_acid, altered_amino_acid, transcript_id) # Expected Results expected_results = [[ variant_uri, gene_uri, region_uri, transcript_uri ]] # Query graph sparql_output = test_env.query_graph(sparql_query) self.assertEqual(expected_results, sparql_output)
def setUp(self): self.test_util = TestUtils() return
def test_missense_variant_cdna_model(self): """ Test missense variant with cdna information Using test data set 2, and the function add_variant_info_to_graph() We want to test the following triples: CGD:VariantID is an instance of OBO:SO_0001059 CGD:VariantID is an instance of OBO:SO_0001583 CGD:VariantID has the label "ABL1 T315I missense mutation" CGD:VariantID is_sequence_variant_instance_of (OBO:GENO_0000408) NCBIGene:25 CGD:VariantID has location (faldo:location) AminoAcidRegionID CGD:VariantID has location (faldo:location) CDNARegionID CGD:VariantID has location (faldo:location) ChromosomalRegionID CGD:VariantID OBO:GENO_reference_amino_acid "T" CGD:VariantID OBO:GENO_results_in_amino_acid_change "I" CGD:VariantID owl:sameAs dbSNP:rs121913459 CGD:VariantID owl:sameAs COSMIC:12560 CGD:VariantID RO:0002205 (transcribed_to) CCDS:35166.1 CCDS:35166.1 is an instance of OBO:SO_0000233 CCDS:35166.1 has the label "CCDS35166.1" CCDS:35166.1 OBO:RO_0002513 (translates_to) UniProtKB:P00519#P00519-1 CCDS:35166.1 OBO:RO_0002513 (translates_to) NCBIProtein:NP_005148.2 UniProtKB:P00519#P00519-1 owl:sameAs NCBIProtein:NP_005148.2 UniProtKB:P00519#P00519-1 is an instance of OBO:SO_0000104 (polypeptide) UniProtKB:P00519#P00519-1 has the label "P00519#P00519-1" NCBIProtein:NP_005148.2 is an instance of OBO:SO_0000104 (polypeptide) NCBIProtein:NP_005148.2 has the label "NP_005148.2" """ from dipper.utils.TestUtils import TestUtils self.cgd.add_variant_info_to_graph(self.test_set_2) # Make testutils object and load bindings test_env = TestUtils(self.cgd.graph) cu = CurieUtil(self.curie_map) self.cgd.load_bindings() (variant_key, variant_label, amino_acid_variant, amino_acid_position, transcript_id, transcript_priority, protein_variant_type, functional_impact, stop_gain_loss, transcript_gene, protein_variant_source, variant_gene, bp_pos, variant_cdna, cosmic_id, db_snp_id, genome_pos_start, genome_pos_end, ref_base, variant_base, primary_transcript_exons, primary_transcript_variant_sub_types, variant_type, chromosome, genome_build, build_version, build_date) = self.test_set_2[0] gene_id = self.cgd.gene_map[transcript_gene] ref_amino_acid = "T" altered_amino_acid = "I" db_snp_curie = "dbSNP:121913459" cosmic_curie = "COSMIC:12560" uniprot_curie = "UniProtKB:P00519#P00519-1" uniprot_id = "P00519#P00519-1" refseq_curie = "NCBIProtein:NP_005148.2" transcript_curie = "CCDS:35166.1" ccds_id = "35166.1" position = 315 chromosome_curie = "hg19chr9" variant_id = self.cgd.make_cgd_id('variant{0}'.format(variant_key)) aa_region_id = ":_{0}{1}{2}Region".format(position, position, uniprot_curie) cdna_region_id = ":_{0}Region".format(transcript_curie) chr_region_id = ":_{0}{1}Region-{2}-{3}".format( genome_build, chromosome, genome_pos_start, genome_pos_end) aa_coord_id = ":_{0}-{1}".format(uniprot_id, position) cdna_coord_id = ":_{0}-{1}".format(ccds_id, bp_pos) # chr_coord_id = "CHR:{0}-{1}".format(chromosome_curie, genome_pos_start) chr_coord_id = ":_{0}-{1}".format(chromosome_curie, genome_pos_start) variant_uri = URIRef(cu.get_uri(variant_id)) transcript_uri = URIRef(cu.get_uri(transcript_curie)) gene_uri = URIRef(cu.get_uri(gene_id)) db_snp_uri = URIRef(cu.get_uri(db_snp_curie)) cosmic_uri = URIRef(cu.get_uri(cosmic_curie)) uniprot_uri = URIRef(cu.get_uri(uniprot_curie)) refseq_uri = URIRef(cu.get_uri(refseq_curie)) aa_region_uri = URIRef(cu.get_uri(aa_region_id)) cdna_region_uri = URIRef(cu.get_uri(cdna_region_id)) chr_region_uri = URIRef(cu.get_uri(chr_region_id)) aa_coord_uri = URIRef(cu.get_uri(aa_coord_id)) cdna_coord_uri = URIRef(cu.get_uri(cdna_coord_id)) chr_coord_uri = URIRef(cu.get_uri(chr_coord_id)) sparql_query = """ SELECT ?cosmic ?gene ?aaRegion ?cdnaRegion ?chrRegion ?dbSNP ?transcript ?uniprot ?refseq ?aaCoord ?cdnaCoord ?chrCoord WHERE {{ ?cosmic a OBO:SO_0001059; a OBO:SO_0001583 ; OBO:GENO_0000408 ?gene ; faldo:location ?aaRegion ; faldo:location ?cdnaRegion ; faldo:location ?chrRegion ; OBO:GENO_reference_amino_acid "{0}" ; OBO:GENO_reference_nucleotide "{1}" ; OBO:GENO_altered_nucleotide "{2}" ; OBO:GENO_results_in_amino_acid_change "{3}" ; owl:sameAs ?dbSNP ; RO:0002205 ?transcript . ?cosmic owl:sameAs ?dbSNP . ?transcript a OBO:SO_0000233 ; rdfs:label "{4}" ; OBO:RO_0002513 ?uniprot ; OBO:RO_0002513 ?refseq . ?uniprot a OBO:SO_0000104 ; rdfs:label "P00519-1" . ?refseq a OBO:SO_0000104 ; rdfs:label "NP_005148.2" . ?refseq owl:sameAs ?uniprot . ?aaRegion faldo:begin ?aaCoord . ?cdnaRegion faldo:begin ?cdnaCoord . ?chrRegion faldo:begin ?chrCoord . ?aaCoord faldo:position {5} . ?cdnaCoord faldo:position {6} . ?chrCoord faldo:position {7} . ?dbSNP rdfs:label "{8}" . }} """.format(ref_amino_acid, ref_base, variant_base, altered_amino_acid, transcript_id, position, bp_pos, genome_pos_start, db_snp_id) # Expected Results expected_results = [[ cosmic_uri, gene_uri, aa_region_uri, cdna_region_uri, chr_region_uri, db_snp_uri, transcript_uri, uniprot_uri, refseq_uri, aa_coord_uri, cdna_coord_uri, chr_coord_uri ]] # Query graph sparql_output = test_env.query_graph(sparql_query) self.assertEqual(expected_results, sparql_output)