class TestSchemaOrg(unittest.TestCase): """Using SchemaOrg Schema to test all functions in biothings_schema """ def setUp(self): # preload biothings schema PATH = 'https://raw.githubusercontent.com/data2health/schemas/biothings/biothings/biothings_curie_kevin.jsonld' self.se = Schema(PATH) # test list_all_classes self.clses = self.se.list_all_classes() # test list_all_properties self.props = self.se.list_all_properties() def test_schemaclass_class(self): """ Test the SchemaClass Class using all classes in BioThings schema""" # loop through all classes for _cls in self.clses: # test get_class scls = self.se.get_class(_cls.name) # test describe function describe = scls.describe() def test_schemaproperty_class(self): """ Test the SchemaProperty Class using all classes in BioThings schema """ # loop through all properties for _prop in self.props: # test get_property sp = self.se.get_property(_prop.name) # test describe function describe = sp.describe()
class TestSchemaOrg(unittest.TestCase): """Using SchemaOrg Schema to test all functions in biothings_schema """ def setUp(self): # preload schemaorg-only schema self.se = Schema(base_schema=["schema.org"]) # test list_all_classes self.clses = self.se.list_all_classes() # test list_all_properties self.props = self.se.list_all_properties() def test_schemaclass_class(self): """ Test the SchemaClass Class using all classes in Schemaorg schema""" # loop through all classes for _cls in self.clses: # test get_class scls = self.se.get_class(_cls.name) # test describe function describe = scls.describe() scls = self.se.get_class(_cls.name, output_type="curie") describe = scls.describe() scls = self.se.get_class(_cls.name, output_type="uri") describe = scls.describe() scls = self.se.get_class(_cls.name, output_type="label") describe = scls.describe() def test_schemaproperty_class(self): """ Test the SchemaProperty Class using all classes in Schemaorg schema """ # loop through all properties for _prop in self.props: # test get_property sp = self.se.get_property(_prop.name) # test describe function describe = sp.describe() sp = self.se.get_property(_prop.name, output_type="curie") # test describe function describe = sp.describe() sp = self.se.get_property(_prop.name, output_type="uri") # test describe function describe = sp.describe() sp = self.se.get_property(_prop.name, output_type="label") # test describe function describe = sp.describe()
class TestSchemaClass(unittest.TestCase): """Test Schema Validator Class """ def setUp(self): schema_url = 'https://raw.githubusercontent.com/data2health/schemas/biothings/biothings/biothings_curie_kevin.jsonld' self.se = Schema(schema_url) def test_list_all_classes(self): """ Test list_all_classes function """ all_cls = self.se.list_all_classes() all_cls_names = [_cls.name for _cls in all_cls] # assert root level Class in all classes self.assertIn('schema:Thing', all_cls_names) # assert class "Gene" in all classes self.assertIn('bts:Gene', all_cls_names) # class 'ffff' should not be one of the classes self.assertNotIn('bts:ffff', all_cls_names) # class name should be curie self.assertNotIn('Thing', all_cls_names) # assert type of the class is SchemaClass self.assertEqual(SchemaClass, type(all_cls[0])) def test_list_all_properties(self): """ Test list_all_properties function""" all_props = self.se.list_all_properties() all_prop_names = [_prop.name for _prop in all_props] # assert "name" in all props self.assertIn('schema:name', all_prop_names) # property name should be curie self.assertNotIn('name', all_prop_names) # assert "ffff" should not be one of the props self.assertNotIn('bts:ffff', all_prop_names) # assert type of the property is SchemaProperty self.assertEqual(SchemaProperty, type(all_props[0])) def test_get_class(self): """ Test get_class function""" scls = self.se.get_class("schema:Gene") self.assertEqual(SchemaClass, type(scls)) def test_get_property(self): """ Test get_property function""" sp = self.se.get_property("ensembl") self.assertEqual(SchemaProperty, type(sp))
class TestSchemaClass(unittest.TestCase): """Test Schema Validator Class """ def setUp(self): schema_file = os.path.join(_CURRENT, 'data', 'extend_from_bioschemas.json') self.se = Schema(schema_file) def test_list_all_classes(self): """ Test list_all_classes function """ all_cls = self.se.list_all_classes() all_cls_names = [_cls.name for _cls in all_cls] # assert root level Class in all classes self.assertIn('bioschemas:Gene', all_cls_names) # class name should be curie self.assertNotIn('Gene', all_cls_names) # assert type of the class is SchemaClass self.assertEqual(SchemaClass, type(all_cls[0])) def test_list_all_properties(self): """ Test list_all_properties function""" all_props = self.se.list_all_properties() all_prop_names = [_prop.name for _prop in all_props] # assert "name" in all props self.assertIn('schema:name', all_prop_names) # property name should be curie self.assertNotIn('name', all_prop_names) # assert "ffff" should not be one of the props self.assertNotIn('bts:ffff', all_prop_names) # assert type of the property is SchemaProperty self.assertEqual(SchemaProperty, type(all_props[0])) def test_get_class(self): """ Test get_class function""" scls = self.se.get_class("bioschemas:Gene") self.assertEqual(SchemaClass, type(scls)) def test_get_property(self): """ Test get_property function""" sp = self.se.get_property("bioschemas:encodesBioChemEntity") self.assertEqual(SchemaProperty, type(sp))
class TestSchemaClassClass(unittest.TestCase): """Test SchemaClass Class """ def setUp(self): schema_url = 'https://raw.githubusercontent.com/data2health/schemas/biothings/biothings/biothings_curie_kevin.jsonld' self.se = Schema(schema_url) def test_initialization_with_context_works(self): biothings_jsonld_path = os.path.join(_CURRENT, 'data', 'biothings_test.jsonld') schema_url = 'https://raw.githubusercontent.com/data2health/schemas/biothings/biothings/biothings_curie_kevin.jsonld' biothings_schema = load_json_or_yaml(biothings_jsonld_path) self.se_with_context = Schema(schema_url, biothings_schema['@context']) self.assertEqual(self.se_with_context.schema, self.se.schema) def test_initialization(self): # if input class is not in schema, defined_in_schema should be False scls = self.se.get_class("dd") self.assertFalse(scls.defined_in_schema) # test response if input is NAME only scls = self.se.get_class("bts:Gene") self.assertEqual(scls.name, "bts:Gene") self.assertEqual(scls.uri, "http://schema.biothings.io/Gene") self.assertEqual(scls.label, "Gene") # test response if input is CURIE only scls = self.se.get_class("bts:Gene") self.assertEqual(scls.name, "bts:Gene") self.assertEqual(scls.uri, "http://schema.biothings.io/Gene") self.assertEqual(scls.label, "Gene") # test response if input is URI only scls = self.se.get_class("http://schema.biothings.io/Gene") self.assertEqual(scls.name, "bts:Gene") self.assertEqual(scls.uri, "http://schema.biothings.io/Gene") self.assertEqual(scls.label, "Gene") def test_parent_classes(self): """ Test parent_classes function """ scls = self.se.get_class("bts:Gene") parents = scls.parent_classes # check the first item of should be 'Thing' self.assertEqual(parents[0][0].name, 'schema:Thing') # if input is the root class, should return empty list scls = self.se.get_class("Thing") parents = scls.parent_classes self.assertEqual(parents, []) # check the response if class not exist scls = self.se.get_class("dd") parents = scls.parent_classes self.assertEqual(parents, []) ############################### # test if output_type is uri scls = self.se.get_class("bts:Gene", output_type="uri") parents = scls.parent_classes # check the first item of should be 'Thing' self.assertEqual(parents[0][0], 'http://schema.org/Thing') ############################### # test if output_type is label scls = self.se.get_class("bts:Gene", output_type="label") parents = scls.parent_classes # check the first item of should be 'Thing' self.assertEqual(parents[0][0], 'Thing') ############################### # test if output_type is curie scls = self.se.get_class("bts:Gene", output_type="curie") parents = scls.parent_classes # check the first item of should be 'Thing' self.assertEqual(parents[0][0], 'schema:Thing') def test_ancestor_classes(self): """ Test ancestor_classes function""" ############################### # test if output_type is python class scls = self.se.get_class("bts:MolecularEntity") ancestors = scls.ancestor_classes ancestor_names = [_item.name for _item in ancestors] # check if gene is in ancestors self.assertIn('schema:Thing', ancestor_names) self.assertIn('bts:BiologicalEntity', ancestor_names) # check if Gene is in ancestors (Gene is its child classs) self.assertNotIn('bts:Gene', ancestor_names) # check itself should not in ancestors self.assertNotIn('bts:MolecularEntity', ancestor_names) # test if input class is the root class scls = self.se.get_class("Thing") self.assertEqual(scls.ancestor_classes, []) # test if input class not exists scls = self.se.get_class("dd") self.assertEqual(scls.ancestor_classes, []) ############################### # test if output_type is curie scls = self.se.get_class("bts:MolecularEntity", output_type="curie") ancestors = scls.ancestor_classes # check if BiologicalEntity is in descendants self.assertIn('bts:BiologicalEntity', ancestors) self.assertIn('schema:Thing', ancestors) ############################### # test if output_type is label scls = self.se.get_class("bts:MolecularEntity", output_type="label") ancestors = scls.ancestor_classes # check if Thing is in ancestors self.assertIn('Thing', ancestors) self.assertIn('BiologicalEntity', ancestors) ############################### # test if output_type is uri scls = self.se.get_class("bts:MolecularEntity", output_type="uri") ancestors = scls.ancestor_classes # check if gene is in descendants self.assertIn('http://schema.biothings.io/BiologicalEntity', ancestors) self.assertIn('http://schema.org/Thing', ancestors) def test_descendant_classes(self): """ Test descendant_classes function""" ############################### # test if output_type is python class scls = self.se.get_class("bts:MolecularEntity") descendants = scls.descendant_classes descendant_names = [_item.name for _item in descendants] # check if gene is in descendants self.assertIn('bts:Gene', descendant_names) # check if Thing is in descendants (Thing is its parent classs) self.assertNotIn('schema:Thing', descendant_names) # check itself should not in descendants self.assertNotIn('bts:MolecularEntity', descendant_names) # test if input class is the leaf class scls = self.se.get_class("bts:Gene") descendants = scls.descendant_classes self.assertEqual(descendants, []) # test if input class not exists scls = self.se.get_class("dd") descendants = scls.descendant_classes self.assertEqual(descendants, []) ############################### # test if output_type is curie scls = self.se.get_class("bts:MolecularEntity", output_type="curie") descendants = scls.descendant_classes # check if gene is in descendants self.assertIn('bts:Gene', descendants) ############################### # test if output_type is label scls = self.se.get_class("bts:MolecularEntity", output_type="label") descendants = scls.descendant_classes # check if gene is in descendants self.assertIn('Gene', descendants) ############################### # test if output_type is uri scls = self.se.get_class("bts:MolecularEntity", output_type="uri") descendants = scls.descendant_classes # check if gene is in descendants self.assertIn('http://schema.biothings.io/Gene', descendants) def test_child_classes(self): """ Test child_classes function""" ############################### # test if output_type is python class scls = self.se.get_class("bts:MolecularEntity") children = scls.child_classes children_names = [_item.name for _item in children] # check if GeneFamily is in children self.assertIn('bts:GeneFamily', children_names) # check if gene is in children (gene is descendant) self.assertNotIn('bts:Gene', children_names) # check if Thing is in children (Thing is its parent classs) self.assertNotIn('schema:Thing', children_names) # check itself should not in children self.assertNotIn('bts:MolecularEntity', children_names) # test if input class is the leaf class scls = self.se.get_class("bts:Gene") children = scls.child_classes self.assertEqual(children, []) # test if input class is not defined scls = self.se.get_class("dd") children = scls.child_classes self.assertEqual(children, []) ############################### # test if output_type is curie scls = self.se.get_class("bts:MolecularEntity", output_type="curie") children = scls.child_classes # check if GeneFamily is in children self.assertIn('bts:GeneFamily', children) ############################### # test if output_type is uri scls = self.se.get_class("bts:MolecularEntity", output_type="uri") children = scls.child_classes # check if GeneFamily is in children self.assertIn('http://schema.biothings.io/GeneFamily', children) ############################### # test if output_type is label scls = self.se.get_class("bts:MolecularEntity", output_type="label") children = scls.child_classes # check if GeneFamily is in children self.assertIn('GeneFamily', children) def test_used_by(self): """ Test used_by function""" scls = self.se.get_class("bts:GenomicEntity") usage = scls.used_by() self.assertTrue(len(usage) > 1) self.assertEqual(list, type(usage)) # test if class is not defined scls = self.se.get_class("dd") usage = scls.used_by() self.assertEqual(usage, []) def test_describe(self): """test describe function""" scls = self.se.get_class("dd") describe = scls.describe() self.assertEqual(describe, {})
class SchemaExtractor(): """Extract BioThings Schema and construct networkx graph.""" def __init__(self, schema): """Load biothings schema.""" self.se = Schema(schema) # get all properties which are descendants of "identifier" property self.all_ids = self.se.get_property('identifier', output_type="curie").descendant_properties def find_descendants(self, lst): """Find all descendants for a list of schemaclass classes. :arg list lst: a list of schemaclass classes """ # if input is empty list, return an empty set if not lst: return set() # find descendant of each class and then merge together into a set dsc_lst = set(itertools.chain.from_iterable([self.se.get_class(_cls, output_type="curie").descendant_classes for _cls in lst])) return dsc_lst def find_cls_ids(self, _cls): """Find all identifiers which belongs to a class. :arg cls _cls: a SchemaClass instance """ # get all properties belong to the cls which are descendants of "identifiers" properties = [_prop['curie'] for _prop in self.se.get_class(_cls).list_properties(group_by_class=False) if _prop and _prop['curie'] in self.all_ids] return properties def schema2networkx(self): """Convert schema into a networkx graph. Logics ~~~~~~ Each identifier represents a node node properties include its semantic type (class name) The edge is represented by non-identifier properties """ G = nx.DiGraph() # list all properties defined in the schema properties = self.se.list_all_defined_properties() for _property in properties: if _property not in self.all_ids: # find all descendants of domain classes input_clses = set([_cls.name for _cls in _property.domain if _cls.uri in self.se.full_class_only_graph]) input_clses |= self.find_descendants(input_clses) # find all descendants of range classes output_clses = set([_cls.name for _cls in _property.range if _cls.uri in self.se.full_class_only_graph]) output_clses |= self.find_descendants(output_clses) if input_clses and output_clses: input_ids = set(itertools.chain.from_iterable([self.find_cls_ids(_cls) for _cls in input_clses])) output_ids = set(itertools.chain.from_iterable([self.find_cls_ids(_cls) for _cls in output_clses])) if input_ids and output_ids: G.add_edges_from(zip(input_ids, output_ids), label=_property.label) else: continue else: continue return G
# %% [markdown] # ## Question # Thread #cvisb andrew 21 days ago # > For each repository, show how many datasets have each metadata field populated. # %% from collections import defaultdict, Counter from functools import partial from elasticsearch_dsl import Search from elasticsearch import Elasticsearch from biothings_schema import Schema schema = Schema() dataset = schema.get_class("schema:Dataset") properties = sorted([ prop['label'] for prop in dataset.list_properties( class_specific=False, group_by_class=False) ]) # %% client = Elasticsearch('su07:9199') indicies = ('zenodo', 'omicsdi', 'harvard_dataverse','ncbi_geo_transformed') result = defaultdict(partial(defaultdict, Counter)) count = 0 for index in indicies: search = Search(using=client, index=index) for doc in search.scan():