class TestSchemaOrg(unittest.TestCase): """Using SchemaOrg Schema to test all functions in biothings_schema """ def setUp(self): # preload biothings schema PATH = 'https://raw.githubusercontent.com/data2health/schemas/biothings/biothings/biothings_curie_kevin.jsonld' self.se = Schema(PATH) # test list_all_classes self.clses = self.se.list_all_classes() # test list_all_properties self.props = self.se.list_all_properties() def test_schemaclass_class(self): """ Test the SchemaClass Class using all classes in BioThings schema""" # loop through all classes for _cls in self.clses: # test get_class scls = self.se.get_class(_cls.name) # test describe function describe = scls.describe() def test_schemaproperty_class(self): """ Test the SchemaProperty Class using all classes in BioThings schema """ # loop through all properties for _prop in self.props: # test get_property sp = self.se.get_property(_prop.name) # test describe function describe = sp.describe()
class TestSchemaOrg(unittest.TestCase): """Using SchemaOrg Schema to test all functions in biothings_schema """ def setUp(self): # preload schemaorg-only schema self.se = Schema(base_schema=["schema.org"]) # test list_all_classes self.clses = self.se.list_all_classes() # test list_all_properties self.props = self.se.list_all_properties() def test_schemaclass_class(self): """ Test the SchemaClass Class using all classes in Schemaorg schema""" # loop through all classes for _cls in self.clses: # test get_class scls = self.se.get_class(_cls.name) # test describe function describe = scls.describe() scls = self.se.get_class(_cls.name, output_type="curie") describe = scls.describe() scls = self.se.get_class(_cls.name, output_type="uri") describe = scls.describe() scls = self.se.get_class(_cls.name, output_type="label") describe = scls.describe() def test_schemaproperty_class(self): """ Test the SchemaProperty Class using all classes in Schemaorg schema """ # loop through all properties for _prop in self.props: # test get_property sp = self.se.get_property(_prop.name) # test describe function describe = sp.describe() sp = self.se.get_property(_prop.name, output_type="curie") # test describe function describe = sp.describe() sp = self.se.get_property(_prop.name, output_type="uri") # test describe function describe = sp.describe() sp = self.se.get_property(_prop.name, output_type="label") # test describe function describe = sp.describe()
class TestSchemaClass(unittest.TestCase): """Test Schema Validator Class """ def setUp(self): schema_url = 'https://raw.githubusercontent.com/data2health/schemas/biothings/biothings/biothings_curie_kevin.jsonld' self.se = Schema(schema_url) def test_list_all_classes(self): """ Test list_all_classes function """ all_cls = self.se.list_all_classes() all_cls_names = [_cls.name for _cls in all_cls] # assert root level Class in all classes self.assertIn('schema:Thing', all_cls_names) # assert class "Gene" in all classes self.assertIn('bts:Gene', all_cls_names) # class 'ffff' should not be one of the classes self.assertNotIn('bts:ffff', all_cls_names) # class name should be curie self.assertNotIn('Thing', all_cls_names) # assert type of the class is SchemaClass self.assertEqual(SchemaClass, type(all_cls[0])) def test_list_all_properties(self): """ Test list_all_properties function""" all_props = self.se.list_all_properties() all_prop_names = [_prop.name for _prop in all_props] # assert "name" in all props self.assertIn('schema:name', all_prop_names) # property name should be curie self.assertNotIn('name', all_prop_names) # assert "ffff" should not be one of the props self.assertNotIn('bts:ffff', all_prop_names) # assert type of the property is SchemaProperty self.assertEqual(SchemaProperty, type(all_props[0])) def test_get_class(self): """ Test get_class function""" scls = self.se.get_class("schema:Gene") self.assertEqual(SchemaClass, type(scls)) def test_get_property(self): """ Test get_property function""" sp = self.se.get_property("ensembl") self.assertEqual(SchemaProperty, type(sp))
class TestSchemaClass(unittest.TestCase): """Test Schema Validator Class """ def setUp(self): schema_file = os.path.join(_CURRENT, 'data', 'extend_from_bioschemas.json') self.se = Schema(schema_file) def test_list_all_classes(self): """ Test list_all_classes function """ all_cls = self.se.list_all_classes() all_cls_names = [_cls.name for _cls in all_cls] # assert root level Class in all classes self.assertIn('bioschemas:Gene', all_cls_names) # class name should be curie self.assertNotIn('Gene', all_cls_names) # assert type of the class is SchemaClass self.assertEqual(SchemaClass, type(all_cls[0])) def test_list_all_properties(self): """ Test list_all_properties function""" all_props = self.se.list_all_properties() all_prop_names = [_prop.name for _prop in all_props] # assert "name" in all props self.assertIn('schema:name', all_prop_names) # property name should be curie self.assertNotIn('name', all_prop_names) # assert "ffff" should not be one of the props self.assertNotIn('bts:ffff', all_prop_names) # assert type of the property is SchemaProperty self.assertEqual(SchemaProperty, type(all_props[0])) def test_get_class(self): """ Test get_class function""" scls = self.se.get_class("bioschemas:Gene") self.assertEqual(SchemaClass, type(scls)) def test_get_property(self): """ Test get_property function""" sp = self.se.get_property("bioschemas:encodesBioChemEntity") self.assertEqual(SchemaProperty, type(sp))
class MappingParser(): """Parse the mapping file between biothings schema and biothings API""" BIOTHINGS_SCHEMA_PATH = 'https://raw.githubusercontent.com/data2health/schemas/biothings/biothings/biothings_curie_kevin.jsonld' def __init__(self, se=None): if not se: self.se = Schema(self.BIOTHINGS_SCHEMA_PATH) else: self.se = se # list all properties which are descendants of identifiers self.id_list = self.se.get_property("identifier", output_type="curie").descendant_properties # get all classes defined in biothings schema JSON-LD file self.defined_clses = [_item.name for _item in self.se.list_all_defined_classes()] # list of properties whose "range" is among defined classes self.linked_prop_list = [_prop.name for _prop in self.se.list_all_defined_properties() if set([_item.name for _item in _prop.range]) & set(self.defined_clses)] self.cls_prop_clsf = {} def load_mapping(self, mapping, api=None): self.mapping = load_json_or_yaml(mapping) self.api = api def classify_keys_in_json(self, json_doc): """ classify the keys in a json doc""" result = defaultdict(list) for _key in json_doc.keys(): if _key in self.id_list: result['id'].append(_key) elif _key in self.linked_prop_list: result['links'].append(_key) return result def connect(self): G = nx.MultiDiGraph() self.type = self.mapping.get("@type") # classify the keys in the JSON doc clsf = self.classify_keys_in_json(self.mapping) # for each "links" properties, find its ids for predicate in clsf['links']: if type(self.mapping[predicate]) == dict: self.mapping[predicate] = [self.mapping[predicate]] for _pred in self.mapping[predicate]: if "@type" in _pred: sp = self.se.get_property(predicate) obj_clsf = self.classify_keys_in_json(_pred) common_prefix = find_common_path(get_dict_values(_pred)) input_id = [_pred['$input']] if '$input' in _pred else clsf['id'] source = _pred['$source'] if '$source' in _pred else self.api for _edge in itertools.product(input_id, obj_clsf['id']): output_field = _pred[_edge[1]] input_field = self.mapping[_edge[0]] if type(input_field) == list: input_field = ','.join(input_field) if type(output_field) == list: output_field = ','.join(output_field) G.add_edge(_edge[0], _edge[1], label=predicate, mapping_key=predicate, api=self.api, source=source, input_field=input_field, input_type=self.mapping["@type"], input_id=_edge[0], output_id=_edge[1], output_type=_pred["@type"], output_field=common_prefix if common_prefix else output_field) if metadata[self.api].get('api_type') == 'biothings': inverse_property = None if not sp.inverse_property else sp.inverse_property.name if not inverse_property: print(predicate) G.add_edge(_edge[1], _edge[0], api=self.api, input_field=output_field, input_type=_pred["@type"], source=source, input_id=_edge[1], output_id=_edge[0], output_type=self.mapping["@type"], output_field=input_field, label=inverse_property, mapping_key=_edge[0]) return G
class SchemaExtractor(): """Extract BioThings Schema and construct networkx graph.""" def __init__(self, schema): """Load biothings schema.""" self.se = Schema(schema) # get all properties which are descendants of "identifier" property self.all_ids = self.se.get_property('identifier', output_type="curie").descendant_properties def find_descendants(self, lst): """Find all descendants for a list of schemaclass classes. :arg list lst: a list of schemaclass classes """ # if input is empty list, return an empty set if not lst: return set() # find descendant of each class and then merge together into a set dsc_lst = set(itertools.chain.from_iterable([self.se.get_class(_cls, output_type="curie").descendant_classes for _cls in lst])) return dsc_lst def find_cls_ids(self, _cls): """Find all identifiers which belongs to a class. :arg cls _cls: a SchemaClass instance """ # get all properties belong to the cls which are descendants of "identifiers" properties = [_prop['curie'] for _prop in self.se.get_class(_cls).list_properties(group_by_class=False) if _prop and _prop['curie'] in self.all_ids] return properties def schema2networkx(self): """Convert schema into a networkx graph. Logics ~~~~~~ Each identifier represents a node node properties include its semantic type (class name) The edge is represented by non-identifier properties """ G = nx.DiGraph() # list all properties defined in the schema properties = self.se.list_all_defined_properties() for _property in properties: if _property not in self.all_ids: # find all descendants of domain classes input_clses = set([_cls.name for _cls in _property.domain if _cls.uri in self.se.full_class_only_graph]) input_clses |= self.find_descendants(input_clses) # find all descendants of range classes output_clses = set([_cls.name for _cls in _property.range if _cls.uri in self.se.full_class_only_graph]) output_clses |= self.find_descendants(output_clses) if input_clses and output_clses: input_ids = set(itertools.chain.from_iterable([self.find_cls_ids(_cls) for _cls in input_clses])) output_ids = set(itertools.chain.from_iterable([self.find_cls_ids(_cls) for _cls in output_clses])) if input_ids and output_ids: G.add_edges_from(zip(input_ids, output_ids), label=_property.label) else: continue else: continue return G
class TestSchemaPropertyClass(unittest.TestCase): """Test SchemaProperty Class """ def setUp(self): schema_url = 'https://raw.githubusercontent.com/data2health/schemas/biothings/biothings/biothings_curie_kevin.jsonld' self.se = Schema(schema_url) # test response if input is NAME only sp = self.se.get_property("ensembl") self.assertEqual(sp.name, "bts:ensembl") self.assertEqual(sp.uri, "http://schema.biothings.io/ensembl") self.assertEqual(sp.label, "ensembl") # test response if input is CURIE only sp = self.se.get_property("bts:ensembl") self.assertEqual(sp.name, "bts:ensembl") self.assertEqual(sp.uri, "http://schema.biothings.io/ensembl") self.assertEqual(sp.label, "ensembl") # test response if input is URI only sp = self.se.get_property("http://schema.biothings.io/ensembl") self.assertEqual(sp.name, "bts:ensembl") self.assertEqual(sp.uri, "http://schema.biothings.io/ensembl") self.assertEqual(sp.label, "ensembl") def test_initialization(self): # if input property is not in schema, defined_in_schema should be False sp = SchemaProperty('dd', self.se) self.assertFalse(sp.defined_in_schema) def test_parent_properties(self): """ Test parent_properties function """ sp = self.se.get_property("ensembl") parents = sp.parent_properties # check the first item of should be 'Thing' self.assertIn("schema:identifier", [_item.name for _item in parents]) # check negative cases self.assertNotIn("bts:sgd", [_item.name for _item in parents]) # if input doesn't have parent properties, should return empty list sp = self.se.get_property("identifier") parents = sp.parent_properties self.assertEqual(parents, []) # test if input is not defined sp = self.se.get_property('dd') parents = sp.parent_properties self.assertEqual(parents, []) def test_child_properties(self): """ Test child_properties function""" sp = self.se.get_property("identifier") children = sp.child_properties child_names = [_item.name for _item in children] # check if ensembl is in descendants self.assertIn('bts:ensembl', child_names) # check if affectsExpressionOf is in descendants self.assertNotIn('bts:affectsExpressionOf', child_names) # check itself should not in descendants self.assertNotIn('schema:identifier', child_names) # test if input property is the leaf property sp = self.se.get_property("ensembl") children = sp.child_properties self.assertEqual(children, []) # test if input is not defined sp = self.se.get_property("dd") children = sp.child_properties self.assertEqual(children, []) def test_describe(self): """test describe function""" sp = self.se.get_property("dd") describe = sp.describe() self.assertEqual(describe, {})