Example #1
0
class TestSchemaOrg(unittest.TestCase):
    """Using SchemaOrg Schema to test all functions in biothings_schema
    """
    def setUp(self):
        # preload biothings schema
        PATH = 'https://raw.githubusercontent.com/data2health/schemas/biothings/biothings/biothings_curie_kevin.jsonld'
        self.se = Schema(PATH)
        # test list_all_classes
        self.clses = self.se.list_all_classes()
        # test list_all_properties
        self.props = self.se.list_all_properties()

    def test_schemaclass_class(self):
        """ Test the SchemaClass Class using all classes in BioThings schema"""
        # loop through all classes
        for _cls in self.clses:
            # test get_class
            scls = self.se.get_class(_cls.name)
            # test describe function
            describe = scls.describe()

    def test_schemaproperty_class(self):
        """ Test the SchemaProperty Class using all classes in BioThings schema
        """
        # loop through all properties
        for _prop in self.props:
            # test get_property
            sp = self.se.get_property(_prop.name)
            # test describe function
            describe = sp.describe()
class TestSchemaOrg(unittest.TestCase):
    """Using SchemaOrg Schema to test all functions in biothings_schema
    """
    def setUp(self):
        # preload schemaorg-only schema
        self.se = Schema(base_schema=["schema.org"])
        # test list_all_classes
        self.clses = self.se.list_all_classes()
        # test list_all_properties
        self.props = self.se.list_all_properties()

    def test_schemaclass_class(self):
        """ Test the SchemaClass Class using all classes in Schemaorg schema"""
        # loop through all classes
        for _cls in self.clses:
            # test get_class
            scls = self.se.get_class(_cls.name)
            # test describe function
            describe = scls.describe()
            scls = self.se.get_class(_cls.name, output_type="curie")
            describe = scls.describe()
            scls = self.se.get_class(_cls.name, output_type="uri")
            describe = scls.describe()
            scls = self.se.get_class(_cls.name, output_type="label")
            describe = scls.describe()

    def test_schemaproperty_class(self):
        """ Test the SchemaProperty Class using all classes in Schemaorg schema
        """
        # loop through all properties
        for _prop in self.props:
            # test get_property
            sp = self.se.get_property(_prop.name)
            # test describe function
            describe = sp.describe()
            sp = self.se.get_property(_prop.name, output_type="curie")
            # test describe function
            describe = sp.describe()
            sp = self.se.get_property(_prop.name, output_type="uri")
            # test describe function
            describe = sp.describe()
            sp = self.se.get_property(_prop.name, output_type="label")
            # test describe function
            describe = sp.describe()
Example #3
0
class TestSchemaClass(unittest.TestCase):
    """Test Schema Validator Class
    """
    def setUp(self):
        schema_url = 'https://raw.githubusercontent.com/data2health/schemas/biothings/biothings/biothings_curie_kevin.jsonld'
        self.se = Schema(schema_url)

    def test_list_all_classes(self):
        """ Test list_all_classes function
        """
        all_cls = self.se.list_all_classes()
        all_cls_names = [_cls.name for _cls in all_cls]
        # assert root level Class in all classes
        self.assertIn('schema:Thing', all_cls_names)
        # assert class "Gene" in all classes
        self.assertIn('bts:Gene', all_cls_names)
        # class 'ffff' should not be one of the classes
        self.assertNotIn('bts:ffff', all_cls_names)
        # class name should be curie
        self.assertNotIn('Thing', all_cls_names)
        # assert type of the class is SchemaClass
        self.assertEqual(SchemaClass, type(all_cls[0]))

    def test_list_all_properties(self):
        """ Test list_all_properties function"""
        all_props = self.se.list_all_properties()
        all_prop_names = [_prop.name for _prop in all_props]
        # assert "name" in all props
        self.assertIn('schema:name', all_prop_names)
        # property name should be curie
        self.assertNotIn('name', all_prop_names)
        # assert "ffff" should not be one of the props
        self.assertNotIn('bts:ffff', all_prop_names)
        # assert type of the property is SchemaProperty
        self.assertEqual(SchemaProperty, type(all_props[0]))

    def test_get_class(self):
        """ Test get_class function"""
        scls = self.se.get_class("schema:Gene")
        self.assertEqual(SchemaClass, type(scls))

    def test_get_property(self):
        """ Test get_property function"""
        sp = self.se.get_property("ensembl")
        self.assertEqual(SchemaProperty, type(sp))
Example #4
0
class TestSchemaClass(unittest.TestCase):
    """Test Schema Validator Class
    """
    def setUp(self):
        schema_file = os.path.join(_CURRENT, 'data',
                                   'extend_from_bioschemas.json')
        self.se = Schema(schema_file)

    def test_list_all_classes(self):
        """ Test list_all_classes function
        """
        all_cls = self.se.list_all_classes()
        all_cls_names = [_cls.name for _cls in all_cls]
        # assert root level Class in all classes
        self.assertIn('bioschemas:Gene', all_cls_names)
        # class name should be curie
        self.assertNotIn('Gene', all_cls_names)
        # assert type of the class is SchemaClass
        self.assertEqual(SchemaClass, type(all_cls[0]))

    def test_list_all_properties(self):
        """ Test list_all_properties function"""
        all_props = self.se.list_all_properties()
        all_prop_names = [_prop.name for _prop in all_props]
        # assert "name" in all props
        self.assertIn('schema:name', all_prop_names)
        # property name should be curie
        self.assertNotIn('name', all_prop_names)
        # assert "ffff" should not be one of the props
        self.assertNotIn('bts:ffff', all_prop_names)
        # assert type of the property is SchemaProperty
        self.assertEqual(SchemaProperty, type(all_props[0]))

    def test_get_class(self):
        """ Test get_class function"""
        scls = self.se.get_class("bioschemas:Gene")
        self.assertEqual(SchemaClass, type(scls))

    def test_get_property(self):
        """ Test get_property function"""
        sp = self.se.get_property("bioschemas:encodesBioChemEntity")
        self.assertEqual(SchemaProperty, type(sp))
class TestSchemaClassClass(unittest.TestCase):
    """Test SchemaClass Class
    """
    def setUp(self):
        schema_url = 'https://raw.githubusercontent.com/data2health/schemas/biothings/biothings/biothings_curie_kevin.jsonld'
        self.se = Schema(schema_url)

    def test_initialization_with_context_works(self):
        biothings_jsonld_path = os.path.join(_CURRENT, 'data',
                                             'biothings_test.jsonld')
        schema_url = 'https://raw.githubusercontent.com/data2health/schemas/biothings/biothings/biothings_curie_kevin.jsonld'
        biothings_schema = load_json_or_yaml(biothings_jsonld_path)
        self.se_with_context = Schema(schema_url, biothings_schema['@context'])
        self.assertEqual(self.se_with_context.schema, self.se.schema)

    def test_initialization(self):
        # if input class is not in schema, defined_in_schema should be False
        scls = self.se.get_class("dd")
        self.assertFalse(scls.defined_in_schema)
        # test response if input is NAME only
        scls = self.se.get_class("bts:Gene")
        self.assertEqual(scls.name, "bts:Gene")
        self.assertEqual(scls.uri, "http://schema.biothings.io/Gene")
        self.assertEqual(scls.label, "Gene")
        # test response if input is CURIE only
        scls = self.se.get_class("bts:Gene")
        self.assertEqual(scls.name, "bts:Gene")
        self.assertEqual(scls.uri, "http://schema.biothings.io/Gene")
        self.assertEqual(scls.label, "Gene")
        # test response if input is URI only
        scls = self.se.get_class("http://schema.biothings.io/Gene")
        self.assertEqual(scls.name, "bts:Gene")
        self.assertEqual(scls.uri, "http://schema.biothings.io/Gene")
        self.assertEqual(scls.label, "Gene")

    def test_parent_classes(self):
        """ Test parent_classes function
        """
        scls = self.se.get_class("bts:Gene")
        parents = scls.parent_classes
        # check the first item of should be 'Thing'
        self.assertEqual(parents[0][0].name, 'schema:Thing')
        # if input is the root class, should return empty list
        scls = self.se.get_class("Thing")
        parents = scls.parent_classes
        self.assertEqual(parents, [])
        # check the response if class not exist
        scls = self.se.get_class("dd")
        parents = scls.parent_classes
        self.assertEqual(parents, [])
        ###############################
        # test if output_type is uri
        scls = self.se.get_class("bts:Gene", output_type="uri")
        parents = scls.parent_classes
        # check the first item of should be 'Thing'
        self.assertEqual(parents[0][0], 'http://schema.org/Thing')
        ###############################
        # test if output_type is label
        scls = self.se.get_class("bts:Gene", output_type="label")
        parents = scls.parent_classes
        # check the first item of should be 'Thing'
        self.assertEqual(parents[0][0], 'Thing')
        ###############################
        # test if output_type is curie
        scls = self.se.get_class("bts:Gene", output_type="curie")
        parents = scls.parent_classes
        # check the first item of should be 'Thing'
        self.assertEqual(parents[0][0], 'schema:Thing')

    def test_ancestor_classes(self):
        """ Test ancestor_classes function"""
        ###############################
        # test if output_type is python class
        scls = self.se.get_class("bts:MolecularEntity")
        ancestors = scls.ancestor_classes
        ancestor_names = [_item.name for _item in ancestors]
        # check if gene is in ancestors
        self.assertIn('schema:Thing', ancestor_names)
        self.assertIn('bts:BiologicalEntity', ancestor_names)
        # check if Gene is in ancestors (Gene is its child classs)
        self.assertNotIn('bts:Gene', ancestor_names)
        # check itself should not in ancestors
        self.assertNotIn('bts:MolecularEntity', ancestor_names)
        # test if input class is the root class
        scls = self.se.get_class("Thing")
        self.assertEqual(scls.ancestor_classes, [])
        # test if input class not exists
        scls = self.se.get_class("dd")
        self.assertEqual(scls.ancestor_classes, [])
        ###############################
        # test if output_type is curie
        scls = self.se.get_class("bts:MolecularEntity", output_type="curie")
        ancestors = scls.ancestor_classes
        # check if BiologicalEntity is in descendants
        self.assertIn('bts:BiologicalEntity', ancestors)
        self.assertIn('schema:Thing', ancestors)
        ###############################
        # test if output_type is label
        scls = self.se.get_class("bts:MolecularEntity", output_type="label")
        ancestors = scls.ancestor_classes
        # check if Thing is in ancestors
        self.assertIn('Thing', ancestors)
        self.assertIn('BiologicalEntity', ancestors)
        ###############################
        # test if output_type is uri
        scls = self.se.get_class("bts:MolecularEntity", output_type="uri")
        ancestors = scls.ancestor_classes
        # check if gene is in descendants
        self.assertIn('http://schema.biothings.io/BiologicalEntity', ancestors)
        self.assertIn('http://schema.org/Thing', ancestors)

    def test_descendant_classes(self):
        """ Test descendant_classes function"""
        ###############################
        # test if output_type is python class
        scls = self.se.get_class("bts:MolecularEntity")
        descendants = scls.descendant_classes
        descendant_names = [_item.name for _item in descendants]
        # check if gene is in descendants
        self.assertIn('bts:Gene', descendant_names)
        # check if Thing is in descendants (Thing is its parent classs)
        self.assertNotIn('schema:Thing', descendant_names)
        # check itself should not in descendants
        self.assertNotIn('bts:MolecularEntity', descendant_names)
        # test if input class is the leaf class
        scls = self.se.get_class("bts:Gene")
        descendants = scls.descendant_classes
        self.assertEqual(descendants, [])
        # test if input class not exists
        scls = self.se.get_class("dd")
        descendants = scls.descendant_classes
        self.assertEqual(descendants, [])
        ###############################
        # test if output_type is curie
        scls = self.se.get_class("bts:MolecularEntity", output_type="curie")
        descendants = scls.descendant_classes
        # check if gene is in descendants
        self.assertIn('bts:Gene', descendants)
        ###############################
        # test if output_type is label
        scls = self.se.get_class("bts:MolecularEntity", output_type="label")
        descendants = scls.descendant_classes
        # check if gene is in descendants
        self.assertIn('Gene', descendants)
        ###############################
        # test if output_type is uri
        scls = self.se.get_class("bts:MolecularEntity", output_type="uri")
        descendants = scls.descendant_classes
        # check if gene is in descendants
        self.assertIn('http://schema.biothings.io/Gene', descendants)

    def test_child_classes(self):
        """ Test child_classes function"""
        ###############################
        # test if output_type is python class
        scls = self.se.get_class("bts:MolecularEntity")
        children = scls.child_classes
        children_names = [_item.name for _item in children]
        # check if GeneFamily is in children
        self.assertIn('bts:GeneFamily', children_names)
        # check if gene is in children (gene is descendant)
        self.assertNotIn('bts:Gene', children_names)
        # check if Thing is in children (Thing is its parent classs)
        self.assertNotIn('schema:Thing', children_names)
        # check itself should not in children
        self.assertNotIn('bts:MolecularEntity', children_names)
        # test if input class is the leaf class
        scls = self.se.get_class("bts:Gene")
        children = scls.child_classes
        self.assertEqual(children, [])
        # test if input class is not defined
        scls = self.se.get_class("dd")
        children = scls.child_classes
        self.assertEqual(children, [])
        ###############################
        # test if output_type is curie
        scls = self.se.get_class("bts:MolecularEntity", output_type="curie")
        children = scls.child_classes
        # check if GeneFamily is in children
        self.assertIn('bts:GeneFamily', children)
        ###############################
        # test if output_type is uri
        scls = self.se.get_class("bts:MolecularEntity", output_type="uri")
        children = scls.child_classes
        # check if GeneFamily is in children
        self.assertIn('http://schema.biothings.io/GeneFamily', children)
        ###############################
        # test if output_type is label
        scls = self.se.get_class("bts:MolecularEntity", output_type="label")
        children = scls.child_classes
        # check if GeneFamily is in children
        self.assertIn('GeneFamily', children)

    def test_used_by(self):
        """ Test used_by function"""
        scls = self.se.get_class("bts:GenomicEntity")
        usage = scls.used_by()
        self.assertTrue(len(usage) > 1)
        self.assertEqual(list, type(usage))
        # test if class is not defined
        scls = self.se.get_class("dd")
        usage = scls.used_by()
        self.assertEqual(usage, [])

    def test_describe(self):
        """test describe function"""
        scls = self.se.get_class("dd")
        describe = scls.describe()
        self.assertEqual(describe, {})
Example #6
0
class SchemaExtractor():
    """Extract BioThings Schema and construct networkx graph."""

    def __init__(self, schema):
        """Load biothings schema."""
        self.se = Schema(schema)
        # get all properties which are descendants of "identifier" property
        self.all_ids = self.se.get_property('identifier',
                                            output_type="curie").descendant_properties

    def find_descendants(self, lst):
        """Find all descendants for a list of schemaclass classes.

        :arg list lst: a list of schemaclass classes
        """
        # if input is empty list, return an empty set
        if not lst:
            return set()
        # find descendant of each class and then merge together into a set
        dsc_lst = set(itertools.chain.from_iterable([self.se.get_class(_cls, output_type="curie").descendant_classes for _cls in lst]))
        return dsc_lst

    def find_cls_ids(self, _cls):
        """Find all identifiers which belongs to a class.
    
        :arg cls _cls: a SchemaClass instance
        """
        # get all properties belong to the cls which are descendants of "identifiers"
        properties = [_prop['curie'] for _prop in self.se.get_class(_cls).list_properties(group_by_class=False) if _prop and _prop['curie'] in self.all_ids]
        return properties

    def schema2networkx(self):
        """Convert schema into a networkx graph.

        Logics
        ~~~~~~
        Each identifier represents a node
        node properties include its semantic type (class name)
        The edge is represented by non-identifier properties
        """
        G = nx.DiGraph()
        # list all properties defined in the schema
        properties = self.se.list_all_defined_properties()
        for _property in properties:
            if _property not in self.all_ids:
                # find all descendants of domain classes
                input_clses = set([_cls.name for _cls in _property.domain if _cls.uri in self.se.full_class_only_graph])
                input_clses |= self.find_descendants(input_clses)
                # find all descendants of range classes
                output_clses = set([_cls.name for _cls in _property.range if _cls.uri in self.se.full_class_only_graph])
                output_clses |= self.find_descendants(output_clses)
                if input_clses and output_clses:
                    input_ids = set(itertools.chain.from_iterable([self.find_cls_ids(_cls) for _cls in input_clses]))
                    output_ids = set(itertools.chain.from_iterable([self.find_cls_ids(_cls) for _cls in output_clses]))
                    if input_ids and output_ids:
                        G.add_edges_from(zip(input_ids, output_ids),
                                         label=_property.label)
                else:
                    continue
            else:
                continue
        return G
Example #7
0

# %% [markdown]
# ## Question
# Thread #cvisb andrew  21 days ago
# > For each repository, show how many datasets have each metadata field populated.

# %%
from collections import defaultdict, Counter
from functools import partial
from elasticsearch_dsl import Search
from elasticsearch import Elasticsearch
from biothings_schema import Schema

schema = Schema()
dataset = schema.get_class("schema:Dataset")
properties = sorted([
    prop['label'] for prop in dataset.list_properties(
        class_specific=False, group_by_class=False)
])

# %%
client = Elasticsearch('su07:9199')
indicies = ('zenodo', 'omicsdi', 'harvard_dataverse','ncbi_geo_transformed')

result = defaultdict(partial(defaultdict, Counter))

count = 0
for index in indicies:
    search = Search(using=client, index=index)
    for doc in search.scan():