def extract(name, url, description, about=None, thumbnail=None, output_file=None, **kwargs): ''' extract a DataCatalog to describe some dataset(s). To add more properties, just add them via additional keyword args (kwargs) Parameters ========== output_file: An html output file to write catalog to (optional) url: the url to get the catalog name: the name of the DataCatalog description: a description of the DataCatalog thumbnail: an image thumbnail (web url) about: text about the data catalog (optional). ''' # Step 0. Define absolute paths to our Dockerfile, recipe, output here = os.path.abspath(os.path.dirname(__file__)) recipe_yml = os.path.join(here, "recipe.yml") # Step 3: Create Data Catalog catalog = Schema("DataCatalog") # Step 1: Show required and recommended fields from recipe recipe = RecipeParser(recipe_yml) # datacatalog.properties catalog.add_property('url', about) catalog.add_property('name', name) catalog.add_property('description', description) catalog.add_property('thumbnailUrl', thumbnail) catalog.add_property('about', about) # Additional (won't be added if not part of schema) for key,value in kwargs.items(): catalog.add_property(key, value) # Step 4: Validate Data Structure recipe.validate(catalog) # Step 5: If the user wants to write to html, do it before return if output_file is not None: make_dataset(catalog, output_file) return catalog
def setUp(self): self.tmpdir = os.path.join(tempfile.gettempdir(), 'schemaorg-test') if not os.path.exists(self.tmpdir): os.mkdir(self.tmpdir) self.here = os.path.abspath(os.path.dirname(__file__)) recipe_yml = os.path.join(self.here, "recipe.yml") self.recipe = RecipeParser(recipe_yml) self.dataset = Schema("Dataset") self.templates = [ 'google/dataset-table.html', # bootstrap 'google/visual-dataset.html', # default 'google/dataset.html', # json only 'google/dataset-vue-table.html' ] # vue js person = make_person(name="Dinosaur Pancakes", description='Dataset maintainer', url='https://www.github.com/vsoch', contact_type='customer support', telephone='999-999-9999') self.dataset.add_property('creator', person) self.dataset.add_property('version', "1.0.0") self.dataset.add_property('description', "This is the best dataset.") self.dataset.add_property('name', "Dinosaur Dataset") self.dataset.add_property( 'thumbnailUrl', 'https://vsoch.github.io/datasets/assets/img/avocado.png') self.dataset.add_property('about', "This is a dataset") download = Schema('DataDownload') download.add_property( 'contentUrl', 'https://vsoch.github.io/datasets/assets/img/avocado.png') download.add_property('encodingFormat', 'CSV') self.dataset.add_property('distribution', [download]) self.recipe.validate(self.dataset)
containerRecipe = Schema(spec_yml) # Step 2: Show required and recommended fields from recipe recipe = RecipeParser(recipe_yml) print(recipe.loaded) # Step 3: Extract Container Things! First, the recipe file from spython.main.parse.parsers import DockerParser parser = DockerParser(dockerfile).parse() # containerRecipe.properties containerRecipe.add_property('version', containerRecipe.version) containerRecipe.add_property('labels', parser.labels) # currently lists containerRecipe.add_property('environment', parser.environ) # currently a list containerRecipe.add_property('entrypoint', parser.entrypoint) containerRecipe.add_property('description', 'A Dockerfile build recipe') # This would be extracted at build --> push time, so we know the uri. containerRecipe.add_property('name', "vsoch/salad") containerRecipe.add_property('ContainerImage', parser.fromHeader) # Step 4: Validate Data Structure recipe.validate(containerRecipe) # Step 5, get extra metadata we would get with container-diff! # Kids don't run command line things from Python at home, it's just bad :)
# Step 2: Show required and recommended fields from recipe recipe = RecipeParser(recipe_yml) print(recipe.loaded) # Step 3: Extract Container Things! First, the recipe file from spython.main.parse.parsers import DockerParser parser = DockerParser(dockerfile).parse() # See definitions at containerRecipe._properties.keys() # When you add, they are found at: # containerRecipe.properties containerRecipe.add_property('version', containerRecipe.version) containerRecipe.add_property('environment', parser.environ) # currently a list containerRecipe.add_property('entrypoint', parser.entrypoint) containerRecipe.add_property('description', 'A Dockerfile build recipe') # This would be extracted at build --> push time, so we know the uri. containerRecipe.add_property('name', "vanessa/sregistry") containerRecipe.add_property('ContainerImage', parser.fromHeader) # Step 4: Validate Data Structure recipe.validate(containerRecipe) # Step 5, get extra metadata we would get with container-diff! # Kids don't run command line things from Python at home, it's just bad :)
from schemaorg.templates.google import (make_person, make_dataset) # make_person(name, description, url="", telephone="", email="") person = make_person(name="@vsoch", description='research software engineer, dinosaur') # Step 3: Create SoftwareSourceCode from spython.main.parse.parsers import DockerParser parser = DockerParser(dockerfile).parse() sourceCode = Schema("SoftwareSourceCode") # sourceCode.properties sourceCode.add_property('creator', person) sourceCode.add_property('version', sourceCode.version) sourceCode.add_property('description', 'A Dockerfile build recipe') sourceCode.add_property('name', parser.fromHeader) # Step 4: Validate Data Structure recipe.validate(sourceCode) # Step 5a: Add additional fields (extra parsing!) # Since this is a demo, we won't do this here (we don't have URI) # I'll do a separate example for this using vsoch/dockerfiles on Github # Step 5b: Generate dataset, meaning writing metadata into template. # If we provide an output file it, it would write to it.
def make_person(name, description, url="", telephone="", email="", contact_type="customer support"): # Create an individual (persona) person = Schema('Person') person.add_property('name', name) contactPoint = Schema('ContactPoint') # Update the contact point contactPoint.add_property('telephone', telephone) contactPoint.add_property('email', email) contactPoint.add_property('url', url) contactPoint.add_property('contactType', contact_type) # Update the person with it person.add_property('contactPoint', contactPoint) return person
def extract(name, url=None, telephone=None, email=None, output_file=None, contact_type="customer_service", **kwargs): ''' extract an Organization. Some of the fields required are for a "ContactPoint" included within. Parameters ========== output_file: An html output file to write catalog to (optional) url: the url to get the catalog name: the name of the DataCatalog contact_type: the type of contact for the ContactPoint telephone: the telephone of the ContactPoint email: the email of the ContactPoint ''' # Step 0. Define absolute paths to our Dockerfile, recipe, output here = os.path.abspath(os.path.dirname(__file__)) recipe_yml = os.path.join(here, "recipe.yml") # Step 2: Create Contact Point contact = Schema("ContactPoint") contact.add_property('contactType', contact_type) contact.add_property('telephone', telephone) contact.add_property('email', email) # Step 3: Show required and recommended fields from recipe recipe = RecipeParser(recipe_yml) # Organization properties org = Schema("Organization") org.add_property('contactPoint', contact) org.add_property('url', url) org.add_property('name', name) # Additional (won't be added if not part of schema) for key, value in kwargs.items(): org.add_property(key, value) # Step 4: Validate Data Structure recipe.validate(org) # Step 5: If the user wants to write to html, do it before return if output_file is not None: make_dataset(org, output_file) return org
def extract(name, description=None, thumbnail=None, sameAs=None, version=None, about=None, output_file=None, person=None, repository=None, runtime=None, **kwargs): ''' extract a SoftwareSourceCode to describe a codebase. To add more properties, just add them via additional keyword args (kwargs) Parameters ========== output_file: An html output file to write catalog to (optional) url: the url to get the catalog name: the name of the DataCatalog description: a description of the DataCatalog thumbnail: an image thumbnail (web url) about: text about the data catalog (optional). version: the software version. If not provided, uses schemarg version ''' # Step 0. Define absolute paths to our Dockerfile, recipe, output here = os.path.abspath(os.path.dirname(__file__)) recipe_yml = os.path.join(here, "recipe.yml") # Step 1: Show required and recommended fields from recipe recipe = RecipeParser(recipe_yml) # Step 2: Create SoftwareSourceCode ssc = Schema("SoftwareSourceCode") # dataset.properties ssc.add_property('creator', person) ssc.add_property('version', version or ssc.version) ssc.add_property('description', description) ssc.add_property('name', name) ssc.add_property('thumbnailUrl', thumbnail) ssc.add_property('sameAs', sameAs) ssc.add_property('about', about) ssc.add_property('codeRepository', repository) ssc.add_property('runtime', runtime) # Step 3: Additional (won't be added if not part of schema) for key,value in kwargs.items(): ssc.add_property(key, value) # Step 4: Validate Data Structure recipe.validate(ssc) # Step 5: If the user wants to write to html, do it before return if output_file is not None: make_dataset(ssc, output_file) return ssc
class TestTemplates(unittest.TestCase): def setUp(self): self.tmpdir = os.path.join(tempfile.gettempdir(), 'schemaorg-test') if not os.path.exists(self.tmpdir): os.mkdir(self.tmpdir) self.here = os.path.abspath(os.path.dirname(__file__)) recipe_yml = os.path.join(self.here, "recipe.yml") self.recipe = RecipeParser(recipe_yml) self.dataset = Schema("Dataset") self.templates = [ 'google/dataset-table.html', # bootstrap 'google/visual-dataset.html', # default 'google/dataset.html', # json only 'google/dataset-vue-table.html' ] # vue js person = make_person(name="Dinosaur Pancakes", description='Dataset maintainer', url='https://www.github.com/vsoch', contact_type='customer support', telephone='999-999-9999') self.dataset.add_property('creator', person) self.dataset.add_property('version', "1.0.0") self.dataset.add_property('description', "This is the best dataset.") self.dataset.add_property('name', "Dinosaur Dataset") self.dataset.add_property( 'thumbnailUrl', 'https://vsoch.github.io/datasets/assets/img/avocado.png') self.dataset.add_property('about', "This is a dataset") download = Schema('DataDownload') download.add_property( 'contentUrl', 'https://vsoch.github.io/datasets/assets/img/avocado.png') download.add_property('encodingFormat', 'CSV') self.dataset.add_property('distribution', [download]) self.recipe.validate(self.dataset) def tearDown(self): pass #shutil.rmtree(self.tmpdir) def test_templates(self): print('Testing templates generation...') for template in self.templates: prefix, ext = os.path.splitext(template) output_file = os.path.join(self.tmpdir, os.path.basename(prefix) + '.html') html = make_dataset(self.dataset, output_file, template=template) print(output_file)
def extract(name, version=None, contact=None, output_html=True, description=None, thumbnail=None, sameAs=None, about=None, repository=None): ''' extract a Dataset to describe some Github repository. To add more properties, just add them via additional keyword args (kwargs) Parameters ========== url: the url to get the catalog name: the name of the DataCatalog contact: name of a person that is in charge of the dataset description: a description of the DataCatalog thumbnail: an image thumbnail (web url) about: text about the data catalog (optional). ''' # Step 0. Define absolute paths to our Dockerfile, recipe, output here = os.path.abspath(os.path.dirname(__file__)) recipe_yml = os.path.join(here, "recipe.yml") # Step 1: Show required and recommended fields from recipe recipe = RecipeParser(recipe_yml) # Step 2: Create Dataset dataset = Schema("Dataset") # We can obtain these from the environment, or use reasonable defaults thumbnail = os.environ.get( 'DATASET_THUMBNAIL', thumbnail or 'https://vsoch.github.io/datasets/assets/img/avocado.png') about = os.environ.get( 'DATASET_ABOUT', about or 'This is a Dataset parsed by the openschemas/extractors container.') repository = os.environ.get('GITHUB_REPOSITORY', repository or 'openschemas/extractors') description = os.environ.get('DATASET_DESCRIPTION', 'A Dataset') email = os.environ.get('DATASET_EMAIL') template = os.environ.get('DATASET_TEMPLATE', "google/dataset-table.html") # Can be one of: # google/dataset-table.html (bootstrap) # google/visual-dataset.html (see vsoch.github.io/zenodo-ml) # google/dataset.html (just blank page, json metadata) # google/dataset-vue-table.html # see https://openschemas.github.io/schemaorg#7-embed-in-html-with-json-ld # Contact metadata contact = os.environ.get('GITHUB_ACTOR', contact) contact_url = os.environ.get('CONTACT_URL', repository) contact_description = os.environ.get('CONTACT_DESCRIPTION', 'Dataset maintainer') contact_type = os.environ.get('CONTACT_TYPE', 'customer support') contact_telephone = os.environ.get('CONTACT_TELEPHONE') contact = add_kwargs(contact, 'DATASET_DOWNLOAD_KWARGS') # Download Link download_link = os.environ.get('DATASET_DOWNLOAD_LINK') encoding = os.environ.get('DATASET_ENCODING_FORMAT') if download != None: download = Schema('DataDownload') download.add_property('encodingFormat', encoding) download.add_property('contentUrl', download_link) download = add_kwargs(download, 'DATASET_DOWNLOAD_KWARGS') dataset.add_property('distribution', [download]) # Get the repository full url for contact if not contact_url.startswith('http'): contact_url = "https://www.github.com/%s" % contact_url if contact is not None: person = make_person(name=contact, description=contact_description, url=contact_url, contact_type=contact_type, telephone=contact_telephone) person = add_kwargs(person, 'CONTACT_KWARGS') dataset.add_property('creator', person) # dataset.properties dataset.add_property('version', version) dataset.add_property('description', description) dataset.add_property('name', name) dataset.add_property('thumbnailUrl', thumbnail) dataset.add_property('about', about) dataset = add_kwargs(dataset, 'DATASET_KWARGS') # Step 5: Validate Data Structure recipe.validate(dataset) if output_html: return make_dataset(dataset, template=template) return dataset.dump_json(pretty_print=True)