Esempio n. 1
0
def extract(name, description=None, thumbnail=None, sameAs=None, version=None,
            about=None, output_file=None, person=None, repository=None,
            runtime=None, **kwargs):

    ''' extract a SoftwareSourceCode to describe a codebase. To add more
        properties, just add them via additional keyword args (kwargs)
    
        Parameters
        ==========
        output_file: An html output file to write catalog to (optional)
        url: the url to get the catalog
        name: the name of the DataCatalog
        description: a description of the DataCatalog
        thumbnail: an image thumbnail (web url)
        about: text about the data catalog (optional).
        version: the software version. If not provided, uses schemarg version
    '''

    # Step 0. Define absolute paths to our Dockerfile, recipe, output
    here = os.path.abspath(os.path.dirname(__file__))
    recipe_yml = os.path.join(here, "recipe.yml")
    
    # Step 1: Show required and recommended fields from recipe
    recipe = RecipeParser(recipe_yml)
    
    # Step 2: Create SoftwareSourceCode
    ssc = Schema("SoftwareSourceCode")

    # dataset.properties
    ssc.add_property('creator', person)
    ssc.add_property('version', version or ssc.version)
    ssc.add_property('description', description)
    ssc.add_property('name', name)
    ssc.add_property('thumbnailUrl', thumbnail)
    ssc.add_property('sameAs', sameAs)
    ssc.add_property('about', about)
    ssc.add_property('codeRepository', repository)
    ssc.add_property('runtime', runtime)

    # Step 3: Additional (won't be added if not part of schema)
    for key,value in kwargs.items():
        ssc.add_property(key, value)

    # Step 4: Validate Data Structure
    recipe.validate(ssc)

    # Step 5: If the user wants to write to html, do it before return
    if output_file is not None:
        make_dataset(ssc, output_file)

    return ssc
Esempio n. 2
0
def extract(name,
            url=None,
            telephone=None,
            email=None,
            output_file=None,
            contact_type="customer_service",
            **kwargs):
    ''' extract an Organization. Some of the fields required are for
        a "ContactPoint" included within.
 
        Parameters
        ==========
        output_file: An html output file to write catalog to (optional)
        url: the url to get the catalog
        name: the name of the DataCatalog
        contact_type: the type of contact for the ContactPoint
        telephone: the telephone of the ContactPoint
        email: the email of the ContactPoint
    '''

    # Step 0. Define absolute paths to our Dockerfile, recipe, output
    here = os.path.abspath(os.path.dirname(__file__))
    recipe_yml = os.path.join(here, "recipe.yml")

    # Step 2: Create Contact Point
    contact = Schema("ContactPoint")
    contact.add_property('contactType', contact_type)
    contact.add_property('telephone', telephone)
    contact.add_property('email', email)

    # Step 3: Show required and recommended fields from recipe
    recipe = RecipeParser(recipe_yml)

    # Organization properties
    org = Schema("Organization")
    org.add_property('contactPoint', contact)
    org.add_property('url', url)
    org.add_property('name', name)

    # Additional (won't be added if not part of schema)
    for key, value in kwargs.items():
        org.add_property(key, value)

    # Step 4: Validate Data Structure
    recipe.validate(org)

    # Step 5: If the user wants to write to html, do it before return
    if output_file is not None:
        make_dataset(org, output_file)

    return org
Esempio n. 3
0
    def test_templates(self):

        print('Testing templates generation...')
        for template in self.templates:
            prefix, ext = os.path.splitext(template)
            output_file = os.path.join(self.tmpdir,
                                       os.path.basename(prefix) + '.html')
            html = make_dataset(self.dataset, output_file, template=template)
            print(output_file)
Esempio n. 4
0
def extract(name, url, description,
            about=None, thumbnail=None, output_file=None, **kwargs):
    ''' extract a DataCatalog to describe some dataset(s). To add more
        properties, just add them via additional keyword args (kwargs)
    
        Parameters
        ==========
        output_file: An html output file to write catalog to (optional)
        url: the url to get the catalog
        name: the name of the DataCatalog
        description: a description of the DataCatalog
        thumbnail: an image thumbnail (web url)
        about: text about the data catalog (optional).
    '''

    # Step 0. Define absolute paths to our Dockerfile, recipe, output
    here = os.path.abspath(os.path.dirname(__file__))
    recipe_yml = os.path.join(here, "recipe.yml")
    
    # Step 3: Create Data Catalog
    catalog = Schema("DataCatalog")

    # Step 1: Show required and recommended fields from recipe
    recipe = RecipeParser(recipe_yml)
    
    # datacatalog.properties
    catalog.add_property('url', about)
    catalog.add_property('name', name)
    catalog.add_property('description', description)
    catalog.add_property('thumbnailUrl', thumbnail)    
    catalog.add_property('about', about)

    # Additional (won't be added if not part of schema)
    for key,value in kwargs.items():
        catalog.add_property(key, value)

    # Step 4: Validate Data Structure
    recipe.validate(catalog)

    # Step 5: If the user wants to write to html, do it before return
    if output_file is not None:
        make_dataset(catalog, output_file)

    return catalog
Esempio n. 5
0
    # Container Diff
    response = run_command([
        "container-diff", "analyze", uri, "--type=pip", "--type=file",
        "--type=apt", "--type=history", "--json", '--quiet',
        '--verbosity=panic'
    ])

    # note that we can also get pip, apt, packages here...
    if response['return_code'] == 0:
        layers = json.loads(response['message'])
        for layer in layers:
            if layer['AnalyzeType'] == "File":
                print('Found %s files!' % len(layer['Analysis']))

    # Found 12615 files!
    # Here we can go to town parsing the guts to label the container meaningfully
    # TODO: we need some lamma magic / NLP here to extract software tokens

else:
    print("Return value is not zero, %s" % response['message'])

# Note to readers - we can parse a ContainerRecipe from a manifest!
# manifest['ContainerConfig'] And it has a name! Hmm.

# Step 6. When above is done, generate json-ld
from schemaorg.templates.google import make_dataset

dataset = make_dataset(containerRecipe, index_html)
print(dataset)
Esempio n. 6
0
person = make_person(name="@vsoch",
                     description='research software engineer, dinosaur')

# Step 3: Create SoftwareSourceCode

from spython.main.parse.parsers import DockerParser
parser = DockerParser(dockerfile).parse()

sourceCode = Schema("SoftwareSourceCode")

# sourceCode.properties

sourceCode.add_property('creator', person)
sourceCode.add_property('version', sourceCode.version)
sourceCode.add_property('description', 'A Dockerfile build recipe')
sourceCode.add_property('name', parser.fromHeader)

# Step 4: Validate Data Structure

recipe.validate(sourceCode)

# Step 5a: Add additional fields (extra parsing!)
#         Since this is a demo, we won't do this here (we don't have URI)
#         I'll do a separate example for this using vsoch/dockerfiles on Github

# Step 5b: Generate dataset, meaning writing metadata into template.
#          If we provide an output file it, it would write to it.

dataset = make_dataset(sourceCode, index_html)
print(dataset)
Esempio n. 7
0
def extract(dockerfile, contact, container_name=None, output_html=True):
    '''extract a dataset from a given dockerfile, write to html output file.
       Use container-diff and spython to get information about the container.
    '''

    # Step 0. Define absolute paths to our Dockerfile, recipe, output
    here = os.path.abspath(os.path.dirname(__file__))
    recipe_yml = os.path.join(here, "recipe.yml")
    spec_yml = os.path.join(here, "specification.yml")

    # Step 1: Show required and recommended fields from recipe
    recipe = RecipeParser(recipe_yml)

    # Step 2: Create Dataset
    parser = DockerRecipe(dockerfile)
    image = Schema(spec_yml)

    # We can obtain these from the environment, or use reasonable defaults
    thumbnail = os.environ.get(
        'IMAGE_THUMBNAIL',
        'https://vsoch.github.io/datasets/assets/img/avocado.png')
    about = os.environ.get(
        'IMAGE_ABOUT',
        'This is a Dockerfile parsed by the openschemas/extractors container.')
    repository = os.environ.get('GITHUB_REPOSITORY', 'openschemas/extractors')
    description = os.environ.get('IMAGE_DESCRIPTION',
                                 'A Dockerfile build recipe')

    # Step 3: Generate a Person (these are Google Helper functions)
    contact = os.environ.get('GITHUB_ACTOR', contact)
    contact_url = os.environ.get('CONTACT_URL', repository)
    contact_description = os.environ.get('CONTACT_DESCRIPTION',
                                         'Dockerfile maintainer')
    contact_type = os.environ.get('CONTACT_TYPE', 'customer support')
    contact_telephone = os.environ.get('CONTACT_TELEPHONE')

    # Get the repository full url for contact
    if not contact_url.startswith('http'):
        contact_url = "https://www.github.com/%s" % contact_url

    if contact is not None:
        person = make_person(name=contact,
                             description=contact_description,
                             url=contact_url,
                             contact_type=contact_type,
                             telephone=contact_telephone)
        image.properties['creator'] = person
        image.properties['author'] = person

    # image.properties
    if len(parser.environ) > 0:
        image.properties['environment'] = parser.environ
    image.properties['entrypoint'] = parser.entrypoint
    image.properties['version'] = image.version
    image.properties['description'] = description
    image.properties['ContainerImage'] = parser.fromHeader
    image.properties['name'] = container_name

    # Fun properties :)
    image.properties['thumbnailUrl'] = thumbnail
    image.properties['sameAs'] = 'ImageDefinition'
    image.properties['about'] = about
    image.properties[
        'codeRepository'] = 'https://www.github.com/%s' % repository
    image.properties['runtime'] = 'Docker'

    # Generate temporary filename
    output_file = "%s.json" % get_tmpfile("image-definition")

    # Try using container name, if not available default to ContainerImage (FROM)
    layers = run_container_diff(container_name, parser.fromHeader, output_file)

    if len(layers) > 0:

        # softwareRequirements
        requires = []  # APT and PIP

        # note that the top level key here can be history, files, pip, apt, etc.
        for layer in layers:

            ## Pip and Apt will go into softwareRequirements
            if layer['AnalyzeType'] in ["Pip", "Apt"]:
                for pkg in layer['Analysis']:
                    requires.append(
                        '%s > %s==%s' %
                        (layer['AnalyzeType'], pkg['Name'], pkg['Version']))

        image.properties["softwareRequirements"] = requires

    if output_html:
        return make_dataset(image)
    return image.dump_json(pretty_print=True)
Esempio n. 8
0
def extract(name,
            version=None,
            contact=None,
            output_html=True,
            description=None,
            thumbnail=None,
            sameAs=None,
            about=None,
            repository=None):
    ''' extract a Dataset to describe some Github repository. To add more
        properties, just add them via additional keyword args (kwargs)
    
        Parameters
        ==========
        url: the url to get the catalog
        name: the name of the DataCatalog
        contact: name of a person that is in charge of the dataset
        description: a description of the DataCatalog
        thumbnail: an image thumbnail (web url)
        about: text about the data catalog (optional).
    '''

    # Step 0. Define absolute paths to our Dockerfile, recipe, output
    here = os.path.abspath(os.path.dirname(__file__))
    recipe_yml = os.path.join(here, "recipe.yml")

    # Step 1: Show required and recommended fields from recipe
    recipe = RecipeParser(recipe_yml)

    # Step 2: Create Dataset
    dataset = Schema("Dataset")

    # We can obtain these from the environment, or use reasonable defaults
    thumbnail = os.environ.get(
        'DATASET_THUMBNAIL', thumbnail
        or 'https://vsoch.github.io/datasets/assets/img/avocado.png')
    about = os.environ.get(
        'DATASET_ABOUT', about
        or 'This is a Dataset parsed by the openschemas/extractors container.')
    repository = os.environ.get('GITHUB_REPOSITORY', repository
                                or 'openschemas/extractors')
    description = os.environ.get('DATASET_DESCRIPTION', 'A Dataset')
    email = os.environ.get('DATASET_EMAIL')
    template = os.environ.get('DATASET_TEMPLATE', "google/dataset-table.html")

    # Can be one of:
    # google/dataset-table.html  (bootstrap)
    # google/visual-dataset.html (see vsoch.github.io/zenodo-ml)
    # google/dataset.html        (just blank page, json metadata)
    # google/dataset-vue-table.html
    # see https://openschemas.github.io/schemaorg#7-embed-in-html-with-json-ld

    # Contact metadata
    contact = os.environ.get('GITHUB_ACTOR', contact)
    contact_url = os.environ.get('CONTACT_URL', repository)
    contact_description = os.environ.get('CONTACT_DESCRIPTION',
                                         'Dataset maintainer')
    contact_type = os.environ.get('CONTACT_TYPE', 'customer support')
    contact_telephone = os.environ.get('CONTACT_TELEPHONE')
    contact = add_kwargs(contact, 'DATASET_DOWNLOAD_KWARGS')

    # Download Link
    download_link = os.environ.get('DATASET_DOWNLOAD_LINK')
    encoding = os.environ.get('DATASET_ENCODING_FORMAT')
    if download != None:
        download = Schema('DataDownload')
        download.add_property('encodingFormat', encoding)
        download.add_property('contentUrl', download_link)
        download = add_kwargs(download, 'DATASET_DOWNLOAD_KWARGS')
        dataset.add_property('distribution', [download])

    # Get the repository full url for contact
    if not contact_url.startswith('http'):
        contact_url = "https://www.github.com/%s" % contact_url

    if contact is not None:
        person = make_person(name=contact,
                             description=contact_description,
                             url=contact_url,
                             contact_type=contact_type,
                             telephone=contact_telephone)
        person = add_kwargs(person, 'CONTACT_KWARGS')
        dataset.add_property('creator', person)

    # dataset.properties
    dataset.add_property('version', version)
    dataset.add_property('description', description)
    dataset.add_property('name', name)
    dataset.add_property('thumbnailUrl', thumbnail)
    dataset.add_property('about', about)
    dataset = add_kwargs(dataset, 'DATASET_KWARGS')

    # Step 5: Validate Data Structure
    recipe.validate(dataset)

    if output_html:
        return make_dataset(dataset, template=template)
    return dataset.dump_json(pretty_print=True)
Esempio n. 9
0
    

# Add more (not required) fields - some of these belon with ContainerImage
containerRecipe.add_property('operatingSystem', manifest['Os']) 
containerRecipe.add_property('softwareVersion', manifest['Id'])  # shasum
containerRecipe.add_property('identifier', manifest['RepoTags']) # tag

# Note to readers - we can parse a ContainerRecipe from a manifest!
# manifest['ContainerConfig'] And it has a name! Hmm.

# Container Diff
response = run_command(["container-diff", "analyze", uri,
                        "--type=pip", "--type=file", "--type=apt", "--type=history",
                        "--json", '--quiet','--verbosity=panic'])

# note that we can also get pip, apt, packages here...
if response['return_code'] == 0:
    layers = json.loads(response['message'])
    for layer in layers:
        if layer['AnalyzeType'] == "File":
            print('Found %s files!' %len(layer['Analysis']))

# Found 12615 files!
# Here we can go to town parsing the guts to label the container meaningfully
# TODO: we need some lamma magic / NLP here to extract software tokens

# Step 6. When above is done, generate json-ld
from schemaorg.templates.google import make_dataset
dataset = make_dataset(containerRecipe)
print(dataset)
person = make_person(name="@vsoch",
                     description='research software engineer, dinosaur')

# Step 3: Create SoftwareSourceCode

from spython.main.parse.parsers import DockerParser
parser = DockerParser('Dockerfile').parse()

sourceCode = Schema("SoftwareSourceCode")

# sourceCode.properties

sourceCode.add_property('creator', person)
sourceCode.add_property('version', sourceCode.version)
sourceCode.add_property('description', 'A Dockerfile build recipe')
sourceCode.add_property('name', parser.fromHeader)

# Step 4: Validate Data Structure

recipe.validate(sourceCode)

# Step 5a: Add additional fields (extra parsing!)
#         Since this is a demo, we won't do this here (we don't have URI)
#         I'll do a separate example for this using vsoch/dockerfiles on Github

# Step 5b: Generate dataset, meaning writing metadata into template.
#          If we provide an output file it, it would write to it.

dataset = make_dataset(sourceCode)
print(dataset)