Beispiel #1
0
def extract(dockerfile, contact, container_name=None, output_html=True):
    '''extract a dataset from a given dockerfile, write to html output file.
       Use container-diff and spython to get information about the container.
    '''

    # Step 0. Define absolute paths to our Dockerfile, recipe, output
    here = os.path.abspath(os.path.dirname(__file__))
    recipe_yml = os.path.join(here, "recipe.yml")
    spec_yml = os.path.join(here, "specification.yml")

    # Step 1: Show required and recommended fields from recipe
    recipe = RecipeParser(recipe_yml)

    # Step 2: Create Dataset
    parser = DockerRecipe(dockerfile)
    image = Schema(spec_yml)

    # We can obtain these from the environment, or use reasonable defaults
    thumbnail = os.environ.get(
        'IMAGE_THUMBNAIL',
        'https://vsoch.github.io/datasets/assets/img/avocado.png')
    about = os.environ.get(
        'IMAGE_ABOUT',
        'This is a Dockerfile parsed by the openschemas/extractors container.')
    repository = os.environ.get('GITHUB_REPOSITORY', 'openschemas/extractors')
    description = os.environ.get('IMAGE_DESCRIPTION',
                                 'A Dockerfile build recipe')

    # Step 3: Generate a Person (these are Google Helper functions)
    contact = os.environ.get('GITHUB_ACTOR', contact)
    contact_url = os.environ.get('CONTACT_URL', repository)
    contact_description = os.environ.get('CONTACT_DESCRIPTION',
                                         'Dockerfile maintainer')
    contact_type = os.environ.get('CONTACT_TYPE', 'customer support')
    contact_telephone = os.environ.get('CONTACT_TELEPHONE')

    # Get the repository full url for contact
    if not contact_url.startswith('http'):
        contact_url = "https://www.github.com/%s" % contact_url

    if contact is not None:
        person = make_person(name=contact,
                             description=contact_description,
                             url=contact_url,
                             contact_type=contact_type,
                             telephone=contact_telephone)
        image.properties['creator'] = person
        image.properties['author'] = person

    # image.properties
    if len(parser.environ) > 0:
        image.properties['environment'] = parser.environ
    image.properties['entrypoint'] = parser.entrypoint
    image.properties['version'] = image.version
    image.properties['description'] = description
    image.properties['ContainerImage'] = parser.fromHeader
    image.properties['name'] = container_name

    # Fun properties :)
    image.properties['thumbnailUrl'] = thumbnail
    image.properties['sameAs'] = 'ImageDefinition'
    image.properties['about'] = about
    image.properties[
        'codeRepository'] = 'https://www.github.com/%s' % repository
    image.properties['runtime'] = 'Docker'

    # Generate temporary filename
    output_file = "%s.json" % get_tmpfile("image-definition")

    # Try using container name, if not available default to ContainerImage (FROM)
    layers = run_container_diff(container_name, parser.fromHeader, output_file)

    if len(layers) > 0:

        # softwareRequirements
        requires = []  # APT and PIP

        # note that the top level key here can be history, files, pip, apt, etc.
        for layer in layers:

            ## Pip and Apt will go into softwareRequirements
            if layer['AnalyzeType'] in ["Pip", "Apt"]:
                for pkg in layer['Analysis']:
                    requires.append(
                        '%s > %s==%s' %
                        (layer['AnalyzeType'], pkg['Name'], pkg['Version']))

        image.properties["softwareRequirements"] = requires

    if output_html:
        return make_dataset(image)
    return image.dump_json(pretty_print=True)
Beispiel #2
0
def extract(name,
            version=None,
            contact=None,
            output_html=True,
            description=None,
            thumbnail=None,
            sameAs=None,
            about=None,
            repository=None):
    ''' extract a Dataset to describe some Github repository. To add more
        properties, just add them via additional keyword args (kwargs)
    
        Parameters
        ==========
        url: the url to get the catalog
        name: the name of the DataCatalog
        contact: name of a person that is in charge of the dataset
        description: a description of the DataCatalog
        thumbnail: an image thumbnail (web url)
        about: text about the data catalog (optional).
    '''

    # Step 0. Define absolute paths to our Dockerfile, recipe, output
    here = os.path.abspath(os.path.dirname(__file__))
    recipe_yml = os.path.join(here, "recipe.yml")

    # Step 1: Show required and recommended fields from recipe
    recipe = RecipeParser(recipe_yml)

    # Step 2: Create Dataset
    dataset = Schema("Dataset")

    # We can obtain these from the environment, or use reasonable defaults
    thumbnail = os.environ.get(
        'DATASET_THUMBNAIL', thumbnail
        or 'https://vsoch.github.io/datasets/assets/img/avocado.png')
    about = os.environ.get(
        'DATASET_ABOUT', about
        or 'This is a Dataset parsed by the openschemas/extractors container.')
    repository = os.environ.get('GITHUB_REPOSITORY', repository
                                or 'openschemas/extractors')
    description = os.environ.get('DATASET_DESCRIPTION', 'A Dataset')
    email = os.environ.get('DATASET_EMAIL')
    template = os.environ.get('DATASET_TEMPLATE', "google/dataset-table.html")

    # Can be one of:
    # google/dataset-table.html  (bootstrap)
    # google/visual-dataset.html (see vsoch.github.io/zenodo-ml)
    # google/dataset.html        (just blank page, json metadata)
    # google/dataset-vue-table.html
    # see https://openschemas.github.io/schemaorg#7-embed-in-html-with-json-ld

    # Contact metadata
    contact = os.environ.get('GITHUB_ACTOR', contact)
    contact_url = os.environ.get('CONTACT_URL', repository)
    contact_description = os.environ.get('CONTACT_DESCRIPTION',
                                         'Dataset maintainer')
    contact_type = os.environ.get('CONTACT_TYPE', 'customer support')
    contact_telephone = os.environ.get('CONTACT_TELEPHONE')
    contact = add_kwargs(contact, 'DATASET_DOWNLOAD_KWARGS')

    # Download Link
    download_link = os.environ.get('DATASET_DOWNLOAD_LINK')
    encoding = os.environ.get('DATASET_ENCODING_FORMAT')
    if download != None:
        download = Schema('DataDownload')
        download.add_property('encodingFormat', encoding)
        download.add_property('contentUrl', download_link)
        download = add_kwargs(download, 'DATASET_DOWNLOAD_KWARGS')
        dataset.add_property('distribution', [download])

    # Get the repository full url for contact
    if not contact_url.startswith('http'):
        contact_url = "https://www.github.com/%s" % contact_url

    if contact is not None:
        person = make_person(name=contact,
                             description=contact_description,
                             url=contact_url,
                             contact_type=contact_type,
                             telephone=contact_telephone)
        person = add_kwargs(person, 'CONTACT_KWARGS')
        dataset.add_property('creator', person)

    # dataset.properties
    dataset.add_property('version', version)
    dataset.add_property('description', description)
    dataset.add_property('name', name)
    dataset.add_property('thumbnailUrl', thumbnail)
    dataset.add_property('about', about)
    dataset = add_kwargs(dataset, 'DATASET_KWARGS')

    # Step 5: Validate Data Structure
    recipe.validate(dataset)

    if output_html:
        return make_dataset(dataset, template=template)
    return dataset.dump_json(pretty_print=True)