Exemple #1
0
class OutputDataPackage(OutputBase):
    """Output a tabular data package (https://specs.frictionlessdata.io/data-package/).
    """
    def __init__(self,
                 inputs,
                 schema,
                 output_folder='_site',
                 translations=None,
                 indicator_options=None,
                 data_schema=None,
                 package_properties=None,
                 resource_properties=None,
                 field_properties=None,
                 sorting='alphabetical',
                 logging=None):
        """Constructor for OutputDataPackage.

        Parameters
        ----------

        Inherits all the parameters from OutputBase, plus the following:

        data_schema : DataSchemaInputBase
            An optional subclass of DataSchemaInputBase
        package_properties : dict
            Common properties to add to all the data packages.
            Note that this can also be set per-indicator in a "datapackage" metadata property.
        resource_properties : dict
            Common properties to add to the resource in all data packages.
            Note that this can also be set per-indicator in a "datapackage" metadata property.
        field_properties : dict of dicts
            Properties to add to specific fields, keyed by field name.
            Note that this can also be set per-indicator in a "datapackage" metadata property.
        sorting : string
            Strategy to use when sorting the columns and values. Options are:
            - alphabetical: Sort columns/values in alphabetical order
            - default: Use the default sorting, either from the data_schema or
              from the position of columns/values in the source data
        """

        OutputBase.__init__(self,
                            inputs,
                            schema,
                            output_folder=output_folder,
                            translations=translations,
                            indicator_options=indicator_options,
                            logging=logging)
        self.top_level_package = None
        self.data_schema = data_schema
        if package_properties is None:
            package_properties = {}
        if resource_properties is None:
            resource_properties = {}
        if field_properties is None:
            field_properties = {}
        self.package_properties = package_properties
        self.resource_properties = resource_properties
        self.field_properties = field_properties
        self.sorting = sorting
        self.per_indicator_metadata_field = 'datapackage'

    def get_base_folder(self):
        return 'data-packages'

    def build(self, language=None):
        """Write the JSON output expected by Open SDG. Overrides parent."""
        status = True

        self.create_top_level_package('all', 'All indicators')

        backup_data_schema = DataSchemaInputIndicator(source=self.indicators)
        data_schema_not_specified = False
        if self.data_schema is None:
            data_schema_not_specified = True
            self.data_schema = backup_data_schema

        for indicator_id in self.get_indicator_ids():
            # Make sure the folder exists.
            package_folder = os.path.join(self.output_folder,
                                          self.get_base_folder(), indicator_id)
            Path(package_folder).mkdir(parents=True, exist_ok=True)

            indicator = self.get_indicator_by_id(indicator_id).language(
                language)
            backup_schema_was_used = False
            data_schema_for_indicator = self.data_schema.get_schema_for_indicator(
                indicator)
            if data_schema_for_indicator is None:
                backup_schema_was_used = True
                data_schema_for_indicator = backup_data_schema.get_schema_for_indicator(
                    indicator)

            # Clone the schema so that it can be sorted and translated.
            data_schema_for_indicator = Schema(dict(data_schema_for_indicator))
            # We only sort the schema if it was not specified or if
            # the backup schema was used.
            if data_schema_not_specified or backup_schema_was_used:
                self.sort_data_schema(data_schema_for_indicator)
            if language is not None:
                self.translate_data_schema(data_schema_for_indicator, language)

            # Write the data.
            data_path = os.path.join(package_folder, 'data.csv')
            self.write_data(indicator.data, data_path)

            # Write the descriptor.
            descriptor_path = os.path.join(package_folder, 'datapackage.json')
            indicator_package = self.create_indicator_package(
                data_schema_for_indicator, data_path, indicator_id, indicator)
            self.write_indicator_package(indicator_package,
                                         descriptor_path,
                                         indicator,
                                         language=language)

            # Add to the top level package.
            top_level_data_path = indicator_id + '/data.csv'
            top_level_resource = self.create_resource(
                data_schema_for_indicator, data_path, indicator_id, indicator,
                top_level_data_path)
            self.add_to_top_level_package(top_level_resource)

        top_level_descriptor_path = os.path.join(self.output_folder,
                                                 self.get_base_folder(),
                                                 'all.json')
        self.write_top_level_package(top_level_descriptor_path,
                                     language=language)

        return status

    def create_top_level_package(self, name, title):
        self.top_level_package = Package(self.package_properties)
        self.top_level_package.name = name
        self.top_level_package.title = title

    def write_top_level_package(self, path, language=None):
        self.top_level_package.to_json(path)
        # Workaround for https://github.com/frictionlessdata/frictionless-py/issues/788
        os.chmod(path, 0o644)

    def add_to_top_level_package(self, resource):
        self.top_level_package.add_resource(resource)

    def write_data(self, df, path):
        df.to_csv(path, index=False)

    def apply_package_properties(self, package, indicator):
        for key in self.package_properties:
            package[key] = self.package_properties[key]
        indicator_props = self.get_properties_per_indicator(
            indicator, 'package_properties')
        for key in indicator_props:
            package[key] = indicator_props[key]

    def apply_resource_properties(self, resource, indicator):
        for key in self.resource_properties:
            resource[key] = self.resource_properties[key]
        indicator_props = self.get_properties_per_indicator(
            indicator, 'resource_properties')
        for key in indicator_props:
            resource[key] = indicator_props[key]

    def apply_field_properties(self, resource, indicator):
        for field_name in self.field_properties:
            field = resource.get_field(field_name)
            if field is not None:
                for key in self.field_properties[field_name]:
                    field[key] = self.field_properties[field_name][key]
        indicator_props = self.get_properties_per_indicator(
            indicator, 'field_properties')
        for field_name in indicator_props:
            field = resource.get_field(field_name)
            if field is not None:
                for key in indicator_props[field_name]:
                    field[key] = indicator_props[field_name][key]

    def create_resource(self,
                        schema,
                        data_path,
                        indicator_id,
                        indicator,
                        data_path_override=None):
        resource = describe_resource(data_path)
        self.apply_resource_properties(resource, indicator)
        resource.schema = schema
        resource.path = data_path_override if data_path_override is not None else data_path
        resource.name = indicator_id
        resource.title = indicator.get_name()
        self.apply_field_properties(resource, indicator)
        return resource

    def create_indicator_package(self, schema, data_path, indicator_id,
                                 indicator):
        package = describe_package(data_path)
        self.apply_package_properties(package, indicator)
        package.name = indicator_id
        package.title = indicator.get_name()
        resource = package.get_resource('data')
        self.apply_resource_properties(resource, indicator)
        resource.schema = schema
        resource.path = 'data.csv'
        return package

    def write_indicator_package(self,
                                package,
                                descriptor_path,
                                indicator,
                                language=None):
        package.to_json(descriptor_path)
        # Workaround for https://github.com/frictionlessdata/frictionless-py/issues/788
        os.chmod(descriptor_path, 0o644)

    def sort_data_schema(self, schema):
        if self.sorting == 'alphabetical':
            schema.fields.sort(key=lambda field: field.name)
            for field in schema.fields:
                if 'enum' in field.constraints:
                    field.constraints['enum'].sort()

    def translate_data_schema(self, schema, language):
        groups_common = ['data']
        for field in schema.fields:
            groups = groups_common + [field.name]
            # Translate the field titles.
            field.title = self.translation_helper.translate(
                field.name, language, groups)
            # Translate the values.
            if 'enum' in field.constraints:
                field.constraints['enum'] = [
                    self.translation_helper.translate(value, language, groups)
                    for value in field.constraints['enum']
                ]

    def get_properties_per_indicator(self, indicator, property_type):
        if indicator is None:
            return {}

        per_indicator = indicator.get_meta_field_value(
            self.per_indicator_metadata_field)

        if per_indicator is not None and property_type in per_indicator:
            return per_indicator[property_type]
        else:
            return {}

    def validate(self):
        """Validate the data for the indicators."""

        status = True
        if self.data_schema is not None:
            for inid in self.indicators:
                status = status & self.data_schema.validate(
                    self.indicators[inid])

        return status

    def get_documentation_title(self):
        return 'Data packages'

    def get_documentation_content(self, languages=None, baseurl=''):
        if languages is None:
            languages = ['']

        indicator_ids = self.get_documentation_indicator_ids()

        descriptor_endpoint = '{language}/{base_folder}/{indicator_id}/datapackage.json'
        data_endpoint = '{language}/{base_folder}/{indicator_id}/data.csv'
        all_endpoint = '{language}/{base_folder}/all.json'
        output = '<p>' + self.get_documentation_description(
        ) + ' Examples are below:<p>'
        output += '<ul>'
        for language in languages:
            for indicator_id in indicator_ids:
                descriptor_path = descriptor_endpoint.format(
                    language=language,
                    indicator_id=indicator_id,
                    base_folder=self.get_base_folder())
                data_path = data_endpoint.format(
                    language=language,
                    indicator_id=indicator_id,
                    base_folder=self.get_base_folder())
                output += '<li>' + indicator_id + ':<ul>'
                output += '<li><a href="' + baseurl + descriptor_path + '">' + descriptor_path + '</a></li>'
                output += '<li><a href="' + baseurl + data_path + '">' + data_path + '</a></li>'
                output += '</ul></li>'
            all_path = all_endpoint.format(language=language,
                                           base_folder=self.get_base_folder())
            output += '<li>All indicators: <a href="' + baseurl + all_path + '">' + all_path + '</a></li>'
        output += '<li>etc...</li>'
        output += '</ul>'
        return output

    def get_documentation_description(self):
        return """This output produces <a href="https://specs.frictionlessdata.io/data-package/">
Exemple #2
0
def package_event(event, format):
    if format not in ['zip', 'json']:
        return "Format not supported"
    # Set up a data package
    package = Package(
        name='event-%d' % event.id,
        title=event.name,
        description="Event and project details collected with dribdat",
        keywords=["dribdat", "hackathon", "co-creation"],
        sources=[{
            "title": "dribdat",
            "path": "https://dribdat.cc"
        }],
        licenses=[{
            "name":
            "ODC-PDDL-1.0",
            "path":
            "http://opendatacommons.org/licenses/pddl/",
            "title":
            "Open Data Commons Public Domain Dedication & License 1.0"
        }],
        contributors=[{
            "title": current_user.username,
            "path": current_user.webpage_url or '',
            "role": "author"
        }],
        homepage=request.host_url,
        created=format_date(datetime.now(), '%Y-%m-%dT%H:%M'),
        version="0.1.0",
    )

    # if False:  # as CSV
    #     fp_projects = tempfile.NamedTemporaryFile(
    #         mode='w+t', prefix='projects-', suffix='.csv')
    #     # print("Writing to temp CSV file", fp_projects.name)
    #     fp_projects.write(gen_csv(project_list(event.id)))
    #     resource = Resource(fp_projects.name)
    # if False:
    #     # print("Generating in-memory rowset")
    #     project_rows = gen_rows(project_list(event.id))
    #     resource = Resource(
    #         name='projects',
    #         data=project_rows,
    #     )

    # Generate resources
    # print("Generating in-memory JSON of event")
    package.add_resource(
        Resource(
            name='events',
            data=[event.get_full_data()],
        ))
    # print("Generating in-memory JSON of projects")
    package.add_resource(
        Resource(
            name='projects',
            data=project_list(event.id, True),
        ))
    if format == 'zip':
        # print("Generating in-memory JSON of participants")
        package.add_resource(
            Resource(
                name='users',
                data=get_event_users(event),
            ))
        # print("Generating in-memory JSON of activities")
        package.add_resource(
            Resource(
                name='activities',
                data=get_event_activities(event.id, 500),
            ))
        # print("Generating in-memory JSON of activities")
        package.add_resource(
            Resource(
                name='categories',
                data=get_event_categories(event.id),
            ))
        # print("Adding supplementary README")
        package.add_resource(Resource(
            name='readme',
            path='PACKAGE.txt',
        ))

    # Generate data package
    fp_package = tempfile.NamedTemporaryFile(prefix='datapackage-',
                                             suffix='.zip')
    # print("Saving at", fp_package.name)
    if format == 'json':
        return jsonify(package)
    elif format == 'zip':
        package.to_zip(fp_package.name)
        return send_file(fp_package.name, as_attachment=True)
Exemple #3
0
def test_package_add_resource():
    package = Package({})
    resource = package.add_resource({"name": "name", "data": []})
    assert len(package.resources) == 1
    assert package.resources[0].name == "name"
    assert resource.name == "name"
Exemple #4
0
def PackageEvent(event, author, host_url='', full_contents=False):
    """ Creates a Data Package from the data of an event """

    # Set up a data package object
    package = Package(
        name='event-%d' % event.id,
        title=event.name,
        description="Event and project details collected with dribdat",
        keywords=["dribdat", "hackathon", "co-creation"],
        sources=[{
            "title": "dribdat",
            "path": "https://dribdat.cc"
        }],
        licenses=[{
            "name":
            "ODC-PDDL-1.0",
            "path":
            "http://opendatacommons.org/licenses/pddl/",
            "title":
            "Open Data Commons Public Domain Dedication & License 1.0"
        }],
        contributors=[{
            "title": author.username,
            "path": author.webpage_url or '',
            "role": "author"
        }],
        homepage=event.webpage_url or '',
        created=format_date(dt.now(), '%Y-%m-%dT%H:%M'),
        version="0.1.0",
    )

    # if False:  # as CSV
    #     fp_projects = tempfile.NamedTemporaryFile(
    #         mode='w+t', prefix='projects-', suffix='.csv')
    #     # print("Writing to temp CSV file", fp_projects.name)
    #     fp_projects.write(gen_csv(get_project_list(event.id)))
    #     resource = Resource(fp_projects.name)
    # if False:
    #     # print("Generating in-memory rowset")
    #     project_rows = gen_rows(get_project_list(event.id))
    #     resource = Resource(
    #         name='projects',
    #         data=project_rows,
    #     )

    # Generate resources

    # print("Generating in-memory JSON of event")
    package.add_resource(
        Resource(
            name='events',
            data=[event.get_full_data()],
        ))
    # print("Generating in-memory JSON of projects")
    package.add_resource(
        Resource(
            name='projects',
            data=get_project_list(event.id, host_url, True),
        ))
    if full_contents:
        # print("Generating in-memory JSON of participants")
        package.add_resource(
            Resource(
                name='users',
                data=get_event_users(event),
            ))
        # print("Generating in-memory JSON of activities")
        package.add_resource(
            Resource(
                name='activities',
                data=get_event_activities(event.id, 500),
            ))
        # print("Generating in-memory JSON of activities")
        package.add_resource(
            Resource(
                name='categories',
                data=get_event_categories(event.id),
            ))
        # print("Adding supplementary README")
        package.add_resource(Resource(
            name='readme',
            path='PACKAGE.txt',
        ))

    return package