Beispiel #1
0
 def test_parse(tag_str, expected_tags):
     tags = schema_gov.tags_parse(tag_str)
     assert tags == expected_tags, 'Got %s not %s' % (tags, expected_tags)
Beispiel #2
0
    def row_2_package(self, row_dict):
        name = (row_dict.get('Identifier')
                or u'').replace('higher-education-statistics', 'hesa')
        name = self.name_munge(name)
        title = row_dict['Title']
        if not (name and title):
            raise RowParseError(
                'Both Name and Title fields must be filled: name=%r title=%r' %
                (name, title))
        contacts = row_dict['Contact information'].split('\n')
        if len(contacts) != 3:
            raise RowParseError(
                'Unknown contacts format with %i line(s) not 3:' %
                (len(contacts), contact_information))
        author, ignore_phone, author_email = contacts

        license_name = row_dict['Licence'].replace('Statistcs', 'Statistics')
        license_id = self.license_2_license_id(license_name, self.log)
        if not license_id:
            raise RowParseError('No license recognised for: %r' % license_name)

        ref = row_dict['Dataset Ref#']
        if not ref.startswith('BIS-'):
            raise RowParseError('Reference must start with \'BIS-\': %s' % ref)
        resources = self._resources_by_ref[ref]

        geo_coverage = schema_gov.GeoCoverageType.get_instance().str_to_db(
            row_dict['Geographic Coverage'])

        munged_dates = {}
        for column in [
                'Date Released', 'Date Updated', 'Temporal Coverage To',
                'Temporal Coverage From'
        ]:
            val = '%s' % row_dict[column]
            munged_dates[column] = val

        taxonomy_url = row_dict['Taxonomy url']
        if taxonomy_url and taxonomy_url != '-':
            taxonomy_url = self.tidy_url(taxonomy_url, self.log)

        national_statistic = u'no'
        if row_dict['National Statistic'] != national_statistic:
            self.log(
                'Warning: Ignoring national statistic for non-ONS data: %s' %
                row_dict['National Statistic'])

        pkg_dict = OrderedDict([
            ('name', name),
            ('title', title),
            ('version',
             row_dict['Version'][:model.PACKAGE_VERSION_MAX_LENGTH]),
            ('url', None),
            ('author', author),
            ('author_email', author_email),
            ('maintainer', None),
            ('maintainer_email', None),
            ('notes', row_dict['Abstract']),
            ('license_id', license_id),
            ('tags', []),  # post-filled
            ('groups', ['ukgov']),
            ('resources', resources),
            ('extras',
             OrderedDict([
                 ('external_reference', ref),
                 ('date_released', munged_dates['Date Released']),
                 ('date_updated', munged_dates['Date Updated']),
                 ('temporal_granularity', row_dict['Temporal Granularity']),
                 ('temporal_coverage_from',
                  munged_dates['Temporal Coverage From']),
                 ('temporal_coverage_to',
                  munged_dates['Temporal Coverage To']),
                 ('geographic_coverage', geo_coverage),
                 ('geographical_granularity',
                  row_dict['Geographic Granularity']),
                 ('agency', row_dict['Agency']),
                 ('precision', row_dict['Precision']),
                 ('taxonomy_url', taxonomy_url),
                 ('import_source',
                  'BIS-%s' % os.path.basename(self._filepath)),
                 ('department', row_dict['Department']),
                 ('update_frequency', row_dict['Update Frequency']),
                 ('national_statistic', national_statistic),
                 ('categories', row_dict['Categories']),
             ])),
        ])

        tags = schema_gov.TagSuggester.suggest_tags(pkg_dict)
        [tags.add(tag) for tag in schema_gov.tags_parse(row_dict['Tags'])]
        tags = list(tags)
        tags.sort()
        pkg_dict['tags'] = tags

        # snap to suggestions
        field_suggestions = [
            ['temporal_granularity', schema_gov.temporal_granularity_options],
            [
                'geographical_granularity',
                schema_gov.geographic_granularity_options
            ],
            ['categories', schema_gov.category_options],
            ['department', schema_gov.government_depts],
        ]
        for field, suggestions in field_suggestions:
            val = pkg_dict['extras'][field]
            if val and val != '-' and val not in suggestions:
                suggestions_lower = [sugg.lower() for sugg in suggestions]
                if val.lower() in suggestions_lower:
                    val = suggestions[suggestions_lower.index(val.lower())]
                elif schema_gov.expand_abbreviations(val) in suggestions:
                    val = schema_gov.expand_abbreviations(val)
                elif val.lower() + 's' in suggestions:
                    val = val.lower() + 's'
                elif val.replace('&', 'and').strip() in suggestions:
                    val = val.replace('&', 'and').strip()
            if val and val != '-' and val not in suggestions:
                self.log(
                    "WARNING: Value for column '%s' of '%s' is not in suggestions '%s'"
                    % (column, val, suggestions))
            pkg_dict['extras'][field] = val

        return pkg_dict
Beispiel #3
0
    def row_2_package(self, row_dict):
        name = (row_dict.get("Identifier") or u"").replace("higher-education-statistics", "hesa")
        name = self.name_munge(name)
        title = row_dict["Title"]
        if not (name and title):
            raise RowParseError("Both Name and Title fields must be filled: name=%r title=%r" % (name, title))
        contacts = row_dict["Contact information"].split("\n")
        if len(contacts) != 3:
            raise RowParseError("Unknown contacts format with %i line(s) not 3:" % (len(contacts), contact_information))
        author, ignore_phone, author_email = contacts

        license_name = row_dict["Licence"].replace("Statistcs", "Statistics")
        license_id = self.license_2_license_id(license_name, self.log)
        if not license_id:
            raise RowParseError("No license recognised for: %r" % license_name)

        ref = row_dict["Dataset Ref#"]
        if not ref.startswith("BIS-"):
            raise RowParseError("Reference must start with 'BIS-': %s" % ref)
        resources = self._resources_by_ref[ref]

        geo_coverage = schema_gov.GeoCoverageType.get_instance().str_to_db(row_dict["Geographic Coverage"])

        munged_dates = {}
        for column in ["Date Released", "Date Updated", "Temporal Coverage To", "Temporal Coverage From"]:
            val = "%s" % row_dict[column]
            munged_dates[column] = val

        taxonomy_url = row_dict["Taxonomy url"]
        if taxonomy_url and taxonomy_url != "-":
            taxonomy_url = self.tidy_url(taxonomy_url, self.log)

        national_statistic = u"no"
        if row_dict["National Statistic"] != national_statistic:
            self.log("Warning: Ignoring national statistic for non-ONS data: %s" % row_dict["National Statistic"])

        pkg_dict = OrderedDict(
            [
                ("name", name),
                ("title", title),
                ("version", row_dict["Version"][: model.PACKAGE_VERSION_MAX_LENGTH]),
                ("url", None),
                ("author", author),
                ("author_email", author_email),
                ("maintainer", None),
                ("maintainer_email", None),
                ("notes", row_dict["Abstract"]),
                ("license_id", license_id),
                ("tags", []),  # post-filled
                ("groups", ["ukgov"]),
                ("resources", resources),
                (
                    "extras",
                    OrderedDict(
                        [
                            ("external_reference", ref),
                            ("date_released", munged_dates["Date Released"]),
                            ("date_updated", munged_dates["Date Updated"]),
                            ("temporal_granularity", row_dict["Temporal Granularity"]),
                            ("temporal_coverage_from", munged_dates["Temporal Coverage From"]),
                            ("temporal_coverage_to", munged_dates["Temporal Coverage To"]),
                            ("geographic_coverage", geo_coverage),
                            ("geographical_granularity", row_dict["Geographic Granularity"]),
                            ("agency", row_dict["Agency"]),
                            ("precision", row_dict["Precision"]),
                            ("taxonomy_url", taxonomy_url),
                            ("import_source", "BIS-%s" % os.path.basename(self._filepath)),
                            ("department", row_dict["Department"]),
                            ("update_frequency", row_dict["Update Frequency"]),
                            ("national_statistic", national_statistic),
                            ("categories", row_dict["Categories"]),
                        ]
                    ),
                ),
            ]
        )

        tags = schema_gov.TagSuggester.suggest_tags(pkg_dict)
        [tags.add(tag) for tag in schema_gov.tags_parse(row_dict["Tags"])]
        tags = list(tags)
        tags.sort()
        pkg_dict["tags"] = tags

        # snap to suggestions
        field_suggestions = [
            ["temporal_granularity", schema_gov.temporal_granularity_options],
            ["geographical_granularity", schema_gov.geographic_granularity_options],
            ["categories", schema_gov.category_options],
            ["department", schema_gov.government_depts],
        ]
        for field, suggestions in field_suggestions:
            val = pkg_dict["extras"][field]
            if val and val != "-" and val not in suggestions:
                suggestions_lower = [sugg.lower() for sugg in suggestions]
                if val.lower() in suggestions_lower:
                    val = suggestions[suggestions_lower.index(val.lower())]
                elif schema_gov.expand_abbreviations(val) in suggestions:
                    val = schema_gov.expand_abbreviations(val)
                elif val.lower() + "s" in suggestions:
                    val = val.lower() + "s"
                elif val.replace("&", "and").strip() in suggestions:
                    val = val.replace("&", "and").strip()
            if val and val != "-" and val not in suggestions:
                self.log(
                    "WARNING: Value for column '%s' of '%s' is not in suggestions '%s'" % (column, val, suggestions)
                )
            pkg_dict["extras"][field] = val

        return pkg_dict