Example #1
0
 def test_parse(self):
     expected_data = [
         ("pollution fish", ["pollution", "fish"]),
         ("dosh$money", ["doshmoney"]),
         ("ordnance survey", ["ordnance-survey"]),
     ]
     for str_, tags in expected_data:
         result_tags = tags_parse(str_)
         assert_equal(result_tags, tags)
Example #2
0
 def test_parse(self):
     expected_data = [
         ('pollution fish', ['pollution', 'fish']),
         ('dosh$money', ['doshmoney']),
         ('ordnance survey', ['ordnance-survey']),
     ]
     for str_, tags in expected_data:
         result_tags = tags_parse(str_)
         assert_equal(result_tags, tags)
Example #3
0
 def test_parse(self):
     expected_data = [
         ('pollution fish', ['pollution', 'fish']),
         ('dosh$money', ['doshmoney']),
         ('ordnance survey', ['ordnance-survey']),
         ]
     for str_, tags in expected_data:
         result_tags = tags_parse(str_)
         assert_equal(result_tags, tags)
Example #4
0
class CospreadImporter(SpreadsheetPackageImporter):
    license_map = {
        u'UK Crown Copyright with data.gov.uk rights': u'uk-ogl',
        u'\xa9 HESA. Not core Crown Copyright.': u'uk-ogl',
        u'Local Authority copyright with data.gov.uk rights': u'uk-ogl',
        u'Local Authority Copyright with data.gov.uk rights': u'uk-ogl',
        u'UK Crown Copyright': u'uk-ogl',
        u'Crown Copyright': u'uk-ogl',
        u'UK Open Government Licence (OGL)': u'uk-ogl',
        u'UK Open Government License (OGL)': u'uk-ogl',
        u'Met Office licence': u'met-office-cp',
        u'Met Office UK Climate Projections Licence Agreement':
        u'met-office-cp',
    }

    def __init__(self,
                 include_given_tags=False,
                 xmlrpc_settings=None,
                 generate_names=False,
                 **kwargs):
        self.include_given_tags = include_given_tags
        self._drupal_helper = schema.DrupalHelper(xmlrpc_settings)
        super(CospreadImporter,
              self).__init__(record_params=[generate_names],
                             record_class=CospreadDataRecords,
                             **kwargs)

    @classmethod
    def log(self, msg):
        super(CospreadImporter, self).log(msg)
        log.warn(msg)

    def record_2_package(self, row_dict):
        pkg_dict = OrderedDict()
        pkg_dict['title'] = row_dict['Title']
        pkg_dict['name'] = self.name_munge(
            row_dict.get('Package name') or u'') or self.munge(
                pkg_dict['title'])
        if not (pkg_dict['name'] and pkg_dict['title']):
            raise RowParseError(
                'Both Name and Title fields must be filled: name=%r title=%r' %
                (pkg_dict['name'], pkg_dict['title']))
        log.info('Package: %s' % pkg_dict['name'])
        pkg_dict['author'] = row_dict['Contact - Permanent contact point']
        pkg_dict['author_email'] = row_dict['Contact - E-mail address.']
        is_maintainer = bool('maintainer' in ' '.join(row_dict.keys()).lower())
        pkg_dict['maintainer'] = row_dict[
            'Maintainer - '] if is_maintainer else None
        pkg_dict['maintainer_email'] = row_dict[
            'Maintainer - E-mail address'] if is_maintainer else None
        notes = row_dict['Notes']
        license_id, additional_notes = self.get_license_id(row_dict['Licence'])
        if additional_notes:
            notes += additional_notes
        pkg_dict['license_id'] = license_id
        pkg_dict['url'] = self.tidy_url(row_dict['URL'])
        pkg_dict['notes'] = notes
        pkg_dict['version'] = u''
        pkg_dict['groups'] = [u'ukgov']

        pkg_dict['extras'] = OrderedDict()
        extras_dict = pkg_dict['extras']
        geo_cover = []
        geo_coverage_type = schema.GeoCoverageType.get_instance()
        spreadsheet_regions = ('England', 'N. Ireland', 'Scotland', 'Wales',
                               'Overseas', 'Global')
        for region in spreadsheet_regions:
            munged_region = region.lower().replace('n. ', 'northern_')
            field = 'Geographic coverage - %s' % region
            if (row_dict[field]
                    or '').lower() not in (None, '', 'no', 'False'):
                geo_cover.append(munged_region)
        extras_dict['geographic_coverage'] = geo_coverage_type.form_to_db(
            geo_cover)
        for column, extra_key in [
            ('Date released', 'date_released'),
            ('Date updated', 'date_updated'),
            ('Date update future', 'date_update_future'),
            ('Temporal Coverage - From', 'temporal_coverage-from'),
            ('Temporal Coverage - To', 'temporal_coverage-to'),
        ]:
            form_value = row_dict.get(column)
            if isinstance(form_value, datetime.date):
                val = field_types.DateType.date_to_db(form_value)
            else:
                if isinstance(form_value, int):
                    form_value = str(form_value)
                # Hack for CLG data to allow '2008/09' to mean '2008', or
                # '2009' if it is a 'To' field.
                match = re.match('(\d{4})/(\d{2})', form_value or '')
                if match:
                    years = [int(year_str) for year_str in match.groups()]
                    if extra_key.endswith('-to'):
                        form_value = str(
                            field_types.DateType.
                            add_centurys_to_two_digit_year(year=years[1],
                                                           near_year=years[0]))
                    else:
                        form_value = str(years[0])
                try:
                    val = field_types.DateType.form_to_db(form_value)
                except field_types.DateConvertError, e:
                    self.log(
                        "WARNING: Value for column '%s' of %r is not understood as a date format."
                        % (column, form_value))
                    val = form_value
            extras_dict[extra_key] = val

        field_map = [
            ['CO Identifier'],
            ['Update frequency', schema.update_frequency_options],
            ['Temporal Granularity', schema.temporal_granularity_options],
            [
                'Geographical Granularity',
                schema.geographic_granularity_options
            ],
            ['Taxonomy URL'],
            ['Agency responsible'],
            ['Precision'],
            ['Department', schema.government_depts],
            ['Published by'],
            ['Published via'],
            ['Mandate'],
        ]
        optional_fields = [
            'Categories',
            'CO Identifier',
            'Agency responsible',
            'Department',
            'Published by',
            'Published via',
            'Mandate',
        ]
        for field_mapping in field_map:
            column = field_mapping[0]
            extras_key = column.lower().replace(' ', '_')
            if column == 'Agency responsible':
                extras_key = 'agency'
            elif column in ('CO Identifier', 'CO Reference'):
                if row_dict.has_key('CO Reference'):
                    column = 'CO Reference'
                extras_key = 'external_reference'
            if row_dict.has_key(column):
                val = row_dict[column]
            else:
                assert column in optional_fields, column
                val = None
            if len(field_mapping) > 1:
                # snap to suggestions
                suggestions = field_mapping[1]
                if val and val not in suggestions:
                    val = val.strip()
                    suggestions_lower = [sugg.lower() for sugg in suggestions]
                    if val.lower() in suggestions_lower:
                        val = suggestions[suggestions_lower.index(val.lower())]
                    elif schema.canonise_organisation_name(val) in suggestions:
                        val = schema.canonise_organisation_name(val)
                    elif val.lower() + 's' in suggestions:
                        val = val.lower() + 's'
                    elif val.lower().rstrip('s') in suggestions:
                        val = val.lower().rstrip('s')
                    elif val.replace('&', 'and') in suggestions:
                        val = val.replace('&', 'and')
                    elif val.lower() == 'annually' and 'annual' in suggestions:
                        val = 'annual'
                    elif val.lower() == 'year' and 'annual' in suggestions:
                        val = 'annual'
                if val and val not in suggestions:
                    self.log(
                        "WARNING: Value for column '%s' of '%s' is not in suggestions '%s'"
                        % (column, val, suggestions))
            extras_dict[extras_key] = val

        orgs = []
        for key in ['published_by', 'published_via', 'department', 'agency']:
            org_name = extras_dict.get(key)
            if org_name:
                org = self._drupal_helper.cached_department_or_agency_to_organisation(
                    org_name)
                if org:
                    orgs.append(org)
        # limit/pad number of orgs to be 2
        orgs = (orgs + [u''] * 4)[:2]
        extras_dict['published_by'], extras_dict['published_via'] = orgs
        # do not have department/agency fields any more
        del extras_dict['department']
        del extras_dict['agency']

        extras_dict[
            'national_statistic'] = u''  # Ignored: row_dict['national statistic'].lower()
        extras_dict['import_source'] = 'COSPREAD-%s' % os.path.basename(
            self._filepath)

        resources = []
        for row_resource in row_dict['resources']:
            res_dict = OrderedDict([
                ('url', self.tidy_url(row_resource['Download URL'])),
                ('format', row_resource.get('File format', u'')),
                ('description', row_resource.get('Download Description', u'')),
            ])
            if '\n' in res_dict['url']:
                # multiple urls
                for url in res_dict['url'].split():
                    res_dict_tmp = OrderedDict(
                        res_dict.items())  # i.e. deepcopy
                    res_dict_tmp['url'] = url
                    resources.append(res_dict_tmp)
            else:
                resources.append(res_dict)
        pkg_dict['resources'] = resources

        tags = schema.TagSuggester.suggest_tags(pkg_dict)
        if self.include_given_tags:
            given_tags = schema.tags_parse(row_dict['Tags'])
            tags = tags | set(given_tags)
        pkg_dict['tags'] = sorted(list(tags))

        return pkg_dict
Example #5
0
 def test_parse(tag_str, expected_tags):
     tags = tags_parse(tag_str)
     assert tags == expected_tags, "Got %s not %s" % (tags, expected_tags)
Example #6
0
 def test_parse(tag_str, expected_tags):
     tags = tags_parse(tag_str)
     assert tags == expected_tags, 'Got %s not %s' % (tags,
                                                      expected_tags)