Beispiel #1
0
 def test_parse(self):
     expected_data = [
         ('Annual Report', 'annual_report'),
         ('Annual Report: 2006', 'annual_report-2006'),
         ]
     for str_, name in expected_data:
         result_name = name_munge(str_)
         assert_equal(result_name, name)
Beispiel #2
0
 def test_parse(self):
     expected_data = [
         ('Annual Report', 'annual_report'),
         ('Annual Report: 2006', 'annual_report-2006'),
     ]
     for str_, name in expected_data:
         result_name = name_munge(str_)
         assert_equal(result_name, name)
Beispiel #3
0
    def record_2_package(self, item):
        assert isinstance(item, dict)

        # process item
        title, release = self._split_title(item['title'])
        munged_title = schema.name_munge(title)
        publisher_name = self._source_to_publisher(item['hub:source-agency'])
        if publisher_name:
            publishers = [publisher_name]
        else:
            publishers = []
            log.warn('Did not find publisher for source-agency: %s', item['hub:source-agency'])

        # Resources
        guid = item['guid'] or None
        if guid:
            if not guid.startswith(guid_prefix):
                raise RowParseError('GUID did not start with prefix %r: %r' % (guid_prefix, guid))
            guid = guid[len(guid_prefix):]
            if 'http' in guid: 
                raise RowParseError('GUID de-prefixed should not have \'http\' in it still: %r' % (guid))
        existing_resource = None
        download_url = item.get('link', None)

        notes_list = []
        if item['description']:
            notes_list.append(item['description'])
        for column, name in [('hub:source-agency', 'Source agency'),
                             ('hub:designation', 'Designation'),
                             ('hub:language', 'Language'),
                             ('hub:altTitle', 'Alternative title'),
                       ]:
            if item[column]:
                notes_list.append('%s: %s' % (name, item[column]))
        notes = '\n\n'.join(notes_list)

        extras = {
            'geographic_coverage': u'',
            'external_reference': u'',
            'temporal_granularity': u'',
            'date_updated': u'',
            'precision': u'',
            'geographic_granularity': u'',
            'temporal_coverage-from': u'',
            'temporal_coverage-to': u'',
            'national_statistic': u'',
            'update_frequency': u'',
            'date_released': u'',
            'categories': u'',
            'series':u'',
            }
        date_released = u''
        if item['pubDate']:
            date_released = date.parse(item["pubDate"])
            if date_released.qualifier:
                log.warn('Could not read format of publication (release) date: %r' % 
                         item["pubDate"])
        extras['date_released'] = date_released.isoformat()
        extras['categories'] = item['hub:theme']
        extras['geographic_coverage'] = self._parse_geographic_coverage(item['hub:coverage'])
        extras['national_statistic'] = 'yes' if item['hub:designation'] == 'National Statistics' or item['hub:designation'] == 'National Statistics' else 'no'
        extras['geographic_granularity'] = item['hub:geographic-breakdown']
        extras['external_reference'] = u'ONSHUB'
        extras['series'] = title if release else u''
        for update_frequency_suggestion in schema.update_frequency_options:
            item_info = ('%s %s' % (item['title'], item['description'])).lower()
            if update_frequency_suggestion in item_info:
                extras['update_frequency'] = update_frequency_suggestion
            elif update_frequency_suggestion.endswith('ly'):
                if update_frequency_suggestion.rstrip('ly') in item_info:
                    extras['update_frequency'] = update_frequency_suggestion
        extras['import_source'] = 'ONS-%s' % self._current_filename 

        resources = [{
            'url': download_url,
            'description': release,
            'hub-id': guid,
            'publish-date': date_released.as_datetime().strftime('%Y-%m-%d'),
            }]

        # update package
        pkg_dict = {
            'name': munged_title,
            'title': title,
            'version': None,
            'url': None,
            'maintainer': None,
            'maintainer_email': None,
            'notes': notes,
            'license_id': self._crown_license_id,
            'tags': [], # post-filled
            'groups': publishers,
            'resources': resources,
            'extras': extras,
            }

        tags = schema.TagSuggester.suggest_tags(pkg_dict)
        for keyword in item['hub:ipsv'].split(';') + \
                item['hub:keywords'].split(';') + \
                item['hub:nscl'].split(';'):
            tag = schema.tag_munge(keyword)
            if tag and len(tag) > 1:
                tags.add(tag)
        tags = list(tags)
        tags.sort()
        pkg_dict['tags'] = tags

        return pkg_dict
Beispiel #4
0
 def test_parse(self):
     expected_data = [("Annual Report", "annual_report"), ("Annual Report: 2006", "annual_report-2006")]
     for str_, name in expected_data:
         result_name = name_munge(str_)
         assert_equal(result_name, name)
Beispiel #5
0
    def record_2_package(self, item):
        assert isinstance(item, dict)

        # process item
        title, release = self._split_title(item['title'])
        munged_title = schema.name_munge(title)
        publisher_name = self._source_to_publisher(item['hub:source-agency'])
        if publisher_name:
            publishers = [publisher_name]
        else:
            publishers = []
            log.warn('Did not find publisher for source-agency: %s', item['hub:source-agency'])

        # Resources
        guid = item['guid'] or None
        if guid:
            if not guid.startswith(guid_prefix):
                raise RowParseError('GUID did not start with prefix %r: %r' % (guid_prefix, guid))
            guid = guid[len(guid_prefix):]
            if 'http' in guid: 
                raise RowParseError('GUID de-prefixed should not have \'http\' in it still: %r' % (guid))
        existing_resource = None
        download_url = item.get('link', None)

        notes_list = []
        if item['description']:
            notes_list.append(item['description'])
        for column, name in [('hub:source-agency', 'Source agency'),
                             ('hub:designation', 'Designation'),
                             ('hub:language', 'Language'),
                             ('hub:altTitle', 'Alternative title'),
                       ]:
            if item[column]:
                notes_list.append('%s: %s' % (name, item[column]))
        notes = '\n\n'.join(notes_list)

        extras = {
            'geographic_coverage': u'',
            'external_reference': u'',
            'temporal_granularity': u'',
            'date_updated': u'',
            'precision': u'',
            'geographic_granularity': u'',
            'temporal_coverage-from': u'',
            'temporal_coverage-to': u'',
            'national_statistic': u'',
            'update_frequency': u'',
            'date_released': u'',
            'categories': u'',
            'series':u'',
            }
        date_released = u''
        if item['pubDate']:
            date_released = date.parse(item["pubDate"])
            if date_released.qualifier:
                log.warn('Could not read format of publication (release) date: %r' % 
                         item["pubDate"])
        extras['date_released'] = date_released.isoformat()
        extras['categories'] = item['hub:theme']
        extras['geographic_coverage'] = self._parse_geographic_coverage(item['hub:coverage'])
        extras['national_statistic'] = 'yes' if item['hub:designation'] == 'National Statistics' or item['hub:designation'] == 'National Statistics' else 'no'
        extras['geographic_granularity'] = item['hub:geographic-breakdown']
        extras['external_reference'] = u'ONSHUB'
        extras['series'] = title if release else u''
        for update_frequency_suggestion in schema.update_frequency_options:
            item_info = ('%s %s' % (item['title'], item['description'])).lower()
            if update_frequency_suggestion in item_info:
                extras['update_frequency'] = update_frequency_suggestion
            elif update_frequency_suggestion.endswith('ly'):
                if update_frequency_suggestion.rstrip('ly') in item_info:
                    extras['update_frequency'] = update_frequency_suggestion
        extras['import_source'] = 'ONS-%s' % self._current_filename 

        resources = [{
            'url': download_url,
            'description': release,
            'hub-id': guid,
            }]

        # update package
        pkg_dict = {
            'name': munged_title,
            'title': title,
            'version': None,
            'url': None,
            'maintainer': None,
            'maintainer_email': None,
            'notes': notes,
            'license_id': self._crown_license_id,
            'tags': [], # post-filled
            'groups': publishers,
            'resources': resources,
            'extras': extras,
            }

        tags = schema.TagSuggester.suggest_tags(pkg_dict)
        for keyword in item['hub:ipsv'].split(';') + \
                item['hub:keywords'].split(';') + \
                item['hub:nscl'].split(';'):
            tag = schema.tag_munge(keyword)
            if tag and len(tag) > 1:
                tags.add(tag)
        tags = list(tags)
        tags.sort()
        pkg_dict['tags'] = tags

        return pkg_dict