def test_parse(self): expected_data = [ ("pollution fish", ["pollution", "fish"]), ("dosh$money", ["doshmoney"]), ("ordnance survey", ["ordnance-survey"]), ] for str_, tags in expected_data: result_tags = tags_parse(str_) assert_equal(result_tags, tags)
def test_parse(self): expected_data = [ ('pollution fish', ['pollution', 'fish']), ('dosh$money', ['doshmoney']), ('ordnance survey', ['ordnance-survey']), ] for str_, tags in expected_data: result_tags = tags_parse(str_) assert_equal(result_tags, tags)
class CospreadImporter(SpreadsheetPackageImporter): license_map = { u'UK Crown Copyright with data.gov.uk rights': u'uk-ogl', u'\xa9 HESA. Not core Crown Copyright.': u'uk-ogl', u'Local Authority copyright with data.gov.uk rights': u'uk-ogl', u'Local Authority Copyright with data.gov.uk rights': u'uk-ogl', u'UK Crown Copyright': u'uk-ogl', u'Crown Copyright': u'uk-ogl', u'UK Open Government Licence (OGL)': u'uk-ogl', u'UK Open Government License (OGL)': u'uk-ogl', u'Met Office licence': u'met-office-cp', u'Met Office UK Climate Projections Licence Agreement': u'met-office-cp', } def __init__(self, include_given_tags=False, xmlrpc_settings=None, generate_names=False, **kwargs): self.include_given_tags = include_given_tags self._drupal_helper = schema.DrupalHelper(xmlrpc_settings) super(CospreadImporter, self).__init__(record_params=[generate_names], record_class=CospreadDataRecords, **kwargs) @classmethod def log(self, msg): super(CospreadImporter, self).log(msg) log.warn(msg) def record_2_package(self, row_dict): pkg_dict = OrderedDict() pkg_dict['title'] = row_dict['Title'] pkg_dict['name'] = self.name_munge( row_dict.get('Package name') or u'') or self.munge( pkg_dict['title']) if not (pkg_dict['name'] and pkg_dict['title']): raise RowParseError( 'Both Name and Title fields must be filled: name=%r title=%r' % (pkg_dict['name'], pkg_dict['title'])) log.info('Package: %s' % pkg_dict['name']) pkg_dict['author'] = row_dict['Contact - Permanent contact point'] pkg_dict['author_email'] = row_dict['Contact - E-mail address.'] is_maintainer = bool('maintainer' in ' '.join(row_dict.keys()).lower()) pkg_dict['maintainer'] = row_dict[ 'Maintainer - '] if is_maintainer else None pkg_dict['maintainer_email'] = row_dict[ 'Maintainer - E-mail address'] if is_maintainer else None notes = row_dict['Notes'] license_id, additional_notes = self.get_license_id(row_dict['Licence']) if additional_notes: notes += additional_notes pkg_dict['license_id'] = license_id pkg_dict['url'] = self.tidy_url(row_dict['URL']) pkg_dict['notes'] = notes pkg_dict['version'] = u'' pkg_dict['groups'] = [u'ukgov'] pkg_dict['extras'] = OrderedDict() extras_dict = pkg_dict['extras'] geo_cover = [] geo_coverage_type = schema.GeoCoverageType.get_instance() spreadsheet_regions = ('England', 'N. Ireland', 'Scotland', 'Wales', 'Overseas', 'Global') for region in spreadsheet_regions: munged_region = region.lower().replace('n. ', 'northern_') field = 'Geographic coverage - %s' % region if (row_dict[field] or '').lower() not in (None, '', 'no', 'False'): geo_cover.append(munged_region) extras_dict['geographic_coverage'] = geo_coverage_type.form_to_db( geo_cover) for column, extra_key in [ ('Date released', 'date_released'), ('Date updated', 'date_updated'), ('Date update future', 'date_update_future'), ('Temporal Coverage - From', 'temporal_coverage-from'), ('Temporal Coverage - To', 'temporal_coverage-to'), ]: form_value = row_dict.get(column) if isinstance(form_value, datetime.date): val = field_types.DateType.date_to_db(form_value) else: if isinstance(form_value, int): form_value = str(form_value) # Hack for CLG data to allow '2008/09' to mean '2008', or # '2009' if it is a 'To' field. match = re.match('(\d{4})/(\d{2})', form_value or '') if match: years = [int(year_str) for year_str in match.groups()] if extra_key.endswith('-to'): form_value = str( field_types.DateType. add_centurys_to_two_digit_year(year=years[1], near_year=years[0])) else: form_value = str(years[0]) try: val = field_types.DateType.form_to_db(form_value) except field_types.DateConvertError, e: self.log( "WARNING: Value for column '%s' of %r is not understood as a date format." % (column, form_value)) val = form_value extras_dict[extra_key] = val field_map = [ ['CO Identifier'], ['Update frequency', schema.update_frequency_options], ['Temporal Granularity', schema.temporal_granularity_options], [ 'Geographical Granularity', schema.geographic_granularity_options ], ['Taxonomy URL'], ['Agency responsible'], ['Precision'], ['Department', schema.government_depts], ['Published by'], ['Published via'], ['Mandate'], ] optional_fields = [ 'Categories', 'CO Identifier', 'Agency responsible', 'Department', 'Published by', 'Published via', 'Mandate', ] for field_mapping in field_map: column = field_mapping[0] extras_key = column.lower().replace(' ', '_') if column == 'Agency responsible': extras_key = 'agency' elif column in ('CO Identifier', 'CO Reference'): if row_dict.has_key('CO Reference'): column = 'CO Reference' extras_key = 'external_reference' if row_dict.has_key(column): val = row_dict[column] else: assert column in optional_fields, column val = None if len(field_mapping) > 1: # snap to suggestions suggestions = field_mapping[1] if val and val not in suggestions: val = val.strip() suggestions_lower = [sugg.lower() for sugg in suggestions] if val.lower() in suggestions_lower: val = suggestions[suggestions_lower.index(val.lower())] elif schema.canonise_organisation_name(val) in suggestions: val = schema.canonise_organisation_name(val) elif val.lower() + 's' in suggestions: val = val.lower() + 's' elif val.lower().rstrip('s') in suggestions: val = val.lower().rstrip('s') elif val.replace('&', 'and') in suggestions: val = val.replace('&', 'and') elif val.lower() == 'annually' and 'annual' in suggestions: val = 'annual' elif val.lower() == 'year' and 'annual' in suggestions: val = 'annual' if val and val not in suggestions: self.log( "WARNING: Value for column '%s' of '%s' is not in suggestions '%s'" % (column, val, suggestions)) extras_dict[extras_key] = val orgs = [] for key in ['published_by', 'published_via', 'department', 'agency']: org_name = extras_dict.get(key) if org_name: org = self._drupal_helper.cached_department_or_agency_to_organisation( org_name) if org: orgs.append(org) # limit/pad number of orgs to be 2 orgs = (orgs + [u''] * 4)[:2] extras_dict['published_by'], extras_dict['published_via'] = orgs # do not have department/agency fields any more del extras_dict['department'] del extras_dict['agency'] extras_dict[ 'national_statistic'] = u'' # Ignored: row_dict['national statistic'].lower() extras_dict['import_source'] = 'COSPREAD-%s' % os.path.basename( self._filepath) resources = [] for row_resource in row_dict['resources']: res_dict = OrderedDict([ ('url', self.tidy_url(row_resource['Download URL'])), ('format', row_resource.get('File format', u'')), ('description', row_resource.get('Download Description', u'')), ]) if '\n' in res_dict['url']: # multiple urls for url in res_dict['url'].split(): res_dict_tmp = OrderedDict( res_dict.items()) # i.e. deepcopy res_dict_tmp['url'] = url resources.append(res_dict_tmp) else: resources.append(res_dict) pkg_dict['resources'] = resources tags = schema.TagSuggester.suggest_tags(pkg_dict) if self.include_given_tags: given_tags = schema.tags_parse(row_dict['Tags']) tags = tags | set(given_tags) pkg_dict['tags'] = sorted(list(tags)) return pkg_dict
def test_parse(tag_str, expected_tags): tags = tags_parse(tag_str) assert tags == expected_tags, "Got %s not %s" % (tags, expected_tags)
def test_parse(tag_str, expected_tags): tags = tags_parse(tag_str) assert tags == expected_tags, 'Got %s not %s' % (tags, expected_tags)