def get_organisation(self, dept_or_agency): if not self.organisations.has_key(dept_or_agency): # check for name mapping mapped_publisher = self.publisher_map.get(dept_or_agency.strip()) if mapped_publisher: log.info('Mapping %r to %r', dept_or_agency, mapped_publisher) dept_or_agency = mapped_publisher # try canonical name dept_or_agency = schema.canonise_organisation_name(dept_or_agency) # look up with Drupal if not hasattr(self, 'drupal'): domain = self.xmlrpc['domain'] username = self.xmlrpc['username'] password = self.xmlrpc['password'] if username or password: server = '%s:%s@%s' % (username, password, domain) else: server = '%s' % domain self.xmlrpc_url = 'http://%s/services/xmlrpc' % server log.info('XMLRPC connection to %s', self.xmlrpc_url) self.drupal = ServerProxy(self.xmlrpc_url) try: org_id = self.drupal.organisation.match(dept_or_agency) except socket.error, e: raise ScriptError('Socket error connecting to %s', self.xmlrpc_url) except ProtocolError, e: raise ScriptError('XMLRPC error connecting to %s', self.xmlrpc_url)
def generate(cls, xmlrpc_settings): drupal = DrupalClient(xmlrpc_settings) orgs = {} has_errors = False orgs_to_lookup = set() orgs_to_lookup.add('Northern Ireland Executive') for org_name in orgs_to_lookup: org_name = canonise_organisation_name(org_name) org_id = drupal.match_organisation(org_name) if org_id == False: log.error('Could not find organisation %r', org_name) has_errors = True continue proper_org_name = drupal.get_organisation_name(org_id) parent_department_id = drupal.get_department_from_organisation(org_id) orgs[org_id] = {'name': proper_org_name, 'parent_department_id': parent_department_id} f = open(cls.lots_of_orgs_filepath, 'w') try: f.write(json.dumps(orgs)) finally: f.close() if has_errors: print 'Finished with ERRORS' sys.exit(1) else: print 'Finished with SUCCESS'
def _source_to_publisher_(cls, source, ckanclient): ''' For a given ONS Source, returns the equivalent DGU publisher name. If it cannot find it, returns None. ''' # map the name publisher_name = schema.canonise_organisation_name(source) # search for the name in live list of publishers # Start with a narrow search result = ckanclient.action('group_search', query=publisher_name, exact=True) if not result['count']: # Now broaden it out result = ckanclient.action('group_search', query=publisher_name, exact=False) if not result['count']: log.warn('Could not find source in DGU publishers: "%s" (mapped from "%s")', publisher_name, source) return None if result['count'] > 1: log.warn('Multiple publishers matched "%s" (mapped from "%s"): %s', publisher_name, source, [(pub['name'], pub['title']) for pub in result['results']]) else: log.info('..Publisher found: %s', result['results'][0]['name']) return result['results'][0]['name']
def get_organisation(self, dept_or_agency): if not self.organisations.has_key(dept_or_agency): # check for name mapping mapped_publisher = self.publisher_map.get(dept_or_agency.strip()) if mapped_publisher: log.info('Mapping %r to %r', dept_or_agency, mapped_publisher) dept_or_agency = mapped_publisher # try canonical name dept_or_agency = schema.canonise_organisation_name(dept_or_agency) # look up with Drupal if not hasattr(self, 'drupal'): domain = self.xmlrpc['domain'] username = self.xmlrpc['username'] password = self.xmlrpc['password'] if username or password: server = '%s:%s@%s' % (username, password, domain) else: server = '%s' % domain self.xmlrpc_url = 'http://%s/services/xmlrpc' % server log.info('XMLRPC connection to %s', self.xmlrpc_url) self.drupal = ServerProxy(self.xmlrpc_url) try: org_id = self.drupal.organisation.match(dept_or_agency) except socket.error, e: raise ScriptError('Socket error connecting to %s', self.xmlrpc_url) except ProtocolError, e: raise ScriptError('XMLRPC error connecting to %s', self.xmlrpc_url)
def generate(cls, xmlrpc_settings): drupal = DrupalClient(xmlrpc_settings) orgs = {} has_errors = False orgs_to_lookup = set() orgs_to_lookup.add('Northern Ireland Executive') for org_name in orgs_to_lookup: org_name = canonise_organisation_name(org_name) org_id = drupal.match_organisation(org_name) if org_id == False: log.error('Could not find organisation %r', org_name) has_errors = True continue proper_org_name = drupal.get_organisation_name(org_id) parent_department_id = drupal.get_department_from_organisation( org_id) orgs[org_id] = { 'name': proper_org_name, 'parent_department_id': parent_department_id } f = open(cls.lots_of_orgs_filepath, 'w') try: f.write(json.dumps(orgs)) finally: f.close() if has_errors: print 'Finished with ERRORS' sys.exit(1) else: print 'Finished with SUCCESS'
def _source_to_publisher_(cls, source, ckanclient): ''' For a given ONS Source, returns the equivalent DGU publisher name. If it cannot find it, returns None. ''' # map the name publisher_name = schema.canonise_organisation_name(source) # search for the name in live list of publishers # Start with a narrow search result = ckanclient.action('group_search', query=publisher_name, exact=True) if not result['count']: # Now broaden it out result = ckanclient.action('group_search', query=publisher_name, exact=False) if not result['count']: log.warn('Could not find source in DGU publishers: "%s" (mapped from "%s")', publisher_name, source) return None if result['count'] > 1: log.warn('Multiple publishers matched "%s" (mapped from "%s"): %s', publisher_name, source, [(pub['name'], pub['title']) for pub in result['results']]) else: log.info('..Publisher found: %s', result['results'][0]['name']) return result['results'][0]['name']
def _source_to_publisher(self, source): ''' For a given ONS Source, returns the equivalent DGU publisher. If it cannot find it, returns None. ''' # map the name publisher_name = schema.canonise_organisation_name(source) # search for the name in live list of publishers result = self._ckanclient.action('group_search', query=publisher_name) if not result['count']: log.warn('Could not find source in DGU publishers: "%s" (mapped from "%s")', publisher_name, source) if result['count'] > 1: log.warn('Multiple publishers matched "%s" (mapped from "%s"): %s', publisher_name, source, publishers) return result['results'][0]['name']
def _source_to_organisations(cls, source, drupal_helper=None): dept_given = schema.canonise_organisation_name(source) department = None agency = None if not drupal_helper: drupal_helper = schema.DrupalHelper() # special cases if '(Northern Ireland)' in source or dept_given == 'Office of the First and Deputy First Minister': department = u'Northern Ireland Executive' agency = drupal_helper.cached_department_or_agency_to_organisation( dept_given, include_id=False) if not agency: log.warn('Could not find NI department: %s' % dept_given) agency = dept_given if dept_given == 'Office for National Statistics': department = dept_given if dept_given == 'Education': department = 'Department for Education' # search for department if not department: org = drupal_helper.cached_department_or_agency_to_organisation( dept_given, include_id=False) if org in schema.government_depts: department = org elif org: agency = org if not (department or agency) and dept_given: log.warn('Could not find organisation: %s' % dept_given) agency = dept_given # publishers orgs = [drupal_helper.cached_department_or_agency_to_organisation(org) \ for org in [department, agency] if org] orgs += [u''] * (2 - len(orgs)) published_by, published_via = orgs return department, agency, published_by, published_via
def _source_to_organisations(cls, source, drupal_helper=None): dept_given = schema.canonise_organisation_name(source) department = None agency = None if not drupal_helper: drupal_helper = schema.DrupalHelper() # special cases if '(Northern Ireland)' in source or dept_given == 'Office of the First and Deputy First Minister': department = u'Northern Ireland Executive' agency = drupal_helper.cached_department_or_agency_to_organisation(dept_given, include_id=False) if not agency: log.warn('Could not find NI department: %s' % dept_given) agency = dept_given if dept_given == 'Office for National Statistics': department = dept_given if dept_given == 'Education': department = 'Department for Education' # search for department if not department: org = drupal_helper.cached_department_or_agency_to_organisation(dept_given, include_id=False) if org in schema.government_depts: department = org elif org: agency = org if not (department or agency) and dept_given: log.warn('Could not find organisation: %s' % dept_given) agency = dept_given # publishers orgs = [drupal_helper.cached_department_or_agency_to_organisation(org) \ for org in [department, agency] if org] orgs += [u''] * (2 - len(orgs)) published_by, published_via = orgs return department, agency, published_by, published_via
class CospreadImporter(SpreadsheetPackageImporter): license_map = { u'UK Crown Copyright with data.gov.uk rights': u'uk-ogl', u'\xa9 HESA. Not core Crown Copyright.': u'uk-ogl', u'Local Authority copyright with data.gov.uk rights': u'uk-ogl', u'Local Authority Copyright with data.gov.uk rights': u'uk-ogl', u'UK Crown Copyright': u'uk-ogl', u'Crown Copyright': u'uk-ogl', u'UK Open Government Licence (OGL)': u'uk-ogl', u'UK Open Government License (OGL)': u'uk-ogl', u'Met Office licence': u'met-office-cp', u'Met Office UK Climate Projections Licence Agreement': u'met-office-cp', } def __init__(self, include_given_tags=False, xmlrpc_settings=None, generate_names=False, **kwargs): self.include_given_tags = include_given_tags self._drupal_helper = schema.DrupalHelper(xmlrpc_settings) super(CospreadImporter, self).__init__(record_params=[generate_names], record_class=CospreadDataRecords, **kwargs) @classmethod def log(self, msg): super(CospreadImporter, self).log(msg) log.warn(msg) def record_2_package(self, row_dict): pkg_dict = OrderedDict() pkg_dict['title'] = row_dict['Title'] pkg_dict['name'] = self.name_munge( row_dict.get('Package name') or u'') or self.munge( pkg_dict['title']) if not (pkg_dict['name'] and pkg_dict['title']): raise RowParseError( 'Both Name and Title fields must be filled: name=%r title=%r' % (pkg_dict['name'], pkg_dict['title'])) log.info('Package: %s' % pkg_dict['name']) pkg_dict['author'] = row_dict['Contact - Permanent contact point'] pkg_dict['author_email'] = row_dict['Contact - E-mail address.'] is_maintainer = bool('maintainer' in ' '.join(row_dict.keys()).lower()) pkg_dict['maintainer'] = row_dict[ 'Maintainer - '] if is_maintainer else None pkg_dict['maintainer_email'] = row_dict[ 'Maintainer - E-mail address'] if is_maintainer else None notes = row_dict['Notes'] license_id, additional_notes = self.get_license_id(row_dict['Licence']) if additional_notes: notes += additional_notes pkg_dict['license_id'] = license_id pkg_dict['url'] = self.tidy_url(row_dict['URL']) pkg_dict['notes'] = notes pkg_dict['version'] = u'' pkg_dict['groups'] = [u'ukgov'] pkg_dict['extras'] = OrderedDict() extras_dict = pkg_dict['extras'] geo_cover = [] geo_coverage_type = schema.GeoCoverageType.get_instance() spreadsheet_regions = ('England', 'N. Ireland', 'Scotland', 'Wales', 'Overseas', 'Global') for region in spreadsheet_regions: munged_region = region.lower().replace('n. ', 'northern_') field = 'Geographic coverage - %s' % region if (row_dict[field] or '').lower() not in (None, '', 'no', 'False'): geo_cover.append(munged_region) extras_dict['geographic_coverage'] = geo_coverage_type.form_to_db( geo_cover) for column, extra_key in [ ('Date released', 'date_released'), ('Date updated', 'date_updated'), ('Date update future', 'date_update_future'), ('Temporal Coverage - From', 'temporal_coverage-from'), ('Temporal Coverage - To', 'temporal_coverage-to'), ]: form_value = row_dict.get(column) if isinstance(form_value, datetime.date): val = field_types.DateType.date_to_db(form_value) else: if isinstance(form_value, int): form_value = str(form_value) # Hack for CLG data to allow '2008/09' to mean '2008', or # '2009' if it is a 'To' field. match = re.match('(\d{4})/(\d{2})', form_value or '') if match: years = [int(year_str) for year_str in match.groups()] if extra_key.endswith('-to'): form_value = str( field_types.DateType. add_centurys_to_two_digit_year(year=years[1], near_year=years[0])) else: form_value = str(years[0]) try: val = field_types.DateType.form_to_db(form_value) except field_types.DateConvertError, e: self.log( "WARNING: Value for column '%s' of %r is not understood as a date format." % (column, form_value)) val = form_value extras_dict[extra_key] = val field_map = [ ['CO Identifier'], ['Update frequency', schema.update_frequency_options], ['Temporal Granularity', schema.temporal_granularity_options], [ 'Geographical Granularity', schema.geographic_granularity_options ], ['Taxonomy URL'], ['Agency responsible'], ['Precision'], ['Department', schema.government_depts], ['Published by'], ['Published via'], ['Mandate'], ] optional_fields = [ 'Categories', 'CO Identifier', 'Agency responsible', 'Department', 'Published by', 'Published via', 'Mandate', ] for field_mapping in field_map: column = field_mapping[0] extras_key = column.lower().replace(' ', '_') if column == 'Agency responsible': extras_key = 'agency' elif column in ('CO Identifier', 'CO Reference'): if row_dict.has_key('CO Reference'): column = 'CO Reference' extras_key = 'external_reference' if row_dict.has_key(column): val = row_dict[column] else: assert column in optional_fields, column val = None if len(field_mapping) > 1: # snap to suggestions suggestions = field_mapping[1] if val and val not in suggestions: val = val.strip() suggestions_lower = [sugg.lower() for sugg in suggestions] if val.lower() in suggestions_lower: val = suggestions[suggestions_lower.index(val.lower())] elif schema.canonise_organisation_name(val) in suggestions: val = schema.canonise_organisation_name(val) elif val.lower() + 's' in suggestions: val = val.lower() + 's' elif val.lower().rstrip('s') in suggestions: val = val.lower().rstrip('s') elif val.replace('&', 'and') in suggestions: val = val.replace('&', 'and') elif val.lower() == 'annually' and 'annual' in suggestions: val = 'annual' elif val.lower() == 'year' and 'annual' in suggestions: val = 'annual' if val and val not in suggestions: self.log( "WARNING: Value for column '%s' of '%s' is not in suggestions '%s'" % (column, val, suggestions)) extras_dict[extras_key] = val orgs = [] for key in ['published_by', 'published_via', 'department', 'agency']: org_name = extras_dict.get(key) if org_name: org = self._drupal_helper.cached_department_or_agency_to_organisation( org_name) if org: orgs.append(org) # limit/pad number of orgs to be 2 orgs = (orgs + [u''] * 4)[:2] extras_dict['published_by'], extras_dict['published_via'] = orgs # do not have department/agency fields any more del extras_dict['department'] del extras_dict['agency'] extras_dict[ 'national_statistic'] = u'' # Ignored: row_dict['national statistic'].lower() extras_dict['import_source'] = 'COSPREAD-%s' % os.path.basename( self._filepath) resources = [] for row_resource in row_dict['resources']: res_dict = OrderedDict([ ('url', self.tidy_url(row_resource['Download URL'])), ('format', row_resource.get('File format', u'')), ('description', row_resource.get('Download Description', u'')), ]) if '\n' in res_dict['url']: # multiple urls for url in res_dict['url'].split(): res_dict_tmp = OrderedDict( res_dict.items()) # i.e. deepcopy res_dict_tmp['url'] = url resources.append(res_dict_tmp) else: resources.append(res_dict) pkg_dict['resources'] = resources tags = schema.TagSuggester.suggest_tags(pkg_dict) if self.include_given_tags: given_tags = schema.tags_parse(row_dict['Tags']) tags = tags | set(given_tags) pkg_dict['tags'] = sorted(list(tags)) return pkg_dict
def test_basic(self): res = canonise_organisation_name("MFA") assert_equal(res, "Marine and Fisheries Agency")
def test_basic(self): res = canonise_organisation_name('MFA') assert_equal(res, 'Marine and Fisheries Agency')
def test_basic(self): res = canonise_organisation_name('MFA') assert_equal(res, 'Marine and Fisheries Agency')