def get_organisation(self, dept_or_agency):
        if not self.organisations.has_key(dept_or_agency):
            # check for name mapping
            mapped_publisher = self.publisher_map.get(dept_or_agency.strip())
            if mapped_publisher:
                log.info('Mapping %r to %r', dept_or_agency, mapped_publisher)
                dept_or_agency = mapped_publisher

            # try canonical name
            dept_or_agency = schema.canonise_organisation_name(dept_or_agency)

            # look up with Drupal
            if not hasattr(self, 'drupal'):
                domain = self.xmlrpc['domain']
                username = self.xmlrpc['username']
                password = self.xmlrpc['password']
                if username or password:
                    server = '%s:%s@%s' % (username, password, domain)
                else:
                    server = '%s' % domain
                self.xmlrpc_url = 'http://%s/services/xmlrpc' % server
                log.info('XMLRPC connection to %s', self.xmlrpc_url)
                self.drupal = ServerProxy(self.xmlrpc_url)
            try:
                org_id = self.drupal.organisation.match(dept_or_agency)
            except socket.error, e:
                raise ScriptError('Socket error connecting to %s', self.xmlrpc_url)
            except ProtocolError, e:
                raise ScriptError('XMLRPC error connecting to %s', self.xmlrpc_url)
Exemple #2
0
    def generate(cls, xmlrpc_settings):
        drupal = DrupalClient(xmlrpc_settings)
        orgs = {}
        has_errors = False
        orgs_to_lookup = set()
        orgs_to_lookup.add('Northern Ireland Executive')
        for org_name in orgs_to_lookup:
            org_name = canonise_organisation_name(org_name)
            org_id = drupal.match_organisation(org_name)
            if org_id == False:
                log.error('Could not find organisation %r', org_name)
                has_errors = True
                continue
            proper_org_name = drupal.get_organisation_name(org_id)
            parent_department_id = drupal.get_department_from_organisation(org_id)
            orgs[org_id] = {'name': proper_org_name,
                            'parent_department_id': parent_department_id}
            
        f = open(cls.lots_of_orgs_filepath, 'w')
        try:
            f.write(json.dumps(orgs))
        finally:
            f.close()

        if has_errors:
            print 'Finished with ERRORS'
            sys.exit(1)
        else:
            print 'Finished with SUCCESS'
Exemple #3
0
    def _source_to_publisher_(cls, source, ckanclient):
        '''
        For a given ONS Source, returns the equivalent DGU publisher name.
        If it cannot find it, returns None.
        '''
        # map the name
        publisher_name = schema.canonise_organisation_name(source)

        # search for the name in live list of publishers
        # Start with a narrow search
        result = ckanclient.action('group_search', query=publisher_name, exact=True)
        if not result['count']:
            # Now broaden it out
            result = ckanclient.action('group_search', query=publisher_name, exact=False)
            
        if not result['count']:
            log.warn('Could not find source in DGU publishers: "%s" (mapped from "%s")', publisher_name, source)
            return None
        if result['count'] > 1:
            log.warn('Multiple publishers matched "%s" (mapped from "%s"): %s', publisher_name, source,
                     [(pub['name'], pub['title']) for pub in result['results']])
        else:
            log.info('..Publisher found: %s', result['results'][0]['name'])

        return result['results'][0]['name']
    def get_organisation(self, dept_or_agency):
        if not self.organisations.has_key(dept_or_agency):
            # check for name mapping
            mapped_publisher = self.publisher_map.get(dept_or_agency.strip())
            if mapped_publisher:
                log.info('Mapping %r to %r', dept_or_agency, mapped_publisher)
                dept_or_agency = mapped_publisher

            # try canonical name
            dept_or_agency = schema.canonise_organisation_name(dept_or_agency)

            # look up with Drupal
            if not hasattr(self, 'drupal'):
                domain = self.xmlrpc['domain']
                username = self.xmlrpc['username']
                password = self.xmlrpc['password']
                if username or password:
                    server = '%s:%s@%s' % (username, password, domain)
                else:
                    server = '%s' % domain
                self.xmlrpc_url = 'http://%s/services/xmlrpc' % server
                log.info('XMLRPC connection to %s', self.xmlrpc_url)
                self.drupal = ServerProxy(self.xmlrpc_url)
            try:
                org_id = self.drupal.organisation.match(dept_or_agency)
            except socket.error, e:
                raise ScriptError('Socket error connecting to %s',
                                  self.xmlrpc_url)
            except ProtocolError, e:
                raise ScriptError('XMLRPC error connecting to %s',
                                  self.xmlrpc_url)
Exemple #5
0
    def generate(cls, xmlrpc_settings):
        drupal = DrupalClient(xmlrpc_settings)
        orgs = {}
        has_errors = False
        orgs_to_lookup = set()
        orgs_to_lookup.add('Northern Ireland Executive')
        for org_name in orgs_to_lookup:
            org_name = canonise_organisation_name(org_name)
            org_id = drupal.match_organisation(org_name)
            if org_id == False:
                log.error('Could not find organisation %r', org_name)
                has_errors = True
                continue
            proper_org_name = drupal.get_organisation_name(org_id)
            parent_department_id = drupal.get_department_from_organisation(
                org_id)
            orgs[org_id] = {
                'name': proper_org_name,
                'parent_department_id': parent_department_id
            }

        f = open(cls.lots_of_orgs_filepath, 'w')
        try:
            f.write(json.dumps(orgs))
        finally:
            f.close()

        if has_errors:
            print 'Finished with ERRORS'
            sys.exit(1)
        else:
            print 'Finished with SUCCESS'
Exemple #6
0
    def _source_to_publisher_(cls, source, ckanclient):
        '''
        For a given ONS Source, returns the equivalent DGU publisher name.
        If it cannot find it, returns None.
        '''
        # map the name
        publisher_name = schema.canonise_organisation_name(source)

        # search for the name in live list of publishers
        # Start with a narrow search
        result = ckanclient.action('group_search', query=publisher_name, exact=True)
        if not result['count']:
            # Now broaden it out
            result = ckanclient.action('group_search', query=publisher_name, exact=False)
            
        if not result['count']:
            log.warn('Could not find source in DGU publishers: "%s" (mapped from "%s")', publisher_name, source)
            return None
        if result['count'] > 1:
            log.warn('Multiple publishers matched "%s" (mapped from "%s"): %s', publisher_name, source,
                     [(pub['name'], pub['title']) for pub in result['results']])
        else:
            log.info('..Publisher found: %s', result['results'][0]['name'])

        return result['results'][0]['name']
Exemple #7
0
    def _source_to_publisher(self, source):
        '''
        For a given ONS Source, returns the equivalent DGU publisher.
        If it cannot find it, returns None.
        '''
        # map the name
        publisher_name = schema.canonise_organisation_name(source)

        # search for the name in live list of publishers
        result = self._ckanclient.action('group_search', query=publisher_name)
        if not result['count']:
            log.warn('Could not find source in DGU publishers: "%s" (mapped from "%s")', publisher_name, source)
        if result['count'] > 1:
            log.warn('Multiple publishers matched "%s" (mapped from "%s"): %s', publisher_name, source, publishers)

        return result['results'][0]['name']
Exemple #8
0
    def _source_to_organisations(cls, source, drupal_helper=None):
        dept_given = schema.canonise_organisation_name(source)
        department = None
        agency = None

        if not drupal_helper:
            drupal_helper = schema.DrupalHelper()

        # special cases
        if '(Northern Ireland)' in source or dept_given == 'Office of the First and Deputy First Minister':
            department = u'Northern Ireland Executive'
            agency = drupal_helper.cached_department_or_agency_to_organisation(
                dept_given, include_id=False)
            if not agency:
                log.warn('Could not find NI department: %s' % dept_given)
                agency = dept_given
        if dept_given == 'Office for National Statistics':
            department = dept_given
        if dept_given == 'Education':
            department = 'Department for Education'

        # search for department
        if not department:
            org = drupal_helper.cached_department_or_agency_to_organisation(
                dept_given, include_id=False)
            if org in schema.government_depts:
                department = org
            elif org:
                agency = org

        if not (department or agency) and dept_given:
            log.warn('Could not find organisation: %s' % dept_given)
            agency = dept_given

        # publishers
        orgs = [drupal_helper.cached_department_or_agency_to_organisation(org) \
                for org in [department, agency] if org]
        orgs += [u''] * (2 - len(orgs))
        published_by, published_via = orgs

        return department, agency, published_by, published_via
Exemple #9
0
    def _source_to_organisations(cls, source, drupal_helper=None):
        dept_given = schema.canonise_organisation_name(source)
        department = None
        agency = None

        if not drupal_helper:
            drupal_helper = schema.DrupalHelper()
        
        # special cases
        if '(Northern Ireland)' in source or dept_given == 'Office of the First and Deputy First Minister':
            department = u'Northern Ireland Executive'
            agency = drupal_helper.cached_department_or_agency_to_organisation(dept_given, include_id=False)
            if not agency:
                log.warn('Could not find NI department: %s' % dept_given)
                agency = dept_given
        if dept_given == 'Office for National Statistics':
            department = dept_given
        if dept_given == 'Education':
            department = 'Department for Education'

        # search for department
        if not department:
            org = drupal_helper.cached_department_or_agency_to_organisation(dept_given, include_id=False)
            if org in schema.government_depts:
                department = org
            elif org:
                agency = org
                
        if not (department or agency) and dept_given: 
            log.warn('Could not find organisation: %s' % dept_given)
            agency = dept_given

        # publishers
        orgs = [drupal_helper.cached_department_or_agency_to_organisation(org) \
                for org in [department, agency] if org]
        orgs += [u''] * (2 - len(orgs))
        published_by, published_via = orgs

        return department, agency, published_by, published_via
Exemple #10
0
class CospreadImporter(SpreadsheetPackageImporter):
    license_map = {
        u'UK Crown Copyright with data.gov.uk rights': u'uk-ogl',
        u'\xa9 HESA. Not core Crown Copyright.': u'uk-ogl',
        u'Local Authority copyright with data.gov.uk rights': u'uk-ogl',
        u'Local Authority Copyright with data.gov.uk rights': u'uk-ogl',
        u'UK Crown Copyright': u'uk-ogl',
        u'Crown Copyright': u'uk-ogl',
        u'UK Open Government Licence (OGL)': u'uk-ogl',
        u'UK Open Government License (OGL)': u'uk-ogl',
        u'Met Office licence': u'met-office-cp',
        u'Met Office UK Climate Projections Licence Agreement':
        u'met-office-cp',
    }

    def __init__(self,
                 include_given_tags=False,
                 xmlrpc_settings=None,
                 generate_names=False,
                 **kwargs):
        self.include_given_tags = include_given_tags
        self._drupal_helper = schema.DrupalHelper(xmlrpc_settings)
        super(CospreadImporter,
              self).__init__(record_params=[generate_names],
                             record_class=CospreadDataRecords,
                             **kwargs)

    @classmethod
    def log(self, msg):
        super(CospreadImporter, self).log(msg)
        log.warn(msg)

    def record_2_package(self, row_dict):
        pkg_dict = OrderedDict()
        pkg_dict['title'] = row_dict['Title']
        pkg_dict['name'] = self.name_munge(
            row_dict.get('Package name') or u'') or self.munge(
                pkg_dict['title'])
        if not (pkg_dict['name'] and pkg_dict['title']):
            raise RowParseError(
                'Both Name and Title fields must be filled: name=%r title=%r' %
                (pkg_dict['name'], pkg_dict['title']))
        log.info('Package: %s' % pkg_dict['name'])
        pkg_dict['author'] = row_dict['Contact - Permanent contact point']
        pkg_dict['author_email'] = row_dict['Contact - E-mail address.']
        is_maintainer = bool('maintainer' in ' '.join(row_dict.keys()).lower())
        pkg_dict['maintainer'] = row_dict[
            'Maintainer - '] if is_maintainer else None
        pkg_dict['maintainer_email'] = row_dict[
            'Maintainer - E-mail address'] if is_maintainer else None
        notes = row_dict['Notes']
        license_id, additional_notes = self.get_license_id(row_dict['Licence'])
        if additional_notes:
            notes += additional_notes
        pkg_dict['license_id'] = license_id
        pkg_dict['url'] = self.tidy_url(row_dict['URL'])
        pkg_dict['notes'] = notes
        pkg_dict['version'] = u''
        pkg_dict['groups'] = [u'ukgov']

        pkg_dict['extras'] = OrderedDict()
        extras_dict = pkg_dict['extras']
        geo_cover = []
        geo_coverage_type = schema.GeoCoverageType.get_instance()
        spreadsheet_regions = ('England', 'N. Ireland', 'Scotland', 'Wales',
                               'Overseas', 'Global')
        for region in spreadsheet_regions:
            munged_region = region.lower().replace('n. ', 'northern_')
            field = 'Geographic coverage - %s' % region
            if (row_dict[field]
                    or '').lower() not in (None, '', 'no', 'False'):
                geo_cover.append(munged_region)
        extras_dict['geographic_coverage'] = geo_coverage_type.form_to_db(
            geo_cover)
        for column, extra_key in [
            ('Date released', 'date_released'),
            ('Date updated', 'date_updated'),
            ('Date update future', 'date_update_future'),
            ('Temporal Coverage - From', 'temporal_coverage-from'),
            ('Temporal Coverage - To', 'temporal_coverage-to'),
        ]:
            form_value = row_dict.get(column)
            if isinstance(form_value, datetime.date):
                val = field_types.DateType.date_to_db(form_value)
            else:
                if isinstance(form_value, int):
                    form_value = str(form_value)
                # Hack for CLG data to allow '2008/09' to mean '2008', or
                # '2009' if it is a 'To' field.
                match = re.match('(\d{4})/(\d{2})', form_value or '')
                if match:
                    years = [int(year_str) for year_str in match.groups()]
                    if extra_key.endswith('-to'):
                        form_value = str(
                            field_types.DateType.
                            add_centurys_to_two_digit_year(year=years[1],
                                                           near_year=years[0]))
                    else:
                        form_value = str(years[0])
                try:
                    val = field_types.DateType.form_to_db(form_value)
                except field_types.DateConvertError, e:
                    self.log(
                        "WARNING: Value for column '%s' of %r is not understood as a date format."
                        % (column, form_value))
                    val = form_value
            extras_dict[extra_key] = val

        field_map = [
            ['CO Identifier'],
            ['Update frequency', schema.update_frequency_options],
            ['Temporal Granularity', schema.temporal_granularity_options],
            [
                'Geographical Granularity',
                schema.geographic_granularity_options
            ],
            ['Taxonomy URL'],
            ['Agency responsible'],
            ['Precision'],
            ['Department', schema.government_depts],
            ['Published by'],
            ['Published via'],
            ['Mandate'],
        ]
        optional_fields = [
            'Categories',
            'CO Identifier',
            'Agency responsible',
            'Department',
            'Published by',
            'Published via',
            'Mandate',
        ]
        for field_mapping in field_map:
            column = field_mapping[0]
            extras_key = column.lower().replace(' ', '_')
            if column == 'Agency responsible':
                extras_key = 'agency'
            elif column in ('CO Identifier', 'CO Reference'):
                if row_dict.has_key('CO Reference'):
                    column = 'CO Reference'
                extras_key = 'external_reference'
            if row_dict.has_key(column):
                val = row_dict[column]
            else:
                assert column in optional_fields, column
                val = None
            if len(field_mapping) > 1:
                # snap to suggestions
                suggestions = field_mapping[1]
                if val and val not in suggestions:
                    val = val.strip()
                    suggestions_lower = [sugg.lower() for sugg in suggestions]
                    if val.lower() in suggestions_lower:
                        val = suggestions[suggestions_lower.index(val.lower())]
                    elif schema.canonise_organisation_name(val) in suggestions:
                        val = schema.canonise_organisation_name(val)
                    elif val.lower() + 's' in suggestions:
                        val = val.lower() + 's'
                    elif val.lower().rstrip('s') in suggestions:
                        val = val.lower().rstrip('s')
                    elif val.replace('&', 'and') in suggestions:
                        val = val.replace('&', 'and')
                    elif val.lower() == 'annually' and 'annual' in suggestions:
                        val = 'annual'
                    elif val.lower() == 'year' and 'annual' in suggestions:
                        val = 'annual'
                if val and val not in suggestions:
                    self.log(
                        "WARNING: Value for column '%s' of '%s' is not in suggestions '%s'"
                        % (column, val, suggestions))
            extras_dict[extras_key] = val

        orgs = []
        for key in ['published_by', 'published_via', 'department', 'agency']:
            org_name = extras_dict.get(key)
            if org_name:
                org = self._drupal_helper.cached_department_or_agency_to_organisation(
                    org_name)
                if org:
                    orgs.append(org)
        # limit/pad number of orgs to be 2
        orgs = (orgs + [u''] * 4)[:2]
        extras_dict['published_by'], extras_dict['published_via'] = orgs
        # do not have department/agency fields any more
        del extras_dict['department']
        del extras_dict['agency']

        extras_dict[
            'national_statistic'] = u''  # Ignored: row_dict['national statistic'].lower()
        extras_dict['import_source'] = 'COSPREAD-%s' % os.path.basename(
            self._filepath)

        resources = []
        for row_resource in row_dict['resources']:
            res_dict = OrderedDict([
                ('url', self.tidy_url(row_resource['Download URL'])),
                ('format', row_resource.get('File format', u'')),
                ('description', row_resource.get('Download Description', u'')),
            ])
            if '\n' in res_dict['url']:
                # multiple urls
                for url in res_dict['url'].split():
                    res_dict_tmp = OrderedDict(
                        res_dict.items())  # i.e. deepcopy
                    res_dict_tmp['url'] = url
                    resources.append(res_dict_tmp)
            else:
                resources.append(res_dict)
        pkg_dict['resources'] = resources

        tags = schema.TagSuggester.suggest_tags(pkg_dict)
        if self.include_given_tags:
            given_tags = schema.tags_parse(row_dict['Tags'])
            tags = tags | set(given_tags)
        pkg_dict['tags'] = sorted(list(tags))

        return pkg_dict
Exemple #11
0
 def test_basic(self):
     res = canonise_organisation_name("MFA")
     assert_equal(res, "Marine and Fisheries Agency")
Exemple #12
0
 def test_basic(self):
     res = canonise_organisation_name('MFA')
     assert_equal(res, 'Marine and Fisheries Agency')
Exemple #13
0
 def test_basic(self):
     res = canonise_organisation_name('MFA')
     assert_equal(res, 'Marine and Fisheries Agency')