def _create_or_update_organization(self, data_dict, harvest_job): context = { 'model': model, 'session': model.Session, 'user': self._get_user_name(), 'ignore_auth': True, } try: log.info("Finding organization..") log.info(data_dict['id']) org = p.toolkit.get_action('organization_show')(context, {'id': data_dict['id']}) log.info("found", org) last_finished_job = self._last_finished_job(harvest_job) log.info(last_finished_job) if last_finished_job and last_finished_job < data_dict['changed']: log.info("updating organization") org = p.toolkit.get_action('organization_update')(context, {'title': data_dict['name'], 'name': munge_title_to_name(data_dict['name']), 'id': data_dict['id']}) except NotFound: log.info("Organization %s not found, creating...", data_dict['name']) # Get rid of auth audit on the context otherwise we'll get an # exception context.pop('__auth_audit', None) org = p.toolkit.get_action('organization_create')(context, {'title': data_dict['name'], 'name': munge_title_to_name(data_dict['name']), 'id': data_dict['id']}) log.info(org) return org
def do_publisher(cls, publisher_nid): from ckan import model from ckan.lib.munge import munge_title_to_name log = global_log pub = cls.get_cached_publisher_details(publisher_nid) title = pub["title"].strip() slug = munge_title_to_name(title) g = model.Group.get(slug) if g: log.info("Found publisher in db: %s", g.name) else: cls.status.record("Not found in CKAN db", slug, do_print=False) log.warn("Ignoring publisher that cannot be found in db: %s", slug) return if pub.get("parent_node"): parent_pub_title = cls.get_cached_publisher_details(pub["parent_node"])["title"] parent_name = munge_title_to_name(parent_pub_title) parent = model.Group.get(parent_name) if not parent: cls.status.record("Cannot find parent in CKAN db", g.name, do_print=False) log.warning("Cannot find parent %s of %s", parent_name, pub.name) return existing_parents = [ m.group for m in model.Session.query(model.Member) .filter(model.Member.table_name == "group") .filter(model.Member.table_id == g.id) .filter(model.Member.state == "active") ] if existing_parents: if len(existing_parents) > 1: log.warn("Multiple parents for %s: %r", g.name, [p.name for p in existing_parents]) if parent in existing_parents: cls.status.record("Correct parent already", g.name, do_print=False) log.info("Correct parent already: %s parent of %s", parent.name, g.name) return else: cls.status.record("Has another parent", g.name, do_print=False) log.info( "Has another parent: %r (instead of %s) parent of %s", [p.name for p in existing_parents], parent.name, g.name, ) return m = model.Member(group=parent, table_id=g.id, table_name="group") model.Session.add(m) model.Session.commit() cls.status.record("Parent added", slug, do_print=False) log.info("%s is made parent of %s", parent.name, g.name) else: log.info("%s has no parent in Drupal" % g.name) cls.status.record("Has no parent in Drupal", g.name, do_print=False)
def add_publisher(cls, publisher_nid): from ckan import model from ckan.lib.munge import munge_title_to_name if int(publisher_nid) in ignore_publishers: global_log.info('Publisher ignored: %s (%s)', publisher_nid, cls.get_cached_publisher_details(publisher_nid)) return pub = cls.get_cached_publisher_details(publisher_nid) title = pub['title'].strip() slug = munge_title_to_name(title) g = model.Group.get(slug) if g: global_log.info('Publisher already exists in db: %s', slug) else: g = model.Group(name=slug) model.Session.add(g) g.title=title g.type='publisher' g.description=pub['body'] field_pub_web_title = pub['field_pub_web'][0]['title'] if pub['field_pub_web'] else '' g.extras['contact-name'] = '%s contact' % field_pub_web_title if field_pub_web_title else '' g.extras['contact-email'] = pub['field_pub_email_display'][0]['email'] if pub['field_pub_email_display'] else '' g.extras['contact-phone'] = '' g.extras['foi-name'] = '' g.extras['foi-email'] = '' g.extras['foi-web'] = '' g.extras['foi-phone'] = '' acronym = pub['field_acronym'][0]['value'] if pub['field_acronym'] else '' g.extras['abbreviation'] = acronym or '' g.extras['website-url'] = (pub['field_pub_web'][0]['url'] or '') if pub['field_pub_web'] else '' g.extras['website-name'] = (pub['field_pub_web'][0]['title'] or '') if pub['field_pub_web'] else '' model.Session.commit() title_and_abbreviation = '%s (%s)' % (title, acronym) if acronym else title global_log.info('Added/edited publisher: %s <%s>', title_and_abbreviation, publisher_nid) if pub.get('parent_node'): parent_pub_title = cls.get_cached_publisher_details(pub['parent_node'])['title'] parent = model.Group.get(munge_title_to_name(parent_pub_title)) if not parent: parent = cls.add_publisher(pub['parent_node']) if model.Session.query(model.Member).\ filter(model.Member.group==parent).\ filter(model.Member.table_id==g.id).count() == 0: m = model.Member(group=parent, table_id=g.id, table_name='group') model.Session.add(m) global_log.info('%s is parent of %s', parent.name, g.name) else: global_log.info('%s is already a parent of %s', parent.name, g.name) model.Session.commit() return g
def do_publisher(cls, publisher_nid): from ckan import model from ckan.lib.munge import munge_title_to_name log = global_log pub = cls.get_cached_publisher_details(publisher_nid) title = pub['title'].strip() slug = munge_title_to_name(title) g = model.Group.get(slug) if g: log.info('Found publisher in db: %s', g.name) else: cls.status.record('Not found in CKAN db', slug, do_print=False) log.warn('Ignoring publisher that cannot be found in db: %s', slug) return if pub.get('parent_node'): parent_pub_title = cls.get_cached_publisher_details(pub['parent_node'])['title'] parent_name = munge_title_to_name(parent_pub_title) parent = model.Group.get(parent_name) if not parent: cls.status.record('Cannot find parent in CKAN db', g.name, do_print=False) log.warning('Cannot find parent %s of %s', parent_name, pub.name) return existing_parents = [m.group for m in model.Session.query(model.Member).\ filter(model.Member.table_name=='group').\ filter(model.Member.table_id==g.id).\ filter(model.Member.state=='active')] if existing_parents: if len(existing_parents) > 1: log.warn('Multiple parents for %s: %r', g.name, [p.name for p in existing_parents]) if parent in existing_parents: cls.status.record('Correct parent already', g.name, do_print=False) log.info('Correct parent already: %s parent of %s', parent.name, g.name) return else: cls.status.record('Has another parent', g.name, do_print=False) log.info('Has another parent: %r (instead of %s) parent of %s', [p.name for p in existing_parents], parent.name, g.name) return m = model.Member(group=parent, table_id=g.id, table_name='group') model.Session.add(m) model.Session.commit() cls.status.record('Parent added', slug, do_print=False) log.info('%s is made parent of %s', parent.name, g.name) else: log.info('%s has no parent in Drupal' % g.name) cls.status.record('Has no parent in Drupal', g.name, do_print=False)
def _create_or_update_organization(self, data_dict, harvest_job): context = { 'model': model, 'session': model.Session, 'user': self._get_user_name(), 'ignore_auth': True, } try: org = p.toolkit.get_action('organization_show')(context, {'id': data_dict['id']}) except NotFound: org = None if org: log.info("found %s", org) if data_dict['removed']: log.info("Organization was removed, removing from catalog..") p.toolkit.get_action('organization_delete')(context, org) return None if self.config.get('force_all', False) is True: last_time = "2011-01-01" else: last_time = self._last_error_free_job_time(harvest_job) if last_time and last_time < data_dict['changed']: org_data = { 'title': data_dict['name'], 'name': munge_title_to_name(data_dict['name']), 'id': data_dict['id']} org = p.toolkit.get_action('organization_patch')(context, org_data) else: log.info("Organization %s not found, creating...", data_dict['name']) if data_dict['removed']: log.info("Organization was removed, not creating..") return None # Get rid of auth audit on the context otherwise we'll get an # exception context.pop('__auth_audit', None) org_data = { 'title': data_dict['name'], 'name': munge_title_to_name(data_dict['name']), 'id': data_dict['id']} org = p.toolkit.get_action('organization_create')(context, org_data) log.info(org) return org
def _gen_new_name(cls, title, existing_name=None, append_type='number-sequence'): ''' Returns a 'name' for the dataset (URL friendly), based on the title. If the ideal name is already used, it will append a number to it to ensure it is unique. If generating a new name because the title of the dataset has changed, specify the existing name, in case the name doesn't need to change after all. :param existing_name: the current name of the dataset - only specify this if the dataset exists :type existing_name: string :param append_type: the type of characters to add to make it unique - either 'number-sequence' or 'random-hex'. :type append_type: string ''' ideal_name = munge_title_to_name(title) ideal_name = re.sub('-+', '-', ideal_name) # collapse multiple dashes return cls._ensure_name_is_unique(ideal_name, existing_name=existing_name, append_type=append_type)
def _gen_new_name(cls, title, existing_name=None, append_type=None): ''' Returns a 'name' for the dataset (URL friendly), based on the title. If the ideal name is already used, it will append a number to it to ensure it is unique. If generating a new name because the title of the dataset has changed, specify the existing name, in case the name doesn't need to change after all. :param existing_name: the current name of the dataset - only specify this if the dataset exists :type existing_name: string :param append_type: the type of characters to add to make it unique - either 'number-sequence' or 'random-hex'. :type append_type: string ''' # If append_type was given, use it. Otherwise, use the configured default. # If nothing was given and no defaults were set, use 'number-sequence'. if append_type: append_type_param = append_type else: append_type_param = config.get('ckanext.harvest.default_dataset_name_append', 'number-sequence') ideal_name = munge_title_to_name(title) ideal_name = re.sub('-+', '-', ideal_name) # collapse multiple dashes return cls._ensure_name_is_unique(ideal_name, existing_name=existing_name, append_type=append_type_param)
def organization_import(data): """ Import organizations """ _load_config() context = _create_context() configuration = simplejson.loads(data) data_url = configuration.get('url') public_organization = configuration.get('public_organization', False) with closing(urllib2.urlopen(data_url)) as source: data = simplejson.load(source) for item in data: values = {} if isinstance(item, basestring): values['title'] = item.strip() values['name'] = munge_title_to_name(values['title']).lower() else: values['name'] = item.pop('name') values['title'] = item.pop('title') values['description'] = item.pop('description', None) values['extras'] = [{'key': key, 'value': value} for key, value in item.iteritems()] values['id'] = values['name'] if public_organization: values['extras'] = [{'key': 'public_adminstration_organization', 'value': 'true'}] try: get_action('organization_show')(context, {'id': values['id']}) # Do not override organizations except NotFound: get_action('organization_create')(context, values)
def make_package_name(self, title, exclude_existing_package): ''' Creates a URL friendly name from a title If the name already exists, it will add some random characters at the end ''' name = munge_title_to_name(title).replace('_', '-') while '--' in name: name = name.replace('--', '-') name = name[0:90] # max length is 100 # Is this slug already in use (and if we're updating a package, is it in # use by a different package?). pkg_obj = Session.query(Package).filter(Package.name == name).filter(Package.id != exclude_existing_package).first() if not pkg_obj: # The name is available, so use it. Note that if we're updating an # existing package we will be updating this package's URL, so incoming # links may break. return name if exclude_existing_package: # The name is not available, and we're updating a package. Chances # are the package's name already had some random string attached # to it last time. Prevent spurrious updates to the package's URL # (choosing new random text) by just reusing the existing package's # name. pkg_obj = Session.query(Package).filter(Package.id == exclude_existing_package).first() if pkg_obj: # the package may not exist yet because we may be passed the desired package GUID before a new package is instantiated return pkg_obj.name # Append some random text to the URL. Hope that with five character # there will be no collsion. return name + "-" + str(uuid.uuid4())[:5]
def set_dataset_info(self, package, dataset, harvester_config): extra(package, "Agency", "Department of Health & Human Services") package["author"] = "Centers for Medicare & Medicaid Services" extra(package, "author_id", "http://healthdata.gov/id/agency/cms") extra(package, "Bureau Code", "009:38") package["title"] = dataset["Name"].strip() package["notes"] = dataset.get("Description") package["url"] = dataset.get("Address") dataset_hd = dataset["HealthData"] extra(package, "Date Released", parsedate(dataset_hd.get("DateReleased"))) extra(package, "Date Updated", parsedate(dataset_hd.get("DateUpdated"))) extra(package, "Agency Program URL", dataset_hd.get("AgencyProgramURL")) extra(package, "Subject Area 1", "Medicare") extra(package, "Unit of Analysis", dataset_hd.get("UnitOfAnalysis")) extra(package, "Data Dictionary", dataset_hd.get("DataDictionaryURL")) extra(package, "Coverage Period", dataset_hd.get("Coverage Period")) extra(package, "Collection Frequency", dataset_hd.get("Collection Frequency")) extra(package, "Geographic Scope", dataset_hd.get("GeographicScope")) #extra(package, "Contact Name", dataset_hd.get("GenericContactName", None) or dataset_hd.get("ContactName")) # 'X or Y' syntax returns Y if X is either None or the empty string #extra(package, "Contact Email", dataset_hd.get("GenericContactEmail", None) or dataset_hd.get("ContactEmail")) extra(package, "License Agreement", dataset_hd.get("DataLicenseAgreementURL")) from ckan.lib.munge import munge_title_to_name package["tags"] = [ { "name": munge_title_to_name(t["Name"]) } for t in dataset.get("Keywords", [])]
def migrate(self): ''' ''' related_items = get_action('related_list')(data_dict={}) # preflight: # related items must have unique titles before migration related_titles = [i['title'] for i in related_items] # make a list of duplicate titles duplicate_titles = self._find_duplicates(related_titles) if duplicate_titles: print( """All Related Items must have unique titles before migration. The following Related Item titles are used more than once and need to be corrected before migration can continue. Please correct and try again:""" ) for i in duplicate_titles: print(i) return for related in related_items: existing_showcase = get_action('package_search')( data_dict={'fq': '+dataset_type:showcase original_related_item_id:{0}'.format(related['id'])}) normalized_title = substitute_ascii_equivalents(related['title']) if existing_showcase['count'] > 0: print('Showcase for Related Item "{0}" already exists.'.format( normalized_title)) else: data_dict = { 'original_related_item_id': related.get('id'), 'title': related.get('title'), 'name': munge_title_to_name(related.get('title')), 'notes': related.get('description'), 'image_url': related.get('image_url'), 'url': related.get('url'), 'tags': [{"name": related.get('type').lower()}] } # make the showcase try: new_showcase = get_action('ckanext_showcase_create')( data_dict=data_dict) except Exception as e: print('There was a problem migrating "{0}": {1}'.format( normalized_title, e)) else: print('Created Showcase from the Related Item "{0}"'.format(normalized_title)) # make the showcase_package_association, if needed try: related_pkg_id = self._get_related_dataset( related['id']) if related_pkg_id: get_action('ckanext_showcase_package_association_create')( data_dict={'showcase_id': new_showcase['id'], 'package_id': related_pkg_id}) except Exception as e: print('There was a problem creating the showcase_package_association for "{0}": {1}'.format( normalized_title, e))
def _gen_new_name(self,title): ''' Creates a URL friendly name from a title ''' name = munge_title_to_name(title).replace('_', '-') while '--' in name: name = name.replace('--', '-') return name
def _gen_new_title(self, title, related_id): name = munge_title_to_name(title) pkg_obj = model.Session.query(model.Package).filter_by(name=name).first() if pkg_obj: title.replace('duplicate_', '') return 'duplicate_' + title + '_' + related_id else: return title
def create_organization_dict(self, inventory_id, title): return { "name": munge_title_to_name(title), "title": title, "inventory_organization_id": inventory_id, "is_organization": True, "type": "organization", }
def setup_class(cls): SpatialTestBase.setup_class() for fixture_x in cls.fixtures_x: bbox = cls.x_values_to_bbox(fixture_x) bbox_geojson = bbox_2_geojson(bbox) cls.create_package(name=munge_title_to_name(str(fixture_x)), title=str(fixture_x), extras=[{'key': 'spatial', 'value': bbox_geojson}])
def import_stage(self, harvest_object): log.debug('In SwisstopoHarvester import_stage') if not harvest_object: log.error('No harvest object received') return False try: package_dict = json.loads(harvest_object.content) package_dict['id'] = harvest_object.guid package_dict['name'] = munge_title_to_name( package_dict['layer_name'] ) user = model.User.get(self.HARVEST_USER) context = { 'model': model, 'session': Session, 'user': self.HARVEST_USER } # Find or create group the dataset should get assigned to package_dict['groups'] = self._find_or_create_groups(context) # Find or create the organization # the dataset should get assigned to package_dict['owner_org'] = self._find_or_create_organization( context, package_dict ) # Save license url in extras extras = [] if 'license_url' in package_dict: extras.append(('license_url', package_dict['license_url'])) package_dict['extras'] = extras package = model.Package.get(package_dict['id']) model.PackageRole( package=package, user=user, role=model.Role.ADMIN ) log.debug( 'Save or update package %s (%s)' % (package_dict['name'], package_dict['id']) ) self._create_or_update_package(package_dict, harvest_object) log.debug('Save or update term translations') self._submit_term_translations(context, package_dict) Session.commit() except Exception, e: log.exception(e) raise
def gen_new_name(title): name = munge_title_to_name(title).replace('_', '-') while '--' in name: name = name.replace('--', '-') pkg_obj = Session.query(Package).filter(Package.name == name).first() if pkg_obj: return name + str(uuid.uuid4())[:5] else: return name
def gen_new_name(title): name = munge_title_to_name(title).replace("_", "-") while "--" in name: name = name.replace("--", "-") pkg_obj = Session.query(Package).filter(Package.name == name).first() if pkg_obj: return name + str(uuid.uuid4())[:5] else: return name
def _find_or_create_organization(self, context): try: data_dict = { 'permission': 'edit_group', 'id': munge_title_to_name(self.ORGANIZATION[u'de']['name']), 'name': munge_title_to_name(self.ORGANIZATION[u'de']['name']), 'title': self.ORGANIZATION[u'de']['name'], 'description': self.ORGANIZATION[u'de']['description'], 'extras': [ { 'key': 'website', 'value': self.ORGANIZATION[u'de']['website'] } ] } organization = get_action('organization_show')(context, data_dict) except: organization = get_action('organization_create')(context, data_dict) return organization['id']
def get_clean_name(s): current = s counter = 1 while True: current = munge.munge_title_to_name(current) if not _get_package(client, current): break current = "{0}_{1}".format(s, counter) counter = counter + 1 return current
def import_stage(self, harvest_object): ''' The import stage will receive a HarvestObject object and will be responsible for: - performing any necessary action with the fetched object (e.g create a CKAN package). Note: if this stage creates or updates a package, a reference to the package must be added to the HarvestObject. Additionally, the HarvestObject must be flagged as current. - creating the HarvestObject - Package relation (if necessary) - creating and storing any suitable HarvestObjectErrors that may occur. - returning True if everything went as expected, False otherwise. :param harvest_object: HarvestObject object :returns: True if everything went right, False if errors were found ''' logger.debug("in import stage: %s" % harvest_object.guid) if not harvest_object: logger.error('No harvest object received') self._save_object_error('No harvest object received') return False try: self._set_config(harvest_object.job.source.config) context = {'model': model, 'session': Session, 'user': self.user} package_dict = json.loads(harvest_object.content) package_dict['id'] = munge_title_to_name(harvest_object.guid) package_dict['name'] = package_dict['id'] # add owner_org source_dataset = get_action('package_show')( { 'ignore_auth': True }, { 'id': harvest_object.source.id }) owner_org = source_dataset.get('owner_org') package_dict['owner_org'] = owner_org # logger.debug('Create/update package using dict: %s' % package_dict) self._create_or_update_package(package_dict, harvest_object, 'package_show') Session.commit() logger.debug("Finished record") except: logger.exception('Something went wrong!') self._save_object_error('Exception in import stage', harvest_object) return False return True
def create_slug(self): title = request.params.get('title') or '' name = munge_title_to_name(title) if package_exists(name): valid = False else: valid = True #response.content_type = 'application/javascript' response_data = dict(name=name.replace('_', '-'), valid=valid) return self._finish_ok(response_data)
def set_extras(self, package, extras): for k, v in extras.items(): if k in ("title", "notes", "author", "url"): # these are CKAN package fields package[k] = v elif k == "tags": # tags are special package["tags"] = [{"name": munge_title_to_name(t)} for t in v] else: # everything else is an "extra" DatasetHarvesterBase.set_extra(package, k, v)
def _validate_package_id(self, package_id): # Validate that they do not contain any HTML tags. match = re.search('[<>]+', package_id) if match: log.debug( 'Package id %s contains disallowed characters' % package_id ) return False else: return munge_title_to_name(package_id)
def set_extras(self, package, extras): for k, v in extras.items(): if k in ("title", "notes", "author", "url"): # these are CKAN package fields package[k] = v elif k == "tags": # tags are special package["tags"] = [ { "name": munge_title_to_name(t) } for t in v ] else: # everything else is an "extra" DatasetHarvesterBase.set_extra(package, k, v)
def setup_class(cls): SpatialTestBase.setup_class() for fixture_x in cls.fixtures_x: bbox = cls.x_values_to_bbox(fixture_x) bbox_geojson = bbox_2_geojson(bbox) cls.create_package(name=munge_title_to_name(str(fixture_x)), title=str(fixture_x), extras=[{ 'key': 'spatial', 'value': bbox_geojson }])
def _find_or_create_organization(self, context, package_dict): org = self._find_owner(package_dict['layer_name']) try: name = self.ORGANIZATION[org]['de']['name'] data_dict = { 'permission': 'edit_group', 'id': munge_title_to_name(name), 'name': munge_title_to_name(name), 'title': self.ORGANIZATION[org]['de']['name'], 'description': self.ORGANIZATION[org]['de']['description'], 'extras': [ { 'key': 'website', 'value': self.ORGANIZATION[org]['de']['website'] } ] } org = get_action('organization_show')(context, data_dict) except: org = get_action('organization_create')(context, data_dict) return org['id']
def initial_data(self, spatial_clean_db): for fixture_x in self.fixtures_x: bbox = self.x_values_to_bbox(fixture_x) bbox_geojson = bbox_2_geojson(bbox) create_package( name=munge_title_to_name(six.text_type(fixture_x)), title=six.text_type(fixture_x), extras=[{ "key": "spatial", "value": bbox_geojson }], )
def before_index(self, search_data): if not self.is_supported_package_type(search_data): return search_data extract_title = LangToString('title') validated_dict = json.loads(search_data['validated_data_dict']) # log.debug(pprint.pformat(validated_dict)) search_data['res_name'] = [extract_title(r) for r in validated_dict[u'resources']] # noqa search_data['res_description'] = [LangToString('description')(r) for r in validated_dict[u'resources']] # noqa search_data['res_format'] = self._prepare_formats_for_index(validated_dict[u'resources']) # noqa search_data['res_rights'] = [simplify_terms_of_use(r['rights']) for r in validated_dict[u'resources']] # noqa search_data['title_string'] = extract_title(validated_dict) search_data['description'] = LangToString('description')(validated_dict) # noqa if 'political_level' in validated_dict[u'organization']: search_data['political_level'] = validated_dict[u'organization'][u'political_level'] # noqa try: # index language-specific values (or it's fallback) text_field_items = {} for lang_code in get_langs(): search_data['title_' + lang_code] = get_localized_value( validated_dict['title'], lang_code ) search_data['title_string_' + lang_code] = munge_title_to_name( get_localized_value(validated_dict['title'], lang_code) ) search_data['description_' + lang_code] = get_localized_value( validated_dict['description'], lang_code ) search_data['keywords_' + lang_code] = get_localized_value( validated_dict['keywords'], lang_code ) text_field_items['text_' + lang_code] = [get_localized_value(validated_dict['description'], lang_code)] # noqa text_field_items['text_' + lang_code].extend(search_data['keywords_' + lang_code]) # noqa text_field_items['text_' + lang_code].extend([r['title'][lang_code] for r in validated_dict['resources'] if r['title'][lang_code]]) # noqa text_field_items['text_' + lang_code].extend([r['description'][lang_code] for r in validated_dict['resources'] if r['description'][lang_code]]) # noqa # flatten values for text_* fields for key, value in text_field_items.iteritems(): search_data[key] = ' '.join(value) except KeyError: pass # log.debug(pprint.pformat(search_data)) return search_data
def _find_or_create_organization(self, package_dict, context): # Find or create the organization the dataset should get assigned to. try: data_dict = { 'id': munge_title_to_name(self.ORGANIZATION['de']), } package_dict['owner_org'] = get_action('organization_show')( context.copy(), data_dict )['id'] except: data_dict = { 'permission': 'edit_group', 'id': munge_title_to_name(self.ORGANIZATION['de']), 'name': munge_title_to_name(self.ORGANIZATION['de']), 'title': self.ORGANIZATION['de'] } organization = get_action('organization_create')( context.copy(), data_dict ) package_dict['owner_org'] = organization['id']
def _find_or_create_organization(self, context): try: data_dict = { 'permission': 'edit_group', 'id': munge_title_to_name(self.ORGANIZATION[u'de']['name']), 'name': munge_title_to_name(self.ORGANIZATION[u'de']['name']), 'title': self.ORGANIZATION[u'de']['name'], 'description': self.ORGANIZATION[u'de']['description'], 'extras': [{ 'key': 'website', 'value': self.ORGANIZATION[u'de']['website'] }] } organization = get_action('organization_show')(context, data_dict) except: organization = get_action('organization_create')(context, data_dict) return organization['id']
def _dropzone_get_groups(self, dataset_node): ''' Get the groups from the node, normalize them and get the ids. ''' categories = self._get(dataset_node, 'kategorie') if categories: group_titles = categories.split(', ') groups = [] for title in group_titles: name = munge_title_to_name(title) groups.append((name, title)) return self._get_group_ids(groups) else: return []
def before_index(self, search_data): if not self.is_supported_package_type(search_data): return search_data extract_title = LangToString('title') validated_dict = json.loads(search_data['validated_data_dict']) search_data['res_name'] = [extract_title(r) for r in validated_dict[u'resources']] # noqa search_data['res_description'] = [LangToString('description')(r) for r in validated_dict[u'resources']] # noqa search_data['res_format'] = self._prepare_formats_for_index(validated_dict[u'resources']) # noqa search_data['res_rights'] = [simplify_terms_of_use(r['rights']) for r in validated_dict[u'resources']] # noqa search_data['title_string'] = extract_title(validated_dict) search_data['description'] = LangToString('description')(validated_dict) # noqa if 'political_level' in validated_dict[u'organization']: search_data['political_level'] = validated_dict[u'organization'][u'political_level'] # noqa try: # index language-specific values (or it's fallback) text_field_items = {} for lang_code in get_langs(): search_data['title_' + lang_code] = get_localized_value( validated_dict['title'], lang_code ) search_data['title_string_' + lang_code] = munge_title_to_name( get_localized_value(validated_dict['title'], lang_code) ) search_data['description_' + lang_code] = get_localized_value( validated_dict['description'], lang_code ) search_data['keywords_' + lang_code] = get_localized_value( validated_dict['keywords'], lang_code ) text_field_items['text_' + lang_code] = [get_localized_value(validated_dict['description'], lang_code)] # noqa text_field_items['text_' + lang_code].extend(search_data['keywords_' + lang_code]) # noqa text_field_items['text_' + lang_code].extend([r['title'][lang_code] for r in validated_dict['resources'] if r['title'][lang_code]]) # noqa text_field_items['text_' + lang_code].extend([r['description'][lang_code] for r in validated_dict['resources'] if r['description'][lang_code]]) # noqa # flatten values for text_* fields for key, value in text_field_items.iteritems(): search_data[key] = ' '.join(value) except KeyError: pass return search_data
def make_package_name(self, title, exclude_existing_package): ''' Creates a URL friendly name from a title If the name already exists, it will add some random characters at the end ''' name = munge_title_to_name(title).replace('_', '-') while '--' in name: name = name.replace('--', '-') pkg_obj = Session.query(Package).filter(Package.name == name).filter(Package.id != exclude_existing_package).first() if pkg_obj: return name + str(uuid.uuid4())[:5] else: return name
def _find_or_create_groups(self, context): group_name = self.GROUPS['de'][0] data_dict = { 'id': group_name, 'name': munge_title_to_name(group_name), 'title': group_name } try: group = get_action('group_show')(context, data_dict) except: group = get_action('group_create')(context, data_dict) log.info('created the group ' + group['id']) group_ids = [] group_ids.append(group['id']) return group_ids
def _find_or_create_groups(self, groups, context): log.debug("Group names: %s" % groups) group_ids = [] for group_name in groups: data_dict = {"id": group_name, "name": munge_title_to_name(group_name), "title": group_name} try: group = get_action("group_show")(context, data_dict) log.info("found the group " + group["id"]) except: group = get_action("group_create")(context, data_dict) log.info("created the group " + group["id"]) group_ids.append(group["id"]) log.debug("Group ids: %s" % group_ids) return group_ids
def _gen_new_name(self, title): ''' Creates a URL friendly name from a title If the name already exists, it will add some random characters at the end ''' name = munge_title_to_name(title).replace('_', '-') while '--' in name: name = name.replace('--', '-') pkg_obj = Session.query(Package).filter(Package.name == name).first() if pkg_obj: return name + str(uuid.uuid4())[:5] else: return name
def make_package_name(self, title, exclude_existing_package, for_deletion): ''' Creates a URL friendly name from a title If the name already exists, it will add some random characters at the end ''' name = munge_title_to_name(title).replace('_', '-') if for_deletion: name = "deleted-" + name while '--' in name: name = name.replace('--', '-') name = name[0:90] # max length is 100 pkg_obj = Session.query(Package).filter(Package.name == name).filter(Package.id != exclude_existing_package).first() if pkg_obj: return name + "-" + str(uuid.uuid4())[:5] else: return name
def gen_new_name(self, title): name = munge_title_to_name(title).replace('_', '-') while '--' in name: name = name.replace('--', '-') like_q = u'%s%%' % name pkg_query = Session.query(Package).filter( Package.name.ilike(like_q)).limit(100) taken = [pkg.name for pkg in pkg_query] if name not in taken: return name else: counter = 1 while counter < 101: if name + six.text_type(counter) not in taken: return name + six.text_type(counter) counter = counter + 1 return None
def improve_pkg_dict(self, pkg_dict, params): if pkg_dict['name'] != '': pkg_dict['name'] = munge_name(pkg_dict['name']).replace('_', '-') else: pkg_dict['name'] = munge_title_to_name(pkg_dict['title']) if pkg_dict['url'] == '': pkg_dict.pop('url', None) # override the 'id' as this never matches the CKAN internal ID pkg_dict['id'] = pkg_dict['name'] if params is not None and params.get(license, None) is not None: pkg_dict['license_id'] = params['license'] else: pkg_dict['license_id'] = config.get('ckanext.ddi.default_license') return pkg_dict
def generate_name(data_dict): '''Generate a unique name based on the package's title and FIS-Broker guid.''' iso_values = data_dict['iso_values'] package_dict = data_dict['package_dict'] name = munge_title_to_name(package_dict['title']) name = re.sub('-+', '-', name) # ensure we don't exceed the allowed name length of 100: # (100-len(guid_part)-1) name = name[:91].strip('-') guid = iso_values['guid'] guid_part = guid.split('-')[0] name = "{0}-{1}".format(name, guid_part) return name
def process(self, record): record = record data_dict = { 'id': record['ID'], 'title': record['title'].strip('{}'), 'name': munge_title_to_name(record['ID'] + record['title']), 'notes': record['abstract'], 'harvest_source': 'MENDELEY', 'creator': record['author'].replace(',', '').split(' and '), 'tag_string': ','.join(munge_tag(tag) for tag in record['keywords'].split(',')), 'owner_org': tk.config.get('ckanext.ingestor.config.mendeley_bib.owner_org', 'iaea'), 'type': 'publications' } identifiers = [] if 'doi' in record: identifiers.append('doi:' + record['doi']) if 'isbn' in record: identifiers.append('isbn:' + record['isbn']) if 'pmid' in record: identifiers.append('pmid:' + record['pmid']) data_dict['identifier'] = identifiers if 'editor' in record: data_dict['contributor'] = [record['editor']] if 'publisher' in record: data_dict['publisher'] = [record['publisher']] if 'language' in record: data_dict['language'] = [record['language']] data_dict['source'] = record.get('url') user = tk.get_action('get_site_user')({'ignore_auth': True}) existing = model.Package.get(data_dict['id']) action = tk.get_action( 'package_update' if existing else 'package_create') action({'ignore_auth': True, 'user': user['name']}, data_dict)
def get_harvested_package_dict(cls, harvest_object): package = CKANHarvester.get_harvested_package_dict(harvest_object) # change the DKAN-isms into CKAN-style try: if 'extras' not in package: package['extras'] = {} if 'name' not in package: package['name'] = munge.munge_title_to_name(package['title']) if 'description' in package: package['notes'] = package['description'] for license in model.Package.get_license_register().values(): if license.title == package['license_title']: package['license_id'] = license.id break else: package['license_id'] = 'notspecified' if 'resources' not in package: raise PackageDictError('Dataset has no resources') for resource in package['resources']: resource['description'] = resource['title'] if 'revision_id' in resource: del resource['revision_id'] if 'format' not in resource: resource['format'] = MIMETYPE_FORMATS.get( resource.get('mimetype'), '') if 'private' in package: # DKAN appears to have datasets with private=True which are # still public: https://github.com/NuCivic/dkan/issues/950. If # they were really private then we'd not get be able to access # them, so assume they are not private. package['private'] = False return package except (Exception) as e: cls._save_object_error( 'Unable to get convert DKAN to CKAN package: %s' % e, harvest_object) return None
def _find_or_create_entity(self, entityType, entityNames, context): log.debug(entityType + ' names: %s' % entityNames) entity_ids = [] for entity_name in entityNames: data_dict = { 'id': self._utf8_and_remove_diacritics(entity_name), 'name': munge_title_to_name(entity_name), 'title': entity_name } try: entity = get_action(entityType + '_show')(context, data_dict) log.info('found the ' + entityType + ' with id' + entity['id']) except Exception: entity = self._create_entity(entityType, data_dict, context) entity_ids.append(entity['id']) log.debug(entityType + ' ids: %s' % entity_ids) return entity_ids
def command(config_ini, nodepublisher_csv): config_ini_filepath = os.path.abspath(config_ini) load_config(config_ini_filepath) engine = engine_from_config(config,'sqlalchemy.') from ckan import model from ckan.lib.munge import munge_title_to_name logging.config.fileConfig(config_ini_filepath) log = logging.getLogger(os.path.basename(__file__)) global global_log global_log = log model.init_model(engine) # Register a translator in this thread so that # the _() functions in logic layer can work from ckan.lib.cli import MockTranslator registry=Registry() registry.prepare() translator_obj=MockTranslator() registry.register(translator, translator_obj) model.repo.new_revision() log.info('Reading %s', nodepublisher_csv) with open(nodepublisher_csv, 'rU') as f: reader = csv.reader( f) for row in reader: nid, title = row publishers[ int(nid) ] = munge_title_to_name(title) # Mappings where we are getting rid of duplicate publishers publishers[16268] = publishers[11408] # UKSA -> ONS publishers[11606] = publishers[11408] # ONS publishers[20054] = publishers[16248] # Met Office publishers[33036] = publishers[15255] # Windsor & Maidenhead publishers[32619] = publishers[33245] # Monmouthshire publishers[12662] = publishers[11567] # NHS update_datasets() generate_harvest_publishers() log.info('Warnings: %r', warnings)
def _find_or_create_groups(self, groups, context): log.debug('Group names: %s' % groups) group_ids = [] for group_name in groups: data_dict = { 'id': group_name, 'name': munge_title_to_name(group_name), 'title': group_name } try: group = get_action('group_show')(context.copy(), data_dict) log.info('found the group ' + group['id']) except: group = get_action('group_create')(context.copy(), data_dict) log.info('created the group ' + group['id']) group_ids.append(group['id']) log.debug('Group ids: %s' % group_ids) return group_ids
def test_organization_import_update(self): """ Test updating organization import from file """ organization_url = tools.get_organization_test_source() for extras in False, True: data = {'url': organization_url} if extras: data['public_organization'] = True result = organization_import.apply((simplejson.dumps(data), )) self.assert_true(result.successful()) for title in u"Kainuun ty\u00f6- ja elinkeinotoimisto", u"Lapin ty\u00f6- ja elinkeinotoimisto",\ u"Suomen ymp\u00e4rist\u00f6keskus": organization = tests.call_action_api( self.app, 'organization_show', id=munge_title_to_name(title).lower()) self.assert_equal(organization['title'], title) self.assert_true( 'public_adminstration_organization' not in organization) # We do not want this to be updated
def validator(key, data, errors, context): if errors[key]: return value = data[key] if value is not missing: if value: return output = {} prefix = field['autogeneration_field'] if not prefix: prefix = DEFAULT_TITLE_FIELD log.debug('[csc_multilanguage_url] Creating field using the field %s', prefix) prefix = prefix + '-' extras = data.get(key[:-1] + ('__extras', ), {}) locales = [] autogeneration_locale = field['autogeneration_locale'] if autogeneration_locale: locales.append(autogeneration_locale) locale_default = config.get('ckan.locale_default', 'es') if locale_default: locales.append(locale_default) for l in locales: title_lang = prefix + l if title_lang in extras and extras[title_lang]: dataset_title = extras[title_lang] data[key] = munge.munge_title_to_name(dataset_title) log.debug( '[csc_multilanguage_url] Created name "%s" for package from language %s', data[key], l) break return
def set_dataset_info(self, package, dataset, dataset_defaults): extra(package, "Agency", "Department of Health & Human Services") package["author"] = "Centers for Medicare & Medicaid Services" extra(package, "author_id", "http://healthdata.gov/id/agency/cms") extra(package, "Bureau Code", "009:38") package["title"] = dataset["Name"].strip() package["notes"] = dataset.get("Description") package["url"] = dataset.get("Address") dataset_hd = dataset["HealthData"] extra(package, "Date Released", parsedate(dataset_hd.get("DateReleased"))) extra(package, "Date Updated", parsedate(dataset_hd.get("DateUpdated"))) extra(package, "Agency Program URL", dataset_hd.get("AgencyProgramURL")) extra(package, "Subject Area 1", "Medicare") extra(package, "Unit of Analysis", dataset_hd.get("UnitOfAnalysis")) extra(package, "Data Dictionary", dataset_hd.get("DataDictionaryURL")) extra(package, "Coverage Period", dataset_hd.get("Coverage Period")) extra(package, "Collection Frequency", dataset_hd.get("Collection Frequency")) extra(package, "Geographic Scope", dataset_hd.get("GeographicScope")) extra( package, "Contact Name", dataset_hd.get("GenericContactName", None) or dataset_hd.get("ContactName") ) # 'X or Y' syntax returns Y if X is either None or the empty string extra( package, "Contact Email", dataset_hd.get("GenericContactEmail", None) or dataset_hd.get("ContactEmail")) extra(package, "License Agreement", dataset_hd.get("DataLicenseAgreementURL")) from ckan.lib.munge import munge_title_to_name package["tags"] = [{ "name": munge_title_to_name(t["Name"]) } for t in dataset.get("Keywords", [])]
def test_organization_import(self): """ Test organization import """ organization_url = tools.get_organization_test_source() data = simplejson.dumps({ 'url': organization_url, 'public_organization': True }) for _ in xrange(2): result = organization_import.apply((data, )) self.assert_true(result.successful()) for title in u"Kainuun ty\u00f6- ja elinkeinotoimisto", u"Lapin ty\u00f6- ja elinkeinotoimisto",\ u"Suomen ymp\u00e4rist\u00f6keskus": organization = tests.call_action_api( self.app, 'organization_show', id=munge_title_to_name(title).lower()) self.assert_equal(organization['title'], title) public_org = 'false' for extra in organization['extras']: if extra['key'] == 'public_adminstration_organization': public_org = 'true' self.assert_equal(public_org, 'true')
def fetch_stage(self, harvest_object): ''' The fetch stage will receive a HarvestObject object and will be responsible for: - getting the contents of the remote object (e.g. for a CSW server, perform a GetRecordById request). - saving the content in the provided HarvestObject. - creating and storing any suitable HarvestObjectErrors that may occur. - returning True if everything went as expected, False otherwise. :param harvest_object: HarvestObject object :returns: True if everything went right, False if errors were found ''' logger.debug("in fetch stage: %s" % harvest_object.guid) try: self._set_config(harvest_object.job.source.config) content_dict = json.loads(harvest_object.content) content_dict['id'] = content_dict['identifier'] content_dict['name'] = ( munge_title_to_name(content_dict['title']) + content_dict['id'])[:99] try: content = json.dumps(content_dict) except Exception: logger.exception('Dumping the metadata failed!') self._save_object_error('Dumping the metadata failed!', harvest_object) return False harvest_object.content = content harvest_object.save() except Exception: logger.exception('Something went wrong!') self._save_object_error('Exception in fetch stage', harvest_object) return False return True
def get_initial_package(self, user_obj): """ Get the initial package from the kobo asset. Require the user to get the token and to validate ownership on the asset. Return a pkg_dict or raises an error """ kobo_api = self.get_kobo_api(user_obj) asset = kobo_api.get_asset(self.kobo_asset_id) pkg = { 'title': asset['name'], 'name': munge_title_to_name(asset['name']), 'notes': self._build_asset_notes(asset), 'original_id': asset['uid'], 'extras': [ { 'key': 'kobo_asset_id', 'value': self.kobo_asset_id }, { 'key': 'kobo_owner', 'value': asset['owner__username'] }, { 'key': 'kobo_sector', 'value': asset['settings'].get('sector') }, { 'key': 'kobo_country', 'value': asset['settings'].get('country') }, ], } return pkg
def make_package_name(self, title, exclude_existing_package, for_deletion): ''' Creates a URL friendly name from a title If the name already exists, it will add some random characters at the end ''' name = munge_title_to_name(title).replace('_', '-') if for_deletion: name = "deleted-" + name while '--' in name: name = name.replace('--', '-') name = name[0:90] # max length is 100 # Is this slug already in use (and if we're updating a package, is it in # use by a different package?). pkg_obj = Session.query(Package).filter(Package.name == name).filter( Package.id != exclude_existing_package).first() if not pkg_obj: # The name is available, so use it. Note that if we're updating an # existing package we will be updating this package's URL, so incoming # links may break. return name if exclude_existing_package: # The name is not available, and we're updating a package. Chances # are the package's name already had some random string attached # to it last time. Prevent spurrious updates to the package's URL # (choosing new random text) by just reusing the existing package's # name. pkg_obj = Session.query(Package).filter( Package.id == exclude_existing_package).first() if pkg_obj: # the package may not exist yet because we may be passed the desired package GUID before a new package is instantiated return pkg_obj.name # Append some random text to the URL. Hope that with five character # there will be no collsion. return name + "-" + str(uuid.uuid4())[:5]
def gather_stage(self, harvest_job): log.debug('In ZhGisHarvester gather_stage') ids = [] for dataset_id, dataset in self.DATASETS.iteritems(): csw = ckan_csw.ZhGisCkanMetadata() metadata = csw.get_ckan_metadata_by_id(dataset_id).copy() log.debug(metadata) # Fix metadata information metadata['name'] = munge_title_to_name(metadata['name']) metadata['service_type'] = (metadata['service_type'].replace( 'OGC:', '')) # Enrich metadata with hardcoded values metadata['url'] = dataset['geolion_url'] metadata['tags'].extend(dataset['tags']) metadata['translations'] = self._generate_term_translations() log.debug("Translations: %s" % metadata['translations']) metadata['resources'] = ( self._generate_resource_dict_array(metadata)) log.debug(metadata['resources']) metadata['license_id'] = self.LICENSE['name'] metadata['license_url'] = self.LICENSE['url'] obj = HarvestObject(guid=metadata['id'], job=harvest_job, content=json.dumps(metadata)) obj.save() log.debug('adding ' + metadata['name'] + ' to the queue') ids.append(obj.id) return ids
def run_create(self, context, data_dict, resources_sheet, archive): """Dataset creating proccess.""" data_dict['name'] = munge_title_to_name(data_dict['title']) try: package_id_or_name_exists(data_dict['name'], context) except Invalid: pass else: counter = 0 while True: name = '{0}-{1}'.format(data_dict['name'], counter) try: package_id_or_name_exists(name, context) except Invalid: data_dict['name'] = name break counter += 1 result = self.create_dataset(context, data_dict, resources_sheet, archive) if result: h.flash_success('Dataset was created!')