def iterate_datasets(self, package_ids): ''' Helper which iterates over all datasets in package_ids, i.e. fetches the package for all IDs ''' package_show = tk.get_action('package_show') package_ids_unique = set(package_ids) progress_total = len(package_ids_unique) util.get_migrator_log().info('INFO migrating ' + str(progress_total) + ' datasets in total') progress_current = 0 sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) for dataset_id in package_ids_unique: try: # write out status via UDP (see class doc for netcat cmd) progress_current += 1 sock.sendto( str(progress_current) + " / " + str(progress_total) + "\n", (self.UDP_IP, self.UDP_PORT)) dataset = package_show(self.create_context(), {'id': dataset_id.strip()}) # ignore harvesters, which are in the list as well if dataset['type'] == 'harvest': continue yield dataset except Exception: util.get_migrator_log().exception("Package '%s' was not found", dataset_id)
def migrate_datasets(self): ''' Iterates over all datasets and migrates fields with 'migration_functions' ''' # Check if all needed groups are present group_list = tk.get_action('group_list') if not self.executor.check_group_presence( group_list(self.create_context(), {})): return util.get_migrator_log().info('Starting dataset migration' + ( ' [dry run without saving]' if self.dry_run else '')) # Change the type of all datasets to 'dataset' via DB query, as package_update() doesn't # allow to set the type if not self.dry_run: model.Session.query(model.Package)\ .filter(or_((model.Package.type == "datensatz"), (model.Package.type == "app"), (model.Package.type == "dokument")))\ .update({"type": u'dataset'}) model.repo.commit() for dataset in self.iterate_local_datasets(): self.executor.apply_to(dataset) self.update_dataset(dataset) util.get_migrator_log().info('Dataset migration finished' + ( ' [dry run, did not save]' if self.dry_run else ''))
def check_group_presence(self, ckan_group_dict): '''Checks if all groups from the category mapping are present in the given CKAN dict (obtained via API). Returns True if all groups are found, and False otherwise.''' for group in self.functions.new_groups: if group not in ckan_group_dict: util.get_migrator_log().error(u'Group ' + unicode(group) + u' not found. Did you run the ' + u' theme adder command?') return False return True
def apply_to(self, dataset): '''Applies all public migration functions (i.e. not starting with _) to the given dataset. If one a function fails, an error is logged and the next one is tried.''' for name, func in inspect.getmembers(self.functions, inspect.ismethod): if not name.startswith('_'): try: func(dataset) except Exception: util.get_migrator_log().error( util.log_dataset_prefix(dataset) + 'Error applying ' + name)
def update_dataset(self, dataset): ''' Updates dataset in CKAN. ''' if not self.dry_run: try: package_update = tk.get_action('package_update') ctx = self.create_context() ctx['schema'] = self.PACKAGE_UPDATE_SCHEMA ctx['return_id_only'] = True package_update(ctx, dataset) except Exception: util.get_migrator_log().exception( util.log_dataset_prefix(dataset) + 'could not update')
def migrate_adms_identifier(self): util.get_migrator_log().info( 'Migrating adms:identifier to dct:identifier' + (' [dry run without saving]' if self.dry_run else '')) for dataset in self.iterate_adms_id_datasets(): # only migrate if dct:identifier is not already present if not dataset_utils.get_extras_field(dataset, EXTRA_KEY_DCT_IDENTIFIER): util.rename_extras_field_migration(dataset, EXTRA_KEY_ADMS_IDENTIFIER, EXTRA_KEY_DCT_IDENTIFIER, False) self.update_dataset(dataset) else: util.get_migrator_log().info( '%sSkipping package as it already has a dct:identifier', util.log_dataset_prefix(dataset)) util.get_migrator_log().info( 'Finished migration of adms:identifier to dct:identifier' + (' [dry run without saving]' if self.dry_run else ''))
def migrate_contributor_identifier(self): ''' Add govdata-contributor-IDs to datasets that are missing one ''' util.get_migrator_log().info('Migrating dcatde:contributorID' + ( ' [dry run without saving]' if self.dry_run else '')) starttime = time.time() package_obj_to_update = gather_dataset_ids() endtime = time.time() print "INFO: %s datasets found to check for contributor-ID. Total time: %s." % \ (len(package_obj_to_update), str(endtime - starttime)) organization_list = tk.get_action('organization_list')( self.create_context(), { 'all_fields': True, 'include_extras': True }) updated_count = created_count = 0 starttime = time.time() for dataset in self.iterate_datasets(package_obj_to_update.keys()): print u'Updating dataset: {}'.format(dataset['title']) dataset_org_id = dataset['organization']['id'] dataset_org = next((item for item in organization_list if item['id'] == dataset_org_id), None) if not dataset_org: print u'Did not find a Organization for ID: ' + dataset_org_id continue org_contributor_field = get_extras_field(dataset_org, EXTRA_KEY_CONTRIBUTOR_ID) if not org_contributor_field: print u'Did not find a contributor ID for Organization: ' + dataset_org_id continue try: org_contributor_id_list = json.loads( org_contributor_field['value']) except ValueError: # json.loads failed -> value is not an array but a single string org_contributor_id_list = [org_contributor_field['value']] dataset_contributor_field = get_extras_field( dataset, EXTRA_KEY_CONTRIBUTOR_ID) requires_update = False if not dataset_contributor_field: # Contributor-id field does not exist yet set_extras_field(dataset, EXTRA_KEY_CONTRIBUTOR_ID, json.dumps(org_contributor_id_list)) created_count = created_count + 1 requires_update = True else: try: current_ids_list = json.loads( dataset_contributor_field['value']) except ValueError: # json.loads failed -> value is not an array but a single string current_ids_list = [dataset_contributor_field['value']] for contributor_id in org_contributor_id_list: if contributor_id not in current_ids_list: current_ids_list.append(contributor_id) requires_update = True if requires_update: updated_count = updated_count + 1 set_extras_field(dataset, EXTRA_KEY_CONTRIBUTOR_ID, json.dumps(current_ids_list)) if requires_update: self.update_dataset(dataset) endtime = time.time() print "INFO: A Contributor-ID was created for %s datasets that did not have one before." % \ created_count print "INFO: %s datasets were updated. Total time: %s." % ( updated_count, str(endtime - starttime)) util.get_migrator_log().info( 'Finished migration of dcatde:contributorID' + (' [dry run without saving]' if self.dry_run else ''))