Example #1
0
    def iterate_datasets(self, package_ids):
        '''
        Helper which iterates over all datasets in package_ids, i.e. fetches the package
        for all IDs
        '''
        package_show = tk.get_action('package_show')

        package_ids_unique = set(package_ids)
        progress_total = len(package_ids_unique)
        util.get_migrator_log().info('INFO migrating ' + str(progress_total) +
                                     ' datasets in total')
        progress_current = 0
        sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)

        for dataset_id in package_ids_unique:
            try:
                # write out status via UDP (see class doc for netcat cmd)
                progress_current += 1
                sock.sendto(
                    str(progress_current) + " / " + str(progress_total) + "\n",
                    (self.UDP_IP, self.UDP_PORT))

                dataset = package_show(self.create_context(),
                                       {'id': dataset_id.strip()})

                # ignore harvesters, which are in the list as well
                if dataset['type'] == 'harvest':
                    continue

                yield dataset

            except Exception:
                util.get_migrator_log().exception("Package '%s' was not found",
                                                  dataset_id)
Example #2
0
    def migrate_datasets(self):
        '''
        Iterates over all datasets and migrates fields with 'migration_functions'
        '''
        # Check if all needed groups are present
        group_list = tk.get_action('group_list')
        if not self.executor.check_group_presence(
                group_list(self.create_context(), {})):
            return

        util.get_migrator_log().info('Starting dataset migration' + (
            ' [dry run without saving]' if self.dry_run else ''))

        # Change the type of all datasets to 'dataset' via DB query, as package_update() doesn't
        # allow to set the type
        if not self.dry_run:
            model.Session.query(model.Package)\
               .filter(or_((model.Package.type == "datensatz"),
                           (model.Package.type == "app"),
                           (model.Package.type == "dokument")))\
               .update({"type": u'dataset'})
            model.repo.commit()

        for dataset in self.iterate_local_datasets():
            self.executor.apply_to(dataset)

            self.update_dataset(dataset)

        util.get_migrator_log().info('Dataset migration finished' + (
            ' [dry run, did not save]' if self.dry_run else ''))
Example #3
0
 def check_group_presence(self, ckan_group_dict):
     '''Checks if all groups from the category mapping are present
     in the given CKAN dict (obtained via API).
     Returns True if all groups are found, and False otherwise.'''
     for group in self.functions.new_groups:
         if group not in ckan_group_dict:
             util.get_migrator_log().error(u'Group ' + unicode(group) +
                                           u' not found. Did you run the ' +
                                           u' theme adder command?')
             return False
     return True
Example #4
0
 def apply_to(self, dataset):
     '''Applies all public migration functions (i.e. not starting with _)
     to the given dataset.
     If one a function fails, an error is logged and the next one is
     tried.'''
     for name, func in inspect.getmembers(self.functions, inspect.ismethod):
         if not name.startswith('_'):
             try:
                 func(dataset)
             except Exception:
                 util.get_migrator_log().error(
                     util.log_dataset_prefix(dataset) + 'Error applying ' +
                     name)
Example #5
0
 def update_dataset(self, dataset):
     '''
     Updates dataset in CKAN.
     '''
     if not self.dry_run:
         try:
             package_update = tk.get_action('package_update')
             ctx = self.create_context()
             ctx['schema'] = self.PACKAGE_UPDATE_SCHEMA
             ctx['return_id_only'] = True
             package_update(ctx, dataset)
         except Exception:
             util.get_migrator_log().exception(
                 util.log_dataset_prefix(dataset) + 'could not update')
Example #6
0
    def migrate_adms_identifier(self):
        util.get_migrator_log().info(
            'Migrating adms:identifier to dct:identifier' +
            (' [dry run without saving]' if self.dry_run else ''))

        for dataset in self.iterate_adms_id_datasets():
            # only migrate if dct:identifier is not already present
            if not dataset_utils.get_extras_field(dataset,
                                                  EXTRA_KEY_DCT_IDENTIFIER):
                util.rename_extras_field_migration(dataset,
                                                   EXTRA_KEY_ADMS_IDENTIFIER,
                                                   EXTRA_KEY_DCT_IDENTIFIER,
                                                   False)
                self.update_dataset(dataset)
            else:
                util.get_migrator_log().info(
                    '%sSkipping package as it already has a dct:identifier',
                    util.log_dataset_prefix(dataset))

        util.get_migrator_log().info(
            'Finished migration of adms:identifier to dct:identifier' +
            (' [dry run without saving]' if self.dry_run else ''))
Example #7
0
    def migrate_contributor_identifier(self):
        ''' Add govdata-contributor-IDs to datasets that are missing one '''
        util.get_migrator_log().info('Migrating dcatde:contributorID' + (
            ' [dry run without saving]' if self.dry_run else ''))

        starttime = time.time()
        package_obj_to_update = gather_dataset_ids()
        endtime = time.time()
        print "INFO: %s datasets found to check for contributor-ID. Total time: %s." % \
              (len(package_obj_to_update), str(endtime - starttime))

        organization_list = tk.get_action('organization_list')(
            self.create_context(), {
                'all_fields': True,
                'include_extras': True
            })
        updated_count = created_count = 0
        starttime = time.time()

        for dataset in self.iterate_datasets(package_obj_to_update.keys()):
            print u'Updating dataset: {}'.format(dataset['title'])

            dataset_org_id = dataset['organization']['id']
            dataset_org = next((item for item in organization_list
                                if item['id'] == dataset_org_id), None)
            if not dataset_org:
                print u'Did not find a Organization for ID: ' + dataset_org_id
                continue

            org_contributor_field = get_extras_field(dataset_org,
                                                     EXTRA_KEY_CONTRIBUTOR_ID)
            if not org_contributor_field:
                print u'Did not find a contributor ID for Organization: ' + dataset_org_id
                continue

            try:
                org_contributor_id_list = json.loads(
                    org_contributor_field['value'])
            except ValueError:
                # json.loads failed -> value is not an array but a single string
                org_contributor_id_list = [org_contributor_field['value']]

            dataset_contributor_field = get_extras_field(
                dataset, EXTRA_KEY_CONTRIBUTOR_ID)
            requires_update = False
            if not dataset_contributor_field:
                # Contributor-id field does not exist yet
                set_extras_field(dataset, EXTRA_KEY_CONTRIBUTOR_ID,
                                 json.dumps(org_contributor_id_list))
                created_count = created_count + 1
                requires_update = True
            else:
                try:
                    current_ids_list = json.loads(
                        dataset_contributor_field['value'])
                except ValueError:
                    # json.loads failed -> value is not an array but a single string
                    current_ids_list = [dataset_contributor_field['value']]

                for contributor_id in org_contributor_id_list:
                    if contributor_id not in current_ids_list:
                        current_ids_list.append(contributor_id)
                        requires_update = True
                if requires_update:
                    updated_count = updated_count + 1
                    set_extras_field(dataset, EXTRA_KEY_CONTRIBUTOR_ID,
                                     json.dumps(current_ids_list))

            if requires_update:
                self.update_dataset(dataset)

        endtime = time.time()
        print "INFO: A Contributor-ID was created for %s datasets that did not have one before." % \
              created_count
        print "INFO: %s datasets were updated. Total time: %s." % (
            updated_count, str(endtime - starttime))

        util.get_migrator_log().info(
            'Finished migration of dcatde:contributorID' +
            (' [dry run without saving]' if self.dry_run else ''))