Beispiel #1
0
    def _is_it_sufficient_change_to_run_archiver(self, package, operation):
        ''' Returns True if in this revision any of these happened:
        * it is a new dataset
        * dataset licence changed (affects qa)
        * there are resources that have been added or deleted
        * resources have changed their URL or format (affects qa)
        '''
        if operation == 'new':
            log.debug('New package - will archive')
            # even if it has no resources, QA needs to show 0 stars against it
            return True
        elif operation == 'deleted':
            log.debug('Deleted package - won\'t archive')
            return False
        # therefore operation=changed

        # check to see if resources are added, deleted or URL changed

        # look for the latest revision
        rev_list = package.all_related_revisions
        if not rev_list:
            log.debug('No sign of previous revisions - will archive')
            return True
        # I am not confident we can rely on the info about the current
        # revision, because we are still in the 'before_commit' stage. So
        # simply ignore that if it's returned.
        if rev_list[0][0].id == model.Session.revision.id:
            rev_list = rev_list[1:]
        if not rev_list:
            log.warn('No sign of previous revisions - will archive')
            return True
        previous_revision = rev_list[0][0]
        log.debug('Comparing with revision: %s %s',
                  previous_revision.timestamp, previous_revision.id)

        # get the package as it was at that previous revision
        context = {'model': model, 'session': model.Session,
                   # 'user': c.user or c.author,
                   'ignore_auth': True,
                   'revision_id': previous_revision.id}
        data_dict = {'id': package.id}
        try:
            old_pkg_dict = p.toolkit.get_action('package_show')(
                context, data_dict)
        except p.toolkit.NotFound:
            log.warn('No sign of previous package - will archive anyway')
            return True

        # has the licence changed?
        old_licence = (old_pkg_dict['license_id'],
                       lib.get_extra_from_pkg_dict(old_pkg_dict, 'licence')
                       or None)
        new_licence = (package.license_id,
                       package.extras.get('licence') or None)
        if old_licence != new_licence:
            log.debug('Licence has changed - will archive: %r->%r',
                      old_licence, new_licence)
            return True

        # have any resources been added or deleted?
        old_resources = dict((res['id'], res)
                             for res in old_pkg_dict['resources'])
        old_res_ids = set(old_resources.keys())
        new_res_ids = set((res.id for res in package.resources))
        deleted_res_ids = old_res_ids - new_res_ids
        if deleted_res_ids:
            log.debug('Deleted resources - will archive. res_ids=%r',
                      deleted_res_ids)
            return True
        added_res_ids = new_res_ids - old_res_ids
        if added_res_ids:
            log.debug('Added resources - will archive. res_ids=%r',
                      added_res_ids)
            return True

        # have any resources' url/format changed?
        for res in package.resources:
            for key in ('url', 'format'):
                old_res_value = old_resources[res.id][key]
                new_res_value = getattr(res, key)
                if old_res_value != new_res_value:
                    log.debug('Resource %s changed - will archive. '
                              'id=%s pos=%s url="%s"->"%s"',
                              key, res.id[:4], res.position,
                              old_res_value, new_res_value)
                    return True

            was_in_progress = old_resources[res.id].get('upload_in_progress', None)
            is_in_progress = res.extras.get('upload_in_progress', None)
            if was_in_progress != is_in_progress:
                log.debug('Resource %s upload finished - will archive. ', 'upload_finished')
                return True

            log.debug('Resource unchanged. pos=%s id=%s',
                      res.position, res.id[:4])

        log.debug('No new, deleted or changed resources - won\'t archive')
        return False
Beispiel #2
0
    def _is_it_sufficient_change_to_run_archiver(self, package, operation):
        """ Returns True if in this revision any of these happened:
        * it is a new dataset
        * dataset licence changed (affects qa)
        * there are resources that have been added or deleted
        * resources have changed their URL or format (affects qa)
        """
        if operation == "new":
            log.debug("New package - will archive")
            # even if it has no resources, QA needs to show 0 stars against it
            return True
        elif operation == "deleted":
            log.debug("Deleted package - won't archive")
            return False
        # therefore operation=changed

        # check to see if resources are added, deleted or URL changed

        # look for the latest revision
        rev_list = package.all_related_revisions
        if not rev_list:
            log.debug("No sign of previous revisions - will archive")
            return True
        # I am not confident we can rely on the info about the current
        # revision, because we are still in the 'before_commit' stage. So
        # simply ignore that if it's returned.
        if rev_list[0][0].id == model.Session.revision.id:
            rev_list = rev_list[1:]
        if not rev_list:
            log.warn("No sign of previous revisions - will archive")
            return True
        previous_revision = rev_list[0][0]
        log.debug("Comparing with revision: %s %s", previous_revision.timestamp, previous_revision.id)

        # get the package as it was at that previous revision
        context = {
            "model": model,
            "session": model.Session,
            #'user': c.user or c.author,
            "ignore_auth": True,
            "revision_id": previous_revision.id,
        }
        data_dict = {"id": package.id}
        try:
            old_pkg_dict = p.toolkit.get_action("package_show")(context, data_dict)
        except p.toolkit.NotFound:
            log.warn("No sign of previous package - will archive anyway")
            return True

        # has the licence changed?
        old_licence = (old_pkg_dict["license_id"], lib.get_extra_from_pkg_dict(old_pkg_dict, "licence") or None)
        new_licence = (package.license_id, package.extras.get("licence") or None)
        if old_licence != new_licence:
            log.debug("Licence has changed - will archive: %r->%r", old_licence, new_licence)
            return True

        # have any resources been added or deleted?
        old_resources = dict((res["id"], res) for res in old_pkg_dict["resources"])
        old_res_ids = set(old_resources.keys())
        new_res_ids = set((res.id for res in package.resources))
        deleted_res_ids = old_res_ids - new_res_ids
        if deleted_res_ids:
            log.debug("Deleted resources - will archive. res_ids=%r", deleted_res_ids)
            return True
        added_res_ids = new_res_ids - old_res_ids
        if added_res_ids:
            log.debug("Added resources - will archive. res_ids=%r", added_res_ids)
            return True

        # have any resources' url/format changed?
        for res in package.resources:
            for key in ("url", "format"):
                old_res_value = old_resources[res.id][key]
                new_res_value = getattr(res, key)
                if old_res_value != new_res_value:
                    log.debug(
                        "Resource %s changed - will archive. " 'id=%s pos=%s url="%s"->"%s"',
                        key,
                        res.id[:4],
                        res.position,
                        old_res_value,
                        new_res_value,
                    )
                    return True
            log.debug("Resource unchanged. pos=%s id=%s", res.position, res.id[:4])

        log.debug("No new, deleted or changed resources - won't archive")
        return False
Beispiel #3
0
    def _is_it_sufficient_change_to_run_archiver(self, package, operation):
        ''' Returns True if in this revision any of these happened:
        * it is a new dataset
        * dataset licence changed (affects qa)
        * there are resources that have been added or deleted
        * resources have changed their URL or format (affects qa)
        '''
        if operation == 'new':
            log.debug('New package - will archive')
            # even if it has no resources, QA needs to show 0 stars against it
            return True
        elif operation == 'deleted':
            log.debug('Deleted package - won\'t archive')
            return False
        # therefore operation=changed

        # check to see if resources are added, deleted or URL changed

        # look for the latest revision
        rev_list = package.all_related_revisions
        if not rev_list:
            log.debug('No sign of previous revisions - will archive')
            return True
        # I am not confident we can rely on the info about the current
        # revision, because we are still in the 'before_commit' stage. So
        # simply ignore that if it's returned.
        if rev_list[0][0].id == model.Session.revision.id:
            rev_list = rev_list[1:]
        if not rev_list:
            log.warn('No sign of previous revisions - will archive')
            return True
        previous_revision = rev_list[0][0]
        log.debug('Comparing with revision: %s %s',
                  previous_revision.timestamp, previous_revision.id)

        # get the package as it was at that previous revision
        context = {
            'model': model,
            'session': model.Session,
            # 'user': c.user or c.author,
            'ignore_auth': True,
            'revision_id': previous_revision.id
        }
        data_dict = {'id': package.id}
        try:
            old_pkg_dict = p.toolkit.get_action('package_show')(context,
                                                                data_dict)
        except p.toolkit.NotFound:
            log.warn('No sign of previous package - will archive anyway')
            return True

        # has the licence changed?
        old_licence = (old_pkg_dict['license_id'],
                       lib.get_extra_from_pkg_dict(old_pkg_dict, 'licence')
                       or None)
        new_licence = (package.license_id, package.extras.get('licence')
                       or None)
        if old_licence != new_licence:
            log.debug('Licence has changed - will archive: %r->%r',
                      old_licence, new_licence)
            return True

        # have any resources been added or deleted?
        old_resources = dict(
            (res['id'], res) for res in old_pkg_dict['resources'])
        old_res_ids = set(old_resources.keys())
        new_res_ids = set((res.id for res in package.resources))
        deleted_res_ids = old_res_ids - new_res_ids
        if deleted_res_ids:
            log.debug('Deleted resources - will archive. res_ids=%r',
                      deleted_res_ids)
            return True
        added_res_ids = new_res_ids - old_res_ids
        if added_res_ids:
            log.debug('Added resources - will archive. res_ids=%r',
                      added_res_ids)
            return True

        # have any resources' url/format changed?
        for res in package.resources:
            for key in ('url', 'format'):
                old_res_value = old_resources[res.id][key]
                new_res_value = getattr(res, key)
                if old_res_value != new_res_value:
                    log.debug(
                        'Resource %s changed - will archive. '
                        'id=%s pos=%s url="%s"->"%s"', key, res.id[:4],
                        res.position, old_res_value, new_res_value)
                    return True

            was_in_progress = old_resources[res.id].get(
                'upload_in_progress', None)
            is_in_progress = res.extras.get('upload_in_progress', None)
            if was_in_progress != is_in_progress:
                log.debug('Resource %s upload finished - will archive. ',
                          'upload_finished')
                return True

            log.debug('Resource unchanged. pos=%s id=%s', res.position,
                      res.id[:4])

        log.debug('No new, deleted or changed resources - won\'t archive')
        return False
Beispiel #4
0
    def _is_it_sufficient_change_to_run_archiver(self, package, operation):
        ''' Returns True if in this revision any of these happened:
        * it is a new dataset
        * dataset licence changed (affects qa)
        * there are resources that have been added or deleted
        * resources have changed their URL or format (affects qa)
        '''
        if operation == 'new':
            log.debug('New package - will archive')
            # even if it has no resources, QA needs to show 0 stars against it
            return True
        elif operation == 'deleted':
            log.debug('Deleted package - won\'t archive')
            return False
        # therefore operation=changed

        # check to see if resources are added, deleted or URL changed

        # Since 'revisions' is a deprecated feature in CKAN,
        # try to use activity stream to check if dataset changed
        context = {'model': model, 'session': model.Session, 'ignore_auth': True, 'user': None}
        if p.toolkit.check_ckan_version(min_version='2.9.0'):
            data_dict = {'id': package.id, 'limit': 2}

            activity_list = p.toolkit.get_action('package_activity_list')(context, data_dict)
            if len(activity_list) <= 1:
                log.warn('No sign of previous package - will archive anyway')
                return True
            old_act = p.toolkit.get_action('activity_data_show')(context, {'id': activity_list[1]['id']})
            old_pkg_dict = old_act['package']
        else:
            # look for the latest revision
            rev_list = package.all_related_revisions
            if not rev_list:
                log.debug('No sign of previous revisions - will archive')
                return True
            # I am not confident we can rely on the info about the current
            # revision, because we are still in the 'before_commit' stage. So
            # simply ignore that if it's returned.
            if hasattr(model.Session, 'revision') and rev_list[0][0].id == model.Session.revision.id:
                rev_list = rev_list[1:]
            if not rev_list:
                log.warn('No sign of previous revisions - will archive')
                return True
            previous_revision = rev_list[0][0]
            log.debug('Comparing with revision: %s %s',
                      previous_revision.timestamp, previous_revision.id)

            # get the package as it was at that previous revision
            context['revision_id'] = previous_revision.id
            data_dict = {'id': package.id}
            try:
                old_pkg_dict = p.toolkit.get_action('package_show')(
                    context, data_dict)
            except p.toolkit.ObjectNotFound:
                log.warn('No sign of previous package - will archive anyway')
                return True

        # has the licence changed?
        old_licence = (old_pkg_dict['license_id'],
                       lib.get_extra_from_pkg_dict(old_pkg_dict, 'licence')
                       or None)
        new_licence = (package.license_id,
                       package.extras.get('licence') or None)
        if old_licence != new_licence:
            log.debug('Licence has changed - will archive: %r->%r',
                      old_licence, new_licence)
            return True

        # have any resources been added or deleted?
        old_resources = dict((res['id'], res)
                             for res in old_pkg_dict['resources'])
        old_res_ids = set(old_resources.keys())
        new_res_ids = set((res.id for res in package.resources))
        deleted_res_ids = old_res_ids - new_res_ids
        if deleted_res_ids:
            log.debug('Deleted resources - will archive. res_ids=%r',
                      deleted_res_ids)
            return True
        added_res_ids = new_res_ids - old_res_ids
        if added_res_ids:
            log.debug('Added resources - will archive. res_ids=%r',
                      added_res_ids)
            return True

        # have any resources' url/format changed?
        for res in package.resources:
            watched_keys = ['format']
            # Ignore URL changes in uploaded resources.
            # Otherwise we'll end up comparing 'example.txt' to
            # 'http://example.com/dataset/foo/resource/baz/download/example.txt'
            # and thinking that it's changed.
            if res.url_type != 'upload' \
                    or old_resources[res.id]['url_type'] != 'upload':
                watched_keys.append('url')
            for key in watched_keys:
                old_res_value = old_resources[res.id][key]
                new_res_value = getattr(res, key)
                if old_res_value != new_res_value:
                    log.debug('Resource %s changed - will archive. '
                              'id=%s pos=%s url="%s"->"%s"',
                              key, res.id[:4], res.position,
                              old_res_value, new_res_value)
                    return True

            was_in_progress = old_resources[res.id].get('upload_in_progress', None)
            is_in_progress = res.extras.get('upload_in_progress', None)
            if was_in_progress != is_in_progress:
                log.debug('Resource %s upload finished - will archive. ', 'upload_finished')
                return True

            log.debug('Resource unchanged. pos=%s id=%s',
                      res.position, res.id[:4])

        log.debug('No new, deleted or changed resources - won\'t archive')
        return False