def sixodp_to_opendata_postprocess(package_dict):
    package_dict['collection_type'] = 'Open Data'
    package_dict['maintainer'] = package_dict.get('maintainer', ' ') or ' '
    package_dict['maintainer_email'] = package_dict.get('maintainer_email', ' ') or ' '
    date_released = parse_datetime(package_dict['date_released'])
    if date_released:
        date_released_isoformat = "%s.000000" % date_released.isoformat().split('+', 2)[0]
        package_dict['date_released'] = date_released_isoformat
        package_dict['metadata_created'] = date_released_isoformat

    for resource in package_dict['resources']:

        time_series_start = resource.get('time_series_start')
        if time_series_start:
            try:
                isodate(time_series_start, {})
            except Invalid:
                resource.pop('time_series_start')

        time_series_end = resource.get('time_series_end')
        if time_series_end:
            try:
                isodate(time_series_end, {})
            except Invalid:
                resource.pop('time_series_end')
Example #2
0
def sixodp_to_opendata_postprocess(package_dict):
    package_dict['collection_type'] = 'Open Data'
    package_dict['maintainer'] = package_dict.get('maintainer', ' ') or ' '
    package_dict['maintainer_email'] = package_dict.get('maintainer_email', ' ') or ' '
    date_released = parse_datetime(package_dict['date_released'])
    if date_released:
        date_released_isoformat = "%s.000000" % date_released.isoformat().split('+', 2)[0]
        package_dict['date_released'] = date_released_isoformat
        package_dict['metadata_created'] = date_released_isoformat

    for resource in package_dict['resources']:

        time_series_start = resource.get('time_series_start')
        if time_series_start:
            try:
                isodate(time_series_start, {})
            except Invalid:
                resource.pop('time_series_start')

        time_series_end = resource.get('time_series_end')
        if time_series_end:
            try:
                isodate(time_series_end, {})
            except Invalid:
                resource.pop('time_series_end')
Example #3
0
def taxonomy_update(context, data_dict):
    """
    Updates an existing taxonomy.

    title, name and uri are required

    :returns: The newly updated taxonomy
    :rtype: A dictionary
    """
    _check_access('taxonomy_update', context, data_dict)

    model = context['model']

    id = logic.get_or_bust(data_dict, 'id')

    tax = Taxonomy.get(id)
    if not tax:
        raise logic.NotFound()

    tax.name = data_dict.get('name', tax.name)
    tax.title = data_dict.get('title', tax.title)
    tax.uri = data_dict.get('name', tax.uri)
    last_modified = data_dict.get('last_modified', tax.last_modified)

    if tax.last_modified != last_modified:
        tax.last_modified = isodate(last_modified, context)

    model.Session.add(tax)
    model.Session.commit()

    return tax.as_dict()
Example #4
0
    def _changed_packages_since(self, registry, since_time):
        """
        Query source ckan instance for packages changed since_time.
        returns (packages, next since_time to query) or (None, None)
        when no more changes are found.

        registry - LocalCKAN or RemoteCKAN instance
        since_time - local datetime to start looking for changes

        If all the package ids found were included in seen_id_set this
        function will return an empty list of package ids.  Note that
        this is different than when no more changes found and (None, None)
        is returned.
        """
        data = registry.action.changed_packages_activity_list_since(
            since_time=since_time.isoformat())

        if not data:
            return None, None

        packages = []
        for result in data:
            package_id = result['data']['package']['id']
            try:
                packages.append(json.dumps(registry.action.package_show(id=package_id)))
            except NotFound:
                pass

        if data:
            since_time = isodate(data[-1]['timestamp'], None)

        return packages, since_time
def _trim_package(pkg):
    """
    remove keys from pkg that we don't care about when comparing
    or updating/creating packages.  Also try to convert types and
    create missing fields that will be present in package_show.
    """
    # XXX full of custom hacks and deep knowledge of our schema :-(
    if not pkg:
        return
    for k in ['extras', 'metadata_modified', 'metadata_created',
            'revision_id', 'revision_timestamp', 'organization',
            'version', 'tracking_summary',
            'tags', # just because we don't use them
            'num_tags', 'num_resources', 'maintainer',
            'isopen', 'relationships_as_object', 'license_title',
            'license_title_fra', 'license_url_fra', 'license_url',
            'author',
            'groups', # just because we don't use them
            'relationships_as_subject', 'department_number',
            # FIXME: remove these when we can:
            'resource_type',
            # new in 2.3:
            'creator_user_id',
            ]:
        if k in pkg:
            del pkg[k]
    for r in pkg['resources']:
        for k in ['package_id', 'revision_id',
                'revision_timestamp', 'cache_last_updated',
                'webstore_last_updated', 'state', 'hash',
                'description', 'tracking_summary', 'mimetype_inner',
                'mimetype', 'cache_url', 'created', 'webstore_url',
                'last_modified', 'position']:
            if k in r:
                del r[k]
        for k in ['name', 'size']:
            if k not in r:
                r[k] = None
    for k in ['private']:
        pkg[k] = boolean_validator(unicode(pkg.get(k, '')), None)
    if 'name' not in pkg:
        pkg['name'] = pkg['id']
    if 'type' not in pkg:
        pkg['type'] = 'dataset'
    if 'state' not in pkg:
        pkg['state'] = 'active'
    for k in ['url']:
        if k not in pkg:
            pkg[k] = ''
    for name, lang, field in schema_description.dataset_field_iter():
        if field['type'] == 'date':
            try:
                pkg[name] = str(isodate(pkg[name], None)) if pkg.get(name) else ''
            except Invalid:
                pass # not for us to fail validation
        elif field['type'] == 'url':
            if not pkg.get(name): # be consistent about what an empty url is
                pkg[name] = ""
        elif field['type'] == 'fixed' and name in pkg:
            del pkg[name]
Example #6
0
    def portal_update_worker(self, source):
        """
        a process that accepts package ids on stdin which are passed to
        the package_show API on the remote CKAN instance and compared
        to the local version of the same package.  The local package is
        then created, updated, deleted or left unchanged.  This process
        outputs that action as a string 'created', 'updated', 'deleted'
        or 'unchanged'
        """
        registry = RemoteCKAN(source)
        portal = LocalCKAN()
        now = datetime.now()

        for package_id in iter(sys.stdin.readline, ''):
            try:
                data = registry.action.package_show(id=package_id.strip())
                source_pkg = data['result']
            except NotAuthorized:
                source_pkg = None

            _trim_package(source_pkg)

            if source_pkg:
                # treat unpublished packages same as deleted packages
                if not source_pkg['portal_release_date'] or isodate(
                        source_pkg['portal_release_date'], None) > now:
                    source_pkg = None

            try:
                # don't pass user in context so deleted packages
                # raise NotAuthorized
                target_pkg = portal.call_action('package_show',
                    {'id':package_id.strip()}, {})
            except (NotFound, NotAuthorized):
                target_pkg = None

            _trim_package(target_pkg)

            if target_pkg is None and source_pkg is None:
                result = 'unchanged'
            elif target_pkg is None:
                # CREATE
                portal.action.package_create(**source_pkg)
                result = 'created'
            elif source_pkg is None:
                # DELETE
                portal.action.package_delete(id=package_id.strip())
                result = 'deleted'
            elif source_pkg == target_pkg:
                result = 'unchanged'
            else:
                # UPDATE
                portal.action.package_update(**source_pkg)
                result = 'updated'

            sys.stdout.write(result + '\n')
            try:
                sys.stdout.flush()
            except IOError:
                break
Example #7
0
    def _changed_packages_since(self, registry, since_time):
        """
        Query source ckan instance for packages changed since_time.
        returns (packages, next since_time to query) or (None, None)
        when no more changes are found.

        registry - LocalCKAN or RemoteCKAN instance
        since_time - local datetime to start looking for changes

        If all the package ids found were included in seen_id_set this
        function will return an empty list of package ids.  Note that
        this is different than when no more changes found and (None, None)
        is returned.
        """
        data = registry.action.changed_packages_activity_list_since(
            since_time=since_time.isoformat())

        if not data:
            return None, None

        packages = []
        for result in data:
            package_id = result['data']['package']['id']
            try:
                packages.append(json.dumps(registry.action.package_show(id=package_id)))
            except NotFound:
                pass

        if data:
            since_time = isodate(data[-1]['timestamp'], None)

        return packages, since_time
Example #8
0
    def portal_update(self, source, activity_date=None):
        """
        collect batches of package ids modified at source since activity_date
        and apply the package updates to the local CKAN instance for all
        packages with published_date set to any time in the past.
        """
        if activity_date:
            # XXX local time :-(
            activity_date = isodate(activity_date, None)
        else:
            activity_date = datetime.now() - timedelta(days=7)

        seen_package_id_set = set()

        def changed_package_id_runs(start_date):
            while True:
                package_ids, next_date = self._changed_package_ids_since(
                    source, start_date, seen_package_id_set)
                if next_date is None:
                    return
                yield package_ids, next_date
                start_date = next_date

        pool = worker_pool(
            [sys.argv[0], 'canada', 'portal-update-worker', source,
             '-c', self.options.config],
            self.options.processes,
            [],
            stop_when_jobs_done=False,
            stop_on_keyboard_interrupt=False,
            )
        pool.next() # advance generator so we may call send() below

        try:
            for package_ids, next_date in changed_package_id_runs(activity_date):
                stats = dict(created=0, updated=0, deleted=0, unchanged=0)

                jobs = ((i, i + '\n') for i in package_ids)
                try:
                    job_ids, finished, result = pool.send(jobs)
                    while result is not None:
                        stats[result.strip()] += 1
                        job_ids, finished, result = pool.next()
                except KeyboardInterrupt:
                    break

                print next_date.isoformat(),
                print " ".join("%s:%s" % kv for kv in sorted(stats.items()))
        except IOError, e:
            # let pipe errors cause silent exit --
            # the worker will have provided the real traceback
            if e.errno != 32:
                raise
Example #9
0
def changed_packages_activity_list_since(context, data_dict):
    '''Return the activity stream of all recently added or changed packages.

    :param since_time: starting date/time

    Limited to 31 records (configurable via the
    ckan.activity_list_hard_limit setting) but may be called repeatedly
    with the timestamp of the last record to collect all activities.

    :rtype: list of dictionaries
    '''

    since = get_or_bust(data_dict, 'since_time')
    try:
        since_time = isodate(since, None)
    except Invalid, e:
        raise ValidationError({'since_time': e.error})
Example #10
0
def changed_packages_activity_list_since(context, data_dict):
    '''Return the activity stream of all recently added or changed packages.

    :param since_time: starting date/time

    Limited to 31 records (configurable via the
    ckan.activity_list_hard_limit setting) but may be called repeatedly
    with the timestamp of the last record to collect all activities.

    :rtype: list of dictionaries
    '''

    since = get_or_bust(data_dict, 'since_time')
    try:
        since_time = isodate(since, None)
    except Invalid, e:
        raise ValidationError({'since_time':e.error})
Example #11
0
    def changed_datasets(self, since_date):
        """
        Produce a list of dataset ids and requested dates. Each package
        id will appear at most once, showing the activity date closest
        to since_date. Requested dates are preceeded with a "#"
        """
        since_date = isodate(since_date, None)
        seen_ids = set()

        while True:
            ids, since_date = self._changed_package_ids_since(
                self.options.server, since_date, seen_ids)
            if not ids:
                return
            for i in ids:
                print i
            if not self.options.brief:
                print "# {0}".format(since_date.isoformat())
Example #12
0
    def portal_update(self, source, activity_date=None):
        """
        collect batches of package ids modified at source since activity_date
        and apply the package updates to the local CKAN instance for all
        packages with published_date set to any time in the past.
        """
        if activity_date:
            # XXX local time :-(
            activity_date = isodate(activity_date, None)
        else:
            activity_date = datetime.now() - timedelta(days=7)

        seen_package_id_set = set()

        def changed_package_id_runs(start_date):
            while True:
                package_ids, next_date = self._changed_package_ids_since(
                    source, start_date, seen_package_id_set)
                if next_date is None:
                    return
                yield package_ids, next_date
                start_date = next_date

        pool = worker_pool(
            [sys.argv[0], 'canada', 'portal-update-worker', source,
             '-c', self.options.config],
            self.options.processes,
            [],
            stop_when_jobs_done=False,
            stop_on_keyboard_interrupt=False,
            )
        pool.next() # advance generator so we may call send() below

        with _quiet_int_pipe():
            for package_ids, next_date in changed_package_id_runs(activity_date):
                stats = dict(created=0, updated=0, deleted=0, unchanged=0)

                job_ids, finished, result = pool.send(enumerate(package_ids))
                while result is not None:
                    stats[result.strip()] += 1
                    job_ids, finished, result = pool.next()

                print next_date.isoformat(),
                print " ".join("%s:%s" % kv for kv in sorted(stats.items()))
Example #13
0
def taxonomy_create(context, data_dict):
    """
    Creates a new taxonomy. Terms are not created here, they must be
    created using taxonomy_term_create with the taxonomy id from this
    call.

    :param owner_org: the id of the dataset's owning organization, see


    :returns: The newly created taxonomy
    :rtype: A dictionary.
    """
    _check_access('taxonomy_create', context, data_dict)

    model = context['model']

    name = data_dict.get('name')

    title = logic.get_or_bust(data_dict, 'title')
    uri = logic.get_or_bust(data_dict, 'uri')
    last_modified = data_dict.get('last_modified', "")

    if not name:
        name = munge_name(title)

    # Check the name has not been used
    if model.Session.query(Taxonomy).filter(Taxonomy.name == name).count() > 0:
        raise logic.ValidationError("Name is already in use")

    if last_modified != "":
        t = Taxonomy(name=name,
                     title=title,
                     uri=uri,
                     last_modified=isodate(last_modified, context))
    else:
        t = Taxonomy(name=name,
                     title=title,
                     uri=uri,
                     last_modified=datetime.date.today())
    model.Session.add(t)
    model.Session.commit()

    return t.as_dict()
Example #14
0
    def _changed_package_ids_since(self,
                                   registry,
                                   since_time,
                                   seen_id_set=None):
        """
        Query source ckan instance for packages changed since_time.
        returns (package ids, next since_time to query) or (None, None)
        when no more changes are found.

        registry - LocalCKAN or RemoteCKAN instance
        since_time - local datetime to start looking for changes
        seen_id_set - set of package ids already processed, this set is
                      modified by calling this function

        If all the package ids found were included in seen_id_set this
        function will return an empty list of package ids.  Note that
        this is different than when no more changes found and (None, None)
        is returned.
        """
        data = registry.action.changed_packages_activity_list_since(
            since_time=since_time.isoformat())

        if seen_id_set is None:
            seen_id_set = set()

        if not data:
            return None, None

        package_ids = []
        for result in data:
            package_id = result['data']['package']['id']
            if package_id in seen_id_set:
                continue
            seen_id_set.add(package_id)
            package_ids.append(package_id)

        if data:
            since_time = isodate(data[-1]['timestamp'], None)

        return package_ids, since_time
Example #15
0
    def _changed_package_ids_since(self, registry, since_time,
                                   seen_id_set=None):
        """
        Query source ckan instance for packages changed since_time.
        returns (package ids, next since_time to query) or (None, None)
        when no more changes are found.

        registry - LocalCKAN or RemoteCKAN instance
        since_time - local datetime to start looking for changes
        seen_id_set - set of package ids already processed, this set is
                      modified by calling this function

        If all the package ids found were included in seen_id_set this
        function will return an empty list of package ids.  Note that
        this is different than when no more changes found and (None, None)
        is returned.
        """
        data = registry.action.changed_packages_activity_list_since(
            since_time=since_time.isoformat())

        if seen_id_set is None:
            seen_id_set = set()

        if not data:
            return None, None

        package_ids = []
        for result in data:
            package_id = result['data']['package']['id']
            if package_id in seen_id_set:
                continue
            seen_id_set.add(package_id)
            package_ids.append(package_id)

        if data:
            since_time = isodate(data[-1]['timestamp'], None)

        return package_ids, since_time
Example #16
0
    def changed_datasets(self, since_date):
        """
        Produce a list of dataset ids and requested dates. Each package
        id will appear at most once, showing the activity date closest
        to since_date. Requested dates are preceeded with a "#"
        """
        since_date = isodate(since_date, None)
        seen_ids = set()

        if self.options.server:
            registry = RemoteCKAN(self.options.server)
        else:
            registry = LocalCKAN()

        while True:
            ids, since_date = self._changed_package_ids_since(
                registry, since_date, seen_ids)
            if not ids:
                return
            for i in ids:
                print i
            if not self.options.brief:
                print "# {0}".format(since_date.isoformat())
Example #17
0
    def copy_datasets(self, remote, package_ids=None):
        """
        a process that accepts packages on stdin which are compared
        to the local version of the same package.  The local package is
        then created, updated, deleted or left unchanged.  This process
        outputs that action as a string 'created', 'updated', 'deleted'
        or 'unchanged'
        """
        portal = LocalCKAN()

        now = datetime.now()

        packages = iter(sys.stdin.readline, '')

        for package in packages:
            source_pkg = json.loads(package)
            package_id = source_pkg['id']
            reason = None
            target_deleted = False
            if source_pkg and source_pkg['state'] == 'deleted':
                source_pkg = None

            if source_pkg and source_pkg['type'] not in DATASET_TYPES:
                # non-default dataset types ignored
                source_pkg = None

            _trim_package(source_pkg)

            action = None
            if source_pkg and not self.options.mirror:
                if source_pkg.get('ready_to_publish') == 'false':
                    source_pkg = None
                    reason = 'marked not ready to publish'
                elif not source_pkg.get('portal_release_date'):
                    source_pkg = None
                    reason = 'release date not set'
                elif isodate(source_pkg['portal_release_date'], None) > now:
                    source_pkg = None
                    reason = 'release date in future'
                else:
                    # portal packages published public
                    source_pkg['private'] = False

            if action != 'skip':
                try:
                    target_pkg = portal.call_action('package_show', {
                        'id': package_id
                    })
                except (NotFound, NotAuthorized):
                    target_pkg = None
                except (CKANAPIError, urllib2.URLError), e:
                    sys.stdout.write(
                        json.dumps([
                            package_id,
                            'target error',
                            unicode(e.args)
                        ]) + '\n'
                    )
                    raise
                if target_pkg and target_pkg['state'] == 'deleted':
                    target_pkg = None
                    target_deleted = True

                _trim_package(target_pkg)

            if action == 'skip':
                pass
            elif target_pkg is None and source_pkg is None:
                action = 'unchanged'
                reason = reason or 'deleted on registry'
            elif target_deleted:
                action = 'updated'
                reason = 'undeleting on target'
                portal.action.package_update(**source_pkg)
            elif target_pkg is None:
                action = 'created'
                portal.action.package_create(**source_pkg)
            elif source_pkg is None:
                action = 'deleted'
                portal.action.package_delete(id=package_id)
            elif source_pkg == target_pkg:
                action = 'unchanged'
                reason = 'no difference found'
            else:
                action = 'updated'
                portal.action.package_update(**source_pkg)

            sys.stdout.write(json.dumps([package_id, action, reason]) + '\n')
            sys.stdout.flush()
Example #18
0
    def copy_datasets(self, remote, package_ids=None):
        """
        a process that accepts package ids on stdin which are passed to
        the package_show API on the remote CKAN instance and compared
        to the local version of the same package.  The local package is
        then created, updated, deleted or left unchanged.  This process
        outputs that action as a string 'created', 'updated', 'deleted'
        or 'unchanged'
        """
        if self.options.push_apikey and not self.options.fetch:
            registry = LocalCKAN()
            portal = RemoteCKAN(remote, apikey=self.options.push_apikey)
        elif self.options.fetch:
            registry = RemoteCKAN(remote)
            portal = LocalCKAN()
        else:
            print "exactly one of -f or -a options must be specified"
            return

        now = datetime.now()

        if not package_ids:
            package_ids = iter(sys.stdin.readline, '')

        for package_id in package_ids:
            package_id = package_id.strip()
            reason = None
            target_deleted = False
            try:
                source_pkg = registry.action.package_show(id=package_id)
            except NotAuthorized:
                source_pkg = None
            except (CKANAPIError, urllib2.URLError), e:
                sys.stdout.write(json.dumps([package_id, 'source error',
                    unicode(e.args)]) + '\n')
                raise
            if source_pkg and source_pkg['state'] == 'deleted':
                source_pkg = None

            if source_pkg and source_pkg['type'] != 'dataset':
                # non-default dataset types ignored
                source_pkg = None

            _trim_package(source_pkg)

            if source_pkg and not self.options.mirror:
                # treat unpublished packages same as deleted packages
                if not source_pkg['portal_release_date']:
                    source_pkg = None
                    reason = 'release date not set'
                elif isodate(source_pkg['portal_release_date'], None) > now:
                    source_pkg = None
                    reason = 'release date in future'

            try:
                target_pkg = portal.call_action('package_show',
                    {'id':package_id})
            except (NotFound, NotAuthorized):
                target_pkg = None
            except (CKANAPIError, urllib2.URLError), e:
                sys.stdout.write(json.dumps([package_id, 'target error',
                    unicode(e.args)]) + '\n')
                raise
Example #19
0
    def copy_datasets(self, remote, package_ids=None):
        """
        a process that accepts packages on stdin which are compared
        to the local version of the same package.  The local package is
        then created, updated, deleted or left unchanged.  This process
        outputs that action as a string 'created', 'updated', 'deleted'
        or 'unchanged'
        """
        portal = LocalCKAN()

        now = datetime.now()

        packages = iter(sys.stdin.readline, '')

        for package in packages:
            source_pkg = json.loads(package)
            package_id = source_pkg['id']
            reason = None
            target_deleted = False
            if source_pkg and source_pkg['state'] == 'deleted':
                source_pkg = None

            if source_pkg and source_pkg['type'] not in DATASET_TYPES:
                # non-default dataset types ignored
                source_pkg = None

            _trim_package(source_pkg)

            action = None
            if source_pkg and not self.options.mirror:
                if source_pkg.get('ready_to_publish') == 'false':
                    source_pkg = None
                    reason = 'marked not ready to publish'
                elif not source_pkg.get('portal_release_date'):
                    source_pkg = None
                    reason = 'release date not set'
                elif isodate(source_pkg['portal_release_date'], None) > now:
                    source_pkg = None
                    reason = 'release date in future'
                else:
                    # portal packages published public
                    source_pkg['private'] = False

            if action != 'skip':
                try:
                    target_pkg = portal.call_action('package_show', {
                        'id': package_id
                    })
                except (NotFound, NotAuthorized):
                    target_pkg = None
                except (CKANAPIError, urllib2.URLError), e:
                    sys.stdout.write(
                        json.dumps([
                            package_id,
                            'target error',
                            unicode(e.args)
                        ]) + '\n'
                    )
                    raise
                if target_pkg and target_pkg['state'] == 'deleted':
                    target_pkg = None
                    target_deleted = True

                _trim_package(target_pkg)

            if action == 'skip':
                pass
            elif target_pkg is None and source_pkg is None:
                action = 'unchanged'
                reason = reason or 'deleted on registry'
            elif target_deleted:
                action = 'updated'
                reason = 'undeleting on target'
                portal.action.package_update(**source_pkg)
            elif target_pkg is None:
                action = 'created'
                portal.action.package_create(**source_pkg)
            elif source_pkg is None:
                action = 'deleted'
                portal.action.package_delete(id=package_id)
            elif source_pkg == target_pkg:
                action = 'unchanged'
                reason = 'no difference found'
            else:
                action = 'updated'
                portal.action.package_update(**source_pkg)

            sys.stdout.write(json.dumps([package_id, action, reason]) + '\n')
            sys.stdout.flush()
Example #20
0
    def _portal_update(self, portal_ini, activity_date):
        if activity_date:
            past = re.match(PAST_RE, activity_date)
            if past:
                days, hours, minutes = (
                    int(x) if x else 0 for x in past.groups()
                )
                activity_date = datetime.now() - timedelta(
                    days=days,
                    seconds=(hours * 60 + minutes) * 60
                )
            else:
                activity_date = isodate(activity_date, None)
        else:
            activity_date = datetime.now() - timedelta(days=7)

        log = None
        if self.options.log:
            log = open(self.options.log, 'a')

        registry = LocalCKAN()

        def changed_package_id_runs(start_date):
            while True:
                packages, next_date = self._changed_packages_since(
                    registry, start_date)
                if next_date is None:
                    return
                yield packages, next_date
                start_date = next_date

        cmd = [
            sys.argv[0],
            'canada',
            'copy-datasets',
            '-c',
            portal_ini
        ]
        if self.options.mirror:
            cmd.append('-m')

        pool = worker_pool(
            cmd,
            self.options.processes,
            [],
            stop_when_jobs_done=False,
            stop_on_keyboard_interrupt=False,
            )

        # Advance generator so we may call send() below
        pool.next()

        def append_log(finished, package_id, action, reason):
            if not log:
                return
            log.write(json.dumps([
                datetime.now().isoformat(),
                finished,
                package_id,
                action,
                reason,
                ]) + '\n')
            log.flush()

        with _quiet_int_pipe():
            append_log(
                None,
                None,
                "started updating from:",
                activity_date.isoformat()
            )

            for packages, next_date in (
                    changed_package_id_runs(activity_date)):
                job_ids, finished, result = pool.send(enumerate(packages))
                stats = completion_stats(self.options.processes)
                while result is not None:
                    package_id, action, reason = json.loads(result)
                    print job_ids, stats.next(), finished, package_id, \
                        action, reason
                    append_log(finished, package_id, action, reason)
                    job_ids, finished, result = pool.next()

                print " --- next batch starting at: " + next_date.isoformat()
                append_log(
                    None,
                    None,
                    "next batch starting at:",
                    next_date.isoformat()
                )
                self._portal_update_activity_date = next_date.isoformat()
            self._portal_update_completed = True
Example #21
0
def _trim_package(pkg):
    """
    remove keys from pkg that we don't care about when comparing
    or updating/creating packages.  Also try to convert types and
    create missing fields that will be present in package_show.
    """
    # XXX full of custom hacks and deep knowledge of our schema :-(
    if not pkg:
        return
    for k in ['extras', 'metadata_modified', 'metadata_created',
            'revision_id', 'revision_timestamp', 'organization',
            'version', 'tracking_summary',
            'tags', # just because we don't use them
            'num_tags', 'num_resources', 'maintainer',
            'isopen', 'relationships_as_object', 'license_title',
            'license_title_fra', 'license_url_fra', 'license_url',
            'maintainer_email', 'author',
            'groups', # just because we don't use them
            'relationships_as_subject', 'department_number',
            # FIXME: remove these when we can:
            'resource_type',
            ]:
        if k in pkg:
            del pkg[k]
    for r in pkg['resources']:
        for k in ['resource_group_id', 'revision_id',
                'revision_timestamp', 'cache_last_updated',
                'webstore_last_updated', 'id', 'state', 'hash',
                'description', 'tracking_summary', 'mimetype_inner',
                'mimetype', 'cache_url', 'created', 'webstore_url',
                'last_modified', 'position', ]:
            if k in r:
                del r[k]
        for k in ['name', 'size']:
            if k not in r:
                r[k] = None
    for k in ['ready_to_publish', 'private']:
        pkg[k] = boolean_validator(unicode(pkg.get(k, '')), None)
    if 'name' not in pkg:
        pkg['name'] = pkg['id']
    if 'type' not in pkg:
        pkg['type'] = 'dataset'
    if 'state' not in pkg:
        pkg['state'] = 'active'
    for k in ['url']:
        if k not in pkg:
            pkg[k] = ''
    for name, lang, field in schema_description.dataset_field_iter():
        if field['type'] == 'date':
            try:
                pkg[name] = str(isodate(pkg[name], None)) if pkg.get(name) else ''
            except Invalid:
                pass # not for us to fail validation
        elif field['type'] == 'tag_vocabulary' and not isinstance(
                pkg.get(name), list):
            pkg[name] = convert_pilot_uuid_list(field)(pkg.get(name, []))
        elif field['type'] == 'url':
            if not pkg.get(name): # be consistent about what an empty url is
                pkg[name] = ""
        elif field['type'] == 'fixed' and name in pkg:
            del pkg[name]
Example #22
0
    def _portal_update(self, source, activity_date):
        if activity_date:
            past = re.match(PAST_RE, activity_date)
            if past:
                days, hours, minutes = (int(x) if x else 0 for x in past.groups())
                activity_date = datetime.now() - timedelta(days=days,
                    seconds=(hours * 60 + minutes) * 60)
            else:
                activity_date = isodate(activity_date, None)
        else:
            activity_date = datetime.now() - timedelta(days=7)

        log = None
        if self.options.log:
            log = open(self.options.log, 'a')

        seen_package_id_set = set()

        if self.options.push_apikey and not self.options.fetch:
            registry = LocalCKAN()
        elif self.options.fetch:
            registry = RemoteCKAN(source)
        else:
            print "exactly one of -f or -a options must be specified"
            return

        def changed_package_id_runs(start_date):
            while True:
                package_ids, next_date = self._changed_package_ids_since(
                    registry, start_date, seen_package_id_set)
                if next_date is None:
                    return
                yield package_ids, next_date
                start_date = next_date

        cmd = [sys.argv[0], 'canada', 'copy-datasets', source,
             '-c', self.options.config]
        if self.options.push_apikey:
            cmd.extend(['-a', self.options.push_apikey])
        else:
            cmd.append('-f')
        if self.options.mirror:
            cmd.append('-m')
        pool = worker_pool(
            cmd,
            self.options.processes,
            [],
            stop_when_jobs_done=False,
            stop_on_keyboard_interrupt=False,
            )
        pool.next() # advance generator so we may call send() below

        def append_log(finished, package_id, action, reason):
            if not log:
                return
            log.write(json.dumps([
                datetime.now().isoformat(),
                finished,
                package_id,
                action,
                reason,
                ]) + '\n')
            log.flush()

        with _quiet_int_pipe():
            append_log(None, None, "started updating from:",
                activity_date.isoformat())

            for package_ids, next_date in changed_package_id_runs(activity_date):
                job_ids, finished, result = pool.send(enumerate(package_ids))
                stats = completion_stats(self.options.processes)
                while result is not None:
                    package_id, action, reason = json.loads(result)
                    print job_ids, stats.next(), finished, package_id, \
                        action, reason
                    append_log(finished, package_id, action, reason)
                    job_ids, finished, result = pool.next()

                print " --- next batch starting at: " + next_date.isoformat()
                append_log(None, None, "next batch starting at:",
                    next_date.isoformat())
                self._portal_update_activity_date = next_date.isoformat()
            self._portal_update_completed = True
Example #23
0
    def copy_datasets(self, remote, package_ids=None):
        """
        a process that accepts package ids on stdin which are passed to
        the package_show API on the remote CKAN instance and compared
        to the local version of the same package.  The local package is
        then created, updated, deleted or left unchanged.  This process
        outputs that action as a string 'created', 'updated', 'deleted'
        or 'unchanged'
        """
        if self.options.push_apikey and not self.options.fetch:
            registry = LocalCKAN()
            portal = RemoteCKAN(remote, apikey=self.options.push_apikey)
        elif self.options.fetch:
            registry = RemoteCKAN(remote)
            portal = LocalCKAN()
        else:
            print "exactly one of -f or -a options must be specified"
            return

        now = datetime.now()

        if not package_ids:
            package_ids = iter(sys.stdin.readline, '')

        for package_id in package_ids:
            package_id = package_id.strip()
            reason = None
            target_deleted = False
            try:
                source_pkg = registry.action.package_show(id=package_id)
            except NotAuthorized:
                source_pkg = None
            except (CKANAPIError, urllib2.URLError), e:
                sys.stdout.write(json.dumps([package_id, 'source error',
                    unicode(e.args)]) + '\n')
                raise
            if source_pkg and source_pkg['state'] == 'deleted':
                source_pkg = None

            if source_pkg and source_pkg['type'] != 'dataset':
                # non-default dataset types ignored
                source_pkg = None

            _trim_package(source_pkg)

            if source_pkg and not self.options.mirror:
                # treat unpublished packages same as deleted packages
                if not source_pkg['portal_release_date']:
                    source_pkg = None
                    reason = 'release date not set'
                elif isodate(source_pkg['portal_release_date'], None) > now:
                    source_pkg = None
                    reason = 'release date in future'

            try:
                target_pkg = portal.call_action('package_show',
                    {'id':package_id})
            except (NotFound, NotAuthorized):
                target_pkg = None
            except (CKANAPIError, urllib2.URLError), e:
                sys.stdout.write(json.dumps([package_id, 'target error',
                    unicode(e.args)]) + '\n')
                raise