Beispiel #1
0
    def dump_datasets(self):
        """
        Output all public datasets as a .jl file
        """
        registry = LocalCKAN('visitor')
        package_names = registry.action.package_list()

        cmd = [sys.argv[0], 'canada', 'dump-datasets-worker',
            '-c', self.options.config]
        stats = completion_stats(self.options.processes)
        pool = worker_pool(cmd, self.options.processes,
            enumerate(package_names))

        sink = sys.stdout
        if self.options.gzip:
            sink = gzip.GzipFile(fileobj=sys.stdout, mode='wb')
        expecting_number = 0
        results = {}
        with _quiet_int_pipe():
            for job_ids, finished, result in pool:
                sys.stderr.write("%s %s %s\n" % (
                    job_ids, stats.next(), finished))
                results[finished] = result
                # keep the output in the same order as package_names
                while expecting_number in results:
                    sink.write(results.pop(expecting_number))
                    expecting_number += 1
    def dump_datasets(self):
        """
        Output all public datasets as a .jl file
        """
        registry = LocalCKAN('visitor')
        package_names = registry.action.package_list()

        cmd = [sys.argv[0], 'canada', 'dump-datasets-worker',
            '-c', self.options.config]
        stats = completion_stats(self.options.processes)
        pool = worker_pool(cmd, self.options.processes,
            enumerate(package_names))

        sink = sys.stdout
        if self.options.gzip:
            sink = gzip.GzipFile(fileobj=sys.stdout, mode='wb')
        expecting_number = 0
        results = {}
        with _quiet_int_pipe():
            for job_ids, finished, result in pool:
                sys.stderr.write("%s %s %s\n" % (
                    job_ids, stats.next(), finished))
                results[finished] = result
                # keep the output in the same order as package_names
                while expecting_number in results:
                    sink.write(results.pop(expecting_number))
                    expecting_number += 1
Beispiel #3
0
    def portal_update(self, source, activity_date=None):
        """
        collect batches of package ids modified at source since activity_date
        and apply the package updates to the local CKAN instance for all
        packages with published_date set to any time in the past.
        """
        if activity_date:
            # XXX local time :-(
            activity_date = isodate(activity_date, None)
        else:
            activity_date = datetime.now() - timedelta(days=7)

        seen_package_id_set = set()

        def changed_package_id_runs(start_date):
            while True:
                package_ids, next_date = self._changed_package_ids_since(
                    source, start_date, seen_package_id_set)
                if next_date is None:
                    return
                yield package_ids, next_date
                start_date = next_date

        pool = worker_pool(
            [sys.argv[0], 'canada', 'portal-update-worker', source,
             '-c', self.options.config],
            self.options.processes,
            [],
            stop_when_jobs_done=False,
            stop_on_keyboard_interrupt=False,
            )
        pool.next() # advance generator so we may call send() below

        try:
            for package_ids, next_date in changed_package_id_runs(activity_date):
                stats = dict(created=0, updated=0, deleted=0, unchanged=0)

                jobs = ((i, i + '\n') for i in package_ids)
                try:
                    job_ids, finished, result = pool.send(jobs)
                    while result is not None:
                        stats[result.strip()] += 1
                        job_ids, finished, result = pool.next()
                except KeyboardInterrupt:
                    break

                print next_date.isoformat(),
                print " ".join("%s:%s" % kv for kv in sorted(stats.items()))
        except IOError, e:
            # let pipe errors cause silent exit --
            # the worker will have provided the real traceback
            if e.errno != 32:
                raise
Beispiel #4
0
    def portal_update(self, source, activity_date=None):
        """
        collect batches of package ids modified at source since activity_date
        and apply the package updates to the local CKAN instance for all
        packages with published_date set to any time in the past.
        """
        if activity_date:
            # XXX local time :-(
            activity_date = isodate(activity_date, None)
        else:
            activity_date = datetime.now() - timedelta(days=7)

        seen_package_id_set = set()

        def changed_package_id_runs(start_date):
            while True:
                package_ids, next_date = self._changed_package_ids_since(
                    source, start_date, seen_package_id_set)
                if next_date is None:
                    return
                yield package_ids, next_date
                start_date = next_date

        pool = worker_pool(
            [sys.argv[0], 'canada', 'portal-update-worker', source,
             '-c', self.options.config],
            self.options.processes,
            [],
            stop_when_jobs_done=False,
            stop_on_keyboard_interrupt=False,
            )
        pool.next() # advance generator so we may call send() below

        with _quiet_int_pipe():
            for package_ids, next_date in changed_package_id_runs(activity_date):
                stats = dict(created=0, updated=0, deleted=0, unchanged=0)

                job_ids, finished, result = pool.send(enumerate(package_ids))
                while result is not None:
                    stats[result.strip()] += 1
                    job_ids, finished, result = pool.next()

                print next_date.isoformat(),
                print " ".join("%s:%s" % kv for kv in sorted(stats.items()))
Beispiel #5
0
    def load_datasets(self, jl_source, start_line=1, max_count=None):
        start_line = int(start_line)
        if max_count is not None:
            max_count = int(max_count)

        log = None
        if self.options.log:
            log = open(self.options.log, 'a')

        def line_reader():
            for num, line in enumerate(open(jl_source), 1):
                if num < start_line:
                    continue
                if max_count is not None and num >= start_line + max_count:
                    break
                yield num, line.strip() + '\n'
        cmd = [sys.argv[0], 'canada', 'load-dataset-worker',
            '-c', self.options.config]
        if self.options.ckan_user:
            cmd += ['-u', self.options.ckan_user]
        if self.options.replace_datasets:
            cmd += ['-r']

        stats = completion_stats(self.options.processes)
        pool = worker_pool(cmd, self.options.processes, line_reader())
        try:
            for job_ids, finished, result in pool:
                timestamp, action, error, response = json.loads(result)
                print job_ids, stats.next(), finished, action,
                print json.dumps(response) if response else ''
                if log:
                    log.write(json.dumps([
                        timestamp,
                        finished,
                        action,
                        error,
                        response,
                        ]) + '\n')
                    log.flush()
        except IOError, e:
            # let pipe errors cause silent exit --
            # the worker will have provided the real traceback
            if e.errno != 32:
                raise
    def load_datasets(self, jl_source, start_line=1, max_count=None):
        start_line = int(start_line)
        if max_count is not None:
            max_count = int(max_count)

        log = None
        if self.options.log:
            log = open(self.options.log, 'a')

        def line_reader():
            if self.options.gzip:
                source_file = gzip.GzipFile(jl_source)
            else:
                source_file = open(jl_source)
            for num, line in enumerate(source_file, 1):
                if num < start_line:
                    continue
                if max_count is not None and num >= start_line + max_count:
                    break
                yield num, line.strip()
        cmd = [sys.argv[0], 'canada', 'load-dataset-worker',
            '-c', self.options.config]
        if self.options.ckan_user:
            cmd += ['-u', self.options.ckan_user]
        if self.options.replace_datasets:
            cmd += ['-r']

        stats = completion_stats(self.options.processes)
        pool = worker_pool(cmd, self.options.processes, line_reader())
        with _quiet_int_pipe():
            for job_ids, finished, result in pool:
                timestamp, action, error, response = json.loads(result)
                print job_ids, stats.next(), finished, action,
                print json.dumps(response) if response else ''
                if log:
                    log.write(json.dumps([
                        timestamp,
                        finished,
                        action,
                        error,
                        response,
                        ]) + '\n')
                    log.flush()
Beispiel #7
0
    def load_datasets(self, jl_source, start_line=1, max_count=None):
        start_line = int(start_line)
        if max_count is not None:
            max_count = int(max_count)

        log = None
        if self.options.log:
            log = open(self.options.log, 'a')

        def line_reader():
            if self.options.gzip:
                source_file = gzip.GzipFile(jl_source)
            else:
                source_file = open(jl_source)
            for num, line in enumerate(source_file, 1):
                if num < start_line:
                    continue
                if max_count is not None and num >= start_line + max_count:
                    break
                yield num, line.strip()
        cmd = [sys.argv[0], 'canada', 'load-dataset-worker',
            '-c', self.options.config]
        if self.options.ckan_user:
            cmd += ['-u', self.options.ckan_user]
        if self.options.replace_datasets:
            cmd += ['-r']

        stats = completion_stats(self.options.processes)
        pool = worker_pool(cmd, self.options.processes, line_reader())
        with _quiet_int_pipe():
            for job_ids, finished, result in pool:
                timestamp, action, error, response = json.loads(result)
                print job_ids, stats.next(), finished, action,
                print json.dumps(response) if response else ''
                if log:
                    log.write(json.dumps([
                        timestamp,
                        finished,
                        action,
                        error,
                        response,
                        ]) + '\n')
                    log.flush()
    def _portal_update(self, source, activity_date):
        if activity_date:
            past = re.match(PAST_RE, activity_date)
            if past:
                days, hours, minutes = (int(x) if x else 0 for x in past.groups())
                activity_date = datetime.now() - timedelta(days=days,
                    seconds=(hours * 60 + minutes) * 60)
            else:
                activity_date = isodate(activity_date, None)
        else:
            activity_date = datetime.now() - timedelta(days=7)

        log = None
        if self.options.log:
            log = open(self.options.log, 'a')

        seen_package_id_set = set()

        if self.options.push_apikey and not self.options.fetch:
            registry = LocalCKAN()
        elif self.options.fetch:
            registry = RemoteCKAN(source)
        else:
            print "exactly one of -f or -a options must be specified"
            return

        def changed_package_id_runs(start_date):
            while True:
                package_ids, next_date = self._changed_package_ids_since(
                    registry, start_date, seen_package_id_set)
                if next_date is None:
                    return
                yield package_ids, next_date
                start_date = next_date

        cmd = [sys.argv[0], 'canada', 'copy-datasets', source,
             '-c', self.options.config]
        if self.options.push_apikey:
            cmd.extend(['-a', self.options.push_apikey])
        else:
            cmd.append('-f')
        if self.options.mirror:
            cmd.append('-m')
        pool = worker_pool(
            cmd,
            self.options.processes,
            [],
            stop_when_jobs_done=False,
            stop_on_keyboard_interrupt=False,
            )
        pool.next() # advance generator so we may call send() below

        def append_log(finished, package_id, action, reason):
            if not log:
                return
            log.write(json.dumps([
                datetime.now().isoformat(),
                finished,
                package_id,
                action,
                reason,
                ]) + '\n')
            log.flush()

        with _quiet_int_pipe():
            append_log(None, None, "started updating from:",
                activity_date.isoformat())

            for package_ids, next_date in changed_package_id_runs(activity_date):
                job_ids, finished, result = pool.send(enumerate(package_ids))
                stats = completion_stats(self.options.processes)
                while result is not None:
                    package_id, action, reason = json.loads(result)
                    print job_ids, stats.next(), finished, package_id, \
                        action, reason
                    append_log(finished, package_id, action, reason)
                    job_ids, finished, result = pool.next()

                print " --- next batch starting at: " + next_date.isoformat()
                append_log(None, None, "next batch starting at:",
                    next_date.isoformat())
                self._portal_update_activity_date = next_date.isoformat()
            self._portal_update_completed = True
Beispiel #9
0
    def _portal_update(self, source, activity_date):
        if activity_date:
            past = re.match(PAST_RE, activity_date)
            if past:
                days, hours, minutes = (int(x) if x else 0 for x in past.groups())
                activity_date = datetime.now() - timedelta(days=days,
                    seconds=(hours * 60 + minutes) * 60)
            else:
                activity_date = isodate(activity_date, None)
        else:
            activity_date = datetime.now() - timedelta(days=7)

        log = None
        if self.options.log:
            log = open(self.options.log, 'a')

        if self.options.push_apikey and not self.options.fetch:
            registry = LocalCKAN()
        elif self.options.fetch:
            registry = RemoteCKAN(source)
        else:
            print "exactly one of -f or -a options must be specified"
            return

        def changed_package_id_runs(start_date):
            while True:
                package_ids, next_date = self._changed_package_ids_since(
                    registry, start_date)
                if next_date is None:
                    return
                yield package_ids, next_date
                start_date = next_date

        cmd = [sys.argv[0], 'canada', 'copy-datasets', source,
             '-c', self.options.config]
        if self.options.push_apikey:
            cmd.extend(['-a', self.options.push_apikey])
        else:
            cmd.append('-f')
        if self.options.mirror:
            cmd.append('-m')
        pool = worker_pool(
            cmd,
            self.options.processes,
            [],
            stop_when_jobs_done=False,
            stop_on_keyboard_interrupt=False,
            )
        pool.next() # advance generator so we may call send() below

        def append_log(finished, package_id, action, reason):
            if not log:
                return
            log.write(json.dumps([
                datetime.now().isoformat(),
                finished,
                package_id,
                action,
                reason,
                ]) + '\n')
            log.flush()

        with _quiet_int_pipe():
            append_log(None, None, "started updating from:",
                activity_date.isoformat())

            for package_ids, next_date in changed_package_id_runs(activity_date):
                job_ids, finished, result = pool.send(enumerate(package_ids))
                stats = completion_stats(self.options.processes)
                while result is not None:
                    package_id, action, reason = json.loads(result)
                    print job_ids, stats.next(), finished, package_id, \
                        action, reason
                    append_log(finished, package_id, action, reason)
                    job_ids, finished, result = pool.next()

                print " --- next batch starting at: " + next_date.isoformat()
                append_log(None, None, "next batch starting at:",
                    next_date.isoformat())
                self._portal_update_activity_date = next_date.isoformat()
            self._portal_update_completed = True