Exemple #1
0
def action(ckan, arguments,
        stdin=None):
    """
    call an action with KEY=VALUE args, yield the result
    """
    if stdin is None:
        stdin = getattr(sys.stdin, 'buffer', sys.stdin)

    if arguments['--input-json']:
        action_args = json.loads(stdin.read().decode('utf-8'))
    elif arguments['--input']:
        action_args = json.loads(open(
            arguments['--input']).read().decode('utf-8'))
    else:
        action_args = {}
        for kv in arguments['KEY=VALUE']:
            key, p, value = kv.partition('=')
            action_args[key] = value
    result = ckan.call_action(arguments['ACTION_NAME'], action_args)

    if arguments['--output-jsonl']:
        if isinstance(result, list):
            for r in result:
                yield compact_json(r) + b'\n'
        else:
            yield compact_json(result) + b'\n'
    elif arguments['--output-json']:
        yield compact_json(result) + b'\n'
    else:
        yield pretty_json(result) + b'\n'
Exemple #2
0
def action(ckan, arguments, stdin=None):
    """
    call an action with KEY=STRING, KEY:JSON or JSON args, yield the result
    """
    if stdin is None:
        stdin = getattr(sys.stdin, 'buffer', sys.stdin)

    file_args = {}
    if arguments['--input-json']:
        action_args = json.loads(stdin.read().decode('utf-8'))
    elif arguments['--input']:
        action_args = {}
        with open(expanduser(arguments['--input'])) as in_f:
            action_args = json.loads(
                in_f.read().decode('utf-8') if sys.version_info.major == 2 else in_f.read())
    else:
        action_args = {}
        for kv in arguments['KEY=STRING']:
            if hasattr(kv, 'decode'):
                kv = kv.decode('utf-8')
            skey, p, svalue = kv.partition('=')
            jkey, p, jvalue = kv.partition(':')
            fkey, p, fvalue = kv.partition('@')
            if len(jkey) > len(skey) < len(fkey):
                action_args[skey] = svalue
            elif len(skey) > len(jkey) < len(fkey):
                try:
                    value = json.loads(jvalue)
                except ValueError:
                    raise CLIError("KEY:JSON argument %r has invalid JSON "
                        "value %r" % (jkey, jvalue))
                action_args[jkey] = value
            elif len(jkey) > len(fkey) < len(skey):
                try:
                    f = open(expanduser(fvalue), 'rb')
                except IOError as e:
                    raise CLIError("Error opening %r: %s" %
                        (expanduser(fvalue), e.args[1]))
                file_args[fkey] = f
            else:
                raise CLIError("argument not in the form KEY=STRING, "
                    "KEY:JSON or KEY@FILE %r" % kv)

    result = ckan.call_action(arguments['ACTION_NAME'], action_args,
        files=file_args)

    if arguments['--output-jsonl']:
        if isinstance(result, list):
            for r in result:
                yield compact_json(r) + b'\n'
        else:
            yield compact_json(result) + b'\n'
    elif arguments['--output-json']:
        yield compact_json(result) + b'\n'
    else:
        yield pretty_json(result) + b'\n'
Exemple #3
0
 def reply(error, record=None):
     """
     format messages to be sent back to parent process
     """
     stdout.write(
         compact_json([datetime.now().isoformat(), error, record]) + b'\n')
     stdout.flush()
Exemple #4
0
 def reply(action, error, response):
     """
     format messages to be sent back to parent process
     """
     stdout.write(
         compact_json([datetime.now().isoformat(), action, error, response])
         + b'\n')
     stdout.flush()
Exemple #5
0
 def reply(error, response):
     """
     format messages to be sent back to parent process
     """
     stdout.write(compact_json([
         datetime.now().isoformat(),
         error,
         response]) + b'\n')
     stdout.flush()
Exemple #6
0
def action(ckan, arguments, stdin=None):
    """
    call an action with KEY=STRING, KEY:JSON or JSON args, yield the result
    """
    if stdin is None:
        stdin = getattr(sys.stdin, 'buffer', sys.stdin)

    if arguments['--input-json']:
        action_args = json.loads(stdin.read().decode('utf-8'))
    elif arguments['--input']:
        action_args = json.loads(
            open(arguments['--input']).read().decode('utf-8'))
    else:
        action_args = {}
        for kv in arguments['KEY=STRING']:
            skey, p, svalue = kv.partition('=')
            jkey, p, jvalue = kv.partition(':')
            if len(skey) < len(jkey):
                action_args[skey] = svalue
                continue
            if len(jkey) < len(skey):
                try:
                    value = json.loads(jvalue)
                except ValueError:
                    raise CLIError("KEY:JSON argument %r has invalid JSON "
                                   "value %r" % (jkey, jvalue))
                action_args[jkey] = value
                continue
            raise CLIError("argument not in the form KEY=STRING or KEY:JSON "
                           "%r" % kv)

    result = ckan.call_action(arguments['ACTION_NAME'], action_args)

    if arguments['--output-jsonl']:
        if isinstance(result, list):
            for r in result:
                yield compact_json(r) + b'\n'
        else:
            yield compact_json(result) + b'\n'
    elif arguments['--output-json']:
        yield compact_json(result) + b'\n'
    else:
        yield pretty_json(result) + b'\n'
Exemple #7
0
def action(ckan, arguments, stdin=None):
    """
    call an action with KEY=STRING, KEY:JSON or JSON args, yield the result
    """
    if stdin is None:
        stdin = getattr(sys.stdin, 'buffer', sys.stdin)

    if arguments['--input-json']:
        action_args = json.loads(stdin.read().decode('utf-8'))
    elif arguments['--input']:
        action_args = json.loads(open(
            arguments['--input']).read().decode('utf-8'))
    else:
        action_args = {}
        for kv in arguments['KEY=STRING']:
            skey, p, svalue = kv.partition('=')
            jkey, p, jvalue = kv.partition(':')
            if len(skey) < len(jkey):
                action_args[skey] = svalue
                continue
            if len(jkey) < len(skey):
                try:
                    value = json.loads(jvalue)
                except ValueError:
                    raise CLIError("KEY:JSON argument %r has invalid JSON "
                        "value %r" % (jkey, jvalue))
                action_args[jkey] = value
                continue
            raise CLIError("argument not in the form KEY=STRING or KEY:JSON "
                "%r" % kv)

    result = ckan.call_action(arguments['ACTION_NAME'], action_args)

    if arguments['--output-jsonl']:
        if isinstance(result, list):
            for r in result:
                yield compact_json(r) + b'\n'
        else:
            yield compact_json(result) + b'\n'
    elif arguments['--output-json']:
        yield compact_json(result) + b'\n'
    else:
        yield pretty_json(result) + b'\n'
Exemple #8
0
    def name_reader():
        """
        handle start-record and max-records options and extract all
        ids or names from each line (e.g. package_search, package_show
        or package_list output)
        record numbers here correspond to names/ids extracted not lines
        """
        start_record = int(arguments['--start-record'])
        max_records = arguments['--max-records']
        if max_records is not None:
            max_records = int(max_records)

        for num, name in enumerate(chain.from_iterable(
                extract_ids_or_names(line) for line in jsonl_input), 1):
            if num < start_record:
                continue
            if max_records is not None and num >= start_record + max_records:
                break
            yield num, compact_json(name)
Exemple #9
0
    def name_reader():
        """
        handle start-record and max-records options and extract all
        ids or names from each line (e.g. package_search, package_show
        or package_list output)
        record numbers here correspond to names/ids extracted not lines
        """
        start_record = int(arguments['--start-record'])
        max_records = arguments['--max-records']
        if max_records is not None:
            max_records = int(max_records)

        for num, name in enumerate(chain.from_iterable(
                extract_ids_or_names(line) for line in jsonl_input), 1):
            if num < start_record:
                continue
            if max_records is not None and num >= start_record + max_records:
                break
            yield num, compact_json(name)
Exemple #10
0
def dump_metadata(ckan,
                  arguments,
                  pagination=DEFAULT_PAGINATION,
                  stdout=None,
                  stderr=None):
    '''
    Dump all the JSON metadata records.
    This is often a better than using dump_things with thing=datasets,
    as sites like catalog.data.gov do not support package_list api.

    The package_search API is used with pagination.
    '''
    if pagination < 1:
        raise ValueError("Pagination size must be greater or equal to 1")
    if stdout is None:
        stdout = getattr(sys.stdout, 'buffer', sys.stdout)
    if stderr is None:
        stderr = getattr(sys.stderr, 'buffer', sys.stderr)

    jsonl_output = stdout
    if arguments['--output']:
        jsonl_output = open(arguments['--output'], 'wb')
    if arguments['--gzip']:
        jsonl_output = gzip.GzipFile(fileobj=jsonl_output)

    with quiet_int_pipe() as errors:
        count = 0
        total_count = 0
        total_known = False
        while not total_known or total_count > count:
            response = ckan.call_action(
                "package_search",
                dict(rows=pagination, start=count, sort="id asc"))
            total_count = response["count"]
            total_known = True
            for record in response["results"]:
                jsonl_output.write(
                    compact_json(record, sort_keys=True) + b'\n')
                count += 1
    if 'pipe' in errors:
        return 1
    if 'interrupt' in errors:
        return 2
Exemple #11
0
def delete_things(ckan,
                  thing,
                  arguments,
                  worker_pool=None,
                  stdin=None,
                  stdout=None,
                  stderr=None):
    """
    delete datasets, groups, orgs, users etc,

    The parent process creates a pool of worker processes and hands
    out json lines to each worker as they finish a task. Status of
    last record completed and records being processed is displayed
    on stderr.
    """
    if worker_pool is None:
        worker_pool = workers.worker_pool
    if stdin is None:
        stdin = getattr(sys.stdin, 'buffer', sys.stdin)
    if stdout is None:
        stdout = getattr(sys.__stdout__, 'buffer', sys.__stdout__)
    if stderr is None:
        stderr = getattr(sys.stderr, 'buffer', sys.stderr)

    if arguments['--worker']:
        return delete_things_worker(ckan, thing, arguments)

    log = None
    if arguments['--log']:
        log = open(arguments['--log'], 'a')

    jsonl_input = stdin
    if arguments['--input']:
        jsonl_input = open(arguments['--input'], 'rb')
    if arguments['--gzip']:
        jsonl_input = gzip.GzipFile(fileobj=jsonl_input)

    def name_reader():
        """
        handle start-record and max-records options and extract all
        ids or names from each line (e.g. package_search, package_show
        or package_list output)
        record numbers here correspond to names/ids extracted not lines
        """
        start_record = int(arguments['--start-record'])
        max_records = arguments['--max-records']
        if max_records is not None:
            max_records = int(max_records)

        for num, name in enumerate(
                chain.from_iterable(
                    extract_ids_or_names(line) for line in jsonl_input), 1):
            if num < start_record:
                continue
            if max_records is not None and num >= start_record + max_records:
                break
            yield num, compact_json(name)

    cmd = _worker_command_line(thing, arguments)
    processes = int(arguments['--processes'])
    if hasattr(ckan, 'parallel_limit'):
        # add your sites to ckanapi.remoteckan.MY_SITES instead of removing
        processes = min(processes, ckan.parallel_limit)
    stats = completion_stats(processes)
    if not arguments['ID_OR_NAME']:
        pool = worker_pool(cmd, processes, name_reader())
    else:
        pool = worker_pool(
            cmd, processes,
            enumerate(
                (compact_json(n) + b'\n' for n in arguments['ID_OR_NAME']), 1))

    with quiet_int_pipe() as errors:
        for job_ids, finished, result in pool:
            if not result:
                # child exited with traceback
                return 1
            timestamp, error, response = json.loads(result.decode('utf-8'))

            if not arguments['--quiet']:
                stderr.write(('%s %s %s %s %s\n' %
                              (finished, job_ids, next(stats), error,
                               compact_json(response).decode('utf-8')
                               if response else '')).encode('utf-8'))

            if log:
                log.write(
                    compact_json([
                        timestamp,
                        finished,
                        error,
                        response,
                    ]) + b'\n')
                log.flush()
    if 'pipe' in errors:
        return 1
    if 'interrupt' in errors:
        return 2
Exemple #12
0
def delete_things(ckan, thing, arguments,
        worker_pool=None, stdin=None, stdout=None, stderr=None):
    """
    delete datasets, groups, orgs, users etc,

    The parent process creates a pool of worker processes and hands
    out json lines to each worker as they finish a task. Status of
    last record completed and records being processed is displayed
    on stderr.
    """
    if worker_pool is None:
        worker_pool = workers.worker_pool
    if stdin is None:
        stdin = getattr(sys.stdin, 'buffer', sys.stdin)
    if stdout is None:
        stdout = getattr(sys.stdout, 'buffer', sys.stdout)
    if stderr is None:
        stderr = getattr(sys.stderr, 'buffer', sys.stderr)

    if arguments['--worker']:
        return delete_things_worker(ckan, thing, arguments)

    log = None
    if arguments['--log']:
        log = open(arguments['--log'], 'a')

    jsonl_input = stdin
    if arguments['--input']:
        jsonl_input = open(arguments['--input'], 'rb')
    if arguments['--gzip']:
        jsonl_input = gzip.GzipFile(fileobj=jsonl_input)

    def name_reader():
        """
        handle start-record and max-records options and extract all
        ids or names from each line (e.g. package_search, package_show
        or package_list output)
        record numbers here correspond to names/ids extracted not lines
        """
        start_record = int(arguments['--start-record'])
        max_records = arguments['--max-records']
        if max_records is not None:
            max_records = int(max_records)

        for num, name in enumerate(chain.from_iterable(
                extract_ids_or_names(line) for line in jsonl_input), 1):
            if num < start_record:
                continue
            if max_records is not None and num >= start_record + max_records:
                break
            yield num, compact_json(name)

    cmd = _worker_command_line(thing, arguments)
    processes = int(arguments['--processes'])
    if hasattr(ckan, 'parallel_limit'):
        # add your sites to ckanapi.remoteckan.MY_SITES instead of removing
        processes = min(processes, ckan.parallel_limit)
    stats = completion_stats(processes)
    if not arguments['ID_OR_NAME']:
        pool = worker_pool(cmd, processes, name_reader())
    else:
        pool = worker_pool(cmd, processes, enumerate(
            (compact_json(n) + b'\n' for n in arguments['ID_OR_NAME']), 1))

    with quiet_int_pipe() as errors:
        for job_ids, finished, result in pool:
            if not result:
                # child exited with traceback
                return 1
            timestamp, error, response = json.loads(
                result.decode('utf-8'))

            if not arguments['--quiet']:
                stderr.write(('%s %s %s %s %s\n' % (
                    finished,
                    job_ids,
                    next(stats),
                    error,
                    compact_json(response).decode('utf-8') if response else ''
                    )).encode('utf-8'))

            if log:
                log.write(compact_json([
                    timestamp,
                    finished,
                    error,
                    response,
                    ]) + b'\n')
                log.flush()
    if 'pipe' in errors:
        return 1
    if 'interrupt' in errors:
        return 2
Exemple #13
0
def dump_things(ckan, thing, arguments,
        worker_pool=None, stdout=None, stderr=None):
    """
    dump all datasets, groups or orgs accessible by the connected user

    The parent process creates a pool of worker processes and hands
    out ids to each worker. Status of last record completed and records
    being processed is displayed on stderr.
    """
    if worker_pool is None:
        worker_pool = workers.worker_pool
    if stdout is None:
        stdout = getattr(sys.stdout, 'buffer', sys.stdout)
    if stderr is None:
        stderr = getattr(sys.stderr, 'buffer', sys.stderr)

    if arguments['--worker']:
        return dump_things_worker(ckan, thing, arguments)

    log = None
    if arguments['--log']:
        log = open(arguments['--log'], 'a')

    jsonl_output = stdout
    if arguments['--datapackages']:  # TODO: do we want to just divert this to devnull?
        jsonl_output = open(os.devnull, 'wb')
    if arguments['--output']:
        jsonl_output = open(arguments['--output'], 'wb')
    if arguments['--gzip']:
        jsonl_output = gzip.GzipFile(fileobj=jsonl_output)
    if arguments['--all']:
        get_thing_list = {
            'datasets': 'package_list',
            'groups': 'group_list',
            'organizations': 'organization_list',
            }[thing]
        names = ckan.call_action(get_thing_list, {})
    else:
        names = arguments['ID_OR_NAME']

    cmd = _worker_command_line(thing, arguments)
    processes = int(arguments['--processes'])
    if hasattr(ckan, 'parallel_limit'):
        # add your sites to ckanapi.remoteckan.MY_SITES instead of removing
        processes = min(processes, ckan.parallel_limit)
    stats = completion_stats(processes)
    pool = worker_pool(cmd, processes,
        enumerate(compact_json(n) + b'\n' for n in names))

    results = {}
    expecting_number = 0
    with quiet_int_pipe() as errors:
        for job_ids, finished, result in pool:
            timestamp, error, record = json.loads(result.decode('utf-8'))
            results[finished] = record

            if not arguments['--quiet']:
                stderr.write('{0} {1} {2} {3} {4}\n'.format(
                    finished,
                    job_ids,
                    next(stats),
                    error,
                    record.get('name', '') if record else '',
                    ).encode('utf-8'))

            if log:
                log.write(compact_json([
                    timestamp,
                    finished,
                    error,
                    record.get('name', '') if record else None,
                    ]) + b'\n')

            datapackages_path = arguments['--datapackages']
            if datapackages_path:
                create_datapackage(record, datapackages_path, stderr)

            # keep the output in the same order as names
            while expecting_number in results:
                record = results.pop(expecting_number)
                if record:
                    # sort keys so we can diff output
                    jsonl_output.write(compact_json(record,
                        sort_keys=True) + b'\n')
                expecting_number += 1
    if 'pipe' in errors:
        return 1
    if 'interrupt' in errors:
        return 2
Exemple #14
0
def dump_things(ckan,
                thing,
                arguments,
                worker_pool=None,
                stdout=None,
                stderr=None):
    """
    dump all datasets, groups, orgs or users accessible by the connected user

    The parent process creates a pool of worker processes and hands
    out ids to each worker. Status of last record completed and records
    being processed is displayed on stderr.
    """
    if worker_pool is None:
        worker_pool = workers.worker_pool
    if stdout is None:
        stdout = getattr(sys.__stdout__, 'buffer', sys.__stdout__)
    if stderr is None:
        stderr = getattr(sys.stderr, 'buffer', sys.stderr)

    if arguments['--worker']:
        return dump_things_worker(ckan, thing, arguments)

    log = None
    if arguments['--log']:
        log = open(arguments['--log'], 'a')

    jsonl_output = stdout
    if arguments[
            '--datapackages']:  # TODO: do we want to just divert this to devnull?
        jsonl_output = open(os.devnull, 'wb')
    if arguments['--output']:
        jsonl_output = open(arguments['--output'], 'wb')
    if arguments['--gzip']:
        jsonl_output = gzip.GzipFile(fileobj=jsonl_output)
    if arguments['--all']:
        get_thing_list = {
            'datasets': 'package_list',
            'groups': 'group_list',
            'organizations': 'organization_list',
            'users': 'user_list',
            'related': 'related_list',
        }[thing]
        params = dict(
            all_fields=False,  # for user_list
        )
        names = ckan.call_action(get_thing_list, params)

    else:
        names = arguments['ID_OR_NAME']

    if names and isinstance(names[0], dict):
        names = [rec.get('name', rec.get('id')) for rec in names]

    if arguments['--datapackages']:
        arguments['--datastore-fields'] = True
    cmd = _worker_command_line(thing, arguments)
    processes = int(arguments['--processes'])
    if hasattr(ckan, 'parallel_limit'):
        # add your sites to ckanapi.remoteckan.MY_SITES instead of removing
        processes = min(processes, ckan.parallel_limit)
    stats = completion_stats(processes)
    pool = worker_pool(cmd, processes,
                       enumerate(compact_json(n) + b'\n' for n in names))

    results = {}
    expecting_number = 0
    with quiet_int_pipe() as errors:
        for job_ids, finished, result in pool:
            if not result:
                # child exited with traceback
                return 1
            timestamp, error, record = json.loads(result.decode('utf-8'))
            results[finished] = record

            if not arguments['--quiet']:
                stderr.write('{0} {1} {2} {3} {4}\n'.format(
                    finished,
                    job_ids,
                    next(stats),
                    error,
                    record.get('name', '') if record else '',
                ).encode('utf-8'))

            if log:
                log.write(
                    compact_json([
                        timestamp,
                        finished,
                        error,
                        record.get('name', '') if record else None,
                    ]) + b'\n')

            datapackages_path = arguments['--datapackages']
            if datapackages_path:
                create_datapackage(record, datapackages_path, stderr)

            # keep the output in the same order as names
            while expecting_number in results:
                record = results.pop(expecting_number)
                if record:
                    # sort keys so we can diff output
                    jsonl_output.write(
                        compact_json(record, sort_keys=True) + b'\n')
                expecting_number += 1
    if 'pipe' in errors:
        return 1
    if 'interrupt' in errors:
        return 2
Exemple #15
0
def load_things(ckan,
                thing,
                arguments,
                worker_pool=None,
                stdin=None,
                stdout=None,
                stderr=None):
    """
    create and update datasets, groups, orgs and users

    The parent process creates a pool of worker processes and hands
    out json lines to each worker as they finish a task. Status of
    last record completed and records being processed is displayed
    on stderr.
    """
    if worker_pool is None:
        worker_pool = workers.worker_pool
    if stdin is None:
        stdin = getattr(sys.stdin, 'buffer', sys.stdin)
    if stdout is None:
        stdout = getattr(sys.stdout, 'buffer', sys.stdout)
    if stderr is None:
        stderr = getattr(sys.stderr, 'buffer', sys.stderr)

    if arguments['--worker']:
        return load_things_worker(ckan, thing, arguments)

    log = None
    if arguments['--log']:
        log = open(arguments['--log'], 'a')

    jsonl_input = stdin
    if arguments['--input']:
        jsonl_input = open(arguments['--input'], 'rb')
    if arguments['--gzip']:
        jsonl_input = gzip.GzipFile(fileobj=jsonl_input)

    def line_reader():
        """
        handle start-record and max-records options
        """
        start_record = int(arguments['--start-record'])
        max_records = arguments['--max-records']
        if max_records is not None:
            max_records = int(max_records)
        for num, line in enumerate(jsonl_input, 1):  # records start from 1
            if num < start_record:
                continue
            if max_records is not None and num >= start_record + max_records:
                break
            yield num, line

    cmd = _worker_command_line(thing, arguments)
    processes = int(arguments['--processes'])
    if hasattr(ckan, 'parallel_limit'):
        # add your sites to ckanapi.remoteckan.MY_SITES instead of removing
        processes = min(processes, ckan.parallel_limit)
    stats = completion_stats(processes)
    pool = worker_pool(cmd, processes, line_reader())

    with quiet_int_pipe() as errors:
        for job_ids, finished, result in pool:
            if not result:
                # child exited with traceback
                return 1
            timestamp, action, error, response = json.loads(
                result.decode('utf-8'))

            if not arguments['--quiet']:
                stderr.write(('%s %s %s %s %s %s\n' %
                              (finished, job_ids, next(stats), action, error,
                               compact_json(response).decode('utf-8')
                               if response else '')).encode('utf-8'))

            if log:
                log.write(
                    compact_json([
                        timestamp,
                        finished,
                        action,
                        error,
                        response,
                    ]) + b'\n')
                log.flush()
    if 'pipe' in errors:
        return 1
    if 'interrupt' in errors:
        return 2
Exemple #16
0
def load_things(ckan, thing, arguments, worker_pool=None, stdin=None, stdout=None, stderr=None):
    """
    create and update datasets, groups and orgs

    The parent process creates a pool of worker processes and hands
    out json lines to each worker as they finish a task. Status of
    last record completed and records being processed is displayed
    on stderr.
    """
    if worker_pool is None:
        worker_pool = workers.worker_pool
    if stdin is None:
        stdin = getattr(sys.stdin, "buffer", sys.stdin)
    if stdout is None:
        stdout = getattr(sys.stdout, "buffer", sys.stdout)
    if stderr is None:
        stderr = getattr(sys.stderr, "buffer", sys.stderr)

    if arguments["--worker"]:
        return load_things_worker(ckan, thing, arguments)

    log = None
    if arguments["--log"]:
        log = open(arguments["--log"], "a")

    jsonl_input = stdin
    if arguments["--input"]:
        jsonl_input = open(arguments["--input"], "rb")
    if arguments["--gzip"]:
        jsonl_input = gzip.GzipFile(fileobj=jsonl_input)

    def line_reader():
        """
        handle start-record and max-records options
        """
        start_record = int(arguments["--start-record"])
        max_records = arguments["--max-records"]
        if max_records is not None:
            max_records = int(max_records)
        for num, line in enumerate(jsonl_input, 1):  # records start from 1
            if num < start_record:
                continue
            if max_records is not None and num >= start_record + max_records:
                break
            yield num, line

    cmd = _worker_command_line(thing, arguments)
    processes = int(arguments["--processes"])
    if hasattr(ckan, "parallel_limit"):
        # add your sites to ckanapi.remoteckan.MY_SITES instead of removing
        processes = min(processes, ckan.parallel_limit)
    stats = completion_stats(processes)
    pool = worker_pool(cmd, processes, line_reader())

    with quiet_int_pipe() as errors:
        for job_ids, finished, result in pool:
            timestamp, action, error, response = json.loads(result.decode("utf-8"))

            if not arguments["--quiet"]:
                stderr.write(
                    (
                        "%s %s %s %s %s %s\n"
                        % (
                            finished,
                            job_ids,
                            next(stats),
                            action,
                            error,
                            compact_json(response).decode("utf-8") if response else "",
                        )
                    ).encode("utf-8")
                )

            if log:
                log.write(compact_json([timestamp, finished, action, error, response]) + b"\n")
                log.flush()
    if "pipe" in errors:
        return 1
    if "interrupt" in errors:
        return 2
Exemple #17
0
def search_datasets(ckan, arguments, stdin=None, stdout=None, stderr=None):
    """
    call package_search with KEY=STRING, KEY:JSON or JSON args,
    paginate over the results yield the result
    """
    if stdin is None:
        stdin = getattr(sys.stdin, 'buffer', sys.stdin)
    if stdout is None:
        stdout = getattr(sys.__stdout__, 'buffer', sys.__stdout__)
    if stderr is None:
        stderr = getattr(sys.stderr, 'buffer', sys.stderr)

    requests_kwargs = None
    if arguments['--insecure']:
        requests_kwargs = {'verify': False}
    if arguments['--input-json']:
        action_args = json.loads(stdin.read().decode('utf-8'))
    elif arguments['--input']:
        action_args = {}
        with open(expanduser(arguments['--input'])) as in_f:
            action_args = json.loads(in_f.read().decode('utf-8') if sys.
                                     version_info.major == 2 else in_f.read())
    else:
        action_args = {}
        for kv in arguments['KEY=STRING']:
            if hasattr(kv, 'decode'):
                kv = kv.decode('utf-8')
            skey, p, svalue = kv.partition('=')
            jkey, p, jvalue = kv.partition(':')
            if len(jkey) > len(skey):
                action_args[skey] = svalue
            elif len(skey) > len(jkey):
                try:
                    value = json.loads(jvalue)
                except ValueError:
                    raise CLIError("KEY:JSON argument %r has invalid JSON "
                                   "value %r" % (jkey, jvalue))
                action_args[jkey] = value
            else:
                raise CLIError("argument not in the form KEY=STRING, "
                               "or KEY:JSON %r" % kv)

    jsonl_output = stdout
    if arguments['--output']:
        jsonl_output = open(arguments['--output'], 'wb')
    if arguments['--gzip']:
        jsonl_output = gzip.GzipFile(fileobj=jsonl_output)

    start = int(action_args.get('start', 0))
    while True:
        args = action_args
        if 'rows' not in action_args:
            args = dict(action_args, start=start, rows=ROWS_PER_QUERY)
        result = ckan.call_action('package_search',
                                  args,
                                  requests_kwargs=requests_kwargs)
        rows = result['results']
        for r in rows:
            jsonl_output.write(compact_json(r, sort_keys=True) + b'\n')
        if not rows or 'rows' in action_args:
            break

        start += len(rows)
Exemple #18
0
def load_things(ckan, thing, arguments,
        worker_pool=None, stdin=None, stdout=None, stderr=None):
    """
    create and update datasets, groups, orgs and users

    The parent process creates a pool of worker processes and hands
    out json lines to each worker as they finish a task. Status of
    last record completed and records being processed is displayed
    on stderr.
    """
    if worker_pool is None:
        worker_pool = workers.worker_pool
    if stdin is None:
        stdin = getattr(sys.stdin, 'buffer', sys.stdin)
    if stdout is None:
        stdout = getattr(sys.__stdout__, 'buffer', sys.__stdout__)
    if stderr is None:
        stderr = getattr(sys.stderr, 'buffer', sys.stderr)

    if arguments['--worker']:
        return load_things_worker(ckan, thing, arguments)

    log = None
    if arguments['--log']:
        log = open(arguments['--log'], 'a')

    jsonl_input = stdin
    if arguments['--input']:
        jsonl_input = open(arguments['--input'], 'rb')
    if arguments['--gzip']:
        jsonl_input = gzip.GzipFile(fileobj=jsonl_input)

    def line_reader():
        """
        handle start-record and max-records options
        """
        start_record = int(arguments['--start-record'])
        max_records = arguments['--max-records']
        if max_records is not None:
            max_records = int(max_records)
        for num, line in enumerate(jsonl_input, 1): # records start from 1
            if num < start_record:
                continue
            if max_records is not None and num >= start_record + max_records:
                break
            yield num, line

    cmd = _worker_command_line(thing, arguments)
    processes = int(arguments['--processes'])
    if hasattr(ckan, 'parallel_limit'):
        # add your sites to ckanapi.remoteckan.MY_SITES instead of removing
        processes = min(processes, ckan.parallel_limit)
    stats = completion_stats(processes)
    pool = worker_pool(cmd, processes, line_reader())

    with quiet_int_pipe() as errors:
        for job_ids, finished, result in pool:
            if not result:
                # child exited with traceback
                return 1
            timestamp, action, error, response = json.loads(
                result.decode('utf-8'))

            if not arguments['--quiet']:
                stderr.write(('%s %s %s %s %s %s\n' % (
                    finished,
                    job_ids,
                    next(stats),
                    action,
                    error,
                    compact_json(response).decode('utf-8') if response else ''
                    )).encode('utf-8'))

            if log:
                log.write(compact_json([
                    timestamp,
                    finished,
                    action,
                    error,
                    response,
                    ]) + b'\n')
                log.flush()
    if 'pipe' in errors:
        return 1
    if 'interrupt' in errors:
        return 2
Exemple #19
0
def dump_things(ckan, thing, arguments,
        worker_pool=None, stdout=None, stderr=None):
    """
    dump all datasets, groups or orgs accessible by the connected user

    The parent process creates a pool of worker processes and hands
    out ids to each worker. Status of last record completed and records
    being processed is displayed on stderr.
    """
    if worker_pool is None:
        worker_pool = workers.worker_pool
    if stdout is None:
        stdout = getattr(sys.stdout, 'buffer', sys.stdout)
    if stderr is None:
        stderr = getattr(sys.stderr, 'buffer', sys.stderr)

    if arguments['--worker']:
        return dump_things_worker(ckan, thing, arguments)

    log = None
    if arguments['--log']:
        log = open(arguments['--log'], 'a')

    jsonl_output = stdout
    if arguments['--dp-output']:  # TODO: do we want to just divert this to devnull?
        jsonl_output = open(os.devnull, 'w')
    if arguments['--output']:
        jsonl_output = open(arguments['--output'], 'wb')
    if arguments['--gzip']:
        jsonl_output = gzip.GzipFile(fileobj=jsonl_output)
    if arguments['--all']:
        get_thing_list = {
            'datasets': 'package_list',
            'groups': 'group_list',
            'organizations': 'organization_list',
            }[thing]
        names = ckan.call_action(get_thing_list, {})
    else:
        names = arguments['ID_OR_NAME']

    cmd = _worker_command_line(thing, arguments)
    processes = int(arguments['--processes'])
    if hasattr(ckan, 'parallel_limit'):
        # add your sites to ckanapi.remoteckan.MY_SITES instead of removing
        processes = min(processes, ckan.parallel_limit)
    stats = completion_stats(processes)
    pool = worker_pool(cmd, processes,
        enumerate(compact_json(n) + b'\n' for n in names))

    results = {}
    expecting_number = 0
    with quiet_int_pipe() as errors:
        for job_ids, finished, result in pool:
            timestamp, error, record = json.loads(result.decode('utf-8'))
            results[finished] = record

            if not arguments['--quiet']:
                stderr.write('{0} {1} {2} {3} {4}\n'.format(
                    finished,
                    job_ids,
                    next(stats),
                    error,
                    record.get('name', '') if record else '',
                    ).encode('utf-8'))

            if log:
                log.write(compact_json([
                    timestamp,
                    finished,
                    error,
                    record.get('name', '') if record else None,
                    ]) + b'\n')

            if arguments['--dp-output']:
                # TODO: how are we going to handle which resources to leave alone? They're very inconsistent in some instances
                # And I can't imagine anyone wants to download a copy of, for example, the API base endpoint
                resource_formats_to_ignore = ['API', 'api']
                dataset_name = record.get('name', '') if record else ''

                try:
                    base_path = arguments['--dp-output']
                except KeyError:
                    base_path = './'

                target_dir = '{base_path}/{name}/data'.format(base_path=base_path,
                                                                 name=dataset_name)

                try:
                    os.makedirs(target_dir)
                except Exception as e:
                    stderr.write(e.message)

                for resource in record.get('resources', ''):
                    if resource['name'] is not None:
                        resource_id = resource['name']
                    else:
                        resource_id = resource['id']

                    resource_filename = os.path.split(resource['url'])[1]

                    output = os.path.join(target_dir, resource_filename)

                    # Resources can have a free-form address and no internal info, so in those cases
                    # we're going to merely save them using the UID. (If they even exist)
                    if output.endswith('/'):
                        output = os.path.join(output, resource_id)

                    resource['path'] = output  # datapackage.json format explicitly requires a path to the resource

                    try:
                        if resource['format'] not in resource_formats_to_ignore:
                            r = requests.get(resource['url'], stream=True)
                            with open(output, 'wb') as f:
                                for chunk in r.iter_content(chunk_size=1024):
                                    if chunk: # filter out keep-alive new chunks
                                        f.write(chunk)
                                        f.flush()
                    except requests.ConnectionError:
                        stderr.write('URL {url} refused connection. The resource will not be downloaded\n'.format(url=resource['url']))
                    except requests.exceptions.RequestException as e:
                        stderr.write(e.message)
                        stderr.write('\n')


                datapackagejson_output = open('{base_path}{dataset_name}/datapackage.json'.format(base_path=base_path,
                                                                                                   dataset_name=dataset_name), 'w',)

                record['version'] = '1.0-beta.10'

                datapackagejson_output.write(pretty_json(record))

            # keep the output in the same order as names
            while expecting_number in results:
                record = results.pop(expecting_number)
                if record:
                    # sort keys so we can diff output
                    jsonl_output.write(compact_json(record,
                        sort_keys=True) + b'\n')
                expecting_number += 1
    if 'pipe' in errors:
        return 1
    if 'interrupt' in errors:
        return 2