def action(ckan, arguments, stdin=None): """ call an action with KEY=VALUE args, yield the result """ if stdin is None: stdin = getattr(sys.stdin, 'buffer', sys.stdin) if arguments['--input-json']: action_args = json.loads(stdin.read().decode('utf-8')) elif arguments['--input']: action_args = json.loads(open( arguments['--input']).read().decode('utf-8')) else: action_args = {} for kv in arguments['KEY=VALUE']: key, p, value = kv.partition('=') action_args[key] = value result = ckan.call_action(arguments['ACTION_NAME'], action_args) if arguments['--output-jsonl']: if isinstance(result, list): for r in result: yield compact_json(r) + b'\n' else: yield compact_json(result) + b'\n' elif arguments['--output-json']: yield compact_json(result) + b'\n' else: yield pretty_json(result) + b'\n'
def action(ckan, arguments, stdin=None): """ call an action with KEY=STRING, KEY:JSON or JSON args, yield the result """ if stdin is None: stdin = getattr(sys.stdin, 'buffer', sys.stdin) file_args = {} if arguments['--input-json']: action_args = json.loads(stdin.read().decode('utf-8')) elif arguments['--input']: action_args = {} with open(expanduser(arguments['--input'])) as in_f: action_args = json.loads( in_f.read().decode('utf-8') if sys.version_info.major == 2 else in_f.read()) else: action_args = {} for kv in arguments['KEY=STRING']: if hasattr(kv, 'decode'): kv = kv.decode('utf-8') skey, p, svalue = kv.partition('=') jkey, p, jvalue = kv.partition(':') fkey, p, fvalue = kv.partition('@') if len(jkey) > len(skey) < len(fkey): action_args[skey] = svalue elif len(skey) > len(jkey) < len(fkey): try: value = json.loads(jvalue) except ValueError: raise CLIError("KEY:JSON argument %r has invalid JSON " "value %r" % (jkey, jvalue)) action_args[jkey] = value elif len(jkey) > len(fkey) < len(skey): try: f = open(expanduser(fvalue), 'rb') except IOError as e: raise CLIError("Error opening %r: %s" % (expanduser(fvalue), e.args[1])) file_args[fkey] = f else: raise CLIError("argument not in the form KEY=STRING, " "KEY:JSON or KEY@FILE %r" % kv) result = ckan.call_action(arguments['ACTION_NAME'], action_args, files=file_args) if arguments['--output-jsonl']: if isinstance(result, list): for r in result: yield compact_json(r) + b'\n' else: yield compact_json(result) + b'\n' elif arguments['--output-json']: yield compact_json(result) + b'\n' else: yield pretty_json(result) + b'\n'
def reply(error, record=None): """ format messages to be sent back to parent process """ stdout.write( compact_json([datetime.now().isoformat(), error, record]) + b'\n') stdout.flush()
def reply(action, error, response): """ format messages to be sent back to parent process """ stdout.write( compact_json([datetime.now().isoformat(), action, error, response]) + b'\n') stdout.flush()
def reply(error, response): """ format messages to be sent back to parent process """ stdout.write(compact_json([ datetime.now().isoformat(), error, response]) + b'\n') stdout.flush()
def action(ckan, arguments, stdin=None): """ call an action with KEY=STRING, KEY:JSON or JSON args, yield the result """ if stdin is None: stdin = getattr(sys.stdin, 'buffer', sys.stdin) if arguments['--input-json']: action_args = json.loads(stdin.read().decode('utf-8')) elif arguments['--input']: action_args = json.loads( open(arguments['--input']).read().decode('utf-8')) else: action_args = {} for kv in arguments['KEY=STRING']: skey, p, svalue = kv.partition('=') jkey, p, jvalue = kv.partition(':') if len(skey) < len(jkey): action_args[skey] = svalue continue if len(jkey) < len(skey): try: value = json.loads(jvalue) except ValueError: raise CLIError("KEY:JSON argument %r has invalid JSON " "value %r" % (jkey, jvalue)) action_args[jkey] = value continue raise CLIError("argument not in the form KEY=STRING or KEY:JSON " "%r" % kv) result = ckan.call_action(arguments['ACTION_NAME'], action_args) if arguments['--output-jsonl']: if isinstance(result, list): for r in result: yield compact_json(r) + b'\n' else: yield compact_json(result) + b'\n' elif arguments['--output-json']: yield compact_json(result) + b'\n' else: yield pretty_json(result) + b'\n'
def action(ckan, arguments, stdin=None): """ call an action with KEY=STRING, KEY:JSON or JSON args, yield the result """ if stdin is None: stdin = getattr(sys.stdin, 'buffer', sys.stdin) if arguments['--input-json']: action_args = json.loads(stdin.read().decode('utf-8')) elif arguments['--input']: action_args = json.loads(open( arguments['--input']).read().decode('utf-8')) else: action_args = {} for kv in arguments['KEY=STRING']: skey, p, svalue = kv.partition('=') jkey, p, jvalue = kv.partition(':') if len(skey) < len(jkey): action_args[skey] = svalue continue if len(jkey) < len(skey): try: value = json.loads(jvalue) except ValueError: raise CLIError("KEY:JSON argument %r has invalid JSON " "value %r" % (jkey, jvalue)) action_args[jkey] = value continue raise CLIError("argument not in the form KEY=STRING or KEY:JSON " "%r" % kv) result = ckan.call_action(arguments['ACTION_NAME'], action_args) if arguments['--output-jsonl']: if isinstance(result, list): for r in result: yield compact_json(r) + b'\n' else: yield compact_json(result) + b'\n' elif arguments['--output-json']: yield compact_json(result) + b'\n' else: yield pretty_json(result) + b'\n'
def name_reader(): """ handle start-record and max-records options and extract all ids or names from each line (e.g. package_search, package_show or package_list output) record numbers here correspond to names/ids extracted not lines """ start_record = int(arguments['--start-record']) max_records = arguments['--max-records'] if max_records is not None: max_records = int(max_records) for num, name in enumerate(chain.from_iterable( extract_ids_or_names(line) for line in jsonl_input), 1): if num < start_record: continue if max_records is not None and num >= start_record + max_records: break yield num, compact_json(name)
def dump_metadata(ckan, arguments, pagination=DEFAULT_PAGINATION, stdout=None, stderr=None): ''' Dump all the JSON metadata records. This is often a better than using dump_things with thing=datasets, as sites like catalog.data.gov do not support package_list api. The package_search API is used with pagination. ''' if pagination < 1: raise ValueError("Pagination size must be greater or equal to 1") if stdout is None: stdout = getattr(sys.stdout, 'buffer', sys.stdout) if stderr is None: stderr = getattr(sys.stderr, 'buffer', sys.stderr) jsonl_output = stdout if arguments['--output']: jsonl_output = open(arguments['--output'], 'wb') if arguments['--gzip']: jsonl_output = gzip.GzipFile(fileobj=jsonl_output) with quiet_int_pipe() as errors: count = 0 total_count = 0 total_known = False while not total_known or total_count > count: response = ckan.call_action( "package_search", dict(rows=pagination, start=count, sort="id asc")) total_count = response["count"] total_known = True for record in response["results"]: jsonl_output.write( compact_json(record, sort_keys=True) + b'\n') count += 1 if 'pipe' in errors: return 1 if 'interrupt' in errors: return 2
def delete_things(ckan, thing, arguments, worker_pool=None, stdin=None, stdout=None, stderr=None): """ delete datasets, groups, orgs, users etc, The parent process creates a pool of worker processes and hands out json lines to each worker as they finish a task. Status of last record completed and records being processed is displayed on stderr. """ if worker_pool is None: worker_pool = workers.worker_pool if stdin is None: stdin = getattr(sys.stdin, 'buffer', sys.stdin) if stdout is None: stdout = getattr(sys.__stdout__, 'buffer', sys.__stdout__) if stderr is None: stderr = getattr(sys.stderr, 'buffer', sys.stderr) if arguments['--worker']: return delete_things_worker(ckan, thing, arguments) log = None if arguments['--log']: log = open(arguments['--log'], 'a') jsonl_input = stdin if arguments['--input']: jsonl_input = open(arguments['--input'], 'rb') if arguments['--gzip']: jsonl_input = gzip.GzipFile(fileobj=jsonl_input) def name_reader(): """ handle start-record and max-records options and extract all ids or names from each line (e.g. package_search, package_show or package_list output) record numbers here correspond to names/ids extracted not lines """ start_record = int(arguments['--start-record']) max_records = arguments['--max-records'] if max_records is not None: max_records = int(max_records) for num, name in enumerate( chain.from_iterable( extract_ids_or_names(line) for line in jsonl_input), 1): if num < start_record: continue if max_records is not None and num >= start_record + max_records: break yield num, compact_json(name) cmd = _worker_command_line(thing, arguments) processes = int(arguments['--processes']) if hasattr(ckan, 'parallel_limit'): # add your sites to ckanapi.remoteckan.MY_SITES instead of removing processes = min(processes, ckan.parallel_limit) stats = completion_stats(processes) if not arguments['ID_OR_NAME']: pool = worker_pool(cmd, processes, name_reader()) else: pool = worker_pool( cmd, processes, enumerate( (compact_json(n) + b'\n' for n in arguments['ID_OR_NAME']), 1)) with quiet_int_pipe() as errors: for job_ids, finished, result in pool: if not result: # child exited with traceback return 1 timestamp, error, response = json.loads(result.decode('utf-8')) if not arguments['--quiet']: stderr.write(('%s %s %s %s %s\n' % (finished, job_ids, next(stats), error, compact_json(response).decode('utf-8') if response else '')).encode('utf-8')) if log: log.write( compact_json([ timestamp, finished, error, response, ]) + b'\n') log.flush() if 'pipe' in errors: return 1 if 'interrupt' in errors: return 2
def delete_things(ckan, thing, arguments, worker_pool=None, stdin=None, stdout=None, stderr=None): """ delete datasets, groups, orgs, users etc, The parent process creates a pool of worker processes and hands out json lines to each worker as they finish a task. Status of last record completed and records being processed is displayed on stderr. """ if worker_pool is None: worker_pool = workers.worker_pool if stdin is None: stdin = getattr(sys.stdin, 'buffer', sys.stdin) if stdout is None: stdout = getattr(sys.stdout, 'buffer', sys.stdout) if stderr is None: stderr = getattr(sys.stderr, 'buffer', sys.stderr) if arguments['--worker']: return delete_things_worker(ckan, thing, arguments) log = None if arguments['--log']: log = open(arguments['--log'], 'a') jsonl_input = stdin if arguments['--input']: jsonl_input = open(arguments['--input'], 'rb') if arguments['--gzip']: jsonl_input = gzip.GzipFile(fileobj=jsonl_input) def name_reader(): """ handle start-record and max-records options and extract all ids or names from each line (e.g. package_search, package_show or package_list output) record numbers here correspond to names/ids extracted not lines """ start_record = int(arguments['--start-record']) max_records = arguments['--max-records'] if max_records is not None: max_records = int(max_records) for num, name in enumerate(chain.from_iterable( extract_ids_or_names(line) for line in jsonl_input), 1): if num < start_record: continue if max_records is not None and num >= start_record + max_records: break yield num, compact_json(name) cmd = _worker_command_line(thing, arguments) processes = int(arguments['--processes']) if hasattr(ckan, 'parallel_limit'): # add your sites to ckanapi.remoteckan.MY_SITES instead of removing processes = min(processes, ckan.parallel_limit) stats = completion_stats(processes) if not arguments['ID_OR_NAME']: pool = worker_pool(cmd, processes, name_reader()) else: pool = worker_pool(cmd, processes, enumerate( (compact_json(n) + b'\n' for n in arguments['ID_OR_NAME']), 1)) with quiet_int_pipe() as errors: for job_ids, finished, result in pool: if not result: # child exited with traceback return 1 timestamp, error, response = json.loads( result.decode('utf-8')) if not arguments['--quiet']: stderr.write(('%s %s %s %s %s\n' % ( finished, job_ids, next(stats), error, compact_json(response).decode('utf-8') if response else '' )).encode('utf-8')) if log: log.write(compact_json([ timestamp, finished, error, response, ]) + b'\n') log.flush() if 'pipe' in errors: return 1 if 'interrupt' in errors: return 2
def dump_things(ckan, thing, arguments, worker_pool=None, stdout=None, stderr=None): """ dump all datasets, groups or orgs accessible by the connected user The parent process creates a pool of worker processes and hands out ids to each worker. Status of last record completed and records being processed is displayed on stderr. """ if worker_pool is None: worker_pool = workers.worker_pool if stdout is None: stdout = getattr(sys.stdout, 'buffer', sys.stdout) if stderr is None: stderr = getattr(sys.stderr, 'buffer', sys.stderr) if arguments['--worker']: return dump_things_worker(ckan, thing, arguments) log = None if arguments['--log']: log = open(arguments['--log'], 'a') jsonl_output = stdout if arguments['--datapackages']: # TODO: do we want to just divert this to devnull? jsonl_output = open(os.devnull, 'wb') if arguments['--output']: jsonl_output = open(arguments['--output'], 'wb') if arguments['--gzip']: jsonl_output = gzip.GzipFile(fileobj=jsonl_output) if arguments['--all']: get_thing_list = { 'datasets': 'package_list', 'groups': 'group_list', 'organizations': 'organization_list', }[thing] names = ckan.call_action(get_thing_list, {}) else: names = arguments['ID_OR_NAME'] cmd = _worker_command_line(thing, arguments) processes = int(arguments['--processes']) if hasattr(ckan, 'parallel_limit'): # add your sites to ckanapi.remoteckan.MY_SITES instead of removing processes = min(processes, ckan.parallel_limit) stats = completion_stats(processes) pool = worker_pool(cmd, processes, enumerate(compact_json(n) + b'\n' for n in names)) results = {} expecting_number = 0 with quiet_int_pipe() as errors: for job_ids, finished, result in pool: timestamp, error, record = json.loads(result.decode('utf-8')) results[finished] = record if not arguments['--quiet']: stderr.write('{0} {1} {2} {3} {4}\n'.format( finished, job_ids, next(stats), error, record.get('name', '') if record else '', ).encode('utf-8')) if log: log.write(compact_json([ timestamp, finished, error, record.get('name', '') if record else None, ]) + b'\n') datapackages_path = arguments['--datapackages'] if datapackages_path: create_datapackage(record, datapackages_path, stderr) # keep the output in the same order as names while expecting_number in results: record = results.pop(expecting_number) if record: # sort keys so we can diff output jsonl_output.write(compact_json(record, sort_keys=True) + b'\n') expecting_number += 1 if 'pipe' in errors: return 1 if 'interrupt' in errors: return 2
def dump_things(ckan, thing, arguments, worker_pool=None, stdout=None, stderr=None): """ dump all datasets, groups, orgs or users accessible by the connected user The parent process creates a pool of worker processes and hands out ids to each worker. Status of last record completed and records being processed is displayed on stderr. """ if worker_pool is None: worker_pool = workers.worker_pool if stdout is None: stdout = getattr(sys.__stdout__, 'buffer', sys.__stdout__) if stderr is None: stderr = getattr(sys.stderr, 'buffer', sys.stderr) if arguments['--worker']: return dump_things_worker(ckan, thing, arguments) log = None if arguments['--log']: log = open(arguments['--log'], 'a') jsonl_output = stdout if arguments[ '--datapackages']: # TODO: do we want to just divert this to devnull? jsonl_output = open(os.devnull, 'wb') if arguments['--output']: jsonl_output = open(arguments['--output'], 'wb') if arguments['--gzip']: jsonl_output = gzip.GzipFile(fileobj=jsonl_output) if arguments['--all']: get_thing_list = { 'datasets': 'package_list', 'groups': 'group_list', 'organizations': 'organization_list', 'users': 'user_list', 'related': 'related_list', }[thing] params = dict( all_fields=False, # for user_list ) names = ckan.call_action(get_thing_list, params) else: names = arguments['ID_OR_NAME'] if names and isinstance(names[0], dict): names = [rec.get('name', rec.get('id')) for rec in names] if arguments['--datapackages']: arguments['--datastore-fields'] = True cmd = _worker_command_line(thing, arguments) processes = int(arguments['--processes']) if hasattr(ckan, 'parallel_limit'): # add your sites to ckanapi.remoteckan.MY_SITES instead of removing processes = min(processes, ckan.parallel_limit) stats = completion_stats(processes) pool = worker_pool(cmd, processes, enumerate(compact_json(n) + b'\n' for n in names)) results = {} expecting_number = 0 with quiet_int_pipe() as errors: for job_ids, finished, result in pool: if not result: # child exited with traceback return 1 timestamp, error, record = json.loads(result.decode('utf-8')) results[finished] = record if not arguments['--quiet']: stderr.write('{0} {1} {2} {3} {4}\n'.format( finished, job_ids, next(stats), error, record.get('name', '') if record else '', ).encode('utf-8')) if log: log.write( compact_json([ timestamp, finished, error, record.get('name', '') if record else None, ]) + b'\n') datapackages_path = arguments['--datapackages'] if datapackages_path: create_datapackage(record, datapackages_path, stderr) # keep the output in the same order as names while expecting_number in results: record = results.pop(expecting_number) if record: # sort keys so we can diff output jsonl_output.write( compact_json(record, sort_keys=True) + b'\n') expecting_number += 1 if 'pipe' in errors: return 1 if 'interrupt' in errors: return 2
def load_things(ckan, thing, arguments, worker_pool=None, stdin=None, stdout=None, stderr=None): """ create and update datasets, groups, orgs and users The parent process creates a pool of worker processes and hands out json lines to each worker as they finish a task. Status of last record completed and records being processed is displayed on stderr. """ if worker_pool is None: worker_pool = workers.worker_pool if stdin is None: stdin = getattr(sys.stdin, 'buffer', sys.stdin) if stdout is None: stdout = getattr(sys.stdout, 'buffer', sys.stdout) if stderr is None: stderr = getattr(sys.stderr, 'buffer', sys.stderr) if arguments['--worker']: return load_things_worker(ckan, thing, arguments) log = None if arguments['--log']: log = open(arguments['--log'], 'a') jsonl_input = stdin if arguments['--input']: jsonl_input = open(arguments['--input'], 'rb') if arguments['--gzip']: jsonl_input = gzip.GzipFile(fileobj=jsonl_input) def line_reader(): """ handle start-record and max-records options """ start_record = int(arguments['--start-record']) max_records = arguments['--max-records'] if max_records is not None: max_records = int(max_records) for num, line in enumerate(jsonl_input, 1): # records start from 1 if num < start_record: continue if max_records is not None and num >= start_record + max_records: break yield num, line cmd = _worker_command_line(thing, arguments) processes = int(arguments['--processes']) if hasattr(ckan, 'parallel_limit'): # add your sites to ckanapi.remoteckan.MY_SITES instead of removing processes = min(processes, ckan.parallel_limit) stats = completion_stats(processes) pool = worker_pool(cmd, processes, line_reader()) with quiet_int_pipe() as errors: for job_ids, finished, result in pool: if not result: # child exited with traceback return 1 timestamp, action, error, response = json.loads( result.decode('utf-8')) if not arguments['--quiet']: stderr.write(('%s %s %s %s %s %s\n' % (finished, job_ids, next(stats), action, error, compact_json(response).decode('utf-8') if response else '')).encode('utf-8')) if log: log.write( compact_json([ timestamp, finished, action, error, response, ]) + b'\n') log.flush() if 'pipe' in errors: return 1 if 'interrupt' in errors: return 2
def load_things(ckan, thing, arguments, worker_pool=None, stdin=None, stdout=None, stderr=None): """ create and update datasets, groups and orgs The parent process creates a pool of worker processes and hands out json lines to each worker as they finish a task. Status of last record completed and records being processed is displayed on stderr. """ if worker_pool is None: worker_pool = workers.worker_pool if stdin is None: stdin = getattr(sys.stdin, "buffer", sys.stdin) if stdout is None: stdout = getattr(sys.stdout, "buffer", sys.stdout) if stderr is None: stderr = getattr(sys.stderr, "buffer", sys.stderr) if arguments["--worker"]: return load_things_worker(ckan, thing, arguments) log = None if arguments["--log"]: log = open(arguments["--log"], "a") jsonl_input = stdin if arguments["--input"]: jsonl_input = open(arguments["--input"], "rb") if arguments["--gzip"]: jsonl_input = gzip.GzipFile(fileobj=jsonl_input) def line_reader(): """ handle start-record and max-records options """ start_record = int(arguments["--start-record"]) max_records = arguments["--max-records"] if max_records is not None: max_records = int(max_records) for num, line in enumerate(jsonl_input, 1): # records start from 1 if num < start_record: continue if max_records is not None and num >= start_record + max_records: break yield num, line cmd = _worker_command_line(thing, arguments) processes = int(arguments["--processes"]) if hasattr(ckan, "parallel_limit"): # add your sites to ckanapi.remoteckan.MY_SITES instead of removing processes = min(processes, ckan.parallel_limit) stats = completion_stats(processes) pool = worker_pool(cmd, processes, line_reader()) with quiet_int_pipe() as errors: for job_ids, finished, result in pool: timestamp, action, error, response = json.loads(result.decode("utf-8")) if not arguments["--quiet"]: stderr.write( ( "%s %s %s %s %s %s\n" % ( finished, job_ids, next(stats), action, error, compact_json(response).decode("utf-8") if response else "", ) ).encode("utf-8") ) if log: log.write(compact_json([timestamp, finished, action, error, response]) + b"\n") log.flush() if "pipe" in errors: return 1 if "interrupt" in errors: return 2
def search_datasets(ckan, arguments, stdin=None, stdout=None, stderr=None): """ call package_search with KEY=STRING, KEY:JSON or JSON args, paginate over the results yield the result """ if stdin is None: stdin = getattr(sys.stdin, 'buffer', sys.stdin) if stdout is None: stdout = getattr(sys.__stdout__, 'buffer', sys.__stdout__) if stderr is None: stderr = getattr(sys.stderr, 'buffer', sys.stderr) requests_kwargs = None if arguments['--insecure']: requests_kwargs = {'verify': False} if arguments['--input-json']: action_args = json.loads(stdin.read().decode('utf-8')) elif arguments['--input']: action_args = {} with open(expanduser(arguments['--input'])) as in_f: action_args = json.loads(in_f.read().decode('utf-8') if sys. version_info.major == 2 else in_f.read()) else: action_args = {} for kv in arguments['KEY=STRING']: if hasattr(kv, 'decode'): kv = kv.decode('utf-8') skey, p, svalue = kv.partition('=') jkey, p, jvalue = kv.partition(':') if len(jkey) > len(skey): action_args[skey] = svalue elif len(skey) > len(jkey): try: value = json.loads(jvalue) except ValueError: raise CLIError("KEY:JSON argument %r has invalid JSON " "value %r" % (jkey, jvalue)) action_args[jkey] = value else: raise CLIError("argument not in the form KEY=STRING, " "or KEY:JSON %r" % kv) jsonl_output = stdout if arguments['--output']: jsonl_output = open(arguments['--output'], 'wb') if arguments['--gzip']: jsonl_output = gzip.GzipFile(fileobj=jsonl_output) start = int(action_args.get('start', 0)) while True: args = action_args if 'rows' not in action_args: args = dict(action_args, start=start, rows=ROWS_PER_QUERY) result = ckan.call_action('package_search', args, requests_kwargs=requests_kwargs) rows = result['results'] for r in rows: jsonl_output.write(compact_json(r, sort_keys=True) + b'\n') if not rows or 'rows' in action_args: break start += len(rows)
def load_things(ckan, thing, arguments, worker_pool=None, stdin=None, stdout=None, stderr=None): """ create and update datasets, groups, orgs and users The parent process creates a pool of worker processes and hands out json lines to each worker as they finish a task. Status of last record completed and records being processed is displayed on stderr. """ if worker_pool is None: worker_pool = workers.worker_pool if stdin is None: stdin = getattr(sys.stdin, 'buffer', sys.stdin) if stdout is None: stdout = getattr(sys.__stdout__, 'buffer', sys.__stdout__) if stderr is None: stderr = getattr(sys.stderr, 'buffer', sys.stderr) if arguments['--worker']: return load_things_worker(ckan, thing, arguments) log = None if arguments['--log']: log = open(arguments['--log'], 'a') jsonl_input = stdin if arguments['--input']: jsonl_input = open(arguments['--input'], 'rb') if arguments['--gzip']: jsonl_input = gzip.GzipFile(fileobj=jsonl_input) def line_reader(): """ handle start-record and max-records options """ start_record = int(arguments['--start-record']) max_records = arguments['--max-records'] if max_records is not None: max_records = int(max_records) for num, line in enumerate(jsonl_input, 1): # records start from 1 if num < start_record: continue if max_records is not None and num >= start_record + max_records: break yield num, line cmd = _worker_command_line(thing, arguments) processes = int(arguments['--processes']) if hasattr(ckan, 'parallel_limit'): # add your sites to ckanapi.remoteckan.MY_SITES instead of removing processes = min(processes, ckan.parallel_limit) stats = completion_stats(processes) pool = worker_pool(cmd, processes, line_reader()) with quiet_int_pipe() as errors: for job_ids, finished, result in pool: if not result: # child exited with traceback return 1 timestamp, action, error, response = json.loads( result.decode('utf-8')) if not arguments['--quiet']: stderr.write(('%s %s %s %s %s %s\n' % ( finished, job_ids, next(stats), action, error, compact_json(response).decode('utf-8') if response else '' )).encode('utf-8')) if log: log.write(compact_json([ timestamp, finished, action, error, response, ]) + b'\n') log.flush() if 'pipe' in errors: return 1 if 'interrupt' in errors: return 2
def dump_things(ckan, thing, arguments, worker_pool=None, stdout=None, stderr=None): """ dump all datasets, groups or orgs accessible by the connected user The parent process creates a pool of worker processes and hands out ids to each worker. Status of last record completed and records being processed is displayed on stderr. """ if worker_pool is None: worker_pool = workers.worker_pool if stdout is None: stdout = getattr(sys.stdout, 'buffer', sys.stdout) if stderr is None: stderr = getattr(sys.stderr, 'buffer', sys.stderr) if arguments['--worker']: return dump_things_worker(ckan, thing, arguments) log = None if arguments['--log']: log = open(arguments['--log'], 'a') jsonl_output = stdout if arguments['--dp-output']: # TODO: do we want to just divert this to devnull? jsonl_output = open(os.devnull, 'w') if arguments['--output']: jsonl_output = open(arguments['--output'], 'wb') if arguments['--gzip']: jsonl_output = gzip.GzipFile(fileobj=jsonl_output) if arguments['--all']: get_thing_list = { 'datasets': 'package_list', 'groups': 'group_list', 'organizations': 'organization_list', }[thing] names = ckan.call_action(get_thing_list, {}) else: names = arguments['ID_OR_NAME'] cmd = _worker_command_line(thing, arguments) processes = int(arguments['--processes']) if hasattr(ckan, 'parallel_limit'): # add your sites to ckanapi.remoteckan.MY_SITES instead of removing processes = min(processes, ckan.parallel_limit) stats = completion_stats(processes) pool = worker_pool(cmd, processes, enumerate(compact_json(n) + b'\n' for n in names)) results = {} expecting_number = 0 with quiet_int_pipe() as errors: for job_ids, finished, result in pool: timestamp, error, record = json.loads(result.decode('utf-8')) results[finished] = record if not arguments['--quiet']: stderr.write('{0} {1} {2} {3} {4}\n'.format( finished, job_ids, next(stats), error, record.get('name', '') if record else '', ).encode('utf-8')) if log: log.write(compact_json([ timestamp, finished, error, record.get('name', '') if record else None, ]) + b'\n') if arguments['--dp-output']: # TODO: how are we going to handle which resources to leave alone? They're very inconsistent in some instances # And I can't imagine anyone wants to download a copy of, for example, the API base endpoint resource_formats_to_ignore = ['API', 'api'] dataset_name = record.get('name', '') if record else '' try: base_path = arguments['--dp-output'] except KeyError: base_path = './' target_dir = '{base_path}/{name}/data'.format(base_path=base_path, name=dataset_name) try: os.makedirs(target_dir) except Exception as e: stderr.write(e.message) for resource in record.get('resources', ''): if resource['name'] is not None: resource_id = resource['name'] else: resource_id = resource['id'] resource_filename = os.path.split(resource['url'])[1] output = os.path.join(target_dir, resource_filename) # Resources can have a free-form address and no internal info, so in those cases # we're going to merely save them using the UID. (If they even exist) if output.endswith('/'): output = os.path.join(output, resource_id) resource['path'] = output # datapackage.json format explicitly requires a path to the resource try: if resource['format'] not in resource_formats_to_ignore: r = requests.get(resource['url'], stream=True) with open(output, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks f.write(chunk) f.flush() except requests.ConnectionError: stderr.write('URL {url} refused connection. The resource will not be downloaded\n'.format(url=resource['url'])) except requests.exceptions.RequestException as e: stderr.write(e.message) stderr.write('\n') datapackagejson_output = open('{base_path}{dataset_name}/datapackage.json'.format(base_path=base_path, dataset_name=dataset_name), 'w',) record['version'] = '1.0-beta.10' datapackagejson_output.write(pretty_json(record)) # keep the output in the same order as names while expecting_number in results: record = results.pop(expecting_number) if record: # sort keys so we can diff output jsonl_output.write(compact_json(record, sort_keys=True) + b'\n') expecting_number += 1 if 'pipe' in errors: return 1 if 'interrupt' in errors: return 2