def load_things(ckan, thing, arguments, worker_pool=None, stdin=None, stdout=None, stderr=None): """ create and update datasets, groups, orgs and users The parent process creates a pool of worker processes and hands out json lines to each worker as they finish a task. Status of last record completed and records being processed is displayed on stderr. """ if worker_pool is None: worker_pool = workers.worker_pool if stdin is None: stdin = getattr(sys.stdin, 'buffer', sys.stdin) if stdout is None: stdout = getattr(sys.__stdout__, 'buffer', sys.__stdout__) if stderr is None: stderr = getattr(sys.stderr, 'buffer', sys.stderr) if arguments['--worker']: return load_things_worker(ckan, thing, arguments) log = None if arguments['--log']: log = open(arguments['--log'], 'a') jsonl_input = stdin if arguments['--input']: jsonl_input = open(arguments['--input'], 'rb') if arguments['--gzip']: jsonl_input = gzip.GzipFile(fileobj=jsonl_input) def line_reader(): """ handle start-record and max-records options """ start_record = int(arguments['--start-record']) max_records = arguments['--max-records'] if max_records is not None: max_records = int(max_records) for num, line in enumerate(jsonl_input, 1): # records start from 1 if num < start_record: continue if max_records is not None and num >= start_record + max_records: break yield num, line cmd = _worker_command_line(thing, arguments) processes = int(arguments['--processes']) if hasattr(ckan, 'parallel_limit'): # add your sites to ckanapi.remoteckan.MY_SITES instead of removing processes = min(processes, ckan.parallel_limit) stats = completion_stats(processes) pool = worker_pool(cmd, processes, line_reader()) with quiet_int_pipe() as errors: for job_ids, finished, result in pool: if not result: # child exited with traceback return 1 timestamp, action, error, response = json.loads( result.decode('utf-8')) if not arguments['--quiet']: stderr.write(('%s %s %s %s %s %s\n' % ( finished, job_ids, next(stats), action, error, compact_json(response).decode('utf-8') if response else '' )).encode('utf-8')) if log: log.write(compact_json([ timestamp, finished, action, error, response, ]) + b'\n') log.flush() if 'pipe' in errors: return 1 if 'interrupt' in errors: return 2
def load_things(ckan, thing, arguments, worker_pool=None, stdin=None, stdout=None, stderr=None): """ create and update datasets, groups, orgs and users The parent process creates a pool of worker processes and hands out json lines to each worker as they finish a task. Status of last record completed and records being processed is displayed on stderr. """ if worker_pool is None: worker_pool = workers.worker_pool if stdin is None: stdin = getattr(sys.stdin, 'buffer', sys.stdin) if stdout is None: stdout = getattr(sys.stdout, 'buffer', sys.stdout) if stderr is None: stderr = getattr(sys.stderr, 'buffer', sys.stderr) if arguments['--worker']: return load_things_worker(ckan, thing, arguments) log = None if arguments['--log']: log = open(arguments['--log'], 'a') jsonl_input = stdin if arguments['--input']: jsonl_input = open(arguments['--input'], 'rb') if arguments['--gzip']: jsonl_input = gzip.GzipFile(fileobj=jsonl_input) def line_reader(): """ handle start-record and max-records options """ start_record = int(arguments['--start-record']) max_records = arguments['--max-records'] if max_records is not None: max_records = int(max_records) for num, line in enumerate(jsonl_input, 1): # records start from 1 if num < start_record: continue if max_records is not None and num >= start_record + max_records: break yield num, line cmd = _worker_command_line(thing, arguments) processes = int(arguments['--processes']) if hasattr(ckan, 'parallel_limit'): # add your sites to ckanapi.remoteckan.MY_SITES instead of removing processes = min(processes, ckan.parallel_limit) stats = completion_stats(processes) pool = worker_pool(cmd, processes, line_reader()) with quiet_int_pipe() as errors: for job_ids, finished, result in pool: if not result: # child exited with traceback return 1 timestamp, action, error, response = json.loads( result.decode('utf-8')) if not arguments['--quiet']: stderr.write(('%s %s %s %s %s %s\n' % (finished, job_ids, next(stats), action, error, compact_json(response).decode('utf-8') if response else '')).encode('utf-8')) if log: log.write( compact_json([ timestamp, finished, action, error, response, ]) + b'\n') log.flush() if 'pipe' in errors: return 1 if 'interrupt' in errors: return 2
def _portal_update(self, portal_ini, activity_date): if activity_date: past = re.match(PAST_RE, activity_date) if past: days, hours, minutes = ( int(x) if x else 0 for x in past.groups() ) activity_date = datetime.now() - timedelta( days=days, seconds=(hours * 60 + minutes) * 60 ) else: activity_date = isodate(activity_date, None) else: activity_date = datetime.now() - timedelta(days=7) log = None if self.options.log: log = open(self.options.log, 'a') registry = LocalCKAN() def changed_package_id_runs(start_date): while True: packages, next_date = self._changed_packages_since( registry, start_date) if next_date is None: return yield packages, next_date start_date = next_date cmd = [ sys.argv[0], 'canada', 'copy-datasets', '-c', portal_ini ] if self.options.mirror: cmd.append('-m') pool = worker_pool( cmd, self.options.processes, [], stop_when_jobs_done=False, stop_on_keyboard_interrupt=False, ) # Advance generator so we may call send() below pool.next() def append_log(finished, package_id, action, reason): if not log: return log.write(json.dumps([ datetime.now().isoformat(), finished, package_id, action, reason, ]) + '\n') log.flush() with _quiet_int_pipe(): append_log( None, None, "started updating from:", activity_date.isoformat() ) for packages, next_date in ( changed_package_id_runs(activity_date)): job_ids, finished, result = pool.send(enumerate(packages)) stats = completion_stats(self.options.processes) while result is not None: package_id, action, reason = json.loads(result) print job_ids, stats.next(), finished, package_id, \ action, reason append_log(finished, package_id, action, reason) job_ids, finished, result = pool.next() print " --- next batch starting at: " + next_date.isoformat() append_log( None, None, "next batch starting at:", next_date.isoformat() ) self._portal_update_activity_date = next_date.isoformat() self._portal_update_completed = True
def dump_things(ckan, thing, arguments, worker_pool=None, stdout=None, stderr=None): """ dump all datasets, groups or orgs accessible by the connected user The parent process creates a pool of worker processes and hands out ids to each worker. Status of last record completed and records being processed is displayed on stderr. """ if worker_pool is None: worker_pool = workers.worker_pool if stdout is None: stdout = getattr(sys.stdout, 'buffer', sys.stdout) if stderr is None: stderr = getattr(sys.stderr, 'buffer', sys.stderr) if arguments['--worker']: return dump_things_worker(ckan, thing, arguments) log = None if arguments['--log']: log = open(arguments['--log'], 'a') jsonl_output = stdout if arguments['--datapackages']: # TODO: do we want to just divert this to devnull? jsonl_output = open(os.devnull, 'wb') if arguments['--output']: jsonl_output = open(arguments['--output'], 'wb') if arguments['--gzip']: jsonl_output = gzip.GzipFile(fileobj=jsonl_output) if arguments['--all']: get_thing_list = { 'datasets': 'package_list', 'groups': 'group_list', 'organizations': 'organization_list', }[thing] names = ckan.call_action(get_thing_list, {}) else: names = arguments['ID_OR_NAME'] cmd = _worker_command_line(thing, arguments) processes = int(arguments['--processes']) if hasattr(ckan, 'parallel_limit'): # add your sites to ckanapi.remoteckan.MY_SITES instead of removing processes = min(processes, ckan.parallel_limit) stats = completion_stats(processes) pool = worker_pool(cmd, processes, enumerate(compact_json(n) + b'\n' for n in names)) results = {} expecting_number = 0 with quiet_int_pipe() as errors: for job_ids, finished, result in pool: timestamp, error, record = json.loads(result.decode('utf-8')) results[finished] = record if not arguments['--quiet']: stderr.write('{0} {1} {2} {3} {4}\n'.format( finished, job_ids, next(stats), error, record.get('name', '') if record else '', ).encode('utf-8')) if log: log.write(compact_json([ timestamp, finished, error, record.get('name', '') if record else None, ]) + b'\n') datapackages_path = arguments['--datapackages'] if datapackages_path: create_datapackage(record, datapackages_path, stderr) # keep the output in the same order as names while expecting_number in results: record = results.pop(expecting_number) if record: # sort keys so we can diff output jsonl_output.write(compact_json(record, sort_keys=True) + b'\n') expecting_number += 1 if 'pipe' in errors: return 1 if 'interrupt' in errors: return 2
def delete_things(ckan, thing, arguments, worker_pool=None, stdin=None, stdout=None, stderr=None): """ delete datasets, groups, orgs, users etc, The parent process creates a pool of worker processes and hands out json lines to each worker as they finish a task. Status of last record completed and records being processed is displayed on stderr. """ if worker_pool is None: worker_pool = workers.worker_pool if stdin is None: stdin = getattr(sys.stdin, 'buffer', sys.stdin) if stdout is None: stdout = getattr(sys.stdout, 'buffer', sys.stdout) if stderr is None: stderr = getattr(sys.stderr, 'buffer', sys.stderr) if arguments['--worker']: return delete_things_worker(ckan, thing, arguments) log = None if arguments['--log']: log = open(arguments['--log'], 'a') jsonl_input = stdin if arguments['--input']: jsonl_input = open(arguments['--input'], 'rb') if arguments['--gzip']: jsonl_input = gzip.GzipFile(fileobj=jsonl_input) def name_reader(): """ handle start-record and max-records options and extract all ids or names from each line (e.g. package_search, package_show or package_list output) record numbers here correspond to names/ids extracted not lines """ start_record = int(arguments['--start-record']) max_records = arguments['--max-records'] if max_records is not None: max_records = int(max_records) for num, name in enumerate(chain.from_iterable( extract_ids_or_names(line) for line in jsonl_input), 1): if num < start_record: continue if max_records is not None and num >= start_record + max_records: break yield num, compact_json(name) cmd = _worker_command_line(thing, arguments) processes = int(arguments['--processes']) if hasattr(ckan, 'parallel_limit'): # add your sites to ckanapi.remoteckan.MY_SITES instead of removing processes = min(processes, ckan.parallel_limit) stats = completion_stats(processes) if not arguments['ID_OR_NAME']: pool = worker_pool(cmd, processes, name_reader()) else: pool = worker_pool(cmd, processes, enumerate( (compact_json(n) + b'\n' for n in arguments['ID_OR_NAME']), 1)) with quiet_int_pipe() as errors: for job_ids, finished, result in pool: if not result: # child exited with traceback return 1 timestamp, error, response = json.loads( result.decode('utf-8')) if not arguments['--quiet']: stderr.write(('%s %s %s %s %s\n' % ( finished, job_ids, next(stats), error, compact_json(response).decode('utf-8') if response else '' )).encode('utf-8')) if log: log.write(compact_json([ timestamp, finished, error, response, ]) + b'\n') log.flush() if 'pipe' in errors: return 1 if 'interrupt' in errors: return 2
def dump_things(ckan, thing, arguments, worker_pool=None, stdout=None, stderr=None): """ dump all datasets, groups, orgs or users accessible by the connected user The parent process creates a pool of worker processes and hands out ids to each worker. Status of last record completed and records being processed is displayed on stderr. """ if worker_pool is None: worker_pool = workers.worker_pool if stdout is None: stdout = getattr(sys.__stdout__, 'buffer', sys.__stdout__) if stderr is None: stderr = getattr(sys.stderr, 'buffer', sys.stderr) if arguments['--worker']: return dump_things_worker(ckan, thing, arguments) log = None if arguments['--log']: log = open(arguments['--log'], 'a') jsonl_output = stdout if arguments[ '--datapackages']: # TODO: do we want to just divert this to devnull? jsonl_output = open(os.devnull, 'wb') if arguments['--output']: jsonl_output = open(arguments['--output'], 'wb') if arguments['--gzip']: jsonl_output = gzip.GzipFile(fileobj=jsonl_output) if arguments['--all']: get_thing_list = { 'datasets': 'package_list', 'groups': 'group_list', 'organizations': 'organization_list', 'users': 'user_list', 'related': 'related_list', }[thing] params = dict( all_fields=False, # for user_list ) names = ckan.call_action(get_thing_list, params) else: names = arguments['ID_OR_NAME'] if names and isinstance(names[0], dict): names = [rec.get('name', rec.get('id')) for rec in names] if arguments['--datapackages']: arguments['--datastore-fields'] = True cmd = _worker_command_line(thing, arguments) processes = int(arguments['--processes']) if hasattr(ckan, 'parallel_limit'): # add your sites to ckanapi.remoteckan.MY_SITES instead of removing processes = min(processes, ckan.parallel_limit) stats = completion_stats(processes) pool = worker_pool(cmd, processes, enumerate(compact_json(n) + b'\n' for n in names)) results = {} expecting_number = 0 with quiet_int_pipe() as errors: for job_ids, finished, result in pool: if not result: # child exited with traceback return 1 timestamp, error, record = json.loads(result.decode('utf-8')) results[finished] = record if not arguments['--quiet']: stderr.write('{0} {1} {2} {3} {4}\n'.format( finished, job_ids, next(stats), error, record.get('name', '') if record else '', ).encode('utf-8')) if log: log.write( compact_json([ timestamp, finished, error, record.get('name', '') if record else None, ]) + b'\n') datapackages_path = arguments['--datapackages'] if datapackages_path: create_datapackage(record, datapackages_path, stderr) # keep the output in the same order as names while expecting_number in results: record = results.pop(expecting_number) if record: # sort keys so we can diff output jsonl_output.write( compact_json(record, sort_keys=True) + b'\n') expecting_number += 1 if 'pipe' in errors: return 1 if 'interrupt' in errors: return 2
def delete_things(ckan, thing, arguments, worker_pool=None, stdin=None, stdout=None, stderr=None): """ delete datasets, groups, orgs, users etc, The parent process creates a pool of worker processes and hands out json lines to each worker as they finish a task. Status of last record completed and records being processed is displayed on stderr. """ if worker_pool is None: worker_pool = workers.worker_pool if stdin is None: stdin = getattr(sys.stdin, 'buffer', sys.stdin) if stdout is None: stdout = getattr(sys.__stdout__, 'buffer', sys.__stdout__) if stderr is None: stderr = getattr(sys.stderr, 'buffer', sys.stderr) if arguments['--worker']: return delete_things_worker(ckan, thing, arguments) log = None if arguments['--log']: log = open(arguments['--log'], 'a') jsonl_input = stdin if arguments['--input']: jsonl_input = open(arguments['--input'], 'rb') if arguments['--gzip']: jsonl_input = gzip.GzipFile(fileobj=jsonl_input) def name_reader(): """ handle start-record and max-records options and extract all ids or names from each line (e.g. package_search, package_show or package_list output) record numbers here correspond to names/ids extracted not lines """ start_record = int(arguments['--start-record']) max_records = arguments['--max-records'] if max_records is not None: max_records = int(max_records) for num, name in enumerate( chain.from_iterable( extract_ids_or_names(line) for line in jsonl_input), 1): if num < start_record: continue if max_records is not None and num >= start_record + max_records: break yield num, compact_json(name) cmd = _worker_command_line(thing, arguments) processes = int(arguments['--processes']) if hasattr(ckan, 'parallel_limit'): # add your sites to ckanapi.remoteckan.MY_SITES instead of removing processes = min(processes, ckan.parallel_limit) stats = completion_stats(processes) if not arguments['ID_OR_NAME']: pool = worker_pool(cmd, processes, name_reader()) else: pool = worker_pool( cmd, processes, enumerate( (compact_json(n) + b'\n' for n in arguments['ID_OR_NAME']), 1)) with quiet_int_pipe() as errors: for job_ids, finished, result in pool: if not result: # child exited with traceback return 1 timestamp, error, response = json.loads(result.decode('utf-8')) if not arguments['--quiet']: stderr.write(('%s %s %s %s %s\n' % (finished, job_ids, next(stats), error, compact_json(response).decode('utf-8') if response else '')).encode('utf-8')) if log: log.write( compact_json([ timestamp, finished, error, response, ]) + b'\n') log.flush() if 'pipe' in errors: return 1 if 'interrupt' in errors: return 2
def _portal_update(self, source, activity_date): if activity_date: past = re.match(PAST_RE, activity_date) if past: days, hours, minutes = ( int(x) if x else 0 for x in past.groups() ) activity_date = datetime.now() - timedelta( days=days, seconds=(hours * 60 + minutes) * 60 ) else: activity_date = isodate(activity_date, None) else: activity_date = datetime.now() - timedelta(days=7) log = None if self.options.log: log = open(self.options.log, 'a') if self.options.push_apikey and not self.options.fetch: registry = LocalCKAN() elif self.options.fetch: registry = RemoteCKAN(source) else: print "exactly one of -f or -a options must be specified" return def changed_package_id_runs(start_date): while True: package_ids, next_date = self._changed_package_ids_since( registry, start_date) if next_date is None: return yield package_ids, next_date start_date = next_date cmd = [ sys.argv[0], 'canada', 'copy-datasets', source, '-c', self.options.config ] if self.options.push_apikey: cmd.extend(['-a', self.options.push_apikey]) else: cmd.append('-f') if self.options.mirror: cmd.append('-m') pool = worker_pool( cmd, self.options.processes, [], stop_when_jobs_done=False, stop_on_keyboard_interrupt=False, ) # Advance generator so we may call send() below pool.next() def append_log(finished, package_id, action, reason): if not log: return log.write(json.dumps([ datetime.now().isoformat(), finished, package_id, action, reason, ]) + '\n') log.flush() with _quiet_int_pipe(): append_log( None, None, "started updating from:", activity_date.isoformat() ) for package_ids, next_date in ( changed_package_id_runs(activity_date)): job_ids, finished, result = pool.send(enumerate(package_ids)) stats = completion_stats(self.options.processes) while result is not None: package_id, action, reason = json.loads(result) print job_ids, stats.next(), finished, package_id, \ action, reason append_log(finished, package_id, action, reason) job_ids, finished, result = pool.next() print " --- next batch starting at: " + next_date.isoformat() append_log( None, None, "next batch starting at:", next_date.isoformat() ) self._portal_update_activity_date = next_date.isoformat() self._portal_update_completed = True
def load_things(ckan, thing, arguments, worker_pool=None, stdin=None, stdout=None, stderr=None): """ create and update datasets, groups and orgs The parent process creates a pool of worker processes and hands out json lines to each worker as they finish a task. Status of last record completed and records being processed is displayed on stderr. """ if worker_pool is None: worker_pool = workers.worker_pool if stdin is None: stdin = getattr(sys.stdin, "buffer", sys.stdin) if stdout is None: stdout = getattr(sys.stdout, "buffer", sys.stdout) if stderr is None: stderr = getattr(sys.stderr, "buffer", sys.stderr) if arguments["--worker"]: return load_things_worker(ckan, thing, arguments) log = None if arguments["--log"]: log = open(arguments["--log"], "a") jsonl_input = stdin if arguments["--input"]: jsonl_input = open(arguments["--input"], "rb") if arguments["--gzip"]: jsonl_input = gzip.GzipFile(fileobj=jsonl_input) def line_reader(): """ handle start-record and max-records options """ start_record = int(arguments["--start-record"]) max_records = arguments["--max-records"] if max_records is not None: max_records = int(max_records) for num, line in enumerate(jsonl_input, 1): # records start from 1 if num < start_record: continue if max_records is not None and num >= start_record + max_records: break yield num, line cmd = _worker_command_line(thing, arguments) processes = int(arguments["--processes"]) if hasattr(ckan, "parallel_limit"): # add your sites to ckanapi.remoteckan.MY_SITES instead of removing processes = min(processes, ckan.parallel_limit) stats = completion_stats(processes) pool = worker_pool(cmd, processes, line_reader()) with quiet_int_pipe() as errors: for job_ids, finished, result in pool: timestamp, action, error, response = json.loads(result.decode("utf-8")) if not arguments["--quiet"]: stderr.write( ( "%s %s %s %s %s %s\n" % ( finished, job_ids, next(stats), action, error, compact_json(response).decode("utf-8") if response else "", ) ).encode("utf-8") ) if log: log.write(compact_json([timestamp, finished, action, error, response]) + b"\n") log.flush() if "pipe" in errors: return 1 if "interrupt" in errors: return 2
def dump_things(ckan, thing, arguments, worker_pool=None, stdout=None, stderr=None): """ dump all datasets, groups or orgs accessible by the connected user The parent process creates a pool of worker processes and hands out ids to each worker. Status of last record completed and records being processed is displayed on stderr. """ if worker_pool is None: worker_pool = workers.worker_pool if stdout is None: stdout = getattr(sys.stdout, 'buffer', sys.stdout) if stderr is None: stderr = getattr(sys.stderr, 'buffer', sys.stderr) if arguments['--worker']: return dump_things_worker(ckan, thing, arguments) log = None if arguments['--log']: log = open(arguments['--log'], 'a') jsonl_output = stdout if arguments['--dp-output']: # TODO: do we want to just divert this to devnull? jsonl_output = open(os.devnull, 'w') if arguments['--output']: jsonl_output = open(arguments['--output'], 'wb') if arguments['--gzip']: jsonl_output = gzip.GzipFile(fileobj=jsonl_output) if arguments['--all']: get_thing_list = { 'datasets': 'package_list', 'groups': 'group_list', 'organizations': 'organization_list', }[thing] names = ckan.call_action(get_thing_list, {}) else: names = arguments['ID_OR_NAME'] cmd = _worker_command_line(thing, arguments) processes = int(arguments['--processes']) if hasattr(ckan, 'parallel_limit'): # add your sites to ckanapi.remoteckan.MY_SITES instead of removing processes = min(processes, ckan.parallel_limit) stats = completion_stats(processes) pool = worker_pool(cmd, processes, enumerate(compact_json(n) + b'\n' for n in names)) results = {} expecting_number = 0 with quiet_int_pipe() as errors: for job_ids, finished, result in pool: timestamp, error, record = json.loads(result.decode('utf-8')) results[finished] = record if not arguments['--quiet']: stderr.write('{0} {1} {2} {3} {4}\n'.format( finished, job_ids, next(stats), error, record.get('name', '') if record else '', ).encode('utf-8')) if log: log.write(compact_json([ timestamp, finished, error, record.get('name', '') if record else None, ]) + b'\n') if arguments['--dp-output']: # TODO: how are we going to handle which resources to leave alone? They're very inconsistent in some instances # And I can't imagine anyone wants to download a copy of, for example, the API base endpoint resource_formats_to_ignore = ['API', 'api'] dataset_name = record.get('name', '') if record else '' try: base_path = arguments['--dp-output'] except KeyError: base_path = './' target_dir = '{base_path}/{name}/data'.format(base_path=base_path, name=dataset_name) try: os.makedirs(target_dir) except Exception as e: stderr.write(e.message) for resource in record.get('resources', ''): if resource['name'] is not None: resource_id = resource['name'] else: resource_id = resource['id'] resource_filename = os.path.split(resource['url'])[1] output = os.path.join(target_dir, resource_filename) # Resources can have a free-form address and no internal info, so in those cases # we're going to merely save them using the UID. (If they even exist) if output.endswith('/'): output = os.path.join(output, resource_id) resource['path'] = output # datapackage.json format explicitly requires a path to the resource try: if resource['format'] not in resource_formats_to_ignore: r = requests.get(resource['url'], stream=True) with open(output, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks f.write(chunk) f.flush() except requests.ConnectionError: stderr.write('URL {url} refused connection. The resource will not be downloaded\n'.format(url=resource['url'])) except requests.exceptions.RequestException as e: stderr.write(e.message) stderr.write('\n') datapackagejson_output = open('{base_path}{dataset_name}/datapackage.json'.format(base_path=base_path, dataset_name=dataset_name), 'w',) record['version'] = '1.0-beta.10' datapackagejson_output.write(pretty_json(record)) # keep the output in the same order as names while expecting_number in results: record = results.pop(expecting_number) if record: # sort keys so we can diff output jsonl_output.write(compact_json(record, sort_keys=True) + b'\n') expecting_number += 1 if 'pipe' in errors: return 1 if 'interrupt' in errors: return 2