def update(endpoint, **kwargs): """ Updates the database Args: endpoint (str): The api resource url. kwargs (dict): passed to CKAN constructor. Kwargs: chunk_size (int): Number of rows to process at a time (default: All). row_limit (int): Total number of rows to process (default: All). err_limit (int): Number of errors to encounter before failing (default: Inf). Returns: (dict): Update details """ start = timer() pid = kwargs.pop('pid', None) chunk_size = kwargs.pop('chunk_size', 0) row_limit = kwargs.pop('row_limit', None) err_limit = kwargs.pop('err_limit', None) rows = 0 ckan = CKAN(**kwargs) if pid: pids = [pid] else: org_show = partial(ckan.organization_show, include_datasets=True) orgs_basic = ckan.organization_list(permission='read') org_ids = it.imap(itemgetter('id'), orgs_basic) orgs = (org_show(id=org_id) for org_id in org_ids) package_lists = it.imap(itemgetter('packages'), orgs) pid_getter = partial(map, itemgetter('id')) pids = it.chain.from_iterable(it.imap(pid_getter, package_lists)) data = gen_data(ckan, pids, kwargs.get('mock_freq')) errors = {} for records in tup.chunk(data, min(row_limit or 'inf', chunk_size)): rs = map(partial(patch_or_post, endpoint), records) rows += len(filter(lambda r: r.ok, rs)) ids = map(itemgetter('dataset_id'), records) errors.update(dict((k, r.json()) for k, r in zip(ids, rs) if not r.ok)) if row_limit and rows >= row_limit: break if err_limit and len(errors) >= err_limit: raise Exception(errors) elapsed_time = ' ,'.join(fmt_elapsed(timer() - start)) return {'rows_added': rows, 'errors': errors, 'elapsed_time': elapsed_time}
def insert_records(self, resource_id, records, **kwargs): """Inserts records into a datastore table. Args: resource_id (str): The datastore resource id. records (List[dict]): The records to insert. **kwargs: Keyword arguments that are passed to datastore_create. Kwargs: method (str): Insert method. One of ['update, 'insert', 'upsert'] (default: 'insert'). force (bool): Create resource even if read-only. start (int): Row number to start from (zero indexed). stop (int): Row number to stop at (zero indexed). chunksize (int): Number of rows to write at a time. Returns: int: Number of records inserted. Raises: NotFound: If unable to find the resource. Examples: >>> CKAN(quiet=True).insert_records('rid', [{'field': 'value'}]) Traceback (most recent call last): NotFound: Resource `rid` was not found in filestore. """ chunksize = kwargs.pop('chunksize', 0) start = kwargs.pop('start', 0) stop = kwargs.pop('stop', None) kwargs.setdefault('force', self.force) kwargs.setdefault('method', 'insert') kwargs['resource_id'] = resource_id count = 1 for chunk in tup.chunk(records, chunksize, start=start, stop=stop): length = len(chunk) if self.verbose: print( 'Adding records %i - %i to resource %s...' % ( count, count + length - 1, resource_id)) kwargs['records'] = chunk try: self.datastore_upsert(**kwargs) except requests.exceptions.ConnectionError as err: if 'Broken pipe' in err.message[1]: print('Chunksize too large. Try using a smaller chunksize.') return 0 else: raise err except NotFound: # Keep exception message consistent with the others raise NotFound( 'Resource `%s` was not found in filestore.' % resource_id) count += length return count