def func(package): matcher = ResourceMatcher(resources, package.pkg) yield package.pkg for rows in package: if matcher.match(rows.res.name): yield _find_replace(rows, fields) else: yield rows
def func(package): matcher = ResourceMatcher(resources, package.pkg) for resource in package.pkg.descriptor['resources']: if matcher.match(resource['name']): print(resource[key]) yield package.pkg for res in package: yield res
def func(package): matcher = ResourceMatcher(resources, package.pkg) yield package.pkg for rows in package: if matcher.match(rows.res.name): yield (row.inner if isinstance(row, LazyJsonLine) else row for row in rows) else: yield rows
def func(package): matcher = ResourceMatcher(resources, package.pkg) for resource in package.pkg.descriptor['resources']: if matcher.match(resource['name']): resource['schema']['fields'].append( dict(name=name, type=type, **options)) yield package.pkg for res in package: if matcher.match(res.res.name): yield column_adder(res, name, default) else: yield res
def process_datapackage(self, dp): dp = super(set_type, self).process_datapackage(dp) self.matcher = ResourceMatcher(self.resources, dp) added = False for res in dp.descriptor['resources']: if self.matcher.match(res['name']): for field in res['schema']['fields']: if self.name.match(field['name']): field.update(self.options) self.field_names.append(field['name']) added = True # assert added, 'Failed to find field {} in schema'.format(self.name) return dp
def func(package: PackageWrapper): matcher = ResourceMatcher(resources, package.pkg) for resource in package.pkg.descriptor['resources']: if matcher.match(resource['name']): resource.update(props) yield package.pkg res_iter = iter(package) for r in res_iter: if matcher.match(r.res.name): yield r.it else: yield r
def func(package): matcher = ResourceMatcher(resources, package.pkg) for resource in package.pkg.descriptor['resources']: if matcher.match(resource['name']): for i in resource['schema']['fields']: i['name'] = i['name'].lower().replace(' ', '_') yield package.pkg for res in package: if matcher.match(res.res.name): yield rename_in_row(res) else: yield res
def func(package: PackageWrapper): matcher = ResourceMatcher(resources, package.pkg) for resource in package.pkg.descriptor['resources']: if matcher.match(resource['name']): resource.setdefault('schema', {})['primaryKey'] = primary_key yield package.pkg res_iter = iter(package) for r in res_iter: if matcher.match(r.res.name): yield r.it else: yield r
class set_type(DataStreamProcessor): def __init__(self, name, resources=-1, **options): super(set_type, self).__init__() self.name = re.compile(f'^{name}$') self.options = options self.resources = resources self.field_names = [] def process_resources(self, resources): for res in resources: if self.matcher.match(res.res.name): if len(self.field_names) > 0: yield schema_validator(res.res, res, field_names=self.field_names) else: yield res else: yield res def process_datapackage(self, dp): dp = super(set_type, self).process_datapackage(dp) self.matcher = ResourceMatcher(self.resources, dp) added = False for res in dp.descriptor['resources']: if self.matcher.match(res['name']): for field in res['schema']['fields']: if self.name.match(field['name']): field.update(self.options) self.field_names.append(field['name']) added = True # assert added, 'Failed to find field {} in schema'.format(self.name) return dp
def func(package): matcher = ResourceMatcher(resources, package.pkg) try: for resource in package.pkg.descriptor['resources']: if matcher.match(resource['name']): for i in resource['schema']['fields']: if i['name'] == name: i['name'] = new_name except: pass yield package.pkg for res in package: if matcher.match(res.res.name): yield rename_in_row(res, name, new_name) else: yield res
def __init__(self, ingest_response=None, default_input_resource=None, default_output_resource=None, default_replace_resource=True, table_schema=None, resource_filter=None): if not ingest_response: ingest_response = ingest() self.parameters, self.datapackage, self.resource_iterator = ingest_response self.set_default_parameters(default_input_resource, default_output_resource, default_replace_resource) self._resource_filter_param = resource_filter self.input_resource_matcher = ResourceMatcher( self.parameters["input_resource"], self.datapackage) self.output_resource_name = self.parameters["output_resource"] self.output_resource_descriptor = { "name": self.output_resource_name, PROP_STREAMING: True, "path": "data/{}.csv".format(self.output_resource_name), "schema": table_schema }
def func(package): matcher = ResourceMatcher(resources, package.pkg) # Meta (pre) for resource in package.pkg.descriptor['resources']: if matcher.match(resource['name']): fields = resource['schema']['fields'] fields = list( filter(lambda field: field['name'] != key_field, fields)) fields = list( filter(lambda field: field['name'] != value_field, fields)) resource['schema']['fields'] = fields package.pkg.commit() yield package.pkg # Data for resource in package: if not matcher.match(resource.res.name): yield resource if matcher.match(resource.res.name): groups = {} new_field_names = set() for row in resource: groups.setdefault(row[join_field], {}) groups[row[join_field]][row[key_field]] = row[value_field] new_field_names.add('_'.join([key_field, row[key_field]])) rows = [] for group_name, group_data in groups.items(): row = {join_field: group_name} for group_data_key, group_data_value in group_data.items(): row['_'.join([key_field, group_data_key])] = group_data_value rows.append(row) yield iter(rows) # Meta (post) for resource in package.pkg.descriptor['resources']: if matcher.match(resource['name']): fields = resource['schema']['fields'] for new_field_name in new_field_names: fields.append({ 'name': new_field_name, 'type': 'string', 'format': 'default' }) resource['schema']['fields'] = fields package.pkg.commit()
class ResourceFilterProcessor(object): def __init__(self, ingest_response=None, default_input_resource=None, default_output_resource=None, default_replace_resource=True, table_schema=None, resource_filter=None): if not ingest_response: ingest_response = ingest() self.parameters, self.datapackage, self.resource_iterator = ingest_response self.set_default_parameters(default_input_resource, default_output_resource, default_replace_resource) self._resource_filter_param = resource_filter self.input_resource_matcher = ResourceMatcher( self.parameters["input_resource"], self.datapackage) self.output_resource_name = self.parameters["output_resource"] self.output_resource_descriptor = { "name": self.output_resource_name, PROP_STREAMING: True, "path": "data/{}.csv".format(self.output_resource_name), "schema": table_schema } def set_default_parameters(self, default_input_resource, default_output_resource, default_replace_resource): self.parameters.setdefault("input_resource", default_input_resource) self.parameters.setdefault("output_resource", default_output_resource) self.parameters.setdefault("replace_resource", default_replace_resource) def filter_data(self): for resource_descriptor in self.datapackage["resources"]: resource_data = next(self.resource_iterator) if self._is_matching_resource(resource_descriptor): yield self.filter_resource_data(resource_data, self.parameters) else: yield resource_data def filter_datapackage(self): if self.parameters["replace_resource"]: for resource in self.datapackage["resources"]: if self.input_resource_matcher.match(resource["name"]): resource.update(self.output_resource_descriptor) else: self.datapackage["resources"].append( self.output_resource_descriptor) return self.datapackage def filter_resource_data(self, data, parameters): return self._resource_filter_param(data, parameters) def spew(self): spew(*self._get_spew_params()) def get_stats(self): return {} @classmethod def main(cls, **kwargs): cls(ingest_response=ingest(), **kwargs).spew() def _get_spew_params(self): datapackage = self.filter_datapackage() return datapackage, self.filter_data(), self.get_stats() def _is_matching_resource(self, resource_descriptor): return resource_descriptor["name"] == self.parameters[ "output_resource"]
from datetime import date from datapackage_pipelines.wrapper import ingest, spew from dataflows.helpers.resource_matcher import ResourceMatcher from decimal import Decimal parameters, dp, res_iter = ingest() resource_matcher = ResourceMatcher(parameters.get('resource'), dp) key = parameters['key'] collated_field_name = parameters['collated-field-name'] assert isinstance(key, list) for res in dp['resources']: if resource_matcher.match(res['name']): outer_fields = [] inner_fields = [] for field in res['schema']['fields']: if field['name'] in key: outer_fields.append(field) else: inner_fields.append(field) outer_fields.append({ 'name': collated_field_name, 'type': 'object', 'es:schema': { 'fields': inner_fields } }) schema = { 'fields': outer_fields,
def __call__(self): url = self.parameters['url'] limit_rows = self.parameters.get('limit-rows') log_progress_rows = self.parameters.get('log-progress-rows') dep_prefix = 'dependency://' if url.startswith(dep_prefix): dependency = url[len(dep_prefix):].strip() url = get_dependency_datapackage_url(dependency) assert url is not None, "Failed to fetch output datapackage for dependency '%s'" % dependency stream = self.parameters.get('stream', True) required = self.parameters.get('required', True) resource = self.parameters.get('resource') resources = self.parameters.get('resources') if resource is not None: assert not resources resource_index = resource if isinstance(resource, int) else None else: assert resources resource_index = None resource = list(resources.keys()) name_matcher = (ResourceMatcher(resource, self.dp) if isinstance( resource, (str, list)) else None) selected_resources = [] found = False try: dp = datapackage.DataPackage(url) except Exception: if required: raise else: dp = None if dp: dp = self.process_datapackage(dp) for i, orig_res in enumerate(dp.resources): if resource_index == i or \ (name_matcher is not None and name_matcher.match(orig_res.descriptor.get('name'))): found = True desc = copy.deepcopy(orig_res.descriptor) if 'primaryKey' in desc.get('schema', {}): # Avoid duplication checks del orig_res.descriptor['schema']['primaryKey'] orig_res.commit() desc[PROP_STREAMED_FROM] = orig_res.source if resources: desc.update(resources[desc['name']]) self.dp['resources'].append(desc) if tabular(desc) and stream: desc[PROP_STREAMING] = True orig_res_iter = orig_res.iter(keyed=True) if limit_rows: orig_res_iter = itertools.islice( orig_res_iter, limit_rows) if log_progress_rows: orig_res_iter = progress_logger( orig_res_iter, log_progress_rows) selected_resources.append(orig_res_iter) else: desc[PROP_STREAMING] = False assert found or not required, "Failed to find resource with index or name matching %r" % resource spew(self.dp, itertools.chain(self.res_iter, selected_resources))
close() del stream return itertools\ .islice( _reader( get_opener(_url, _resource, columns), _url, max_row=limit_rows), 1, None) parameters, datapackage, resource_iterator = ingest() resources = ResourceMatcher(parameters.get('resources'), datapackage) ignore_missing = parameters.get('ignore-missing', False) limit_rows = parameters.get('limit-rows', -1) new_resource_iterator = [] for resource in datapackage['resources']: if streamable(resource): url = resource[PROP_STREAMED_FROM] name = resource['name'] if not resources.match(name): continue path = get_path(resource) if path is None or path == PATH_PLACEHOLDER:
def process_resources(res_iter, resource_matcher, afield, tfield): for res in res_iter: if resource_matcher.match(res.spec['name']): yield process_resource(res, afield, tfield) else: yield res def modify_datapackage(dp, resource_matcher, afield, tfield): for res in dp['resources']: if not resource_matcher.match(res['name']): continue field = [f for f in res['schema']['fields'] if f['name'] == afield][0] fields = [f for f in res['schema']['fields'] if f['name'] != afield] fields.append({ 'name': tfield, 'type': field['es:itemType'] if 'es:itemType' in field else 'string' }) res['schema']['fields'] = fields return dp if __name__ == '__main__': parameters, dp, res_iter = ingest() resource_matcher = ResourceMatcher(parameters.get('resource'), dp) afield, tfield = parameters['array-field'], parameters['unwound-field'] spew(modify_datapackage(dp, resource_matcher, afield, tfield), process_resources(res_iter, resource_matcher, afield, tfield))