def __call__(self): url = self.parameters['url'] dep_prefix = 'dependency://' if url.startswith(dep_prefix): dependency = url[len(dep_prefix):].strip() url = get_dependency_datapackage_url(dependency) assert url is not None, "Failed to fetch output datapackage for dependency '%s'" % dependency resource = self.parameters['resource'] stream = self.parameters.get('stream', True) name_matcher = ResourceMatcher(resource) if isinstance(resource, str) else None resource_index = resource if isinstance(resource, int) else None selected_resources = [] found = False dp = datapackage.DataPackage(url) dp = self.process_datapackage(dp) for i, orig_res in enumerate(dp.resources): if resource_index == i or \ (name_matcher is not None and name_matcher.match(orig_res.descriptor.get('name'))): found = True orig_res.descriptor[PROP_STREAMED_FROM] = orig_res.source self.dp['resources'].append(orig_res.descriptor) if tabular(orig_res.descriptor) and stream: orig_res.descriptor[PROP_STREAMING] = True selected_resources.append(orig_res.iter(keyed=True)) else: orig_res.descriptor[PROP_STREAMING] = False assert found, "Failed to find resource with index or name matching %r" % resource spew(self.dp, itertools.chain(self.res_iter, selected_resources))
def process_row(row, row_index, spec, resource_index, parameters, stats): resource_matcher = ResourceMatcher(parameters['resource_name']) if resource_matcher.match(spec['name']): clean_field_code = parameters['clean_field_code'] clean_field_name = parameters['clean_field_name'] raw_field = parameters['raw_field'] raw_field_value = row[raw_field] if not raw_field_value: return clean_value_code = None clean_value_name = None ret = fw_process.extractOne(raw_field_value, all_country_names, score_cutoff=80) if ret is not None: country, score = ret if country in all_country_initials: country = all_country_initials[country] try: country = pycountry.countries.lookup(country) clean_value_code = country.alpha_3 clean_value_name = country.name except LookupError: # Ignore values we don't know how to clean pass row[clean_field_code] = clean_value_code row[clean_field_name] = clean_value_name return row
def process_row(row, row_index, spec, resource_index, parameters, stats): resource_matcher = ResourceMatcher(parameters['resource-name']) if resource_matcher.match(spec['name']): fingerprint_field = parameters['fingerprint-field'] name_field = parameters['name-field'] row[fingerprint_field] = slugify(row[name_field], to_lower=True) return row
def modify_datapackage(dp, parameters, stats): resource_matcher = ResourceMatcher(parameters['resource-name']) for res in dp['resources']: if resource_matcher.match(res['name']): res['schema']['fields'].extend([ { 'name': parameters['fingerprint-field'], 'type': 'string' }, ]) return dp
def __call__(self): url = self.parameters['url'] limit_rows = self.parameters.get('limit-rows') dep_prefix = 'dependency://' if url.startswith(dep_prefix): dependency = url[len(dep_prefix):].strip() url = get_dependency_datapackage_url(dependency) assert url is not None, "Failed to fetch output datapackage for dependency '%s'" % dependency resource = self.parameters['resource'] stream = self.parameters.get('stream', True) name_matcher = ResourceMatcher(resource) if isinstance(resource, str) else None resource_index = resource if isinstance(resource, int) else None selected_resources = [] found = False dp = datapackage.DataPackage(url) dp = self.process_datapackage(dp) for i, orig_res in enumerate(dp.resources): if resource_index == i or \ (name_matcher is not None and name_matcher.match(orig_res.descriptor.get('name'))): found = True desc = copy.deepcopy(orig_res.descriptor) if 'primaryKey' in desc.get('schema', {}): # Avoid duplication checks del orig_res.descriptor['schema']['primaryKey'] orig_res.commit() desc[PROP_STREAMED_FROM] = orig_res.source self.dp['resources'].append(desc) if tabular(desc) and stream: desc[PROP_STREAMING] = True orig_res_iter = orig_res.iter(keyed=True) if limit_rows: orig_res_iter = itertools.islice( orig_res_iter, limit_rows) selected_resources.append(orig_res_iter) else: desc[PROP_STREAMING] = False assert found, "Failed to find resource with index or name matching %r" % resource spew(self.dp, itertools.chain(self.res_iter, selected_resources))
from datetime import date from datapackage_pipelines.wrapper import ingest, spew from datapackage_pipelines.utilities.resource_matcher import ResourceMatcher from decimal import Decimal parameters, dp, res_iter = ingest() resource_matcher = ResourceMatcher(parameters.get('resource')) key = parameters['key'] collated_field_name = parameters['collated-field-name'] assert isinstance(key, list) for res in dp['resources']: if resource_matcher.match(res['name']): outer_fields = [] inner_fields = [] for field in res['schema']['fields']: if field['name'] in key: outer_fields.append(field) else: inner_fields.append(field) outer_fields.append({ 'name': collated_field_name, 'type': 'object', 'es:schema': { 'fields': inner_fields } }) schema = { 'fields': outer_fields,
import collections from datapackage_pipelines.wrapper import spew, ingest from datapackage_pipelines.utilities.resource_matcher import ResourceMatcher import logging log = logging.getLogger(__name__) parameters, datapackage, res_iter = ingest() resource_name = parameters['name'] resources_matcher = ResourceMatcher(resource_name) datapackage['resources'] = [res for res in datapackage['resources'] if not resources_matcher.match(res['name'])] def process_resources(res_iter_): while True: resource_ = next(res_iter_) if resources_matcher.match(resource_.spec['name']): # This is the one we're deleting, empty the iterator. collections.deque(resource_, maxlen=0) else: yield resource_ spew(datapackage, process_resources(res_iter))
1, None) parameters, datapackage, resource_iterator = ingest() resources = ResourceMatcher(parameters.get('resources')) ignore_missing = parameters.get('ignore-missing', False) new_resource_iterator = [] for resource in datapackage['resources']: if streamable(resource): url = resource[PROP_STREAMED_FROM] name = resource['name'] if not resources.match(name): continue path = get_path(resource) if path is None or path == PATH_PLACEHOLDER: path = os.path.join('data', name + '.csv') resource['path'] = path resource[PROP_STREAMING] = True rows = stream_reader(resource, url, ignore_missing or url == "") new_resource_iterator.append(rows) elif streaming(resource): new_resource_iterator.append(next(resource_iterator))
if source_fields is not None: for source_field in source_fields: if source_field in field_mapping: raise RuntimeError('Duplicate appearance of %s (%r)' % (source_field, field_mapping)) field_mapping[source_field] = target_field if target_field in field_mapping: raise RuntimeError('Duplicate appearance of %s' % target_field) field_mapping[target_field] = target_field # Create the schema for the target resource needed_fields = sorted(fields.keys()) for resource in datapackage['resources']: if not sources.match(resource['name']): continue schema = resource.get('schema', {}) pk = schema.get('primaryKey', []) for field in schema.get('fields', []): orig_name = field['name'] if orig_name in field_mapping: name = field_mapping[orig_name] if name not in needed_fields: continue if orig_name in pk: target['schema']['primaryKey'].append(name) target['schema']['fields'].append(field) field['name'] = name needed_fields.remove(name)
class ResourceFilterProcessor(object): def __init__(self, ingest_response=None, default_input_resource=None, default_output_resource=None, default_replace_resource=True, table_schema=None, resource_filter=None): if not ingest_response: ingest_response = ingest() self.parameters, self.datapackage, self.resource_iterator = ingest_response self.set_default_parameters(default_input_resource, default_output_resource, default_replace_resource) self._resource_filter_param = resource_filter self.input_resource_matcher = ResourceMatcher( self.parameters["input_resource"]) self.output_resource_name = self.parameters["output_resource"] self.output_resource_descriptor = { "name": self.output_resource_name, PROP_STREAMING: True, "path": "data/{}.csv".format(self.output_resource_name), "schema": table_schema } def set_default_parameters(self, default_input_resource, default_output_resource, default_replace_resource): self.parameters.setdefault("input_resource", default_input_resource) self.parameters.setdefault("output_resource", default_output_resource) self.parameters.setdefault("replace_resource", default_replace_resource) def filter_data(self): for resource_descriptor in self.datapackage["resources"]: resource_data = next(self.resource_iterator) if self._is_matching_resource(resource_descriptor): yield self.filter_resource_data(resource_data, self.parameters) else: yield resource_data def filter_datapackage(self): if self.parameters["replace_resource"]: for resource in self.datapackage["resources"]: if self.input_resource_matcher.match(resource["name"]): resource.update(self.output_resource_descriptor) else: self.datapackage["resources"].append( self.output_resource_descriptor) return self.datapackage def filter_resource_data(self, data, parameters): return self._resource_filter_param(data, parameters) def spew(self): spew(*self._get_spew_params()) def get_stats(self): return {} @classmethod def main(cls, **kwargs): cls(ingest_response=ingest(), **kwargs).spew() def _get_spew_params(self): datapackage = self.filter_datapackage() return datapackage, self.filter_data(), self.get_stats() def _is_matching_resource(self, resource_descriptor): return resource_descriptor["name"] == self.parameters[ "output_resource"]
def __call__(self): self.parameters['resource'] = self.parameters['resource-name'] kv_cache = self.parameters.get('kv-cache', False) kv_path = self.parameters['kv-path'] url = self.parameters['url'] limit_rows = self.parameters.get('limit-rows') log_progress_rows = self.parameters.get('log-progress-rows') dep_prefix = 'dependency://' if url.startswith(dep_prefix): dependency = url[len(dep_prefix):].strip() url = get_dependency_datapackage_url(dependency) assert url is not None, "Failed to fetch output datapackage for dependency '%s'" % dependency stream = self.parameters.get('stream', True) required = self.parameters.get('required', True) resource = self.parameters.get('resource') resources = self.parameters.get('resources') if resource is not None: assert not resources resource_index = resource if isinstance(resource, int) else None else: assert resources resource_index = None resource = list(resources.keys()) name_matcher = ResourceMatcher(resource) if isinstance(resource, (str, list)) else None selected_resources = [] found = False try: dp = datapackage.DataPackage(url) except Exception: if required: raise else: dp = None if dp: dp = self.process_datapackage(dp) for i, orig_res in enumerate(dp.resources): if resource_index == i or \ (name_matcher is not None and name_matcher.match(orig_res.descriptor.get('name'))): found = True desc = copy.deepcopy(orig_res.descriptor) if 'primaryKey' in desc.get('schema', {}): # Avoid duplication checks del orig_res.descriptor['schema']['primaryKey'] orig_res.commit() desc[PROP_STREAMED_FROM] = orig_res.source if resources: desc.update(resources[desc['name']]) self.dp['resources'].append(desc) if tabular(desc) and stream: desc[PROP_STREAMING] = True if kv_cache and os.path.exists(kv_path): kv = PersistentKVFile(kv_path, concurrent=True) orig_res_iter = kv_res_iter(kv, kv_key=self.parameters.get('kv-key')) else: kv = PersistentKVFile(kv_path, concurrent=True) orig_res_iter = kv_res_iter(kv, orig_res.iter(keyed=True), kv_key=self.parameters.get('kv-key')) if limit_rows: orig_res_iter = itertools.islice(orig_res_iter, limit_rows) if log_progress_rows: orig_res_iter = progress_logger(orig_res_iter, log_progress_rows) selected_resources.append(orig_res_iter) else: desc[PROP_STREAMING] = False assert found or not required, "Failed to find resource with index or name matching %r" % resource spew(self.dp, itertools.chain(self.res_iter, selected_resources))
import itertools import datapackage from datapackage_pipelines.wrapper import ingest, spew from datapackage_pipelines.utilities.resource_matcher import ResourceMatcher from datapackage_pipelines.utilities.resources import tabular, PROP_STREAMING parameters, dp, res_iter = ingest() url = parameters['url'] resource = parameters['resource'] name_matcher = ResourceMatcher(resource) if isinstance(resource, str) else None resource_index = resource if isinstance(resource, int) else None selected_resources = [] found = False datapackage = datapackage.DataPackage(url) for i, orig_res in enumerate(datapackage.resources): if resource_index == i or \ (name_matcher is not None and name_matcher.match(orig_res.descriptor.get('name'))): found = True dp['resources'].append(orig_res.descriptor) if tabular(orig_res.descriptor): orig_res.descriptor[PROP_STREAMING] = True selected_resources.append(orig_res.iter(keyed=True)) assert found, "Failed to find resource with index or name matching %r" % resource spew(dp, itertools.chain(res_iter, selected_resources))