Esempio n. 1
0
    def __call__(self):
        url = self.parameters['url']
        dep_prefix = 'dependency://'
        if url.startswith(dep_prefix):
            dependency = url[len(dep_prefix):].strip()
            url = get_dependency_datapackage_url(dependency)
            assert url is not None, "Failed to fetch output datapackage for dependency '%s'" % dependency
        resource = self.parameters['resource']
        stream = self.parameters.get('stream', True)
        name_matcher = ResourceMatcher(resource) if isinstance(resource,
                                                               str) else None
        resource_index = resource if isinstance(resource, int) else None

        selected_resources = []
        found = False
        dp = datapackage.DataPackage(url)
        dp = self.process_datapackage(dp)
        for i, orig_res in enumerate(dp.resources):
            if resource_index == i or \
                    (name_matcher is not None and name_matcher.match(orig_res.descriptor.get('name'))):
                found = True
                orig_res.descriptor[PROP_STREAMED_FROM] = orig_res.source
                self.dp['resources'].append(orig_res.descriptor)
                if tabular(orig_res.descriptor) and stream:
                    orig_res.descriptor[PROP_STREAMING] = True
                    selected_resources.append(orig_res.iter(keyed=True))
                else:
                    orig_res.descriptor[PROP_STREAMING] = False

        assert found, "Failed to find resource with index or name matching %r" % resource
        spew(self.dp, itertools.chain(self.res_iter, selected_resources))
def process_row(row, row_index,
                spec, resource_index,
                parameters, stats):
    resource_matcher = ResourceMatcher(parameters['resource_name'])
    if resource_matcher.match(spec['name']):
        clean_field_code = parameters['clean_field_code']
        clean_field_name = parameters['clean_field_name']
        raw_field = parameters['raw_field']
        raw_field_value = row[raw_field]
        if not raw_field_value:
            return
        clean_value_code = None
        clean_value_name = None
        ret = fw_process.extractOne(raw_field_value,
                                    all_country_names,
                                    score_cutoff=80)
        if ret is not None:
            country, score = ret
            if country in all_country_initials:
                country = all_country_initials[country]
            try:
                country = pycountry.countries.lookup(country)
                clean_value_code = country.alpha_3
                clean_value_name = country.name
            except LookupError:
                # Ignore values we don't know how to clean
                pass
        row[clean_field_code] = clean_value_code
        row[clean_field_name] = clean_value_name

    return row
def process_row(row, row_index, spec, resource_index, parameters, stats):
    resource_matcher = ResourceMatcher(parameters['resource-name'])
    if resource_matcher.match(spec['name']):
        fingerprint_field = parameters['fingerprint-field']
        name_field = parameters['name-field']
        row[fingerprint_field] = slugify(row[name_field], to_lower=True)

    return row
def modify_datapackage(dp, parameters, stats):

    resource_matcher = ResourceMatcher(parameters['resource-name'])

    for res in dp['resources']:
        if resource_matcher.match(res['name']):
            res['schema']['fields'].extend([
                {
                    'name': parameters['fingerprint-field'],
                    'type': 'string'
                },
            ])
            return dp
Esempio n. 5
0
    def __call__(self):
        url = self.parameters['url']
        limit_rows = self.parameters.get('limit-rows')
        dep_prefix = 'dependency://'
        if url.startswith(dep_prefix):
            dependency = url[len(dep_prefix):].strip()
            url = get_dependency_datapackage_url(dependency)
            assert url is not None, "Failed to fetch output datapackage for dependency '%s'" % dependency
        resource = self.parameters['resource']
        stream = self.parameters.get('stream', True)
        name_matcher = ResourceMatcher(resource) if isinstance(resource,
                                                               str) else None
        resource_index = resource if isinstance(resource, int) else None

        selected_resources = []
        found = False
        dp = datapackage.DataPackage(url)
        dp = self.process_datapackage(dp)
        for i, orig_res in enumerate(dp.resources):
            if resource_index == i or \
                    (name_matcher is not None and name_matcher.match(orig_res.descriptor.get('name'))):
                found = True
                desc = copy.deepcopy(orig_res.descriptor)
                if 'primaryKey' in desc.get('schema', {}):
                    # Avoid duplication checks
                    del orig_res.descriptor['schema']['primaryKey']
                    orig_res.commit()
                desc[PROP_STREAMED_FROM] = orig_res.source
                self.dp['resources'].append(desc)
                if tabular(desc) and stream:
                    desc[PROP_STREAMING] = True
                    orig_res_iter = orig_res.iter(keyed=True)
                    if limit_rows:
                        orig_res_iter = itertools.islice(
                            orig_res_iter, limit_rows)
                    selected_resources.append(orig_res_iter)
                else:
                    desc[PROP_STREAMING] = False

        assert found, "Failed to find resource with index or name matching %r" % resource
        spew(self.dp, itertools.chain(self.res_iter, selected_resources))
Esempio n. 6
0
from datetime import date

from datapackage_pipelines.wrapper import ingest, spew
from datapackage_pipelines.utilities.resource_matcher import ResourceMatcher
from decimal import Decimal

parameters, dp, res_iter = ingest()

resource_matcher = ResourceMatcher(parameters.get('resource'))
key = parameters['key']
collated_field_name = parameters['collated-field-name']
assert isinstance(key, list)

for res in dp['resources']:
    if resource_matcher.match(res['name']):
        outer_fields = []
        inner_fields = []
        for field in res['schema']['fields']:
            if field['name'] in key:
                outer_fields.append(field)
            else:
                inner_fields.append(field)
        outer_fields.append({
            'name': collated_field_name,
            'type': 'object',
            'es:schema': {
                'fields': inner_fields
            }
        })
        schema = {
            'fields': outer_fields,
Esempio n. 7
0
import collections

from datapackage_pipelines.wrapper import spew, ingest
from datapackage_pipelines.utilities.resource_matcher import ResourceMatcher

import logging
log = logging.getLogger(__name__)

parameters, datapackage, res_iter = ingest()

resource_name = parameters['name']
resources_matcher = ResourceMatcher(resource_name)


datapackage['resources'] = [res for res in datapackage['resources']
                            if not resources_matcher.match(res['name'])]


def process_resources(res_iter_):

    while True:
        resource_ = next(res_iter_)
        if resources_matcher.match(resource_.spec['name']):
            # This is the one we're deleting, empty the iterator.
            collections.deque(resource_, maxlen=0)
        else:
            yield resource_


spew(datapackage, process_resources(res_iter))
            1, None)


parameters, datapackage, resource_iterator = ingest()

resources = ResourceMatcher(parameters.get('resources'))
ignore_missing = parameters.get('ignore-missing', False)

new_resource_iterator = []
for resource in datapackage['resources']:

    if streamable(resource):
        url = resource[PROP_STREAMED_FROM]

        name = resource['name']
        if not resources.match(name):
            continue

        path = get_path(resource)
        if path is None or path == PATH_PLACEHOLDER:
            path = os.path.join('data', name + '.csv')
            resource['path'] = path

        resource[PROP_STREAMING] = True

        rows = stream_reader(resource, url, ignore_missing or url == "")

        new_resource_iterator.append(rows)

    elif streaming(resource):
        new_resource_iterator.append(next(resource_iterator))
Esempio n. 9
0
    if source_fields is not None:
        for source_field in source_fields:
            if source_field in field_mapping:
                raise RuntimeError('Duplicate appearance of %s (%r)' %
                                   (source_field, field_mapping))
            field_mapping[source_field] = target_field

    if target_field in field_mapping:
        raise RuntimeError('Duplicate appearance of %s' % target_field)

    field_mapping[target_field] = target_field

# Create the schema for the target resource
needed_fields = sorted(fields.keys())
for resource in datapackage['resources']:
    if not sources.match(resource['name']):
        continue

    schema = resource.get('schema', {})
    pk = schema.get('primaryKey', [])
    for field in schema.get('fields', []):
        orig_name = field['name']
        if orig_name in field_mapping:
            name = field_mapping[orig_name]
            if name not in needed_fields:
                continue
            if orig_name in pk:
                target['schema']['primaryKey'].append(name)
            target['schema']['fields'].append(field)
            field['name'] = name
            needed_fields.remove(name)
class ResourceFilterProcessor(object):
    def __init__(self,
                 ingest_response=None,
                 default_input_resource=None,
                 default_output_resource=None,
                 default_replace_resource=True,
                 table_schema=None,
                 resource_filter=None):
        if not ingest_response:
            ingest_response = ingest()
        self.parameters, self.datapackage, self.resource_iterator = ingest_response
        self.set_default_parameters(default_input_resource,
                                    default_output_resource,
                                    default_replace_resource)
        self._resource_filter_param = resource_filter
        self.input_resource_matcher = ResourceMatcher(
            self.parameters["input_resource"])
        self.output_resource_name = self.parameters["output_resource"]
        self.output_resource_descriptor = {
            "name": self.output_resource_name,
            PROP_STREAMING: True,
            "path": "data/{}.csv".format(self.output_resource_name),
            "schema": table_schema
        }

    def set_default_parameters(self, default_input_resource,
                               default_output_resource,
                               default_replace_resource):
        self.parameters.setdefault("input_resource", default_input_resource)
        self.parameters.setdefault("output_resource", default_output_resource)
        self.parameters.setdefault("replace_resource",
                                   default_replace_resource)

    def filter_data(self):
        for resource_descriptor in self.datapackage["resources"]:
            resource_data = next(self.resource_iterator)
            if self._is_matching_resource(resource_descriptor):
                yield self.filter_resource_data(resource_data, self.parameters)
            else:
                yield resource_data

    def filter_datapackage(self):
        if self.parameters["replace_resource"]:
            for resource in self.datapackage["resources"]:
                if self.input_resource_matcher.match(resource["name"]):
                    resource.update(self.output_resource_descriptor)
        else:
            self.datapackage["resources"].append(
                self.output_resource_descriptor)
        return self.datapackage

    def filter_resource_data(self, data, parameters):
        return self._resource_filter_param(data, parameters)

    def spew(self):
        spew(*self._get_spew_params())

    def get_stats(self):
        return {}

    @classmethod
    def main(cls, **kwargs):
        cls(ingest_response=ingest(), **kwargs).spew()

    def _get_spew_params(self):
        datapackage = self.filter_datapackage()
        return datapackage, self.filter_data(), self.get_stats()

    def _is_matching_resource(self, resource_descriptor):
        return resource_descriptor["name"] == self.parameters[
            "output_resource"]
Esempio n. 11
0
    def __call__(self):
        self.parameters['resource'] = self.parameters['resource-name']
        kv_cache = self.parameters.get('kv-cache', False)
        kv_path = self.parameters['kv-path']
        url = self.parameters['url']
        limit_rows = self.parameters.get('limit-rows')
        log_progress_rows = self.parameters.get('log-progress-rows')
        dep_prefix = 'dependency://'
        if url.startswith(dep_prefix):
            dependency = url[len(dep_prefix):].strip()
            url = get_dependency_datapackage_url(dependency)
            assert url is not None, "Failed to fetch output datapackage for dependency '%s'" % dependency
        stream = self.parameters.get('stream', True)
        required = self.parameters.get('required', True)
        resource = self.parameters.get('resource')
        resources = self.parameters.get('resources')
        if resource is not None:
            assert not resources
            resource_index = resource if isinstance(resource, int) else None
        else:
            assert resources
            resource_index = None
            resource = list(resources.keys())
        name_matcher = ResourceMatcher(resource) if isinstance(resource, (str, list)) else None

        selected_resources = []
        found = False
        try:
            dp = datapackage.DataPackage(url)
        except Exception:
            if required:
                raise
            else:
                dp = None
        if dp:
            dp = self.process_datapackage(dp)
            for i, orig_res in enumerate(dp.resources):
                if resource_index == i or \
                        (name_matcher is not None and name_matcher.match(orig_res.descriptor.get('name'))):
                    found = True
                    desc = copy.deepcopy(orig_res.descriptor)
                    if 'primaryKey' in desc.get('schema', {}):
                        # Avoid duplication checks
                        del orig_res.descriptor['schema']['primaryKey']
                        orig_res.commit()
                    desc[PROP_STREAMED_FROM] = orig_res.source
                    if resources:
                        desc.update(resources[desc['name']])
                    self.dp['resources'].append(desc)
                    if tabular(desc) and stream:
                        desc[PROP_STREAMING] = True
                        if kv_cache and os.path.exists(kv_path):
                            kv = PersistentKVFile(kv_path, concurrent=True)
                            orig_res_iter = kv_res_iter(kv, kv_key=self.parameters.get('kv-key'))
                        else:
                            kv = PersistentKVFile(kv_path, concurrent=True)
                            orig_res_iter = kv_res_iter(kv, orig_res.iter(keyed=True), kv_key=self.parameters.get('kv-key'))
                        if limit_rows:
                            orig_res_iter = itertools.islice(orig_res_iter, limit_rows)
                        if log_progress_rows:
                            orig_res_iter = progress_logger(orig_res_iter, log_progress_rows)
                        selected_resources.append(orig_res_iter)
                    else:
                        desc[PROP_STREAMING] = False

        assert found or not required, "Failed to find resource with index or name matching %r" % resource
        spew(self.dp, itertools.chain(self.res_iter, selected_resources))
Esempio n. 12
0
import itertools

import datapackage

from datapackage_pipelines.wrapper import ingest, spew
from datapackage_pipelines.utilities.resource_matcher import ResourceMatcher
from datapackage_pipelines.utilities.resources import tabular, PROP_STREAMING

parameters, dp, res_iter = ingest()

url = parameters['url']
resource = parameters['resource']
name_matcher = ResourceMatcher(resource) if isinstance(resource, str) else None
resource_index = resource if isinstance(resource, int) else None

selected_resources = []
found = False
datapackage = datapackage.DataPackage(url)
for i, orig_res in enumerate(datapackage.resources):
    if resource_index == i or \
          (name_matcher is not None and name_matcher.match(orig_res.descriptor.get('name'))):
        found = True
        dp['resources'].append(orig_res.descriptor)
        if tabular(orig_res.descriptor):
            orig_res.descriptor[PROP_STREAMING] = True
            selected_resources.append(orig_res.iter(keyed=True))

assert found, "Failed to find resource with index or name matching %r" % resource

spew(dp, itertools.chain(res_iter, selected_resources))