def process_row(row, row_index,
                spec, resource_index,
                parameters, stats):
    resource_matcher = ResourceMatcher(parameters['resource_name'])
    if resource_matcher.match(spec['name']):
        clean_field_code = parameters['clean_field_code']
        clean_field_name = parameters['clean_field_name']
        raw_field = parameters['raw_field']
        raw_field_value = row[raw_field]
        if not raw_field_value:
            return
        clean_value_code = None
        clean_value_name = None
        ret = fw_process.extractOne(raw_field_value,
                                    all_country_names,
                                    score_cutoff=80)
        if ret is not None:
            country, score = ret
            if country in all_country_initials:
                country = all_country_initials[country]
            try:
                country = pycountry.countries.lookup(country)
                clean_value_code = country.alpha_3
                clean_value_name = country.name
            except LookupError:
                # Ignore values we don't know how to clean
                pass
        row[clean_field_code] = clean_value_code
        row[clean_field_name] = clean_value_name

    return row
Example #2
0
    def __call__(self):
        url = self.parameters['url']
        dep_prefix = 'dependency://'
        if url.startswith(dep_prefix):
            dependency = url[len(dep_prefix):].strip()
            url = get_dependency_datapackage_url(dependency)
            assert url is not None, "Failed to fetch output datapackage for dependency '%s'" % dependency
        resource = self.parameters['resource']
        stream = self.parameters.get('stream', True)
        name_matcher = ResourceMatcher(resource) if isinstance(resource,
                                                               str) else None
        resource_index = resource if isinstance(resource, int) else None

        selected_resources = []
        found = False
        dp = datapackage.DataPackage(url)
        dp = self.process_datapackage(dp)
        for i, orig_res in enumerate(dp.resources):
            if resource_index == i or \
                    (name_matcher is not None and name_matcher.match(orig_res.descriptor.get('name'))):
                found = True
                orig_res.descriptor[PROP_STREAMED_FROM] = orig_res.source
                self.dp['resources'].append(orig_res.descriptor)
                if tabular(orig_res.descriptor) and stream:
                    orig_res.descriptor[PROP_STREAMING] = True
                    selected_resources.append(orig_res.iter(keyed=True))
                else:
                    orig_res.descriptor[PROP_STREAMING] = False

        assert found, "Failed to find resource with index or name matching %r" % resource
        spew(self.dp, itertools.chain(self.res_iter, selected_resources))
def process_row(row, row_index, spec, resource_index, parameters, stats):
    resource_matcher = ResourceMatcher(parameters['resource-name'])
    if resource_matcher.match(spec['name']):
        fingerprint_field = parameters['fingerprint-field']
        name_field = parameters['name-field']
        row[fingerprint_field] = slugify(row[name_field], to_lower=True)

    return row
def modify_datapackage(dp, parameters, stats):

    resource_matcher = ResourceMatcher(parameters['resource-name'])

    for res in dp['resources']:
        if resource_matcher.match(res['name']):
            res['schema']['fields'].extend([
                {
                    'name': parameters['fingerprint-field'],
                    'type': 'string'
                },
            ])
            return dp
Example #5
0
    def __call__(self):
        url = self.parameters['url']
        limit_rows = self.parameters.get('limit-rows')
        dep_prefix = 'dependency://'
        if url.startswith(dep_prefix):
            dependency = url[len(dep_prefix):].strip()
            url = get_dependency_datapackage_url(dependency)
            assert url is not None, "Failed to fetch output datapackage for dependency '%s'" % dependency
        resource = self.parameters['resource']
        stream = self.parameters.get('stream', True)
        name_matcher = ResourceMatcher(resource) if isinstance(resource,
                                                               str) else None
        resource_index = resource if isinstance(resource, int) else None

        selected_resources = []
        found = False
        dp = datapackage.DataPackage(url)
        dp = self.process_datapackage(dp)
        for i, orig_res in enumerate(dp.resources):
            if resource_index == i or \
                    (name_matcher is not None and name_matcher.match(orig_res.descriptor.get('name'))):
                found = True
                desc = copy.deepcopy(orig_res.descriptor)
                if 'primaryKey' in desc.get('schema', {}):
                    # Avoid duplication checks
                    del orig_res.descriptor['schema']['primaryKey']
                    orig_res.commit()
                desc[PROP_STREAMED_FROM] = orig_res.source
                self.dp['resources'].append(desc)
                if tabular(desc) and stream:
                    desc[PROP_STREAMING] = True
                    orig_res_iter = orig_res.iter(keyed=True)
                    if limit_rows:
                        orig_res_iter = itertools.islice(
                            orig_res_iter, limit_rows)
                    selected_resources.append(orig_res_iter)
                else:
                    desc[PROP_STREAMING] = False

        assert found, "Failed to find resource with index or name matching %r" % resource
        spew(self.dp, itertools.chain(self.res_iter, selected_resources))
 def __init__(self,
              ingest_response=None,
              default_input_resource=None,
              default_output_resource=None,
              default_replace_resource=True,
              table_schema=None,
              resource_filter=None):
     if not ingest_response:
         ingest_response = ingest()
     self.parameters, self.datapackage, self.resource_iterator = ingest_response
     self.set_default_parameters(default_input_resource,
                                 default_output_resource,
                                 default_replace_resource)
     self._resource_filter_param = resource_filter
     self.input_resource_matcher = ResourceMatcher(
         self.parameters["input_resource"])
     self.output_resource_name = self.parameters["output_resource"]
     self.output_resource_descriptor = {
         "name": self.output_resource_name,
         PROP_STREAMING: True,
         "path": "data/{}.csv".format(self.output_resource_name),
         "schema": table_schema
     }
Example #7
0
import itertools

import datapackage

from datapackage_pipelines.wrapper import ingest, spew
from datapackage_pipelines.utilities.resource_matcher import ResourceMatcher
from datapackage_pipelines.utilities.resources import tabular, PROP_STREAMING

parameters, dp, res_iter = ingest()

url = parameters['url']
resource = parameters['resource']
name_matcher = ResourceMatcher(resource) if isinstance(resource, str) else None
resource_index = resource if isinstance(resource, int) else None

selected_resources = []
found = False
datapackage = datapackage.DataPackage(url)
for i, orig_res in enumerate(datapackage.resources):
    if resource_index == i or \
          (name_matcher is not None and name_matcher.match(orig_res.descriptor.get('name'))):
        found = True
        dp['resources'].append(orig_res.descriptor)
        if tabular(orig_res.descriptor):
            orig_res.descriptor[PROP_STREAMING] = True
            selected_resources.append(orig_res.iter(keyed=True))

assert found, "Failed to find resource with index or name matching %r" % resource

spew(dp, itertools.chain(res_iter, selected_resources))
Example #8
0
from datetime import date

from datapackage_pipelines.wrapper import ingest, spew
from datapackage_pipelines.utilities.resource_matcher import ResourceMatcher
from decimal import Decimal

parameters, dp, res_iter = ingest()

resource_matcher = ResourceMatcher(parameters.get('resource'))
key = parameters['key']
collated_field_name = parameters['collated-field-name']
assert isinstance(key, list)

for res in dp['resources']:
    if resource_matcher.match(res['name']):
        outer_fields = []
        inner_fields = []
        for field in res['schema']['fields']:
            if field['name'] in key:
                outer_fields.append(field)
            else:
                inner_fields.append(field)
        outer_fields.append({
            'name': collated_field_name,
            'type': 'object',
            'es:schema': {
                'fields': inner_fields
            }
        })
        schema = {
            'fields': outer_fields,
Example #9
0
import copy
import re

from datapackage_pipelines.wrapper import ingest, spew
from datapackage_pipelines.utilities.resource_matcher import ResourceMatcher

parameters, datapackage, resource_iterator = ingest()

resources = ResourceMatcher(parameters.get('resources'))
unpivot_fields = parameters.get('unpivot')
extra_keys = parameters.get('extraKeyFields')
extra_value = parameters.get('extraValueField')


def match_fields(field_name_re, expected):
    def filt(field):
        return (field_name_re.fullmatch(field['name']) is not None) is expected
    return filt


def process_datapackage(datapackage_):
    unpivot_fields_without_regex = []
    for resource in datapackage_['resources']:
        name = resource['name']
        if not resources.match(name):
            continue

        if 'schema' not in resource:
            continue

        fields = resource['schema'].get('fields', [])
    def __call__(self):
        self.parameters['resource'] = self.parameters['resource-name']
        kv_cache = self.parameters.get('kv-cache', False)
        kv_path = self.parameters['kv-path']
        url = self.parameters['url']
        limit_rows = self.parameters.get('limit-rows')
        log_progress_rows = self.parameters.get('log-progress-rows')
        dep_prefix = 'dependency://'
        if url.startswith(dep_prefix):
            dependency = url[len(dep_prefix):].strip()
            url = get_dependency_datapackage_url(dependency)
            assert url is not None, "Failed to fetch output datapackage for dependency '%s'" % dependency
        stream = self.parameters.get('stream', True)
        required = self.parameters.get('required', True)
        resource = self.parameters.get('resource')
        resources = self.parameters.get('resources')
        if resource is not None:
            assert not resources
            resource_index = resource if isinstance(resource, int) else None
        else:
            assert resources
            resource_index = None
            resource = list(resources.keys())
        name_matcher = ResourceMatcher(resource) if isinstance(resource, (str, list)) else None

        selected_resources = []
        found = False
        try:
            dp = datapackage.DataPackage(url)
        except Exception:
            if required:
                raise
            else:
                dp = None
        if dp:
            dp = self.process_datapackage(dp)
            for i, orig_res in enumerate(dp.resources):
                if resource_index == i or \
                        (name_matcher is not None and name_matcher.match(orig_res.descriptor.get('name'))):
                    found = True
                    desc = copy.deepcopy(orig_res.descriptor)
                    if 'primaryKey' in desc.get('schema', {}):
                        # Avoid duplication checks
                        del orig_res.descriptor['schema']['primaryKey']
                        orig_res.commit()
                    desc[PROP_STREAMED_FROM] = orig_res.source
                    if resources:
                        desc.update(resources[desc['name']])
                    self.dp['resources'].append(desc)
                    if tabular(desc) and stream:
                        desc[PROP_STREAMING] = True
                        if kv_cache and os.path.exists(kv_path):
                            kv = PersistentKVFile(kv_path, concurrent=True)
                            orig_res_iter = kv_res_iter(kv, kv_key=self.parameters.get('kv-key'))
                        else:
                            kv = PersistentKVFile(kv_path, concurrent=True)
                            orig_res_iter = kv_res_iter(kv, orig_res.iter(keyed=True), kv_key=self.parameters.get('kv-key'))
                        if limit_rows:
                            orig_res_iter = itertools.islice(orig_res_iter, limit_rows)
                        if log_progress_rows:
                            orig_res_iter = progress_logger(orig_res_iter, log_progress_rows)
                        selected_resources.append(orig_res_iter)
                    else:
                        desc[PROP_STREAMING] = False

        assert found or not required, "Failed to find resource with index or name matching %r" % resource
        spew(self.dp, itertools.chain(self.res_iter, selected_resources))
Example #11
0
import collections

from datapackage_pipelines.wrapper import spew, ingest
from datapackage_pipelines.utilities.resource_matcher import ResourceMatcher

import logging
log = logging.getLogger(__name__)

parameters, datapackage, res_iter = ingest()

resource_name = parameters['name']
resources_matcher = ResourceMatcher(resource_name)


datapackage['resources'] = [res for res in datapackage['resources']
                            if not resources_matcher.match(res['name'])]


def process_resources(res_iter_):

    while True:
        resource_ = next(res_iter_)
        if resources_matcher.match(resource_.spec['name']):
            # This is the one we're deleting, empty the iterator.
            collections.deque(resource_, maxlen=0)
        else:
            yield resource_


spew(datapackage, process_resources(res_iter))
        _resource['schema'] = schema

    close()
    del stream

    return itertools\
        .islice(
            _reader(
                get_opener(_url, _resource),
                _url),
            1, None)


parameters, datapackage, resource_iterator = ingest()

resources = ResourceMatcher(parameters.get('resources'))
ignore_missing = parameters.get('ignore-missing', False)

new_resource_iterator = []
for resource in datapackage['resources']:

    if streamable(resource):
        url = resource[PROP_STREAMED_FROM]

        name = resource['name']
        if not resources.match(name):
            continue

        path = get_path(resource)
        if path is None or path == PATH_PLACEHOLDER:
            path = os.path.join('data', name + '.csv')
Example #13
0
import itertools

from datapackage_pipelines.wrapper import ingest, spew
from datapackage_pipelines.utilities.resource_matcher import ResourceMatcher

parameters, datapackage, resource_iterator = ingest()

sources = ResourceMatcher(parameters.get('sources'))

target = parameters.get('target', {})
if 'name' not in target:
    target['name'] = 'concat'
if 'path' not in target:
    target['path'] = 'data/' + target['name'] + '.csv'
target.update(dict(mediatype='text/csv', schema=dict(fields=[],
                                                     primaryKey=[])))

fields = parameters['fields']

# Create mapping between source field names to target field names
field_mapping = {}
for target_field, source_fields in fields.items():
    if source_fields is not None:
        for source_field in source_fields:
            if source_field in field_mapping:
                raise RuntimeError('Duplicate appearance of %s (%r)' %
                                   (source_field, field_mapping))
            field_mapping[source_field] = target_field

    if target_field in field_mapping:
        raise RuntimeError('Duplicate appearance of %s' % target_field)
Example #14
0
from datapackage_pipelines.wrapper import ingest, spew
from datapackage_pipelines.utilities.kvstore import KVStore
from datapackage_pipelines.utilities.resource_matcher import ResourceMatcher


class KeyCalc(object):
    def __init__(self, key_spec):
        self.key_spec = key_spec

    def __call__(self, row):
        return self.key_spec.format(**row)


parameters, datapackage, resource_iterator = ingest()

resources = ResourceMatcher(parameters['resources'])
key_calc = KeyCalc(parameters['sort-by'])


def sorter(resource):
    db = KVStore()
    for row_num, row in enumerate(resource):
        key = key_calc(row) + "{:08x}".format(row_num)
        db[key] = row
    for key in db.keys():
        yield db[key]


def new_resource_iterator(resource_iterator_):
    for resource in resource_iterator_:
        if resources.match(resource.spec['name']):
Example #15
0
from datapackage_pipelines.wrapper import ingest, spew
from datapackage_pipelines.generators import slugify
from datapackage_pipelines.utilities.resource_matcher import ResourceMatcher

parameters, dp, res_iter = ingest()

resource_name = parameters['resource-name']
resource_matcher = ResourceMatcher(resource_name)
source_fields = parameters['source-fields']
name_field = parameters['name-field']
fingerprint_field = parameters['fingerprint-field']


def process_resource(res):
    all_fingerprints = set()
    for row in res:
        name = None
        for src_field in source_fields:
            src_value = row[src_field]
            if src_value:
                if name is None:
                    name = src_value
                fingerprint = slugify(src_value, to_lower=True)
                if fingerprint in all_fingerprints:
                    continue
                all_fingerprints.add(fingerprint)
                yield {name_field: name, fingerprint_field: fingerprint}


def process_resources(resources):
    for res in resources:
Example #16
0
            yield process_resource(res, afield, tfield)
        else:
            yield res


def modify_datapackage(dp, resource_matcher, afield, tfield):
    for res in dp['resources']:
        if not resource_matcher.match(res['name']):
            continue
        field = [
            f for f in res['schema']['fields']
            if f['name'] == afield
        ][0]
        fields = [
            f for f in res['schema']['fields']
            if f['name'] != afield
        ]
        fields.append({
            'name': tfield,
            'type': field['es:itemType'] if 'es:itemType' in field else 'string'
        })
        res['schema']['fields'] = fields
    return dp

if __name__ == '__main__':
    parameters, dp, res_iter = ingest()
    resource_matcher = ResourceMatcher(parameters.get('resource'))
    afield, tfield = parameters['array-field'], parameters['unwound-field']
    spew(modify_datapackage(dp, resource_matcher, afield, tfield),
         process_resources(res_iter, resource_matcher,
                           afield, tfield))
class ResourceFilterProcessor(object):
    def __init__(self,
                 ingest_response=None,
                 default_input_resource=None,
                 default_output_resource=None,
                 default_replace_resource=True,
                 table_schema=None,
                 resource_filter=None):
        if not ingest_response:
            ingest_response = ingest()
        self.parameters, self.datapackage, self.resource_iterator = ingest_response
        self.set_default_parameters(default_input_resource,
                                    default_output_resource,
                                    default_replace_resource)
        self._resource_filter_param = resource_filter
        self.input_resource_matcher = ResourceMatcher(
            self.parameters["input_resource"])
        self.output_resource_name = self.parameters["output_resource"]
        self.output_resource_descriptor = {
            "name": self.output_resource_name,
            PROP_STREAMING: True,
            "path": "data/{}.csv".format(self.output_resource_name),
            "schema": table_schema
        }

    def set_default_parameters(self, default_input_resource,
                               default_output_resource,
                               default_replace_resource):
        self.parameters.setdefault("input_resource", default_input_resource)
        self.parameters.setdefault("output_resource", default_output_resource)
        self.parameters.setdefault("replace_resource",
                                   default_replace_resource)

    def filter_data(self):
        for resource_descriptor in self.datapackage["resources"]:
            resource_data = next(self.resource_iterator)
            if self._is_matching_resource(resource_descriptor):
                yield self.filter_resource_data(resource_data, self.parameters)
            else:
                yield resource_data

    def filter_datapackage(self):
        if self.parameters["replace_resource"]:
            for resource in self.datapackage["resources"]:
                if self.input_resource_matcher.match(resource["name"]):
                    resource.update(self.output_resource_descriptor)
        else:
            self.datapackage["resources"].append(
                self.output_resource_descriptor)
        return self.datapackage

    def filter_resource_data(self, data, parameters):
        return self._resource_filter_param(data, parameters)

    def spew(self):
        spew(*self._get_spew_params())

    def get_stats(self):
        return {}

    @classmethod
    def main(cls, **kwargs):
        cls(ingest_response=ingest(), **kwargs).spew()

    def _get_spew_params(self):
        datapackage = self.filter_datapackage()
        return datapackage, self.filter_data(), self.get_stats()

    def _is_matching_resource(self, resource_descriptor):
        return resource_descriptor["name"] == self.parameters[
            "output_resource"]