コード例 #1
0
 def func(package):
     matcher = ResourceMatcher(resources, package.pkg)
     yield package.pkg
     for rows in package:
         if matcher.match(rows.res.name):
             yield _find_replace(rows, fields)
         else:
             yield rows
コード例 #2
0
    def func(package):
        matcher = ResourceMatcher(resources, package.pkg)
        for resource in package.pkg.descriptor['resources']:
            if matcher.match(resource['name']):
                print(resource[key])

        yield package.pkg
        for res in package:
            yield res
コード例 #3
0
 def func(package):
     matcher = ResourceMatcher(resources, package.pkg)
     yield package.pkg
     for rows in package:
         if matcher.match(rows.res.name):
             yield (row.inner if isinstance(row, LazyJsonLine) else row
                    for row in rows)
         else:
             yield rows
コード例 #4
0
ファイル: add_field.py プロジェクト: vitaly-am/dataflows
 def func(package):
     matcher = ResourceMatcher(resources, package.pkg)
     for resource in package.pkg.descriptor['resources']:
         if matcher.match(resource['name']):
             resource['schema']['fields'].append(
                 dict(name=name, type=type, **options))
     yield package.pkg
     for res in package:
         if matcher.match(res.res.name):
             yield column_adder(res, name, default)
         else:
             yield res
コード例 #5
0
 def process_datapackage(self, dp):
     dp = super(set_type, self).process_datapackage(dp)
     self.matcher = ResourceMatcher(self.resources, dp)
     added = False
     for res in dp.descriptor['resources']:
         if self.matcher.match(res['name']):
             for field in res['schema']['fields']:
                 if self.name.match(field['name']):
                     field.update(self.options)
                     self.field_names.append(field['name'])
                     added = True
     # assert added, 'Failed to find field {} in schema'.format(self.name)
     return dp
コード例 #6
0
    def func(package: PackageWrapper):
        matcher = ResourceMatcher(resources, package.pkg)
        for resource in package.pkg.descriptor['resources']:
            if matcher.match(resource['name']):
                resource.update(props)
        yield package.pkg

        res_iter = iter(package)
        for r in res_iter:
            if matcher.match(r.res.name):
                yield r.it
            else:
                yield r
コード例 #7
0
    def func(package):
        matcher = ResourceMatcher(resources, package.pkg)
        for resource in package.pkg.descriptor['resources']:
            if matcher.match(resource['name']):
                for i in resource['schema']['fields']:
                    i['name'] = i['name'].lower().replace(' ', '_')

        yield package.pkg
        for res in package:
            if matcher.match(res.res.name):
                yield rename_in_row(res)
            else:
                yield res
コード例 #8
0
    def func(package: PackageWrapper):
        matcher = ResourceMatcher(resources, package.pkg)
        for resource in package.pkg.descriptor['resources']:
            if matcher.match(resource['name']):
                resource.setdefault('schema', {})['primaryKey'] = primary_key
        yield package.pkg

        res_iter = iter(package)
        for r in res_iter:
            if matcher.match(r.res.name):
                yield r.it
            else:
                yield r
コード例 #9
0
class set_type(DataStreamProcessor):
    def __init__(self, name, resources=-1, **options):
        super(set_type, self).__init__()
        self.name = re.compile(f'^{name}$')
        self.options = options
        self.resources = resources
        self.field_names = []

    def process_resources(self, resources):
        for res in resources:
            if self.matcher.match(res.res.name):
                if len(self.field_names) > 0:
                    yield schema_validator(res.res,
                                           res,
                                           field_names=self.field_names)
                else:
                    yield res
            else:
                yield res

    def process_datapackage(self, dp):
        dp = super(set_type, self).process_datapackage(dp)
        self.matcher = ResourceMatcher(self.resources, dp)
        added = False
        for res in dp.descriptor['resources']:
            if self.matcher.match(res['name']):
                for field in res['schema']['fields']:
                    if self.name.match(field['name']):
                        field.update(self.options)
                        self.field_names.append(field['name'])
                        added = True
        # assert added, 'Failed to find field {} in schema'.format(self.name)
        return dp
コード例 #10
0
    def func(package):
        matcher = ResourceMatcher(resources, package.pkg)
        try: 
            for resource in package.pkg.descriptor['resources']:
                if matcher.match(resource['name']):
                    for i in resource['schema']['fields']:
                        if i['name'] == name:
                            i['name'] = new_name
        except: 
            pass

        yield package.pkg
        for res in package:
            if matcher.match(res.res.name):
                yield rename_in_row(res, name, new_name)
            else:
                yield res
コード例 #11
0
 def __init__(self,
              ingest_response=None,
              default_input_resource=None,
              default_output_resource=None,
              default_replace_resource=True,
              table_schema=None,
              resource_filter=None):
     if not ingest_response:
         ingest_response = ingest()
     self.parameters, self.datapackage, self.resource_iterator = ingest_response
     self.set_default_parameters(default_input_resource,
                                 default_output_resource,
                                 default_replace_resource)
     self._resource_filter_param = resource_filter
     self.input_resource_matcher = ResourceMatcher(
         self.parameters["input_resource"], self.datapackage)
     self.output_resource_name = self.parameters["output_resource"]
     self.output_resource_descriptor = {
         "name": self.output_resource_name,
         PROP_STREAMING: True,
         "path": "data/{}.csv".format(self.output_resource_name),
         "schema": table_schema
     }
コード例 #12
0
    def func(package):
        matcher = ResourceMatcher(resources, package.pkg)

        # Meta (pre)
        for resource in package.pkg.descriptor['resources']:
            if matcher.match(resource['name']):
                fields = resource['schema']['fields']
                fields = list(
                    filter(lambda field: field['name'] != key_field, fields))
                fields = list(
                    filter(lambda field: field['name'] != value_field, fields))
                resource['schema']['fields'] = fields
        package.pkg.commit()
        yield package.pkg

        # Data
        for resource in package:
            if not matcher.match(resource.res.name):
                yield resource
            if matcher.match(resource.res.name):
                groups = {}
                new_field_names = set()
                for row in resource:
                    groups.setdefault(row[join_field], {})
                    groups[row[join_field]][row[key_field]] = row[value_field]
                    new_field_names.add('_'.join([key_field, row[key_field]]))
                rows = []
                for group_name, group_data in groups.items():
                    row = {join_field: group_name}
                    for group_data_key, group_data_value in group_data.items():
                        row['_'.join([key_field,
                                      group_data_key])] = group_data_value
                    rows.append(row)
                yield iter(rows)

        # Meta (post)
        for resource in package.pkg.descriptor['resources']:
            if matcher.match(resource['name']):
                fields = resource['schema']['fields']
                for new_field_name in new_field_names:
                    fields.append({
                        'name': new_field_name,
                        'type': 'string',
                        'format': 'default'
                    })
                resource['schema']['fields'] = fields
        package.pkg.commit()
コード例 #13
0
class ResourceFilterProcessor(object):
    def __init__(self,
                 ingest_response=None,
                 default_input_resource=None,
                 default_output_resource=None,
                 default_replace_resource=True,
                 table_schema=None,
                 resource_filter=None):
        if not ingest_response:
            ingest_response = ingest()
        self.parameters, self.datapackage, self.resource_iterator = ingest_response
        self.set_default_parameters(default_input_resource,
                                    default_output_resource,
                                    default_replace_resource)
        self._resource_filter_param = resource_filter
        self.input_resource_matcher = ResourceMatcher(
            self.parameters["input_resource"], self.datapackage)
        self.output_resource_name = self.parameters["output_resource"]
        self.output_resource_descriptor = {
            "name": self.output_resource_name,
            PROP_STREAMING: True,
            "path": "data/{}.csv".format(self.output_resource_name),
            "schema": table_schema
        }

    def set_default_parameters(self, default_input_resource,
                               default_output_resource,
                               default_replace_resource):
        self.parameters.setdefault("input_resource", default_input_resource)
        self.parameters.setdefault("output_resource", default_output_resource)
        self.parameters.setdefault("replace_resource",
                                   default_replace_resource)

    def filter_data(self):
        for resource_descriptor in self.datapackage["resources"]:
            resource_data = next(self.resource_iterator)
            if self._is_matching_resource(resource_descriptor):
                yield self.filter_resource_data(resource_data, self.parameters)
            else:
                yield resource_data

    def filter_datapackage(self):
        if self.parameters["replace_resource"]:
            for resource in self.datapackage["resources"]:
                if self.input_resource_matcher.match(resource["name"]):
                    resource.update(self.output_resource_descriptor)
        else:
            self.datapackage["resources"].append(
                self.output_resource_descriptor)
        return self.datapackage

    def filter_resource_data(self, data, parameters):
        return self._resource_filter_param(data, parameters)

    def spew(self):
        spew(*self._get_spew_params())

    def get_stats(self):
        return {}

    @classmethod
    def main(cls, **kwargs):
        cls(ingest_response=ingest(), **kwargs).spew()

    def _get_spew_params(self):
        datapackage = self.filter_datapackage()
        return datapackage, self.filter_data(), self.get_stats()

    def _is_matching_resource(self, resource_descriptor):
        return resource_descriptor["name"] == self.parameters[
            "output_resource"]
コード例 #14
0
from datetime import date

from datapackage_pipelines.wrapper import ingest, spew
from dataflows.helpers.resource_matcher import ResourceMatcher
from decimal import Decimal

parameters, dp, res_iter = ingest()

resource_matcher = ResourceMatcher(parameters.get('resource'), dp)
key = parameters['key']
collated_field_name = parameters['collated-field-name']
assert isinstance(key, list)

for res in dp['resources']:
    if resource_matcher.match(res['name']):
        outer_fields = []
        inner_fields = []
        for field in res['schema']['fields']:
            if field['name'] in key:
                outer_fields.append(field)
            else:
                inner_fields.append(field)
        outer_fields.append({
            'name': collated_field_name,
            'type': 'object',
            'es:schema': {
                'fields': inner_fields
            }
        })
        schema = {
            'fields': outer_fields,
コード例 #15
0
    def __call__(self):
        url = self.parameters['url']
        limit_rows = self.parameters.get('limit-rows')
        log_progress_rows = self.parameters.get('log-progress-rows')
        dep_prefix = 'dependency://'
        if url.startswith(dep_prefix):
            dependency = url[len(dep_prefix):].strip()
            url = get_dependency_datapackage_url(dependency)
            assert url is not None, "Failed to fetch output datapackage for dependency '%s'" % dependency
        stream = self.parameters.get('stream', True)
        required = self.parameters.get('required', True)
        resource = self.parameters.get('resource')
        resources = self.parameters.get('resources')
        if resource is not None:
            assert not resources
            resource_index = resource if isinstance(resource, int) else None
        else:
            assert resources
            resource_index = None
            resource = list(resources.keys())
        name_matcher = (ResourceMatcher(resource, self.dp) if isinstance(
            resource, (str, list)) else None)

        selected_resources = []
        found = False
        try:
            dp = datapackage.DataPackage(url)
        except Exception:
            if required:
                raise
            else:
                dp = None
        if dp:
            dp = self.process_datapackage(dp)
            for i, orig_res in enumerate(dp.resources):
                if resource_index == i or \
                        (name_matcher is not None and name_matcher.match(orig_res.descriptor.get('name'))):
                    found = True
                    desc = copy.deepcopy(orig_res.descriptor)
                    if 'primaryKey' in desc.get('schema', {}):
                        # Avoid duplication checks
                        del orig_res.descriptor['schema']['primaryKey']
                        orig_res.commit()
                    desc[PROP_STREAMED_FROM] = orig_res.source
                    if resources:
                        desc.update(resources[desc['name']])
                    self.dp['resources'].append(desc)
                    if tabular(desc) and stream:
                        desc[PROP_STREAMING] = True
                        orig_res_iter = orig_res.iter(keyed=True)
                        if limit_rows:
                            orig_res_iter = itertools.islice(
                                orig_res_iter, limit_rows)
                        if log_progress_rows:
                            orig_res_iter = progress_logger(
                                orig_res_iter, log_progress_rows)
                        selected_resources.append(orig_res_iter)
                    else:
                        desc[PROP_STREAMING] = False

        assert found or not required, "Failed to find resource with index or name matching %r" % resource
        spew(self.dp, itertools.chain(self.res_iter, selected_resources))
コード例 #16
0
    close()
    del stream

    return itertools\
        .islice(
            _reader(
                get_opener(_url, _resource, columns),
                _url,
                max_row=limit_rows),
            1, None)


parameters, datapackage, resource_iterator = ingest()

resources = ResourceMatcher(parameters.get('resources'), datapackage)
ignore_missing = parameters.get('ignore-missing', False)
limit_rows = parameters.get('limit-rows', -1)

new_resource_iterator = []
for resource in datapackage['resources']:

    if streamable(resource):
        url = resource[PROP_STREAMED_FROM]

        name = resource['name']
        if not resources.match(name):
            continue

        path = get_path(resource)
        if path is None or path == PATH_PLACEHOLDER:
コード例 #17
0
def process_resources(res_iter, resource_matcher, afield, tfield):
    for res in res_iter:
        if resource_matcher.match(res.spec['name']):
            yield process_resource(res, afield, tfield)
        else:
            yield res


def modify_datapackage(dp, resource_matcher, afield, tfield):
    for res in dp['resources']:
        if not resource_matcher.match(res['name']):
            continue
        field = [f for f in res['schema']['fields'] if f['name'] == afield][0]
        fields = [f for f in res['schema']['fields'] if f['name'] != afield]
        fields.append({
            'name':
            tfield,
            'type':
            field['es:itemType'] if 'es:itemType' in field else 'string'
        })
        res['schema']['fields'] = fields
    return dp


if __name__ == '__main__':
    parameters, dp, res_iter = ingest()
    resource_matcher = ResourceMatcher(parameters.get('resource'), dp)
    afield, tfield = parameters['array-field'], parameters['unwound-field']
    spew(modify_datapackage(dp, resource_matcher, afield, tfield),
         process_resources(res_iter, resource_matcher, afield, tfield))