コード例 #1
0
    def __call__(self):
        url = self.parameters['url']
        dep_prefix = 'dependency://'
        if url.startswith(dep_prefix):
            dependency = url[len(dep_prefix):].strip()
            url = get_dependency_datapackage_url(dependency)
            assert url is not None, "Failed to fetch output datapackage for dependency '%s'" % dependency
        resource = self.parameters['resource']
        stream = self.parameters.get('stream', True)
        name_matcher = ResourceMatcher(resource) if isinstance(resource,
                                                               str) else None
        resource_index = resource if isinstance(resource, int) else None

        selected_resources = []
        found = False
        dp = datapackage.DataPackage(url)
        dp = self.process_datapackage(dp)
        for i, orig_res in enumerate(dp.resources):
            if resource_index == i or \
                    (name_matcher is not None and name_matcher.match(orig_res.descriptor.get('name'))):
                found = True
                orig_res.descriptor[PROP_STREAMED_FROM] = orig_res.source
                self.dp['resources'].append(orig_res.descriptor)
                if tabular(orig_res.descriptor) and stream:
                    orig_res.descriptor[PROP_STREAMING] = True
                    selected_resources.append(orig_res.iter(keyed=True))
                else:
                    orig_res.descriptor[PROP_STREAMING] = False

        assert found, "Failed to find resource with index or name matching %r" % resource
        spew(self.dp, itertools.chain(self.res_iter, selected_resources))
コード例 #2
0
    def __call__(self):
        url = self.parameters['url']
        limit_rows = self.parameters.get('limit-rows')
        dep_prefix = 'dependency://'
        if url.startswith(dep_prefix):
            dependency = url[len(dep_prefix):].strip()
            url = get_dependency_datapackage_url(dependency)
            assert url is not None, "Failed to fetch output datapackage for dependency '%s'" % dependency
        resource = self.parameters['resource']
        stream = self.parameters.get('stream', True)
        name_matcher = ResourceMatcher(resource) if isinstance(resource,
                                                               str) else None
        resource_index = resource if isinstance(resource, int) else None

        selected_resources = []
        found = False
        dp = datapackage.DataPackage(url)
        dp = self.process_datapackage(dp)
        for i, orig_res in enumerate(dp.resources):
            if resource_index == i or \
                    (name_matcher is not None and name_matcher.match(orig_res.descriptor.get('name'))):
                found = True
                desc = copy.deepcopy(orig_res.descriptor)
                if 'primaryKey' in desc.get('schema', {}):
                    # Avoid duplication checks
                    del orig_res.descriptor['schema']['primaryKey']
                    orig_res.commit()
                desc[PROP_STREAMED_FROM] = orig_res.source
                self.dp['resources'].append(desc)
                if tabular(desc) and stream:
                    desc[PROP_STREAMING] = True
                    orig_res_iter = orig_res.iter(keyed=True)
                    if limit_rows:
                        orig_res_iter = itertools.islice(
                            orig_res_iter, limit_rows)
                    selected_resources.append(orig_res_iter)
                else:
                    desc[PROP_STREAMING] = False

        assert found, "Failed to find resource with index or name matching %r" % resource
        spew(self.dp, itertools.chain(self.res_iter, selected_resources))
コード例 #3
0
    for writer in router.values():
        csv_format.finalize_file(writer)


def process_resources(res_iter, fields, router):
    first = next(res_iter)
    yield split_to_years(first, fields, router)
    for res in res_iter:
        collections.deque(res, 0)


if __name__ == '__main__':
    parameters, datapackage, res_iter = ingest()

    denormalized_pkg = Package(
        get_dependency_datapackage_url(parameters['source-pipeline'][13:]))
    denormalized = denormalized_pkg.resources[0]
    fiscal_years = list(
        filter(lambda r: r.name == 'fiscal-years', denormalized_pkg.resources))

    if len(fiscal_years) == 0:
        spew(datapackage, res_iter)
    else:
        fiscal_years = fiscal_years[0]
        fiscal_years = list(map(lambda x: x[0], fiscal_years.iter()))
        name_prefix = denormalized.name
        datapackage['resources'] = datapackage['resources'][:1]
        fields = denormalized.descriptor['schema']['fields']
        headers = [f['name'] for f in fields]
        fields = dict(zip(headers, fields))
        router = {}
コード例 #4
0
import tabulator
from tabulator.exceptions import SourceError
import logging
import time

from sqlalchemy import create_engine

from datapackage_pipelines_budgetkey.common.object_storage import temp_file, object_storage
from datapackage_pipelines.utilities.resources import PATH_PLACEHOLDER, PROP_STREAMING, PROP_STREAMED_FROM
from datapackage_pipelines.wrapper import ingest, spew, get_dependency_datapackage_url
from decimal import Decimal

from tableschema.exceptions import CastError

parameters, dp, res_iter = ingest()
input_file = get_dependency_datapackage_url(parameters['input-pipeline'])
db_table = parameters['db-table']
errors_db_table = parameters['error-db-table']

REVISION = 1

engine = create_engine(os.environ['DPP_DB_ENGINE'])
try:
    rp = engine.execute("""SELECT "report-url" from {} 
                        where "load-error" is not null
                        and "revision"={}""".format(errors_db_table, REVISION))
    errd_urls = set(r[0] for r in rp)
    rp = engine.execute("""SELECT distinct "report-url" from {}
                        where "revision"={}""".format(db_table, REVISION))
    all_good = set(r[0] for r in rp)
    logging.info('Got %d good reports, %d failed ones', len(all_good),
コード例 #5
0
    def __call__(self):
        self.parameters['resource'] = self.parameters['resource-name']
        kv_cache = self.parameters.get('kv-cache', False)
        kv_path = self.parameters['kv-path']
        url = self.parameters['url']
        limit_rows = self.parameters.get('limit-rows')
        log_progress_rows = self.parameters.get('log-progress-rows')
        dep_prefix = 'dependency://'
        if url.startswith(dep_prefix):
            dependency = url[len(dep_prefix):].strip()
            url = get_dependency_datapackage_url(dependency)
            assert url is not None, "Failed to fetch output datapackage for dependency '%s'" % dependency
        stream = self.parameters.get('stream', True)
        required = self.parameters.get('required', True)
        resource = self.parameters.get('resource')
        resources = self.parameters.get('resources')
        if resource is not None:
            assert not resources
            resource_index = resource if isinstance(resource, int) else None
        else:
            assert resources
            resource_index = None
            resource = list(resources.keys())
        name_matcher = ResourceMatcher(resource) if isinstance(resource, (str, list)) else None

        selected_resources = []
        found = False
        try:
            dp = datapackage.DataPackage(url)
        except Exception:
            if required:
                raise
            else:
                dp = None
        if dp:
            dp = self.process_datapackage(dp)
            for i, orig_res in enumerate(dp.resources):
                if resource_index == i or \
                        (name_matcher is not None and name_matcher.match(orig_res.descriptor.get('name'))):
                    found = True
                    desc = copy.deepcopy(orig_res.descriptor)
                    if 'primaryKey' in desc.get('schema', {}):
                        # Avoid duplication checks
                        del orig_res.descriptor['schema']['primaryKey']
                        orig_res.commit()
                    desc[PROP_STREAMED_FROM] = orig_res.source
                    if resources:
                        desc.update(resources[desc['name']])
                    self.dp['resources'].append(desc)
                    if tabular(desc) and stream:
                        desc[PROP_STREAMING] = True
                        if kv_cache and os.path.exists(kv_path):
                            kv = PersistentKVFile(kv_path, concurrent=True)
                            orig_res_iter = kv_res_iter(kv, kv_key=self.parameters.get('kv-key'))
                        else:
                            kv = PersistentKVFile(kv_path, concurrent=True)
                            orig_res_iter = kv_res_iter(kv, orig_res.iter(keyed=True), kv_key=self.parameters.get('kv-key'))
                        if limit_rows:
                            orig_res_iter = itertools.islice(orig_res_iter, limit_rows)
                        if log_progress_rows:
                            orig_res_iter = progress_logger(orig_res_iter, log_progress_rows)
                        selected_resources.append(orig_res_iter)
                    else:
                        desc[PROP_STREAMING] = False

        assert found or not required, "Failed to find resource with index or name matching %r" % resource
        spew(self.dp, itertools.chain(self.res_iter, selected_resources))
コード例 #6
0
import datapackage

from datapackage_pipelines.wrapper import ingest, spew, get_dependency_datapackage_url

dep_prefix = 'dependency://'

parameters, dp, res_iter = ingest()

url = parameters['url']
if url.startswith(dep_prefix):
    dependency = url[len(dep_prefix):].strip()
    url = get_dependency_datapackage_url(dependency)
    assert url is not None, "Failed to fetch output datapackage for dependency '%s'" % dependency

datapackage = datapackage.DataPackage(url)
for k, v in datapackage.descriptor.items():
    if k != 'resources':
        dp[k] = v

spew(dp, res_iter)