def main():
    parameters, datapackage, resources = ingest()
    aggregations = {"stats": {}, "sysnum_images": {}}
    fields = [{
        "name": "manifest_label",
        "type": "string"
    }, {
        "name": "manifest_sysnum",
        "type": "string"
    }, {
        "name": "resource_id",
        "type": "string"
    }, {
        "name": "resource_type",
        "type": "string"
    }, {
        "name": "resource_format",
        "type": "string"
    }, {
        "name": "resource_width",
        "type": "number"
    }, {
        "name": "resource_height",
        "type": "number"
    }, {
        "name": "resource_filepath",
        "type": "string"
    }, {
        "name": "url",
        "type": "string"
    }, {
        "name": "downloaded",
        "type": "boolean"
    }]
    output_resources = []
    output_descriptors = []
    for resource, descriptor in zip(resources, datapackage["resources"]):
        logging.info("creating images archive for collection {}".format(
            descriptor["name"]))
        output_resources.append(
            get_resource(resource, aggregations, descriptor["name"]))
        output_descriptors.append({
            PROP_STREAMING: True,
            "name": descriptor["name"],
            "path": "{}.csv".format(descriptor["name"]),
            "schema": {
                "fields": fields
            }
        })
    datapackage["resources"] = output_descriptors
    spew(datapackage, output_resources, aggregations["stats"])
Exemple #2
0
def main():
    parameters, datapackage, resources = ingest()
    for resource in datapackage["resources"]:
        if resource["name"] == "manifests":
            for field in resource["schema"]["fields"]:
                if field["name"] in ["attribution", "subject", "alternative_title", "title", "the_creator",
                                     "publisher", "label", "description"]:
                    field["es:type"] = "text"
                elif field["name"] in ["map", "sysnum", "language", "collection", "base"]:
                    field["es:type"] = "keyword"
                else:
                    field["es:type"] = "text"

    spew(datapackage, resources)
def main():
    parameters, datapackage, resources, stats = ingest() + (defaultdict(int), )
    max_year = parameters.get('max-year')
    file_path_template = parameters.get('file-path-template')
    missing_image = parameters.get('missing-image')
    datapackage['resources'] = []
    for resource in resources:
        for rownum, row in enumerate(resource):
            if max_year and row['year'] > max_year:
                stats['invalid year'] += 1
                continue
            if parameters.get('download-thumbnails'):
                if not row['thumb_url']:
                    stats['missing thumb_url'] += 1
                    continue
                name = 'rownum_{}'.format(rownum)
                if file_path_template:
                    photo_filename = file_path_template.format(rownum=rownum)
                    if not path.exists(photo_filename):
                        stats['full size photo missing'] += 1
                        continue
                    if missing_image:
                        if filecmp.cmp(photo_filename,
                                       missing_image,
                                       shallow=False):
                            stats['photo is missing_image photo'] += 1
                            continue
                stats['valid thumbnail'] += 1
                url = row['thumb_url']
                datapackage['resources'].append({
                    PROP_STREAMED_FROM:
                    url,
                    'name':
                    name,
                    'path': ['files/' + name + '.jpg'],
                })
            else:
                if row['image_url']:
                    url = parameters['image_url_prefix'] + row['image_url']
                    name = 'rownum_{}'.format(rownum)
                    datapackage['resources'].append({
                        PROP_STREAMED_FROM:
                        url,
                        'name':
                        name,
                        'path': ['files/' + name + '.png'],
                    })

    spew(datapackage, [], stats)
Exemple #4
0
    def __call__(self):
        url = self.parameters['url']
        limit_rows = self.parameters.get('limit-rows')
        dep_prefix = 'dependency://'
        if url.startswith(dep_prefix):
            dependency = url[len(dep_prefix):].strip()
            url = get_dependency_datapackage_url(dependency)
            assert url is not None, "Failed to fetch output datapackage for dependency '%s'" % dependency
        resource = self.parameters['resource']
        stream = self.parameters.get('stream', True)
        name_matcher = ResourceMatcher(resource) if isinstance(resource,
                                                               str) else None
        resource_index = resource if isinstance(resource, int) else None

        selected_resources = []
        found = False
        dp = datapackage.DataPackage(url)
        dp = self.process_datapackage(dp)
        for i, orig_res in enumerate(dp.resources):
            if resource_index == i or \
                    (name_matcher is not None and name_matcher.match(orig_res.descriptor.get('name'))):
                found = True
                desc = copy.deepcopy(orig_res.descriptor)
                if 'primaryKey' in desc.get('schema', {}):
                    # Avoid duplication checks
                    del orig_res.descriptor['schema']['primaryKey']
                    orig_res.commit()
                desc[PROP_STREAMED_FROM] = orig_res.source
                self.dp['resources'].append(desc)
                if tabular(desc) and stream:
                    desc[PROP_STREAMING] = True
                    orig_res_iter = orig_res.iter(keyed=True)
                    if limit_rows:
                        orig_res_iter = itertools.islice(
                            orig_res_iter, limit_rows)
                    selected_resources.append(orig_res_iter)
                else:
                    desc[PROP_STREAMING] = False

        assert found, "Failed to find resource with index or name matching %r" % resource
        spew(self.dp, itertools.chain(self.res_iter, selected_resources))
    def test_spew_finalizer_runs_before_we_signal_that_were_done(self):
        '''Assert that the finalizer param is executed before spew is finished.

        We signal to other processors that we're done by writing an empty line
        to STDOUT. The finalizer parameter to spew() must be executed before that,
        as there can be processors that depend on us finishing our processing
        before they're able to run. For example, a processor that depends on
        `dump_to_zip` must wait until it has finished writing to the local
        filesystem.
        '''
        datapackage = {}
        resources_iterator = iter([])

        with mock.patch(
                'datapackage_pipelines.wrapper.wrapper.stdout') as stdout_mock:

            def finalizer():
                last_call_args = stdout_mock.write.call_args_list[-1]
                assert last_call_args != mock.call('\n')

            spew(datapackage, resources_iterator, finalizer=finalizer)
Exemple #6
0
def main():
    parameters, datapackage, resources = ingest()
    resources = list(resources)
    stats = {}

    mk_individuals = _get_resource_from_datapackage(datapackage, resources,
                                                    'mk_individual')
    votes = _get_resource_from_datapackage(datapackage, resources,
                                           'vote_rslts_kmmbr_shadow')

    mk_individuals = list(mk_individuals)

    stats["total votes"] = 0
    datapackage["resources"][1]["schema"]["fields"].append({
        "name": "mk_individual_id",
        "type": "integer"
    })

    spew(datapackage,
         [mk_individuals,
          get_resource(votes, mk_individuals, stats)], stats)
def main():
    params, datapackage, res_iter = ingest()

    key = params['key']
    url_key = params['url-key']
    resource_name = params['resource-name']

    resource = {
        'name': resource_name,
        PROP_STREAMING: True,
        'path': 'data/{}.csv'.format(resource_name),
        'schema': {
            'fields': [
                {'name': '{}_Number'.format(key), 'type': 'string'},
                {'name': '{}_Name'.format(key), 'type': 'string'},
                {'name': '{}_Registration_Date'.format(key), 'type': 'string'},
            ]
        }
    }
    datapackage['resources'].append(resource)

    spew(datapackage, [get_entities(url_key)])
def main():
    parameters, datapackage, resources = ingest()
    aggregations = {"stats": {}, "sysnum_images": {}}
    resources = list(resources)
    for descriptor in datapackage["resources"]:
        descriptor["schema"] = get_resource_row_image_schema()

    def get_resource(resource, descriptor):
        init_resource_stats(aggregations["stats"], descriptor)
        bucket = get_bucket(*list(
            map(os.environ.get, [
                "GCS_SERVICE_ACCOUNT_B64_KEY", "GCS_IMAGES_BUCKET",
                "GCS_PROJECT"
            ])))
        queue, threads = None, None
        if not os.environ.get("GCS_DISABLE_DOWNLOAD"):
            numthreads = int(os.environ.get("DOWNLOAD_IMAGES_NUM_THREADS",
                                            "5"))
            poolsize = 20 if numthreads < 50 else int(numthreads / 2)
            logging.info("poolsize={}, numthreads={}".format(
                poolsize, numthreads))
            queue, threads = start_downloader(
                poolsize,
                numthreads,
                worker=partial(download_blob, bucket, aggregations,
                               descriptor["name"]),
                max_retries=5)
        yield from get_images(resource, aggregations, descriptor["name"],
                              bucket, queue)
        if queue:
            stop_downloader(
                queue, threads,
                int(os.environ.get("DOWNLOAD_IMAGES_NUM_THREADS", "5")))

    def get_resources():
        for resource, descriptor in zip(resources, datapackage["resources"]):
            yield get_resource(resource, descriptor)

    spew(datapackage, get_resources(), aggregations["stats"])
Exemple #9
0
def main():
    parameters, datapackage, resources, stats = ingest() + ({}, )
    datapackage['resources'] = [{
        PROP_STREAMING: True,
        "name": "zio",
        "path": "zio.csv",
        "schema": {
            "fields": [{
                "name": "description",
                "type": "string"
            }, {
                "name": "year",
                "type": "year"
            }, {
                "name": "id",
                "type": "string"
            }, {
                "name": "thumb_url",
                "type": "string"
            }, {
                "name": "details_url",
                "type": "string"
            }, {
                "name": "scrape_year",
                "type": "year"
            }, {
                "name": "page_number",
                "type": "integer"
            }, {
                "name": "rownum",
                "type": "integer"
            }, {
                'name': 'error',
                'type': 'string'
            }]
        }
    }]
    spew(datapackage, [get_resource(parameters)], stats)
Exemple #10
0
def main():
    parameters, datapackage, resources = ingest()
    stats = {}
    aggregations = {"stats": stats}
    jinja_env = get_jinja_env()
    committees = {}
    committees_descriptor = None
    for descriptor, resource in zip(datapackage["resources"], resources):
        if descriptor["name"] == "kns_committee":
            committees_descriptor = descriptor
            for committee in resource:
                committees[int(committee["CommitteeID"])] = committee
        elif descriptor["name"] == "kns_committeesession":
            build_meeting_templates(resource, committees, jinja_env,
                                    descriptor, committees_descriptor,
                                    aggregations)
            build_committee_templates(jinja_env, committees,
                                      committees_descriptor, aggregations)
            build_committee_knessets_list_template(jinja_env, committees,
                                                   aggregations)
            build_committees_index_template(jinja_env, committees,
                                            aggregations)
    spew({}, [], stats)
def main():
    params, dp, res_iter = ingest()

    dp['name'] = 'category-explanations'
    dp['resources'] = [{
        'name': 'category-explanations',
        'path': 'data/category-explanations.csv',
        PROP_STREAMING: True,
        'schema': {
            'fields': [
                {
                    'name': 'budget_code',
                    'type': 'string'
                },
                {
                    'name': 'explanation',
                    'type': 'string'
                },
                {
                    'name': 'explanation_short',
                    'type': 'string'
                },
                {
                    'name': 'source',
                    'type': 'string'
                },
            ]
        }
    }]

    spew(dp, [
        itertools.chain(
            process_file('category-explanations.md', 'explanation'),
            process_file('category-explanations-short.md',
                         'explanation_short'),
        )
    ])
def main():
    parameters, datapackage, resources, stats = ingest() + ({}, )
    bills = {}
    israel_law_bill_ids = {}
    for bill in next(resources):
        bill['law_ministry_ids'] = []
        bills[bill['BillID']] = bill
        if bill['IsraelLawID']:
            for israel_law_id in bill['IsraelLawID']:
                israel_law_bill_ids.setdefault(israel_law_id, [])
                israel_law_bill_ids[israel_law_id].append(bill['BillID'])
    for law_ministry in next(resources):
        for bill_id in israel_law_bill_ids.get(law_ministry['IsraelLawID'],
                                               []):
            if law_ministry['GovMinistryID'] not in bills[bill_id][
                    'law_ministry_ids']:
                bills[bill_id]['law_ministry_ids'].append(
                    law_ministry['GovMinistryID'])
    gov_ministries = {}
    for gov_ministry in next(resources):
        gov_ministries[gov_ministry['GovMinistryID']] = gov_ministry['Name']
    for bill in bills.values():
        ministry_names = set()
        for ministry_id in bill['law_ministry_ids']:
            ministry_names.add(gov_ministries[ministry_id])
        bill['law_ministry_names'] = ', '.join(ministry_names)
    datapackage["resources"] = [datapackage['resources'][0]]
    fields = [{
        'name': 'law_ministry_ids',
        'type': 'array'
    }, {
        'name': 'law_ministry_names',
        'type': 'string'
    }]
    datapackage["resources"][0]['schema']['fields'] += fields
    spew(datapackage, [bills.values()], stats)
def main():
    parameters, datapackage, resources = ingest()
    aggregations = {"stats": {}}
    collections = {}
    for descriptor, resource in zip(datapackage["resources"], resources):
        if descriptor["name"] == "collections":
            collections = list(resource)
        else:
            list(resource)
    datapackage["resources"] = []
    for collection in collections:
        datapackage["resources"].append({
            PROP_STREAMING:
            True,
            "name":
            collection["id"],
            "path":
            "{}.csv".format(collection["id"]),
            "schema": {
                "fields": [{
                    "name": "label",
                    "type": "string"
                }, {
                    "name": "manifest",
                    "type": "string"
                }]
            }
        })

    def get_resource(collection):
        for member in json.loads(requests.get(
                collection["json"]).content)["members"]:
            yield {"label": member["label"], "manifest": member["@id"]}

    spew(datapackage, (get_resource(collection) for collection in collections),
         aggregations["stats"])
Exemple #14
0
def main():
    parameters, datapackage, resources, stats = ingest() + (defaultdict(int),)
    resource_names = [r['name'] for r in datapackage['resources']]
    datapackage['resources'] = [r for r in datapackage['resources'] if r['name'] == 'foi_offices']
    datapackage['resources'][0]['schema']['fields'] += [{'name': 'update_type', 'type': 'string'},
                                                        {'name': 'update_title', 'type': 'string'},
                                                        {'name': 'entity_id', 'type': 'string'},]

    def get_resources():
        existing_entities = {}
        for resource_name, resource in zip(resource_names, resources):
            if resource_name == 'existing_entities':
                for row in get_existing_entities(resource, existing_entities, stats):
                    pass
            elif resource_name == 'foi-groups-matching':
                for row in get_foi_groups_matching(resource, existing_entities, stats):
                    pass
            elif resource_name == 'foi_offices':
                yield get_foi_offices_resource(resource, existing_entities, stats, parameters.get('dry-run'))
            else:
                for row in resource:
                    pass

    spew(datapackage, get_resources(), stats)
Exemple #15
0
def main():
    params, dp, res_iter = ingest()

    os.makedirs('/var/datapackages/sitemaps', exist_ok=True)

    kind = params['kind']
    db_table = params['db-table']
    doc_id = params['doc-id']
    page_title = params['page-title']

    if not dp.get('resources'):
        dp['resources'] = [{
            'name': 'sitemaps',
            'path': 'sitemaps.csv',
            PROP_STREAMING: True,
            'schema': {
                'fields': [{
                    'name': 'filename',
                    'type': 'string'
                }]
            }
        }]

    spew(dp, [process_rows(res_iter, kind, db_table, doc_id, page_title)])
Exemple #16
0
def main():
    parameters, dp, res_iter = ingest()

    connection_string = get_connection_string()

    existing_ids = None
    resource_name = parameters['resource-name']
    input_key_fields = parameters['key-fields']
    input_hash_fields = parameters.get('hash-fields')

    for res in dp['resources']:
        if resource_name == res['name']:
            if input_hash_fields is None:
                input_hash_fields = set(f['name'] for f in res['schema']['fields'])
            input_hash_fields = set(input_hash_fields) - set(input_key_fields)
            if len(input_hash_fields.intersection(STATUS_FIELD_NAMES)) == 0:
                res['schema']['fields'].extend(STATUS_FIELDS)

            db_key_fields = parameters.get('db-key-fields', input_key_fields)
            db_hash_fields = parameters.get('db-hash-fields', input_hash_fields)

            existing_ids = \
                get_all_existing_ids(connection_string,
                                     parameters['db-table'],
                                     db_key_fields,
                                     db_hash_fields)
            break

    assert existing_ids is not None
    logging.info('Found %d ids', len(list(existing_ids.keys())))

    spew(dp, process_resources(res_iter,
                               resource_name,
                               input_key_fields,
                               input_hash_fields,
                               existing_ids))
def main():
    parameters, datapackage, resources = ingest()
    aggregations = {"stats": {}}
    resources = list(resources)
    for descriptor in datapackage["resources"]:
        descriptor["schema"]["fields"] = [{
            "name": "doc_id",
            "type": "string"
        }, {
            "name": "system_number",
            "type": "string"
        }, {
            "name": "manifest_url",
            "type": "string"
        }, {
            "name": "manifest_file",
            "type": "string"
        }]

    def get_resources():
        for resource, descriptor in zip(resources, datapackage["resources"]):
            yield (parse_row(row) for row in resource)

    spew(datapackage, get_resources(), aggregations["stats"])
    def process_single(resource):
        counter = 0
        nones = dict((c, 0) for c in threshold_columns)
        for row in resource:
            counter += 1
            for column in threshold_columns:
                value = row.get(column)
                if is_empty(value):
                    nones[column] += 1
            for column in allowed_value_columns:
                value = row.get(column)
                if not is_empty(value) and value != 'unknown':
                    if value not in allowed_values[column]:
                        raise ValueError(
                            '%s: Got %r whereas allowed values for this column are %r'
                            % (column, value, allowed_values[column]))
            yield row
        for column in threshold_columns:
            ratio_percent = 100 - (100 * nones[column]) // counter
            if ratio_percent < thresholds[column]:
                raise ValueError(
                    '%s: Got %d good values (out of %d), which is %d%% (below the threshold of %d%%)'
                    % (column, counter - nones[column], counter, ratio_percent,
                       thresholds[column]))

    for resource_ in resources:
        yield process_single(resource_)


spew(datapackage_, process(resources_))
from datapackage_pipelines.wrapper import spew, ingest

parameters, datapackage, res_iter = ingest()
res_name = parameters.get('resource', datapackage['resources'][0]['name'])


def show_sample(res):
    logging.info('SAMPLE OF LINES from %s', res.spec['name'])
    for i, row in enumerate(res):
        if i < 10:
            if isinstance(row, LazyJsonLine):
                logging.info('#%s: %s', i, row._evaluate())
            else:
                logging.info('#%s: %r', i, row)
        yield row


def process_resources(res_iter_):
    for res in res_iter_:
        logging.info('? from %s', res.spec['name'])
        if res.spec['name'] == res_name:
            yield show_sample(res)
        else:
            yield res


logging.info(json.dumps(datapackage, indent=2))

spew(datapackage, process_resources(res_iter))
Exemple #20
0
    logging.info('Subprocess: "' + ' '.join(command_line_args) + '"')

    try:
        command_line_process = subprocess.Popen(
            command_line_args,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
        )

        with command_line_process.stdout:
            log_subprocess_output(command_line_process.stdout)
    except (OSError, subprocess.CalledProcessError) as exception:
        logging.info('Exception occured: ' + str(exception))
        logging.info('Subprocess failed')
        raise exception
    else:
        # no exception was raised
        logging.info('Subprocess finished')

    return True


parameters, datapackage, res_iter = ingest()

run_shell_command(
    parameters["arguments"]
)

spew(datapackage, res_iter)
Exemple #21
0
        fields_to_keep = [f['name'] for f in fields]
        fields.extend(extra_keys)
        fields.append(extra_value)
        resource['schema']['fields'] = fields
    return unpivot_fields_without_regex, fields_to_keep


def unpivot(rows, fields_to_unpivot_, fields_to_keep_):
    for row in rows:
        for unpivot_field in fields_to_unpivot_:
            new_row = copy.deepcopy(unpivot_field['keys'])
            for field in fields_to_keep_:
                new_row[field] = row[field]
            new_row[extra_value['name']] = row.get(unpivot_field['name'])
            yield new_row


def process_resources(resource_iterator_, fields_to_unpivot, fields_to_keep):
    for resource in resource_iterator_:
        spec = resource.spec
        if not resources.match(spec['name']):
            yield resource
        else:
            yield unpivot(resource, fields_to_unpivot, fields_to_keep)


old_fields, keep_fields = process_datapackage(datapackage)

spew(datapackage, process_resources(resource_iterator, old_fields, keep_fields))
Exemple #22
0
from datapackage_pipelines.wrapper import spew, ingest
import time, logging, datetime, sys


def filter_resource(resource, sleep_seconds):
    yield from resource
    time.sleep(sleep_seconds)


def filter_resources(datapackage, resources, parameters):
    input_resource_name = parameters.get("resource")
    sleep_seconds = float(parameters.get(
        "sleep-seconds", 2))  # sleep 2 seconds between resources
    for resource_descriptor, resource in zip(datapackage["resources"],
                                             resources):
        if not input_resource_name or input_resource_name == resource_descriptor[
                "name"]:
            logging.info("throttling resource {}: sleep_seconds={}".format(
                resource_descriptor["name"], sleep_seconds))
            yield filter_resource(resource, sleep_seconds)
        else:
            yield resource


parameters, datapackage, resources = ingest()
spew(datapackage, filter_resources(datapackage, resources, parameters))
userid = gobble.user.User().id
for dirpath, dirnames, filenames in os.walk('.'):
    if dirpath == '.':
        continue
    if FILENAME in filenames:
        pipeline = yaml.load(open(os.path.join(dirpath, FILENAME)))
        dataset_name = pipeline[list(pipeline.keys())[0]]['pipeline'][0]['parameters']['datapackage']['name']
        url_base = 'http://datastore.openspending.org/{}/{}'.format(userid, dataset_name)
        resp = requests.get(url_base+'/datapackage.json')
        if resp.status_code == 200:
            datapackage_json = resp.json()
            if len(country) > 0:
                if datapackage_json.get('geo', {}).get('country_code', 'xx').lower() != country:
                    continue
            resource = datapackage_json['resources'][0]
            resource_url = '{}/{}'.format(url_base, resource['path'])
            resources.append({
                PROP_STREAMED_FROM: resource_url,
                'path': PATH_PLACEHOLDER,
                'name': dataset_name,
                'encoding': 'utf-8',
                'delimiter': ',',
                'doublequote': True,
                'quotechar': '"',
                'skipinitialspace': False
            })
            logging.error(resource_url)

spew(datapackage, [])
 def main(cls):
     from datapackage_pipelines.wrapper import ingest, spew
     spew(*cls(*ingest()).spew())
"""Map the raw columns names to fiscal fields where indicated."""

import logging
from datapackage_pipelines.wrapper import ingest, spew
from common.utilities import get_fiscal_field_names


def update_datapackage(datapackage):
    valid_fiscal_fields = get_fiscal_field_names()
    for resource in datapackage['resources']:
        for field in resource['schema']['fields']:
            if field['maps_to'] in valid_fiscal_fields:
                field['name'] = field.pop('maps_to')
            else:
                logging.info('Unmapped = %s', field['name'])
    return


_, datapackage_, resources_ = ingest()
spew(update_datapackage(datapackage_), resources_)
    return html


def filter_resource(descriptor, data, stats):
    for row in data:
        stats[descriptor["name"]] += 1
        yield row

def filter_resources(datapackage, resources, parameters, stats):
    tables = []
    for resource_descriptor, resource_data in zip(datapackage["resources"], resources):
        schema = resource_descriptor["schema"]
        stats[resource_descriptor["name"]] = 0
        tables.append(_get_schema_table(resource_descriptor["name"], schema["fields"], schema["primaryKey"]))

        yield filter_resource(resource_descriptor, resource_data, stats)

    html = """<html><head><meta charset="UTF-8"></head><body>{tables}</body></html>""".format(tables="".join(tables))

    save_schema = parameters.get("save-schema", DEFAULT_SAVE_SCHEMA)
    if save_schema:
        save_schema_html = DEFAULT_SAVE_SCHEMA.format(table_name=datapackage["name"], ext="html")
        save_schema_json = DEFAULT_SAVE_SCHEMA.format(table_name=datapackage["name"], ext="json")

        s3 = object_storage.get_s3()
        object_storage.write(s3, parameters["bucket"], save_schema_html, html, public_bucket=True)
        object_storage.write(s3, parameters["bucket"], save_schema_json, json.dumps(datapackage["resources"], indent=2, ensure_ascii=False), public_bucket=True)

stats = {}
spew(datapackage, filter_resources(datapackage, resources, parameters, stats), stats)
    for key, lookup in lookup_tables.items():
        if row[key] in lookup:
            row[key] = lookup[row[key]]
        else:
            warning('%s mapped to None because no alias was found', row[key])
            row[key] = None
    return row


def build_lookup_tables(mappings):
    """Build the lookup tables."""

    def lookup_table(mapping):
        for key, aliases in mapping.items():
            for alias in aliases:
                yield alias, key

    return {
        mapping['field']:
            dict(lookup_table(mapping['mapping']))
        for mapping in mappings
        }


if __name__ == '__main__':
    parameters, _, resources = ingest()
    lookup_tables_ = build_lookup_tables(parameters['mappings'])
    new_resources = process(resources, map_aliases,
                            lookup_tables=lookup_tables_)
    spew(_, new_resources)
                mk_attendance.append(mk_aggs)
            except Exception:
                logging.exception("Failed to process mk_individual name {}".format(mk_name))
                raise
        else:
            raise Exception("Failed to find mk_individual name for mk_individual id {}".format(mk_individual["mk_individual_id"]))

meeting_aggs_fields = [{"name": "knesset_num", "type": "integer"},
                       {"name": "committee_id", "type": "integer"},
                       {"name": "committee_name", "type": "string"},
                       {"name": "meeting_start_date", "type": "datetime"},
                       {"name": "meeting_topics", "type": "string"}, ]

datapackage["resources"] = []

datapackage["resources"] += [{"name": "errors", "path": "errors.csv", PROP_STREAMING: True,
                              "schema": {"fields": [{"name": "error", "type": "string"}, ]}}]

datapackage["resources"] += [{PROP_STREAMING: True,
                              "name": "mk_attendance",
                              "path": "mk_attendance.csv",
                              "schema": {"fields": meeting_aggs_fields + [{"name": "mk_id", "type": "integer"},
                                                                          {"name": "mk_name", "type": "string"},
                                                                          {"name": "mk_membership_committee_names",
                                                                           "type": "string"},
                                                                          {"name": "mk_faction_id", "type": "integer"},
                                                                          {"name": "mk_faction_name", "type": "string"},
                                                                          ]}}]

spew(datapackage, [errors, mk_attendance])

def get_project_urls():
    """Return the complete list of project URLS."""

    counter = 0
    paths = []

    while True:
        counter += 1

        project = PAGINATION_URL.format(counter=counter)
        response = session.get(project)

        if response.text:
            doc = fromstring(response.content)
            more_links = doc.findall(PROJECT_URLS_XPATH)
            more_paths = list(map(lambda x: x.get('href'), more_links))
            paths.extend(more_paths)
            info('Collected %s urls on page %s', len(more_paths), counter)

        else:
            return paths


if __name__ == '__main__':
    _, datapackage, _ = ingest()
    project_paths = get_project_urls()
    project_rows = scrape_projects(project_paths)
    spew(datapackage, [project_rows])
"""A processor to inject constant values into the data."""

from datapackage_pipelines.wrapper import ingest, spew

row_count = 0


def process_rows(prefix, rows):
    global row_count
    for row in rows:
        row['internal_id'] = '{}-{}'.format(prefix, row_count)
        yield row
        row_count += 1


def process(prefix, resources):
    for resource in resources:
        yield process_rows(prefix, resource)


if __name__ == '__main__':
    """Ingest, process and spew out."""

    parameters_, datapackage_, resources_ = ingest()

    spew(datapackage_, process(parameters_['prefix'], resources_))
"""Grab the source description and convert it into a datapackage"""

import json
import logging

from datapackage_pipelines.wrapper import ingest, spew
from common.config import JSON_FORMAT
from common.utilities import get_fiscal_datapackage


if __name__ == '__main__':
    _, datapackage, resources = ingest()
    fiscal_datapackage = get_fiscal_datapackage(source=datapackage)
    fiscal_datapackage_as_json = json.dumps(fiscal_datapackage, **JSON_FORMAT)
    logging.debug('Loaded fiscal datapackage:\n%s', fiscal_datapackage_as_json)
    spew(fiscal_datapackage, resources)
"""The template for writing PDF and web scrapers."""

from datapackage_pipelines.wrapper import ingest, spew
from logging import debug


def scrape_beneficiaries(**params):
    """Return a generator of beneficiaries.

    Each beneficiary is a dictionary whose keys match the fields described
    in source.description.yaml. Parameters come from pipeline-specs.yaml.
    """

    debug('%s', **params)
    beneficiaries = [
        {'field1': 'foo', 'field2': 'spam'},
        {'field1': 'bar', 'field2': 'eggs'},
    ]
    for beneficiary in beneficiaries:
        yield beneficiary


if __name__ == '__main__':
    parameters, datapackage, _ = ingest()
    rows = scrape_beneficiaries(**parameters)
    spew(datapackage, [rows])
from arrow.parser import ParserError
from datapackage_pipelines.wrapper import ingest, spew
from common.utilities import process


def parse_currencies(row):
    """Clean up and convert currency fields to floats."""

    date_columns = (
        'Datum van laatste bijwerking',
        'Einddatum',
        'Begindatum'
    )
    for key in date_columns:
        try:
            row[key] = arrow.get(row[key], 'DD.MM.YYYY HH:mm')
        except ParserError:
            if row[key] != '0000-00-00 00:00:00':
                message = 'Could not parse %s to a date, returning None'
                logging.warning(message, row[key])

            row[key] = None

    return row


if __name__ == '__main__':
    parameters, datapackage_, resources = ingest()
    new_resources_ = process(resources, parse_currencies)
    spew(datapackage_, new_resources_)
Exemple #34
0
    'fields': [{
        'name': h,
        'type': 'string'
    } for h in headers]
}

datapackage['resources'].append(resource)


def process_resources(res_iter, datapackage, domain):
    def get_latest_row(first):
        latest_row = None
        my_rows = []
        for row in first:
            if row['domain'] == domain and row['source'] == 'discourse':
                latest_row = row
            my_rows.append(row)
        return latest_row, iter(my_rows)

    if len(datapackage['resources']):
        if datapackage['resources'][0]['name'] == 'latest-project-entries':
            latest_row, latest_iter = get_latest_row(next(res_iter))
            yield latest_iter
        else:
            latest_row = None
    yield from res_iter
    yield discourse_collector(domain, latest_row)


spew(datapackage, process_resources(res_iter, datapackage, domain))
        yield field_['name'], converters[field_['type']]


converter = dict(get_fiscal_types())
dump = {k: v.__name__ for k, v in converter.items()}
logging.debug('Fiscal type casting: \n%s', json.dumps(dump, indent=4))


def cast_values(row):
    """Cast values to fiscal types."""

    for key, value in row.items():
        if value:
            try:
                if value is None or (type(value) is str and len(value.strip()) == 0):
                    row[key] = None
                else:
                    row[key] = converter[key](value)
            except (ValueError, arrow.parser.ParserError):
                message = 'Could not cast %s = %s to %s, returning None' % (key, row[key], converter[key])
                logging.warning(message)
                assert False, message

    return row


if __name__ == '__main__':
    _, datapackage, resources = ingest()
    new_resources = process(resources, cast_values)
    spew(datapackage, new_resources)
Exemple #36
0
        all_attendee_names = set()
        for attendee_names in (meeting["mks"], meeting["invitees"], meeting["legal_advisors"], meeting["manager"]):
            if attendee_names and len(attendee_names) > 0:
                for attendee_name in attendee_names:
                    if type(attendee_name) == str:
                        all_attendee_names.add(attendee_name)
                    else:
                        all_attendee_names.add(attendee_name["name"])
        attended_mk_individual_ids = set()
        for attendee_name in all_attendee_names:
            for mk_individual in filter(lambda mk: meeting["KnessetNum"] in mk["knesset_nums"],
                                        map(get_mk_individual, mk_individuals)):
                if meeting["KnessetNum"] in mk_individual["knesset_nums"]:
                    name_equals, name_in = False, False
                    for name in mk_individual["mk_names"]:
                        if name == attendee_name:
                            name_equals += 1
                        if name in attendee_name:
                            name_in += 1
                    if name_equals or name_in:
                        attended_mk_individual_ids.add(mk_individual["mk_individual_id"])
        meeting["attended_mk_individual_ids"] = list(attended_mk_individual_ids)
        yield meeting


datapackage["resources"] = [datapackage["resources"][1]]
datapackage["resources"][0]["schema"]["fields"] += [{"name": "attended_mk_individual_ids", "type": "array"}]


spew(datapackage, [get_resource()])
            args = filename, format_data_sample(stream)
            info('Concatenated %s:\n%s', *args)

    info('Done concatenating %s files', nb_files)


def assemble_fiscal_datapackage():
    """Assemble the fiscal datapackage for the concatenated dataset."""

    with open(FISCAL_METADATA_FILE) as stream:
        fdp = yaml.load(stream.read())

    with open(FISCAL_MODEL_FILE) as stream:
        fdp['model'] = yaml.load(stream.read())

    with open(FISCAL_SCHEMA_FILE) as stream:
        fdp['resources'][0]['schema'] = yaml.load(stream.read())

    message = 'Fiscal datapackage: \n%s'
    info(message, format_to_json(fdp))

    return fdp


if __name__ == '__main__':
    parameters, datapackage, _ = ingest()
    datapackage = assemble_fiscal_datapackage()
    datasets = collect_local_datasets(**parameters)
    resource = concatenate(datasets, **parameters)
    spew(datapackage, [resource])
Exemple #38
0
                                          collated_field_name)] = {
                                              'fields': inner_fields
                                          }


def val(v):
    if isinstance(v, Decimal):
        v = float(v)
    elif isinstance(v, date):
        v = v.isoformat()
    return v


def process_resource(res):
    for row in res:
        inner = dict((k, val(v)) for k, v in row.items() if k not in key)
        outer = dict((k, v) for k, v in row.items() if k in key)
        outer[collated_field_name] = inner
        yield outer


def process_resources(res_iter_):
    for res in res_iter_:
        if resource_matcher.match(res.spec['name']):
            yield process_resource(res)
        else:
            yield res


spew(dp, process_resources(res_iter))
            if index != 1:
                yield index, headers, values

    @staticmethod
    def _fixed_points(rows):
        """Convert floats to 2-digit fixed precision strings"""

        for index, headers, values in rows:
            values = [
                '%.2f' % value if type(value) is float else value
                for value in values
            ]
            yield index, headers, values


XLSXIngestor = XLSIngestor


def ingest_resources(datapackage):
    """Ingest each resource one by one into the pipeline."""

    for resource in datapackage['resources']:
        ingestor = BaseIngestor.load(resource)
        yield ingestor.rows


if __name__ == '__main__':
    _, datapackage_, _ = ingest()
    resources = list(ingest_resources(datapackage_))
    spew(datapackage_, resources)
Exemple #40
0
    headers = [hdr_num, hdr_name, hdr_reg_date]
    for data in datums:
        yield dict(zip(headers, treat(data)))


resource = {
    'name': resource_name,
    PROP_STREAMING: True,
    'path': 'data/{}.csv'.format(resource_name),
    'schema': {
        'fields': [
            {
                'name': hdr_num,
                'type': 'string'
            },
            {
                'name': hdr_name,
                'type': 'string'
            },
            {
                'name': hdr_reg_date,
                'type': 'string'
            },
        ]
    }
}

datapackage['resources'].append(resource)

spew(datapackage, [get_entities()])
"""A processor to concatenate resources that have a common set of fields."""

from datapackage_pipelines.wrapper import ingest, spew


def concatenate(resources):
    """Concatenate multiple resources."""

    for resource in resources:
        for row in resource:
            yield row


if __name__ == '__main__':
    _, datapackage, resources_ = ingest()
    single_resource = concatenate(resources_)
    datapackage['resources'] = [datapackage['resources'][0]]
    spew(datapackage, [single_resource])
Exemple #42
0
resources = ResourceMatcher(parameters.get('resources'), datapackage)
ignore_missing = parameters.get('ignore-missing', False)
limit_rows = parameters.get('limit-rows', -1)

new_resource_iterator = []
for resource in datapackage['resources']:

    if streamable(resource):
        url = resource[PROP_STREAMED_FROM]

        name = resource['name']
        if not resources.match(name):
            continue

        path = get_path(resource)
        if path is None or path == PATH_PLACEHOLDER:
            path = os.path.join('data', name + '.csv')
            resource['path'] = path

        resource[PROP_STREAMING] = True

        rows = stream_reader(resource, url, ignore_missing or url == "",
                             limit_rows, resource.pop('http_headers', None))

        new_resource_iterator.append(rows)

    elif streaming(resource):
        new_resource_iterator.append(next(resource_iterator))

spew(datapackage, new_resource_iterator)
Exemple #43
0
                                           "UnitText1_En": doc["_source"]["UnitText1"].get("En") if doc["_source"].get("UnitText1") else "",
                                           "UnitText1_He": doc["_source"]["UnitText1"].get("He") if doc["_source"].get("UnitText1") else "",
                                           "Header_En": doc["_source"]["Header"].get("En") if doc["_source"].get("Header") else "",
                                           "Header_He": doc["_source"]["Header"].get("He") if doc["_source"].get("Header") else "",
                                           })
                yield from filtered_row
            else:
                break


datapackage = {"name": "_",
               "resources": [{"name": "es_data", "path": "es_data.csv", PROP_STREAMING: True,
                              "schema": {"fields": [{"name": "index", "type": "string"},
                                                    {"name": "doc_type", "type": "string"},
                                                    {"name": "doc_id", "type": "string"},
                                                    {"name": "UnitId", "type": "string"},
                                                    {"name": "RightsCode", "type": "string"},
                                                    {"name": "RightsDesc", "type": "string"},
                                                    {"name": "StatusDesc", "type": "string"},
                                                    {"name": "DisplayStatusDesc", "type": "string"},
                                                    {"name": "UnitType", "type": "string"},
                                                    {"name": "Slug_En", "type": "string"},
                                                    {"name": "Slug_He", "type": "string"},
                                                    {"name": "UnitText1_En", "type": "string"},
                                                    {"name": "UnitText1_He", "type": "string"},
                                                    {"name": "Header_En", "type": "string"},
                                                    {"name": "Header_He", "type": "string"}]}}]}


spew(datapackage, get_resources(), stats)
def main():
    parameters, dp, res_iter = ingest()

    connection_string = get_connection_string()

    existing_ids = None
    resource_name = parameters['resource-name']
    input_key_fields = parameters['key-fields']
    input_hash_fields = parameters.get('hash-fields')
    prefix = parameters.get('prefix', '')

    STATUS_FIELDS = [
        {
            'name': prefix + '__last_updated_at',
            'type': 'datetime'
        },
        {
            'name': prefix + '__last_modified_at',
            'type': 'datetime'
        },
        {
            'name': prefix + '__created_at',
            'type': 'datetime'
        },
        {
            'name': prefix + '__is_new',
            'type': 'boolean'
        },
        {
            'name': prefix + '__is_stale',
            'type': 'boolean'
        },
        {
            'name': prefix + '__staleness',
            'type': 'integer'
        },
        {
            'name': prefix + '__next_update_days',
            'type': 'integer'
        },
        {
            'name': prefix + '__hash',
            'type': 'string'
        },
    ]
    STATUS_FIELD_NAMES = list(f['name'] for f in STATUS_FIELDS)

    for res in dp['resources']:
        if resource_name == res['name']:
            if input_hash_fields is None:
                input_hash_fields = set(f['name']
                                        for f in res['schema']['fields'])
            input_hash_fields = set(input_hash_fields) - set(input_key_fields)
            if len(input_hash_fields.intersection(STATUS_FIELD_NAMES)) == 0:
                res['schema']['fields'].extend(STATUS_FIELDS)
            input_hash_fields = set(input_hash_fields) - set(
                STATUS_FIELD_NAMES)

            db_key_fields = parameters.get('db-key-fields', input_key_fields)
            db_hash_fields = parameters.get('db-hash-fields',
                                            input_hash_fields)

            existing_ids = \
                get_all_existing_ids(connection_string,
                                     parameters['db-table'],
                                     db_key_fields,
                                     [
                                         prefix + '__last_updated_at',
                                         prefix + '__next_update_days',
                                         prefix + '__hash',
                                         prefix + '__created_at',
                                     ]
                                    )
            break

    assert existing_ids is not None
    logging.info('Found %d ids', len(list(existing_ids.keys())))

    spew(
        dp,
        process_resources(res_iter, resource_name, input_key_fields,
                          input_hash_fields, existing_ids, prefix))

def process(resources):
    def process_single(resource):
        counter = 0
        nones = dict((c, 0) for c in threshold_columns)
        for row in resource:
            counter += 1
            for column in threshold_columns:
                value = row.get(column)
                if is_empty(value):
                    nones[column] += 1
            for column in allowed_value_columns:
                value = row.get(column)
                if not is_empty(value) and value != 'unknown':
                    if value not in allowed_values[column]:
                        raise ValueError('%s: Got %r whereas allowed values for this column are %r' %
                                         (column, value, allowed_values[column]))
            yield row
        for column in threshold_columns:
            ratio_percent = 100 - (100*nones[column])//counter
            if ratio_percent < thresholds[column]:
                raise ValueError('%s: Got %d good values (out of %d), which is %d%% (below the threshold of %d%%)' %
                                 (column, counter-nones[column], counter, ratio_percent, thresholds[column]))

    for resource_ in resources:
        yield process_single(resource_)


spew(datapackage_, process(resources_))
# ISSUES & PR COUNTS
base_issue_url = '{}/search/issues?q=repo:{}'.format(base_url,
                                                     current_repo_name)

row['open_prs'] = _get_issue_count_for_request(
    '{}%20state:open%20is:pr'.format(base_issue_url))
row['closed_prs'] = _get_issue_count_for_request(
    '{}%20state:closed%20is:pr'.format(base_issue_url))
row['open_issues'] = _get_issue_count_for_request(
    '{}%20state:open%20is:issue'.format(base_issue_url))
row['closed_issues'] = _get_issue_count_for_request(
    '{}%20state:closed%20is:issue'.format(base_issue_url))

resource = {'name': name, 'path': 'data/{}.csv'.format(name)}

# Temporarily set all types to string, will use `set_types` processor in
# pipeline to assign correct types
resource['schema'] = {
    'fields': [{
        'name': h,
        'type': 'string'
    } for h in row.keys()]
}

datapackage['resources'].append(resource)

resource_content.append(row)

spew(datapackage, itertools.chain(res_iter, [resource_content]))
 def main(cls):
     # can be used like this in datapackage processor files:
     # if __name__ == '__main__':
     #      Processor.main()
     spew(*cls(*ingest()).spew())