Example #1
0
 def flow(self):
     flows = [step.flow() for step in self.steps]
     return Flow(*list(filter(lambda x: x is not None, flows)))
Example #2
0
def main():
    with tempfile.TemporaryDirectory() as tempdir:
        with open(os.path.join(tempdir, ".netrc"), "w") as f:
            f.write("machine %s\nlogin %s\npassword %s\n" %
                    (DOMAIN, AUTH_USER, AUTH_PASSWORD))
        HOME = os.environ["HOME"]
        os.environ["HOME"] = tempdir
        os.makedirs("data/corona_data_collector/gps_data_cache", exist_ok=True)
        utils.http_stream_download(
            "data/corona_data_collector/gps_data_cache/datapackage.json", {
                "url":
                "https://%s/data/corona_data_collector/gps_data_cache/datapackage.json"
                % DOMAIN
            })
        utils.http_stream_download(
            "data/corona_data_collector/gps_data_cache/gps_data.csv", {
                "url":
                "https://%s/data/corona_data_collector/gps_data_cache/gps_data.csv"
                % DOMAIN
            })
        Flow(
            download_gdrive_data.flow({
                "limit_rows": 50000,
                "files_dump_to_path": "data/corona_data_collector/gdrive_data",
                "google_drive_csv_folder_id":
                "1pzAyk-uXy__bt1tCX4rpTiPZNmrehTOz",
                "file_sources": {
                    "COVID-19-English.csv": "google",
                    "COVID-19-Russian.csv": "google",
                    "COVID-19-Hebrew.csv": "hebrew_google",
                    "maccabi_updated.csv": "maccabi",
                }
            }),
            load_from_db.flow({
                "where":
                "(id > 500 and id < 1000) or (id > 180000 and id < 185000) or (id > 600000 and id < 601000) or (id > 640000 and id < 641000) or (id > 670000)",
                # "filter_db_row_callback": _mock_version_28
            }),
            # _mock_gender_other,
            add_gps_coordinates.flow({
                "source_fields":
                utils.get_parameters_from_pipeline_spec(
                    "pipeline-spec.yaml", "corona_data_collector",
                    "corona_data_collector.add_gps_coordinates")
                ["source_fields"],
                "workplace_source_fields":
                utils.get_parameters_from_pipeline_spec(
                    "pipeline-spec.yaml", "corona_data_collector",
                    "corona_data_collector.add_gps_coordinates")
                ["workplace_source_fields"],
                "dump_to_path":
                "data/corona_data_collector/with_gps_data",
                "gps_datapackage_path":
                "data/corona_data_collector/gps_data_cache",
                "get-coords-callback":
                lambda street, city: (random.uniform(
                    29, 34), random.uniform(34, 36), int(street != city))
            }),
            export_corona_bot_answers.flow({
                "destination_output":
                "data/corona_data_collector/corona_bot_answers"
            }),
            export_corona_bot_answers.flow({
                "unsupported":
                True,
                "destination_output":
                "data/corona_data_collector/corona_bot_answers_unsupported"
            })).process()
    os.environ["HOME"] = HOME
    subprocess.check_call(
        ["python3", "-m", "src.utils.get_raw_data"],
        cwd="../COVID19-ISRAEL",
        env={
            **os.environ, "GOOGLE_SERVICE_ACCOUNT_FILE":
            os.environ["GOOGLE_SERVICE_ACCOUNT_FILE"],
            "AVIDCOVIDER_LOCAL_PATH":
            os.getcwd()
        })
    subprocess.check_call(["python3", "-m", "src.utils.preprocess_raw_data"],
                          cwd="../COVID19-ISRAEL",
                          env={**os.environ})
    logging.info("Great Success!")
    for router in yaml.load(routers):
        for route in router['routes']:
            routes.append({'router_name': router['name'], **route})
        yield {
            'name': router['name'],
            'ready': router['ready'],
            'routes': len(router['routes']),
            'deployment_created_at': router['deployment'].get('created_at'),
            'deployment_generation': router['deployment'].get('generation'),
            'deployment_namespace': router['deployment'].get('namespace'),
            'deployment_ready': router['deployment'].get('ready'),
            'type': router['type'],
            'dns': router['dns'],
        }


def get_routes():
    yield from routes


with open('output.html', 'w') as f:
    with redirect_stdout(f):
        Flow(
            get_routers(),
            update_resource('res_1', name='routers', path='routers.csv'),
            get_routes(),
            update_resource('res_2', name='routes', path='routes.csv'),
            dump_to_path(),
            printer(num_rows=9999999, tablefmt='html'),
        ).process()
household_us = Flow(
    add_metadata(
        name="household-income-us-historical",
        title=
        "Income Limits for Each Fifth and Top 5 Percent of All Households:  1967 to 2016",
        description=
        "Households as of March of the following year. Income in current and 2016 CPI-U-RS adjusted dollars.",
        sources=[{
            "path": "https://www2.census.gov",
            "title": "United States Census Bureau"
        }],
        licenses=[{
            "id":
            "odc-pddl",
            "path":
            "http://opendatacommons.org/licenses/pddl/",
            "title":
            "Open Data Commons Public Domain Dedication and License v1.0",
            'name':
            "open_data_commons_public_domain_dedication_and_license_v1.0"
        }],
        version="0.3.0",
        views=[{
            "name":
            "comparison-of-upper-limit-of-each-fifth-and-lower-limit-of-top-5-percent",
            "title":
            "Comparison of upper limit of each fifth and lower limit of top 5 percent (2016 dollars)",
            "resources": ["household-income-us-historical"],
            "specType": "simple",
            "spec": {
                "type": "line",
                "group": "Year",
                "series":
                ["Lowest", "Second", "Third", "Fourth", "Top 5 percent"]
            }
        }, {
            "name":
            "lowest-fifth-vs-top-5-percent",
            "title":
            "Ratio of lower limit of top 5 percent to upper limit of lowest fifth (2016 dollars)",
            "resources": [{
                "name":
                "household-income-us-historical",
                "transform": [{
                    "type":
                    "formula",
                    "expressions": ["data['Top 5 percent']/data['Lowest']"],
                    "asFields": ["Ratio"]
                }]
            }],
            "specType":
            "simple",
            "spec": {
                "type": "line",
                "group": "Year",
                "series": ["Ratio"]
            }
        }],
        readme=readme()),
    load(
        load_source=
        'https://www2.census.gov/programs-surveys/cps/tables/time-series/historical-income-households/h01ar.xls',
        format='xls',
        sheet=1,
        encoding='utf-8',
        # remove first 6 rows. remove rows that contain data from 1967 - last year and 3 rows after. Finaly last row
        skip_rows=[
            i + 1 for i in range(6 + datetime.datetime.now().year - 1966 + 3)
        ] + [-1],
        headers=[
            'Year', 'Number (thousands)', 'Lowest', 'Second', 'Third',
            'Fourth', 'Top 5 percent'
        ],
    ),
    find_replace(fields=[{
        'name':
        'Year',
        'patterns': [{
            'find': '(\s?\(\d+\))|(\.0)',
            'replace': ''
        }]
    }, {
        'name': 'Fourth',
        'patterns': [{
            'find': '\+|',
            'replace': ''
        }]
    }],
                 resources=0),
    update_resource(
        0, **{
            'name': 'household-income-us-historical',
            'path': 'data/household-income-us-historical.csv',
            'dpp:streaming': True
        }),
    set_type('Year', type='year'),
    set_type('^(?!Y).+', type='number'),
    validate())
Example #5
0
from avid_covider_pipelines.utils import get_parameters_from_pipeline_spec
from dataflows import printer, Flow, load
from .common import test_corona_bot_answers


logging.basicConfig(level=logging.INFO)


Flow(
    load_from_db.flow({
        "where": "id in (180074, 180075, 676579, 676580)"
    }),
    add_gps_coordinates.flow({
        "source_fields": get_parameters_from_pipeline_spec("pipeline-spec.yaml", "corona_data_collector", "corona_data_collector.add_gps_coordinates")["source_fields"],
        "get-coords-callback": lambda street, city: (random.uniform(29, 34), random.uniform(34, 36), int(street != city))
    }),
    export_corona_bot_answers.flow({
        "destination_output": "data/corona_data_collector/destination_output"
    }),
    printer(fields=[
        "__id", "__created", "main_age", "medical_staff_member", "engagement_source", "alias", "layout"
    ])
).process()

Flow(
    load("data/corona_data_collector/destination_output/corona_bot_answers_25_3_2020_with_coords.csv"),
    load("data/corona_data_collector/destination_output/corona_bot_answers_29_4_2020_with_coords.csv"),
    test_corona_bot_answers(
        lambda row: (str(row["medical_staff_member"]), str(row["engagement_source"]), str(row["layout"])),
        {
            "180074": ["corona_bot_answers_25_3_2020_with_coords", "", "", ""],
Example #6
0
def flow(parameters):
    return Flow(
        load_lazy_json(parameters.get('source')),
        duplicate(parameters.get('source'), parameters.get('target-name'),
                  parameters.get('target-path'),
                  parameters.get('batch_size', 1000)))
Example #7
0
        'name': 'serie-a',
        'path': 'italym.php',
        'key': 'I1',
        'links': [],
        'dataset-name': 'italian-serie-a',
        'dataset-title': 'Italian Serie A (football)'
    }, {
        'name': 'ligue-1',
        'path': 'francem.php',
        'key': 'F1',
        'links': [],
        'dataset-name': 'french-ligue-1',
        'dataset-title': 'French Ligue 1 (football)'
    }]
    for league in leagues:
        meta = get_league_meta(league)
        processors = get_processors(meta)
        processors.append(set_type('Date', type='date', format='%d/%m/%y')),
        processors.append(dump_to_path(out_path='datasets/' + league['name']))
        processors.append(printer())
        processors = [
            add_metadata(name=league['dataset-name'],
                         title=league['dataset-title'],
                         licenses=licenses,
                         sources=sources,
                         related=related_datasets,
                         readme=readme %
                         league['dataset-title'].replace(' (football)', ''))
        ] + processors
        Flow(*processors).process()
Example #8
0
def london_gva(link):
    Flow(load(link, sheet=3), filter_gva,
         unpivot(unpivoting_fields, extra_keys, extra_value),
         remove_duplicates, set_format_and_name, dump_to_path(),
         printer(num_rows=1)).process()
Example #9
0
def test_add_metadata():
    from dataflows import add_metadata
    f = Flow(data, add_metadata(author='Adam Kariv'))
    _, dp, _ = f.results()
    assert dp.descriptor['author'] == 'Adam Kariv'
Example #10
0
def test_load_empty_headers():
    from dataflows import Flow, load, printer

    def ensure_type(t):
        def func(row):
            assert isinstance(row['a'], t)

        return func

    results, dp, stats = Flow(load('data/empty_headers.csv'),
                              ensure_type(str)).results()
    assert results[0] == [{
        'a': 1,
        'b': 2
    }, {
        'a': 2,
        'b': 3
    }, {
        'a': 3,
        'b': 4
    }, {
        'a': 5,
        'b': 6
    }]
    assert len(dp.resources[0].schema.fields) == 2

    results, dp, stats = Flow(load('data/empty_headers.csv', validate=True),
                              ensure_type(int)).results()
    assert results[0] == [{
        'a': 1,
        'b': 2
    }, {
        'a': 2,
        'b': 3
    }, {
        'a': 3,
        'b': 4
    }, {
        'a': 5,
        'b': 6
    }]

    results, dp, stats = Flow(
        load('data/empty_headers.csv', force_strings=True),
        ensure_type(str)).results()
    assert results[0] == [{
        'a': '1',
        'b': '2'
    }, {
        'a': '2',
        'b': '3'
    }, {
        'a': '3',
        'b': '4'
    }, {
        'a': '5',
        'b': '6'
    }]
    assert len(dp.resources[0].schema.fields) == 2

    results, dp, stats = Flow(
        load('data/empty_headers.csv', force_strings=True, validate=True),
        ensure_type(str)).results()
    assert results[0] == [{
        'a': '1',
        'b': '2'
    }, {
        'a': '2',
        'b': '3'
    }, {
        'a': '3',
        'b': '4'
    }, {
        'a': '5',
        'b': '6'
    }]
    assert len(dp.resources[0].schema.fields) == 2
Example #11
0
def parse_dockerfiles():
    gitlab_repos = {}

    def _parse_gitlab_repos(rows):
        if rows.res.name == 'ckan-cloud-instances':
            for row in rows:
                gitlab_repo = row['gitlab_repo']
                if gitlab_repo in gitlab_repos:
                    gitlab_repos[gitlab_repo]['instances'].append(row)
                else:
                    gitlab_repos[gitlab_repo] = {'instances': [row]}
                yield row
        else:
            yield from rows

    def _get_dockerfile_from(dockerfile):
        if dockerfile:
            return [
                line.replace('FROM ', '') for line in dockerfile.split('\n')
                if line.startswith('FROM')
            ][0]
        else:
            return None

    def _parse_ckan_extensions(rows):
        if rows.res.name == 'dockerfiles':
            for row in rows:
                row['ckan_exts'] = []
                if row['dockerfile']:
                    for line in row['dockerfile'].split('\n'):
                        if 'https://github.com/' in line and '.git@' in line and '#egg=' in line:
                            ext = line.split('https://github.com/')[1].split(
                                '#egg=')[0].replace('.git@', '@')
                            row['ckan_exts'].append(ext)
                            if 'ckanext-s3filestore' in ext:
                                row['ckanext-s3filestore'] = ext
                yield row
        else:
            yield from rows

    def _get_dockerfile_row(gitlab_repo_name, gitlab_repo):
        try:
            dockerfile = CkanGitlab()._get_file(gitlab_repo_name, 'Dockerfile')
        except Exception:
            dockerfile = None
        return {
            'gitlab_repo': gitlab_repo_name,
            'instances': [i['name'] for i in gitlab_repo['instances']],
            'from': _get_dockerfile_from(dockerfile),
            'dockerfile': dockerfile
        }

    def _parse_dockerfiles(package):
        package.pkg.add_resource({
            'name': 'dockerfiles',
            'path': 'dockerfiles.csv',
            'schema': {
                'fields': [{
                    'name': 'gitlab_repo',
                    'type': 'string'
                }, {
                    'name': 'instances',
                    'type': 'array'
                }, {
                    'name': 'from',
                    'type': 'string'
                }, {
                    'name': 'dockerfile',
                    'type': 'string'
                }]
            }
        })
        yield package.pkg
        yield from package
        yield (_get_dockerfile_row(gitlab_repo_name, gitlab_repo)
               for gitlab_repo_name, gitlab_repo in gitlab_repos.items())

    return Flow(
        _parse_gitlab_repos,
        _parse_dockerfiles,
        checkpoint('ckan_images_dockerfiles'),
        add_field('ckan_exts', 'array'),
        add_field('ckanext-s3filestore', 'string'),
        _parse_ckan_extensions,
    )
Example #12
0
        yield from package
        yield (_get_dockerfile_row(gitlab_repo_name, gitlab_repo)
               for gitlab_repo_name, gitlab_repo in gitlab_repos.items())

    return Flow(
        _parse_gitlab_repos,
        _parse_dockerfiles,
        checkpoint('ckan_images_dockerfiles'),
        add_field('ckan_exts', 'array'),
        add_field('ckanext-s3filestore', 'string'),
        _parse_ckan_extensions,
    )


def main_flow(prefix, operator):
    return Flow(
        load(f'data/{prefix}/resources/datapackage.json',
             resources=['ckan-cloud-instances']),
        add_field('gitlab_repo', 'string'),
        get_gitlab_repo,
        parse_dockerfiles(),
    )


if __name__ == '__main__':
    prefix = os.environ['DATAPACKAGE_PREFIX']
    operator = os.environ.get('CKAN_CLOUD_OPERATOR_BIN', 'ckan-cloud-operator')
    Flow(main_flow(prefix, operator),
         printer(num_rows=1, fields=['name', 'image', 'gitlab_repo', 'from']),
         dump_to_path(f'data/{prefix}/ckan_images')).process()
Example #13
0
Flow(
      load(f'{BASE_URL}{CONFIRMED}'),
      load(f'{BASE_URL}{RECOVERED}'),
      load(f'{BASE_URL}{DEATH}'),
      unpivot(unpivoting_fields, extra_keys, extra_value),
      find_replace([{'name': 'Date', 'patterns': [{'find': '/', 'replace': '-'}]}]),
      to_normal_date,
      set_type('Date', type='date', format='%d-%m-%y', resources=None),
      set_type('Case', type='number', resources=None),
      join(
        source_name='time_series_19-covid-Confirmed',
        source_key=['Province/State', 'Date'],
        source_delete=True,
        target_name='time_series_19-covid-Deaths',
        target_key=['Province/State', 'Date'],
        fields=dict(Confirmed={
            'name': 'Case',
            'aggregate': 'first'
        })
      ),
      join(
        source_name='time_series_19-covid-Recovered',
        source_key=['Province/State', 'Date'],
        source_delete=True,
        target_name='time_series_19-covid-Deaths',
        target_key=['Province/State', 'Date'],
        fields=dict(Recovered={
            'name': 'Case',
            'aggregate': 'first'
        })
      ),
      add_computed_field(
        target={'name': 'Deaths', 'type': 'number'},
        operation='format',
        with_='{Case}'
      ),
      delete_fields(['Case']),
      update_resource('time_series_19-covid-Deaths', name='time-series-19-covid-combined', path='time-series-19-covid-combined.csv'),
      dump_to_path()
).results()[0]
Example #14
0
                    'image': deployment['spec']['template']['spec']['containers'][0]['image'],
                    'datastore': '',
                    'db': '',
                    'solr': '',
                    'storage': '',
                    'creationTimestamp': str(deployment['metadata']['creationTimestamp']),
                    'generation': '',
                }

    return _processor


def main_flow(prefix, operator):
    return Flow(
        cluster_info(operator),
        update_resource(['res_1'], name='cluster-info', path='cluster-info.csv'),
        checkpoint(f'{prefix}-cluster-info'),
        ckan_cloud_instances(operator),
        update_resource(['res_2'], name='ckan-cloud-instances', path='ckan-cloud-instances.csv'),
    )


if __name__ == '__main__':
    prefix = os.environ['DATAPACKAGE_PREFIX']
    operator = os.environ.get('CKAN_CLOUD_OPERATOR_BIN', 'ckan-cloud-operator')
    Flow(
        main_flow(prefix, operator),
        printer(num_rows=1),
        dump_to_path(f'data/{prefix}/resources')
    ).process()
def flow():
    data_path = 'data{}/'.format(
        '_samples' if os.environ.get('KNESSET_DATA_SAMPLES') else '')
    kns_knessetdates_sorted = []
    mk_individual_factions = {}
    vote_discipline = {}
    all_mk_ids = set()
    aggregates = {}

    def get_vote_discipline_mk_ids(vote_id):
        return vote_discipline.get(vote_id, [set(), set()])

    def process_voted_against_majority(rows):
        for row in rows:
            undisciplined_mk_ids, disciplined_mk_ids = vote_discipline.setdefault(
                row['vote_id'], [set(), set()])
            if row['vote_majority'] in ['against', 'pro']:
                if row['voted_against_majority']:
                    undisciplined_mk_ids.add(row['mk_id'])
                else:
                    disciplined_mk_ids.add(row['mk_id'])

    def process_votes(votes):
        for vote in rows_counter('view_vote_rslts_hdr_approved', votes):
            vote_date = vote['vote_date']
            undisciplined_mk_ids, disciplined_mk_ids = get_vote_discipline_mk_ids(
                vote['id'])
            for mk_id, faction_id in get_mk_faction_ids(
                    all_mk_ids, mk_individual_factions, vote_date):
                knessetdate = get_knessetdate(kns_knessetdates_sorted,
                                              vote_date)
                agg = aggregates.setdefault(knessetdate['knesset'], {})\
                                .setdefault(knessetdate['plenum'], {})\
                                .setdefault(knessetdate['assembly'], {})\
                                .setdefault(knessetdate['pagra'], {})\
                                .setdefault(faction_id, {})\
                                .setdefault(mk_id, defaultdict(int))
                if mk_id in undisciplined_mk_ids:
                    agg['undisciplined_votes'] += 1
                elif mk_id in disciplined_mk_ids:
                    agg['disciplined_votes'] += 1
                agg['total_votes'] += 1

    def get_all_aggregates():
        for knesset, aggs in aggregates.items():
            for plenum, aggs in aggs.items():
                for assembly, aggs in aggs.items():
                    for pagra, aggs in aggs.items():
                        for faction_id, aggs in aggs.items():
                            for mk_id, agg in aggs.items():
                                yield (knesset, plenum, assembly, pagra,
                                       faction_id, mk_id), agg

    def get_mk_aggregates():
        for agg_key, agg in get_all_aggregates():
            total_votes = agg.get('total_votes', 0)
            if total_votes > 0:
                undisciplined_votes_percent = int(
                    agg.get('undisciplined_votes', 0) / total_votes * 100)
                disciplined_votes_percent = int(
                    agg.get('disciplined_votes', 0) / total_votes * 100)
                knesset, plenum, assembly, pagra, faction_id, mk_id = agg_key
                yield dict(
                    {
                        'undisciplined_votes': 0,
                        'disciplined_votes': 0,
                        'total_votes': 0,
                    },
                    **agg,
                    undisciplined_votes_percent=undisciplined_votes_percent,
                    disciplined_votes_percent=disciplined_votes_percent,
                    knesset=knesset,
                    plenum=plenum,
                    assembly=assembly,
                    pagra=int(pagra),
                    faction_id=faction_id,
                    mk_id=mk_id)

    def get_aggregates(package: PackageWrapper):
        schema_fields = [
            {
                'name': 'knesset',
                'type': 'integer'
            },
            {
                'name': 'plenum',
                'type': 'integer'
            },
            {
                'name': 'assembly',
                'type': 'integer'
            },
            {
                'name': 'pagra',
                'type': 'integer'
            },
            {
                'name': 'faction_id',
                'type': 'integer'
            },
            {
                'name': 'mk_id',
                'type': 'integer'
            },
            {
                'name': 'undisciplined_votes',
                'type': 'integer'
            },
            {
                'name': 'disciplined_votes',
                'type': 'integer'
            },
            {
                'name': 'total_votes',
                'type': 'integer'
            },
            {
                'name': 'undisciplined_votes_percent',
                'type': 'integer'
            },
            {
                'name': 'disciplined_votes_percent',
                'type': 'integer'
            },
        ]
        package.pkg.add_resource({
            'name': 'mk_party_discipline_stats',
            'path': 'mk_party_discipline_stats.csv',
            'schema': {
                'fields': schema_fields
            }
        })
        yield package.pkg
        yield from package
        yield get_mk_aggregates()

    return Flow(
        load(data_path + 'members/mk_individual/datapackage.json',
             resources=['mk_individual_names']),
        process_rows_remove_resource(
            'mk_individual_names', mk_individual_names_processor(all_mk_ids)),
        load(data_path + 'members/mk_individual/datapackage.json',
             resources=['mk_individual_factions']),
        process_rows_remove_resource(
            'mk_individual_factions',
            mk_individual_factions_processor(mk_individual_factions)),
        load(data_path + 'knesset/kns_knessetdates/datapackage.json',
             resources=['kns_knessetdates']),
        process_rows_remove_resource(
            'kns_knessetdates',
            kns_knessetdates_processor(kns_knessetdates_sorted)),
        load(data_path + 'people/mk_voted_against_majority/datapackage.json',
             resources=['mk_voted_against_majority']),
        process_rows_remove_resource('mk_voted_against_majority',
                                     process_voted_against_majority),
        load(data_path + 'votes/view_vote_rslts_hdr_approved/datapackage.json',
             resources=['view_vote_rslts_hdr_approved']),
        process_rows_remove_resource('view_vote_rslts_hdr_approved',
                                     process_votes),
        get_aggregates,
        dump_to_path('data/people/mk_party_discipline_stats'),
    )
from dataflows import Flow, add_computed_field


def flow(parameters, *args):
    return Flow(
        add_computed_field(target=dict(
            name='geotype',
            type='string',
            constraints=dict(enum=["state", "nation"])),
                           operation=lambda row:
                           (row['geo'] == 'Germany') and 'nation' or 'state',
                           resources=parameters["resources"]))


# Entrypoint for running the flow directly, without Datapackage Pipelines
if __name__ == '__main__':
    # Add a printer step and run the flow
    Flow(flow(), printer(num_rows=1, tablefmt='html')).process()
import os
from dataflows import Flow
from dataflows_xlsx import dump_to_path


def get_data():
    for i in range(10):
        yield {'i': i, 'foo': 'bar{}'.format(i)}


Flow([{
    'i': i,
    'foo': 'bar{}'.format(i)
} for i in range(10)],
     dump_to_path('tests/data/test_dump_to_xlsx', format='xlsx')).process()

assert os.path.isfile('tests/data/test_dump_to_xlsx/res_1.xlsx')
assert os.path.isfile('tests/data/test_dump_to_xlsx/datapackage.json')
assert os.path.getsize('tests/data/test_dump_to_xlsx/datapackage.json') > 200
assert os.path.getsize('tests/data/test_dump_to_xlsx/res_1.xlsx') > 2000

print('OK')
Example #18
0
OIL_PRICES = Flow(
    add_metadata(
        name="oil-prices",
        title="Brent and WTI Spot Prices",
        descriptor=(
            "A variety of temporal granularities for Europe Brent and WTI "
            "(West Texas Intermediate) Spot Prices."),
        sources=[
            {
                "name": "Daily Europe Brent Spot Price",
                "path": "https://www.eia.gov/dnav/pet/hist_xls/RBRTEd.xls",
                "title": "Daily Europe Brent Spot Price",
            },
            {
                "name": "Weekly Europe Brent Spot Price",
                "path": "https://www.eia.gov/dnav/pet/hist_xls/RBRTEw.xls",
                "title": "Weekly Europe Brent Spot Price",
            },
            {
                "name": "Monthly Europe Brent Spot Price",
                "path": "https://www.eia.gov/dnav/pet/hist_xls/RBRTEm.xls",
                "title": "Monthly Europe Brent Spot Price",
            },
            {
                "name": "Annual Europe Brent Spot Price",
                "path": "https://www.eia.gov/dnav/pet/hist_xls/RBRTEa.xls",
                "title": "Annual Europe Brent Spot Price",
            },
            {
                "name": "Daily Cushing, OK WTI Spot Price",
                "path": "http://www.eia.gov/dnav/pet/hist_xls/RWTCd.xls",
                "title": "Daily Cushing, OK WTI Spot Price",
            },
            {
                "name": "Weekly Cushing, OK WTI Spot Price",
                "path": "http://www.eia.gov/dnav/pet/hist_xls/RWTCw.xls",
                "title": "Weekly Cushing, OK WTI Spot Price",
            },
            {
                "name": "Monthly Cushing, OK WTI Spot Price",
                "path": "http://www.eia.gov/dnav/pet/hist_xls/RWTCm.xls",
                "title": "Monthly Cushing, OK WTI Spot Price",
            },
            {
                "name": "Annual Cushing, OK WTI Spot Price",
                "path": "http://www.eia.gov/dnav/pet/hist_xls/RWTCa.xls",
                "title": "Annual Cushing, OK WTI Spot Price",
            },
        ],
        licenses=[{
            "name":
            "ODC-PDDL-1.0",
            "path":
            "http://opendatacommons.org/licenses/pddl/",
            "title":
            "Open Data Commons Public Domain Dedication and License v1.0",
        }],
        keywords=["Oil", "Brent", "WTI", "Oil Prices", "eia", "oil eia"],
        views=[{
            "name": "graph",
            "title": "Europe Brent Spot Price FOB (Dollars per Barrel)",
            "resourceName": "brent-day",
            "specType": "simple",
            "spec": {
                "type": "line",
                "group": "Date",
                "series": ["Brent Spot Price"],
            },
        }],
    ),
    load(
        load_source="https://www.eia.gov/dnav/pet/hist_xls/RBRTEd.xls",
        format="xls",
        sheet=2,
        skip_rows=[1, 2, 3],
        headers=["Date", "Price"],
    ),
    load(
        load_source="https://www.eia.gov/dnav/pet/hist_xls/RBRTEw.xls",
        format="xls",
        sheet=2,
        skip_rows=[1, 2, 3],
        headers=["Date", "Price"],
    ),
    load(
        load_source="https://www.eia.gov/dnav/pet/hist_xls/RBRTEm.xls",
        format="xls",
        sheet=2,
        skip_rows=[1, 2, 3],
        headers=["Date", "Price"],
    ),
    load(
        load_source="https://www.eia.gov/dnav/pet/hist_xls/RBRTEa.xls",
        format="xls",
        sheet=2,
        skip_rows=[1, 2, 3],
        headers=["Date", "Price"],
    ),
    load(
        load_source="http://www.eia.gov/dnav/pet/hist_xls/RWTCd.xls",
        format="xls",
        sheet=2,
        skip_rows=[1, 2, 3],
        headers=["Date", "Price"],
    ),
    load(
        load_source="http://www.eia.gov/dnav/pet/hist_xls/RWTCw.xls",
        format="xls",
        sheet=2,
        skip_rows=[1, 2, 3],
        headers=["Date", "Price"],
    ),
    load(
        load_source="http://www.eia.gov/dnav/pet/hist_xls/RWTCm.xls",
        format="xls",
        sheet=2,
        skip_rows=[1, 2, 3],
        headers=["Date", "Price"],
    ),
    load(
        load_source="http://www.eia.gov/dnav/pet/hist_xls/RWTCa.xls",
        format="xls",
        sheet=2,
        skip_rows=[1, 2, 3],
        headers=["Date", "Price"],
    ),
    rename_resources,
    set_type("Date", resources=None, type="date", format="any"),
    validate(),
    printer(),
    filter_out_empty_rows,
    dump_to_path(),
)
Example #19
0
 def postflow(self):
     return Flow(self.work())
Example #20
0
Flow(
    load(f'{BASE_URL}{CONFIRMED}'),
    load(f'{BASE_URL}{RECOVERED}'),
    load(f'{BASE_URL}{DEATH}'),
    checkpoint('load_data'),
    unpivot(unpivoting_fields, extra_keys, extra_value),
    find_replace([{
        'name': 'Date',
        'patterns': [{
            'find': '/',
            'replace': '-'
        }]
    }]),
    to_normal_date,
    set_type('Date', type='date', format='%d-%m-%y', resources=None),
    set_type('Case', type='number', resources=None),
    join(source_name='time_series_covid19_confirmed_global',
         source_key=['Province/State', 'Country/Region', 'Date'],
         source_delete=True,
         target_name='time_series_covid19_deaths_global',
         target_key=['Province/State', 'Country/Region', 'Date'],
         fields=dict(Confirmed={
             'name': 'Case',
             'aggregate': 'first'
         })),
    join(source_name='time_series_19-covid-Recovered',
         source_key=['Province/State', 'Country/Region', 'Date'],
         source_delete=True,
         target_name='time_series_covid19_deaths_global',
         target_key=['Province/State', 'Country/Region', 'Date'],
         fields=dict(Recovered={
             'name': 'Case',
             'aggregate': 'first'
         })),
    add_computed_field(target={
        'name': 'Deaths',
        'type': 'number'
    },
                       operation='format',
                       with_='{Case}'),
    delete_fields(['Case']),
    update_resource('time_series_covid19_deaths_global',
                    name='time-series-19-covid-combined',
                    path='data/time-series-19-covid-combined.csv'),
    update_schema('time-series-19-covid-combined',
                  missingValues=['None', ''],
                  fields=[{
                      "format": "%Y-%m-%d",
                      "name": "Date",
                      "type": "date"
                  }, {
                      "format": "default",
                      "name": "Country/Region",
                      "type": "string"
                  }, {
                      "format": "default",
                      "name": "Province/State",
                      "type": "string"
                  }, {
                      "decimalChar": ".",
                      "format": "default",
                      "groupChar": "",
                      "name": "Lat",
                      "type": "number"
                  }, {
                      "decimalChar": ".",
                      "format": "default",
                      "groupChar": "",
                      "name": "Long",
                      "type": "number"
                  }, {
                      "format": "default",
                      "groupChar": "",
                      "name": "Confirmed",
                      "title": "Cumulative total confirmed cases to date",
                      "type": "integer"
                  }, {
                      "format": "default",
                      "groupChar": "",
                      "name": "Recovered",
                      "title": "Cumulative total recovered cases to date",
                      "type": "integer"
                  }, {
                      "format": "default",
                      "groupChar": "",
                      "name": "Deaths",
                      "title": "Cumulative total deaths to date",
                      "type": "integer"
                  }]),
    checkpoint('processed_data'),
    # Duplicate the stream to create aggregated data
    duplicate(source='time-series-19-covid-combined',
              target_name='worldwide-aggregated',
              target_path='data/worldwide-aggregated.csv'),
    join_with_self(resource_name='worldwide-aggregated',
                   join_key=['Date'],
                   fields=dict(Date={'name': 'Date'},
                               Confirmed={
                                   'name': 'Confirmed',
                                   'aggregate': 'sum'
                               },
                               Recovered={
                                   'name': 'Recovered',
                                   'aggregate': 'sum'
                               },
                               Deaths={
                                   'name': 'Deaths',
                                   'aggregate': 'sum'
                               })),
    update_schema('worldwide-aggregated',
                  missingValues=['None', ''],
                  fields=[{
                      "format": "%Y-%m-%d",
                      "name": "Date",
                      "type": "date"
                  }, {
                      "format": "default",
                      "groupChar": "",
                      "name": "Confirmed",
                      "title": "Cumulative total confirmed cases to date",
                      "type": "integer"
                  }, {
                      "format": "default",
                      "groupChar": "",
                      "name": "Recovered",
                      "title": "Cumulative total recovered cases to date",
                      "type": "integer"
                  }, {
                      "format": "default",
                      "groupChar": "",
                      "name": "Deaths",
                      "title": "Cumulative total deaths to date",
                      "type": "integer"
                  }]),
    checkpoint('processed_worldwide_data'),
    # Add daily increase rate field in the worldwide data
    calculate_increase_rate,
    # Create another resource with key countries pivoted
    duplicate(source='time-series-19-covid-combined',
              target_name='key-countries-pivoted',
              target_path='data/key-countries-pivoted.csv'),
    join_with_self(resource_name='key-countries-pivoted',
                   join_key=['Date', 'Country/Region'],
                   fields=dict(Date={'name': 'Date'},
                               Country={'name': 'Country/Region'},
                               Confirmed={
                                   'name': 'Confirmed',
                                   'aggregate': 'sum'
                               },
                               Recovered={
                                   'name': 'Recovered',
                                   'aggregate': 'sum'
                               },
                               Deaths={
                                   'name': 'Deaths',
                                   'aggregate': 'sum'
                               })),
    update_schema('key-countries-pivoted',
                  missingValues=['None', ''],
                  fields=[{
                      "format": "%Y-%m-%d",
                      "name": "Date",
                      "type": "date"
                  }, {
                      "format": "default",
                      "name": "Country",
                      "type": "string"
                  }, {
                      "format": "default",
                      "groupChar": "",
                      "name": "Confirmed",
                      "title": "Cumulative total confirmed cases to date",
                      "type": "integer"
                  }, {
                      "format": "default",
                      "groupChar": "",
                      "name": "Recovered",
                      "title": "Cumulative total recovered cases to date",
                      "type": "integer"
                  }, {
                      "format": "default",
                      "groupChar": "",
                      "name": "Deaths",
                      "title": "Cumulative total deaths to date",
                      "type": "integer"
                  }]),
    checkpoint('processed_country_data'),
    # All countries aggregated
    duplicate(source='key-countries-pivoted',
              target_name='countries-aggregated',
              target_path='data/countries-aggregated.csv'),
    pivot_key_countries,
    delete_fields(['Country', 'Confirmed', 'Recovered', 'Deaths'],
                  resources='key-countries-pivoted'),
    # Prepare data package (name, title) and add views
    update_package(
        name='covid-19',
        title='Novel Coronavirus 2019',
        views=[{
            "title": "Total world to date",
            "resources": ["worldwide-aggregated"],
            "specType": "simple",
            "spec": {
                "group": "Date",
                "series": ["Confirmed", "Recovered", "Deaths"],
                "type": "line"
            }
        }, {
            "title": "Number of confirmed cases in key countries",
            "resources": ["key-countries-pivoted"],
            "specType": "simple",
            "spec": {
                "group":
                "Date",
                "series": [
                    "China", "US", "United_Kingdom", "Italy",
                    "France", "Germany", "Spain", "Iran"
                ],
                "type":
                "line"
            }
        }, {
            "title":
            "Mortality rate in percentage",
            "resources": [{
                "name":
                "worldwide-aggregated",
                "transform": [{
                    "type":
                    "formula",
                    "expressions":
                    ["data['Deaths'] / data['Confirmed'] * 100 + '%'"],
                    "asFields": ["Mortality rate"]
                }]
            }],
            "specType":
            "simple",
            "spec": {
                "group": "Date",
                "series": ["Mortality rate"],
                "type": "bar"
            }
        }, {
            "title":
            "Increase rate from previous day in confirmed cases worldwide",
            "resources": ["worldwide-aggregated"],
            "specType": "simple",
            "spec": {
                "group": "Date",
                "series": ["Increase rate"],
                "type": "bar"
            }
        }]),
    dump_to_path()).results()[0]
def flow(parameters):
    return Flow(
        add_computed_field(parameters.get('fields', []),
                           resources=parameters.get('resources')), )
Example #22
0
Flow(
    load(f'{BASE_URL}{CONFIRMED}'),
    load(f'{BASE_URL}{RECOVERED}'),
    load(f'{BASE_URL}{DEATH}'),
    checkpoint('load_data'),
    unpivot(unpivoting_fields, extra_keys, extra_value),
    find_replace([{
        'name': 'Date',
        'patterns': [{
            'find': '/',
            'replace': '-'
        }]
    }]),
    to_normal_date,
    set_type('Date', type='date', format='%d-%m-%y', resources=None),
    set_type('Case', type='number', resources=None),
    join(source_name='time_series_19-covid-Confirmed',
         source_key=['Province/State', 'Country/Region', 'Date'],
         source_delete=True,
         target_name='time_series_19-covid-Deaths',
         target_key=['Province/State', 'Country/Region', 'Date'],
         fields=dict(Confirmed={
             'name': 'Case',
             'aggregate': 'first'
         })),
    join(source_name='time_series_19-covid-Recovered',
         source_key=['Province/State', 'Country/Region', 'Date'],
         source_delete=True,
         target_name='time_series_19-covid-Deaths',
         target_key=['Province/State', 'Country/Region', 'Date'],
         fields=dict(Recovered={
             'name': 'Case',
             'aggregate': 'first'
         })),
    add_computed_field(target={
        'name': 'Deaths',
        'type': 'number'
    },
                       operation='format',
                       with_='{Case}'),
    delete_fields(['Case']),
    update_resource('time_series_19-covid-Deaths',
                    name='time-series-19-covid-combined',
                    path='data/time-series-19-covid-combined.csv'),
    update_package(name='covid-19', title='Novel Coronavirus 2019'),
    dump_to_path(),
    checkpoint('processed_data'),
    # Duplicate the stream to create aggregated data
    duplicate(source='time-series-19-covid-combined',
              target_name='worldwide-aggregated',
              target_path='worldwide-aggregated.csv'),
    join_with_self(resource_name='worldwide-aggregated',
                   join_key=['Date'],
                   fields=dict(Date={'name': 'Date'},
                               Confirmed={
                                   'name': 'Confirmed',
                                   'aggregate': 'sum'
                               },
                               Recovered={
                                   'name': 'Recovered',
                                   'aggregate': 'sum'
                               },
                               Deaths={
                                   'name': 'Deaths',
                                   'aggregate': 'sum'
                               })),
    dump_to_path()).results()[0]
def flow(parameters):
    return Flow(update_package(**parameters))
Example #24
0
        doc = row['document']
        for (k, v) in RENAME_FIELDS.items():
            doc[v] = doc.get(k, [])
        yield row


def flow(*_):
    return Flow(
        filter_by_type,
        rename_fields,
        add_fields(FIELDS, 'string'),
        add_fields(ADDITIONAL_FIELDS, 'string'),
        parse_document,
        delete_fields([
            'document', 'pdf', 'other', 'num_files', 'parser_version',
            'source', 's3_object_name'
        ]),
    )


if __name__ == '__main__':
    csv.field_size_limit(512 * 1024)

    Flow(
        load(
            '/var/datapackages/maya/maya_complete_notification_list/datapackage.json'
        ),
        flow(),
        printer(),
    ).process()
Example #25
0
 def postflow(self):
     return Flow(self.address_fixer(), )
def flow(parameters):
    return Flow(
        update_resource(parameters['source'], name=parameters['target']))
Example #27
0
    'קולות קוראים': 'קול קורא',
    'תמיכות': 'מבחן תמיכה',
}


def process_kind(row):
    row['tender_type'] = KIND_MAPPING.get(row['tender_type_he'],
                                          row['tender_type_he'])
    row['tender_type_he'] = KIND_HE_MAPPING.get(row['tender_type_he'],
                                                row['tender_type_he'])


def flow(*_):
    return Flow(
        fetch_results(),
        set_type('start_date', type='date', format='%d.%m.%Y'),
        set_type('tender_id', type='string'),
        set_type('tender_type', type='string'),
        process_kind,
        calculate_publication_id(2),
        set_primary_key(['publication_id']),
        update_resource(-1, name='jobiz', **{PROP_STREAMING: True}),
    )


if __name__ == '__main__':
    Flow(
        flow(),
        printer(),
    ).process()
Example #28
0
def batch_flow(parameters):
    return Flow(*[flow(p) for p in parameters['batch']])
Example #29
0
from pprint import pprint
from dataflows import Flow, load
from processors.pivot import pivot

# Run flow
flow = Flow(
    load('layouts/long.csv'),
    pivot(join_field='name', key_field='treatment', value_field='result'),
)
results, package, stats = flow.results()
print('[Data]\n')
pprint(results[0])
print('\n[Meta]\n')
pprint(package.descriptor)