Python validate Exemples, dataflows.validate Python Exemples

Exemple #1

0

Afficher le fichier

def test_validate():
    from dataflows import Flow, validate, set_type
    from dataflows.base.schema_validator import ignore
    data = [
        {
            'a': 1,
            'b': 1
        },
        {
            'a': 2,
            'b': 2
        },
        {
            'a': 3,
            'b': 3
        },
        {
            'a': 4,
            'b': 'a'
        },
    ]

    class on_error():
        def __init__(self):
            self.bad_row, self.bad_index = None, None

        def __call__(self, name, row, i, e):
            self.bad_row, self.bad_index = row, i
            return False

    # Schema validator
    handler = on_error()
    res, *_ = Flow(data, set_type('b', type='integer', on_error=ignore),
                   validate(on_error=handler)).results()
    assert len(res[0]) == 3
    assert handler.bad_row == {'a': 4, 'b': 'a'}
    assert handler.bad_index == 3

    # Field validator
    handler = on_error()
    res, *_ = Flow(data, set_type('b', type='integer', on_error=ignore),
                   validate('a', lambda v: v < 4, on_error=handler)).results()
    assert len(res[0]) == 3
    assert handler.bad_row == {'a': 4, 'b': 'a'}
    assert handler.bad_index == 3

    # Row validator
    handler = on_error()
    res, *_ = Flow(data, set_type('b', type='integer', on_error=ignore),
                   validate(lambda v: v['a'] < 4, on_error=handler)).results()
    assert len(res[0]) == 3
    assert handler.bad_row == {'a': 4, 'b': 'a'}
    assert handler.bad_index == 3

Exemple #2

0

Afficher le fichier

Fichier : helpers.py Projet : saeelparsekar/emojis

def generate_package():
    package_flow = Flow(
        add_metadata(
            name="unicode-emojis",
            title="UTS #51 Unicode Emoji",
            descriptor=(
                "List of emojis available from the Unicode Consortium. "
                "More information can be found in the Unicode® Technical Standard #51."
            ),
            sources=[
                {
                    "name": "unicode-emoji",
                    "path": "https://unicode.org/Public/emoji/latest/emoji-test.txt",
                    "title": "UTS #51 Unicode Emoji",
                },
            ],
            licenses=[
                {
                    "name": "ODC-PDDL-1.0",
                    "path": "http://opendatacommons.org/licenses/pddl/",
                    "title": "Open Data Commons Public Domain Dedication and License v1.0",
                }
            ],
            keywords=["unicode", "emojis", "emoji", "51", "standard", "uts"],
        ),
        load(load_source="data/emojis.csv", format="csv",),
        validate(),
        dump_to_path(),
    )
    package_flow.process()

Exemple #3

0

Afficher le fichier

def flow(*_):
    DF.Flow(
        DF.load(filename, name='welfare'),
        DF.add_field('activity_name', 'string',
                     lambda r: r['שם השירות (ציבורי)']),
        DF.filter_rows(lambda r: r['activity_name']),
        DF.add_field(
            'activity_description', 'array', lambda r:
            [r['תיאור השירות (תיאור קצר)'] + '\n' + r['השירות (מטרת השירות)']
             ]),
        DF.add_field(
            'history', 'array', lambda r: [
                dict(
                    year=2019,
                    unit=r['יחידה ארגונית נותנת השירות'].split('/')[0].strip(),
                    subunit=r['יחידה ארגונית נותנת השירות'].split('/')[1].
                    strip(),
                    subsubunit=r['יחידה ארגונית נותנת השירות'].split('/')[
                        1].strip(),
                )
            ]), DF.add_field('target_audience', 'array',
                             splitter('אוכלוסייה')),
        DF.add_field('subject', 'array', splitter('תחום ההתערבות')),
        DF.add_field('intervention', 'array', splitter('אופן התערבות')),
        DF.select_fields(FIELDS),
        DF.add_field('publisher_name', 'string', 'משרד הרווחה'),
        DF.add_field('min_year', 'integer', 2019),
        DF.add_field('max_year', 'integer', 2019),
        DF.add_field('kind', 'string', 'gov_social_service'),
        DF.add_field('kind_he', 'string', 'שירות חברתי'), DF.printer(),
        DF.validate(), DF.dump_to_path('tmp/activities-welfare')).process()
    return DF.Flow(
        DF.load('tmp/activities-welfare/datapackage.json'),
        DF.update_resource(-1, **{'dpp:streaming': True}),
    )

Exemple #4

0

Afficher le fichier

Fichier : naama_scraper.py Projet : odedsh/budgetkey-data-pipelines

def flow(*_):
    return DF.Flow(
        get_updated_sources(),
        DF.concatenate(fields=TENDER_MAPPING, target=dict(name='tenders')),
        DF.validate(),
        DF.filter_rows(lambda r: r['publication_id']),
        DF.add_field('tender_type', 'string',
                     lambda r: TENDER_KINDS[r['tender_type_he']],
                     **{'es:keyword': True}),
        DF.join_with_self(
            'tenders', KEY,
            dict((k, dict(aggregate='last'))
                 for k in list(TENDER_MAPPING.keys()) + ['tender_type'])),
        DF.set_type('publication_id', type='string', transform=str),
        DF.set_type('supplier_id', type='string', transform=str),
        DF.set_type('tender_id',
                    type='string',
                    transform=lambda v: v or 'none'),
        DF.set_type('.+_date',
                    type='date',
                    format='%d.%m.%Y',
                    on_error=DF.schema_validator.clear),
        DF.set_type('subjects',
                    type='string',
                    transform=lambda v: ';'.join(x.strip()
                                                 for x in v.split(','))
                    if v else ''),
        DF.set_type('claim_date',
                    type='datetime',
                    transform=lambda v, field_name, row: datetime.datetime.
                    combine(v, row['claim_time'] or datetime.time(0))
                    if v else None),
        DF.set_type('tender_type_he', **{'es:keyword': True}),
        DF.delete_fields(['claim_time']),
        DF.add_field(
            'page_url', 'string', lambda r:
            f'https://mr.gov.il/ilgstorefront/he/p/{r["publication_id"]}'),
        DF.add_field('page_title', 'string', lambda r: r['description']),
        DF.add_field('reason', 'string', lambda r: r['regulation']),
        DF.add_field('documents', 'array', []),
        DF.add_field('contact', 'string'),
        DF.add_field('contact_email', 'string'),
        DF.validate(),
        DF.update_resource(-1, **{'dpp:streaming': True}),
        DF.printer(),
    )

Exemple #5

0

Afficher le fichier

def test_unpivot_any_resources():
    from dataflows import unpivot, validate
    data1 = [
        dict([('name', 'ike{}'.format(i))] +
             [(str(year), year + i) for year in range(1990, 2020, 10)])
        for i in range(5)
    ]
    data2 = [
        dict([('city', 'mike{}'.format(i))] +
             [(str(year), year + i) for year in range(2050, 2080, 10)])
        for i in range(5)
    ]
    f = Flow(
        data1, data2,
        unpivot([dict(name='([0-9]+)', keys=dict(year='\\1'))],
                [dict(name='year', type='integer')],
                dict(name='amount', type='integer')), validate())
    results, _, _ = f.results()
    assert results[0] == [
        dict(zip(['name', 'year', 'amount'], r)) for r in [
            ['ike0', 1990, 1990],
            ['ike0', 2000, 2000],
            ['ike0', 2010, 2010],
            ['ike1', 1990, 1991],
            ['ike1', 2000, 2001],
            ['ike1', 2010, 2011],
            ['ike2', 1990, 1992],
            ['ike2', 2000, 2002],
            ['ike2', 2010, 2012],
            ['ike3', 1990, 1993],
            ['ike3', 2000, 2003],
            ['ike3', 2010, 2013],
            ['ike4', 1990, 1994],
            ['ike4', 2000, 2004],
            ['ike4', 2010, 2014],
        ]
    ]
    assert results[1] == [
        dict(zip(['city', 'year', 'amount'], r)) for r in [
            ['mike0', 2050, 2050],
            ['mike0', 2060, 2060],
            ['mike0', 2070, 2070],
            ['mike1', 2050, 2051],
            ['mike1', 2060, 2061],
            ['mike1', 2070, 2071],
            ['mike2', 2050, 2052],
            ['mike2', 2060, 2062],
            ['mike2', 2070, 2072],
            ['mike3', 2050, 2053],
            ['mike3', 2060, 2063],
            ['mike3', 2070, 2073],
            ['mike4', 2050, 2054],
            ['mike4', 2060, 2064],
            ['mike4', 2070, 2074],
        ]
    ]

Exemple #6

0

Afficher le fichier

Fichier : update.py Projet : loleg/opendatach-stats

def conference_csv():
    flow = Flow(
        # Load inputs
        load(
            od19_base + od19_feedback,
            name='feedback',
            format='csv',
        ),
        load(
            od19_base + od19_analysis,
            name='analysis',
            format='csv',
        ),
        # Process them
        set_type("Anzahl.*", type='integer', resources='analysis'),
        delete_fields([
            "Anzahl Auflistung",
            ".*\\(Formel\\)",
            ".*Duplikate",
        ],
                      resources='analysis'),
        not_empty_groupcol,
        # Save the results
        add_metadata(
            name='opendatach19',
            title='''Opendata.ch/2019 Forum''',
            licenses=[{
                "name":
                "ODC-PDDL-1.0",
                "path":
                "http://opendatacommons.org/licenses/pddl/",
                "title":
                "Open Data Commons Public Domain Dedication and License v1.0"
            }],
            maintainers=[{
                "name": "Oleg Lavrovsky",
                "web": "https://datalets.ch/"
            }],
            views=[{
                "name": "Groups",
                "resources": ["analysis"],
                "spec": {
                    "group": "Alle "
                    "Bedürfnisse"
                    "",
                    "series": ["Anzahl Auflistung (Zahl)"],
                    "type": "bar"
                },
                "specType": "simple",
                "title": "Topic counts"
            }]),
        printer(),
        validate(),
        dump_to_path('data/opendatach19'),
    )
    flow.process()

Exemple #7

0

Afficher le fichier

Fichier : test_lib.py Projet : robertdigital/dataflows

def test_update_schema():
    from dataflows import Flow, printer, update_schema, validate

    f = Flow([['a', '-'], ['a', 0]], update_schema(-1, missingValues=['-']),
             validate(), printer())
    results, dp, stats = f.results()
    print(dp.descriptor)
    assert results[0] == [
        dict(col0='a', col1=None),
        dict(col0='a', col1=0),
    ]

Exemple #8

0

Afficher le fichier

Fichier : set_types.py Projet : kant/datapackage-pipelines

def flow(parameters):
    resources = parameters.get('resources')
    regex = parameters.get('regex', True)
    if 'types' in parameters:
        return Flow(*[
            set_type(name, resources=resources, regex=regex, **options) if
            options is not None else delete_fields([name], resources=resources)
            for name, options in parameters['types'].items()
        ])
    else:
        return Flow(validate())

Exemple #9

0

Afficher le fichier

Fichier : test_examples.py Projet : nipoitra80/dataflows

def test_validate():
    from dataflows import Flow, validate, set_type, printer, ValidationError, exceptions

    def adder(row):
        row['a'] += 0.5
        row['a'] = str(row['a'])

    f = Flow((dict(a=x) for x in range(10)), set_type('a', type='integer'),
             adder, validate(), printer())

    with pytest.raises(exceptions.ProcessorError) as excinfo:
        f.process()
    assert isinstance(excinfo.value.cause, ValidationError)

Exemple #10

0

Afficher le fichier

Fichier : test_examples.py Projet : vitaly-am/dataflows

def test_validate():
    from dataflows import Flow, validate, set_type, printer, ValidationError

    def adder(row):
        row['a'] += 0.5
        row['a'] = str(row['a'])

    f = Flow((dict(a=x) for x in range(10)), set_type('a', type='integer'),
             adder, validate(), printer())
    try:
        _ = f.process()
        assert False
    except ValidationError:
        pass

Exemple #11

0

Afficher le fichier

def test_set_type_resources():
    from dataflows import Flow, set_type, validate

    f = Flow([dict(a=str(i)) for i in range(10)],
             [dict(b=str(i)) for i in range(10)],
             [dict(c='0_' + str(i)) for i in range(10)],
             set_type('a', resources='res_[1]', type='integer'),
             set_type('b', resources=['res_2'], type='integer'),
             set_type('[cd]', resources=-1, type='number', groupChar='_'),
             validate())
    results, dp, stats = f.results()
    print(dp.descriptor)
    assert results[0][1]['a'] == 1
    assert results[1][3]['b'] == 3
    assert results[2][8]['c'] == 8.0

Exemple #12

0

Afficher le fichier

Fichier : negev_galil.py Projet : OpenBudget/budgetkey-data-pipelines

def flow(*_):
    return DF.Flow(
        scraper(),
        DF.filter_rows(lambda row: row['page_title'] and row['page_title'].
                       startswith('קול קורא'),
                       resources=-1),
        DF.set_type('start_date', type='date', format='%d/%m/%Y',
                    resources=-1),
        DF.set_type('claim_date',
                    type='datetime',
                    format='%d/%m/%Y',
                    resources=-1),
        calculate_publication_id(9),
        DF.validate(),
        DF.update_resource(-1, name='negev_galil', **{PROP_STREAMING: True}),
    )

Exemple #13

0

Afficher le fichier

Fichier : collect.py Projet : i-ankit-25/population-reference-bureau

def clean_data(filename: str, location: str) -> None:
    """Clean and validate data with `dataflows`, creating data packages in the
    process, one for each file."""
    global FILE_NAME
    FILE_NAME = f"{location}-{filename}"
    clean_directory, _, processing_directory = set_location_dirs(location)
    exported_file = f"{clean_directory}/{filename}"
    _ = Flow(
        load(
            f"{processing_directory}/{filename}.csv",
            name=FILE_NAME,
        ),
        change_path,
        add_field("NameFIPS", "string"),
        concat_name_columns,
        delete_fields(["Name", "FIPS"]),
        set_type("Data", type="any"),
        validate(),
        dump_to_path(exported_file),
    ).process()[1]

Exemple #14

0

Afficher le fichier

Fichier : negev_galil.py Projet : wsheffel/budgetkey-data-pipelines

def flow(*_):
    return DF.Flow(
        scraper(),
        DF.filter_rows(lambda row: row['page_title'] and row['page_title'].startswith('קול קורא'), resources=-1),
        page_parser(),
        DF.add_field('decision', 'string',
                     default=lambda row: row['parsed']['decision'], resources=-1),
        DF.add_field('start_date', 'date', format='%d/%m/%Y',
                     default=lambda row: row['parsed']['start_date'], resources=-1),
        DF.add_field('claim_date', 'datetime', format='%d/%m/%Y',
                     default=lambda row: row['parsed']['claim_date'], resources=-1),
        DF.add_field('documents', 'array',
                     default=lambda row: row['parsed']['documents'], resources=-1),
        DF.delete_fields(['parsed'], resources=-1),
        calculate_publication_id(9),
        DF.validate(),
        DF.update_resource(
            -1, name='negev_galil',
            **{
                PROP_STREAMING: True
            }
        ),
    )

Exemple #15

0

Afficher le fichier

    def flow(self):
        if len(self.errors) == 0:
            primaryKey = [
                self.ct_to_fn(f) for f in self.config.get(CONFIG_PRIMARY_KEY)
            ]

            fieldOptions = {}
            dataTypes = dict(
                (ct['name'], dict(ct.get('options', {}), type=ct['dataType']))
                for ct in self.config.get(CONFIG_TAXONOMY_CT)
                if 'dataType' in ct)
            for mf in self.config.get(CONFIG_MODEL_MAPPING):
                ct = mf.get('columnType')
                name = mf['name']
                fieldOptions[name] = {}
                if ct is not None:
                    fieldOptions[name].update(dataTypes.get(ct, {}))
                fieldOptions[name].update(mf.get('options', {}))
                fieldOptions[name]['columnType'] = ct

            extraFieldDefs = self.join_mapping_taxonomy('extra', fieldOptions)
            normalizeFieldDef = self.join_mapping_taxonomy(
                'normalize', fieldOptions)
            unpivotFields = [
                dict(
                    name=f['name'],
                    keys=f['normalize'],
                ) for f in self.config.get(CONFIG_MODEL_MAPPING)
                if 'normalize' in f
            ]
            if len(normalizeFieldDef) > 0:
                normalizeFieldDef = normalizeFieldDef[0]
            else:
                normalizeFieldDef = None

            steps = [
                self.create_fdp(),
                self.datetime_handler(),
                self.set_consts(fieldOptions),
                validate(on_error=ignore),
            ] + ([
                unpivot(unpivotFields,
                        extraFieldDefs,
                        normalizeFieldDef,
                        regex=False,
                        resources=RESOURCE_NAME),
            ] if normalizeFieldDef else []) + [
                self.copy_names_to_titles(),
                self.rename([(self.ct_to_fn(f['columnType']), f['name'])
                             for f in self.config.get(CONFIG_MODEL_MAPPING)
                             if f.get('columnType') is not None]),
                update_resource(RESOURCE_NAME, path='out.csv'),
                # *[
                #     set_type(
                #         self.ct_to_fn(f['columnType']),
                #         columnType=f['columnType'],
                #         **fieldOptions.get(f['columnType'], {}),
                #         resources=RESOURCE_NAME,
                #         on_error=ignore
                #     )
                #     for f in self.config.get(CONFIG_MODEL_MAPPING)
                #     if f.get('columnType') is not None
                # ],
                set_primary_key(primaryKey, resources=RESOURCE_NAME)
                if len(primaryKey) else None
                # printer()
            ]
            f = Flow(*steps)
            return f

Exemple #16

0

Afficher le fichier

def flow(parameters, *_):
    def take_first(field):
        def f(row):
            if field in row and isinstance(row[field], list):
                row[field] = row[field][0]

        return Flow(
            f,
            set_type(field, type='string'),
        )

    def datetime_to_date(field):
        def f(row):
            if row.get(field):
                row[field] = row[field].date()

        return Flow(
            f,
            set_type(field, type='date'),
        )

    def approve(parameters):
        def func(row):
            if parameters.get('filter-out') is None:
                return True
            bad_phrase = parameters['filter-out']
            for f in ('page_title', 'description'):
                if row.get(f) and bad_phrase in row[f]:
                    return False
            return True

        return func

    return Flow(
        fetcher(parameters),
        concatenate(dict(
            page_title=['Title'],
            publication_id=['ItemId'],
            tender_id=['ItemUniqueId'],
            publisher=['OfficeDesc'],
            start_date=['PublishDate'],
            claim_date=['LastDate'],
            decision=['StatusDesc'],
            description=['Description'],
            last_update_date=['UpdateDate'],
            base_url=['BaseUrl'],
            url_name=['UrlName'],
            tender_type_he=['PublicationTypeDesc'],
        ),
                    resources=-1),
        add_field('tender_type',
                  'string',
                  default=parameters['tender_type'],
                  resources=-1),
        take_first('publisher'),
        take_first('tender_type_he'),
        add_field('page_url',
                  'string',
                  default=lambda row:
                  'https://www.gov.il/he{base_url}{url_name}'.format(**row)),
        # delete_fields(['base_url', 'url_name']),
        filter_rows(approve(parameters)),
        set_type('publication_id', type='integer'),
        set_type('start_date', type='datetime', format=DATE_FMT),
        set_type('last_update_date', type='datetime', format=DATE_FMT),
        set_type('claim_date', type='datetime', format=DATE_FMT),
        datetime_to_date('last_update_date'),
        datetime_to_date('start_date'),
        set_primary_key(['publication_id', 'tender_type', 'tender_id']),
        dedup(),
        update_resource(-1, **parameters.pop('resource')),
        update_resource(-1, **{'dpp:streaming': True}),
        validate(),
    )

Exemple #17

0

Afficher le fichier

Fichier : prepare_stacks.py Projet : hasadna/datacity-businessgate

def process_demographics(stack):
    key = 'stack:demographics'
    try:
        demographics_cards = _cache.get(key)
    except KeyError:        
        def add_source():
            def f(rows):
                for row in rows:
                    row['source'] = rows.res.name
                    yield row
            return DF.Flow(
                DF.add_field('source', 'string'),
                f
            )

        def map_to_cards():
            MAP = {
                ("דו''ח אג''ס לפי עולים וותיקים",
                        ("סה''כ עולים",)
                ): 'immigrants',
                ("דו''ח אג''ס לפי קבוצות גיל",
                        ('0-5', '6-12')
                ): 'kids',
                ("דו''ח אג''ס לפי קבוצות גיל",
                        ('13-17',)
                ): 'teenagers',
                ("דו''ח אג''ס לפי קבוצות גיל",
                        ('60-64', '65-69', '70-74', '75-120')
                ): 'elderly',
                ("דו''ח אג''ס לפי קבוצות גיל",
                        ('18-21','22-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59')
                ): 'adults',
            }
            
            def f(rows):
                for row in rows:
                    for (source, kinds), kind in MAP.items():
                        if row['source'] == source and row['kind'] in kinds:
                            row['kind'] = kind
                            yield row
            return f

        s2n = dict(
            (int(stat_area), f['properties']['title'])
            for f in get_neighborhood_features()
            for stat_area in f['properties']['stat_areas']
        )

        MAP2 = dict(
            adults=('אוכלוסיה בוגרת', 'גברים ונשים בין גיל 18 ל-60', 0),
            kids=('ילדים', 'תינוקות וילדים עד גיל 12', 1),
            teenagers=('בני נוער', 'נערים ונערות עד גיל 18', 2),
            elderly=('הגיל השלישי', 'גברים ונשים מעל גיל 60', 3),
            immigrants=('עולים לישראל', 'תושבים שאינם ילידי ישראל', 4),
        )

        demographics_cards = DF.Flow(
            *[
                DF.load(f, headers=4)
                for f in glob.glob('demographics/*.csv')
            ],
            DF.add_field('stat_id', 'string', lambda r: r["אג''ס"]),
            DF.add_field('total', 'number', lambda r: r.get("סה''כ")),
            DF.delete_fields(["אג''ס", "סה''כ "]),
            DF.unpivot([dict(
                name="([-'א-ת0-9 ].+)",
                keys=dict(
                    kind=r'\1'
                )
            )], [dict(
                name='kind', type='string'
            )], dict(
                name='value', type='number'
            )),
            DF.validate(),
            add_source(),
            map_to_cards(),
            DF.concatenate(dict(
                total=[], value=[], kind=[], stat_id=[]
            )),
            DF.add_field('neighborhood', 'string', lambda r: s2n.get(int(r['stat_id']))),
            DF.filter_rows(lambda r: r['neighborhood']),
            DF.join_with_self('concat', ['neighborhood', 'kind'], dict(
                neighborhood=None,
                kind=None,
                total=dict(aggregate='sum'),
                value=dict(aggregate='sum'),
            )),
            DF.duplicate('concat', 'maxes'),
            DF.join_with_self('concat', ['neighborhood'], dict(neighborhood=None, total=None)),
            DF.join('concat', ['neighborhood'], 'maxes', ['neighborhood'], dict(
                total=None,
            )),
            DF.add_field('score_value', 'number', lambda r: r['value']), # /r['total']  
            DF.sort_rows('{score_value}', reverse=True),
            DF.duplicate('maxes', 'demographics'),
            DF.join_with_self('maxes', ['kind'], dict(kind=None, max_value=dict(name='score_value', aggregate='max'))),
            DF.join('maxes', ['kind'], 'demographics', ['kind'], dict(max_value=None)),
            DF.add_field('geometry_score', 'number', lambda r: 6*r['score_value']/r['max_value']),
            DF.add_field('score_display', 'string', lambda r: '{:,} ({:.0f}%)'.format(r['value'], 100*r['score_value']/r['total'])),
            DF.add_field('scores', 'object', lambda r: dict(
                title=r['neighborhood'],
                score_value=float(r['score_value']),
                score_display=r['score_display'],
                geometry_score=float(r['geometry_score']),
            )),
            DF.join_with_self('demographics', ['kind'], dict(
                kind=None, scores=dict(aggregate='array'),
            )),
            DF.add_field('title', 'string', lambda r: MAP2[r['kind']][0]),
            DF.add_field('content', 'string', lambda r: MAP2[r['kind']][1]),
            DF.add_field('order', 'integer', lambda r: MAP2[r['kind']][2]),
            DF.sort_rows('{order}'),
            DF.delete_fields(['kind']),
        ).results()[0][0]
        _cache.set(key, demographics_cards)

    # features = [
    #     dict(type='Feature', geometry=r['geometry'], properties=dict(title=r['neighborhoods'][0]))
    #     for r in DF.Flow(
    #         DF.load('geo/stat-areas/stat-areas/datapackage.json'),
    #     ).results()[0][0]
    # ]
    # geometry=dict(type='FeatureCollection', features=features)

    stack.update(dict(
        map=True,
        scheme='green',
        currentField='neighborhood',
        layout='scores',
        # geometry=geometry
    ))
    stack.setdefault('cards', []).extend(demographics_cards)

Exemple #18

0

Afficher le fichier

Fichier : opec.py Projet : gustavhempel/ETL-Pipeline

            ' ', '_', 'Country', '2017', '2018', '2019', '-', 'Q417', '1Q18',
            '2Q18', '3Q18', '4Q18', '1Q19', '2Q19', '3Q19', '4Q19'
        ]),
    load(
        load_source=
        'https://www.opec.org/opec_web/static_files_project/media/downloads/publications/MOMR%20Appendix%20Tables%20(April%202020).xlsx',
        format='xls',
        sheet=5,
        skip_rows=[1, 2, 3, 4, 5, 6],
        headers=[
            ' ', 'Country', '2016', '2017', '2018', '3Q19', '4Q19', '2019',
            'Change 19/18', '1Q20', '2Q20', '3Q20', '4Q20', 'Change 20/19'
        ]),
    load(
        load_source=
        'https://www.opec.org/opec_web/static_files_project/media/downloads/publications/MOMR%20Appendix%20Tables%20(April%202020).xlsx',
        format='xls',
        sheet=6,
        skip_rows=[1, 2, 3, 4, 5, 6],
        headers=[
            ' ', 'Country', '2017', '2018', '2019', 'Change 19/18', '2Q19',
            '3Q19', '4Q19', '1Q20', 'Feb20', 'Mar20', 'Change Mar/Feb'
        ]),
    rename_resources,
    validate(),
    printer(),
    dump_to_path('opec'),
)

oil_prices.process()

Exemple #19

0

Afficher le fichier

Fichier : prepare_stacks.py Projet : hasadna/datacity-businessgate

def process_stack_demand(stack):

    def collect_cats():
        F = 'כלל המדגם'
        
        def f(rows):
            cat = None
            for row in rows:
                if F in row:
                    v = row[F]
                    if v.startswith('סך הכל '):
                        cat = v[7:]
                    elif v.startswith('--- '):
                        if not v.endswith('ללא פירוט'):
                            subcat = v[4:]
                            row['category'] = cat
                            row['subcategory'] = subcat
                            yield row
                else:
                    yield row
        return DF.Flow(
            DF.add_field('category', 'string', resources=-1),
            DF.add_field('subcategory', 'string', resources=-1),
            f,
            DF.delete_fields([F], resources=-1),
        )

    def fix_nones(row):
        row['demand_pct'] = row['demand_pct'] or 0

    key = 'stack:demand'
    try:
        demand_stacks = _cache.get(key)
    except KeyError:        
        demand_stacks = DF.Flow(
            DF.load('demand.xlsx', infer_strategy=DF.load.INFER_STRINGS, headers=2),
            collect_cats(),
            DF.update_schema(-1, missingValues=['--']),
            DF.unpivot(
                unpivot_fields=[dict(
                    name='(.+) \\([A-Z]\\)',
                    keys=dict(
                        neighborhood='\\1'
                    ),
                )],
                extra_keys=[dict(
                    name='neighborhood', type='string'
                )],
                extra_value=dict(
                    name='demand_pct', type='number'
                ),
                resources=-1
            ),
            DF.validate(),
            DF.duplicate('demand', 'demand_stacks'),
            DF.join_with_self('demand', ['category', 'subcategory'], dict(
                category=None, subcategory=None, max_demand=dict(name='demand_pct', aggregate='max')
            )),
            DF.join(
                'demand', ['category', 'subcategory'],
                'demand_stacks', ['category', 'subcategory'],
                dict(
                    max_demand=None
                )
            ),
            fix_nones,
            DF.add_field('display', 'string', lambda r: '{:.0f}%'.format(r['demand_pct'] * 100)),
            DF.add_field('value', 'number', lambda r: r['demand_pct']),
            DF.add_field('score', 'number', lambda r: r['demand_pct'] / r['max_demand'] * 6),
            DF.delete_fields(['demand_pct', 'max_demand']),
            DF.sort_rows('{score}', reverse=True),
            DF.add_field('scores', 'object', lambda r: dict(
                title=r['neighborhood'],
                score_display=r['display'],
                score_value=float(r['value']),
                geometry_score=float(r['score']),
            )),
            DF.join_with_self('demand_stacks', ['category', 'subcategory'], dict(
                category=None, subcategory=None,
                scores=dict(aggregate='array'),
            )),
            DF.add_field('card', 'object', lambda r: dict(
                title='ביקוש ל{}'.format(r['subcategory']),
                content='',
                scores=r['scores'],
                test='demand__{category}__{subcategory}'.format(**r).replace(' ', '_')
            )),
            DF.join_with_self('demand_stacks', ['category'], dict(
                category=None,
                cards=dict(name='card', aggregate='array'),
            )),
            DF.add_field('name', 'string', lambda r: 'demand.{}'.format(r['category']).replace(' ', '_')),
        ).results()[0][0]
        _cache.set(key, demand_stacks)
                    
    cards = [s for s in demand_stacks if s['name'] == stack['name']][0]['cards']
    stack.update(dict(
        layout='scores',
        currentField='neighborhood',
        map=True
    ))
    stack.setdefault('cards', []).extend(cards)

Exemple #20

0

Afficher le fichier

Fichier : bond_us_flow.py Projet : datasets/bond-yields-us-10y

            {
              "name": "graph",
              "title": "10 year US Government Bond Yields (Monthly granuarlity)",
              "specType": "simple",
              "spec": {"type": "line","group": "Date","series": ["Rate"]}
            }
        ],
        readme=readme()
    ),
    load(
        load_source='http://www.federalreserve.gov/datadownload/Output.aspx?rel=H15&series=0809abf197c17f1ff0b2180fe7015cc3&lastObs=&from=&to=&filetype=csv&label=include&layout=seriescolumn',
        skip_rows=[i+1 for i in range(6)],
        headers=['Date', 'Rate'],
        format='csv',
        name='monthly'
    ),
    set_type('Date', type='date', format='any', descriptor='Date in ISO 8601'),
    set_type('Rate', type='number', description='Percent per year'),
    update_resource('monthly', **{'path':'data/monthly.csv', 'dpp:streaming': True}),
    validate(),
    dump_to_path()
)


def flow(parameters, datapackage, resources, stats):
    return bond_us


if __name__ == '__main__':
    bond_us.process()

Exemple #21

0

Afficher le fichier

Fichier : population_estimates_flow.py Projet : fagan2888/population-growth-estimates-and-projections

            resources=resource_names[1:]),
    add_computed_field(fields=[{
        "operation": "format",
        "target": "Region",
        "with": "{Region, subregion, country or area *}"
    }, {
        "operation": "format",
        "target": "Country Code",
        "with": "{Country code}"
    }, {
        "operation": "format",
        "target": "Year",
        "with": "{year}"
    }, {
        "operation": "format",
        "target": "Population",
        "with": "{population}"
    }]),
    delete_fields(fields=[
        'Region, subregion, country or area *', 'Country code', 'year',
        'population'
    ]), validate())


def flow(parameters, datapackage, resources, stats):
    return population_estimates


if __name__ == '__main__':
    population_estimates.process()

Exemple #22

0

Afficher le fichier

 def postflow(self):
     return Flow(validate(on_error=schema_validator.drop), )

Exemple #23

0

Afficher le fichier

    set_type(
        'Rate',
        resources='quarterly',
        type='number',
        description=
        'Quarterly average yield from British Government Securities, 10 year Nominal Par Yield'
    ), set_type('Year', resources='annual', type='date', format='any'),
    set_type(
        'Rate',
        resources='annual',
        type='number',
        description=
        'Annual average yield from British Government Securities, 10 year Nominal Par Yield'
    ),
    update_resource('quarterly', **{
        'path': 'data/quarterly.csv',
        'dpp:streaming': True
    }),
    update_resource('annual', **{
        'path': 'data/annual.csv',
        'dpp:streaming': True
    }), validate(), dump_to_path())


def flow(parameters, datapackage, resources, stats):
    return bond_uk


if __name__ == '__main__':
    bond_uk.process()

Exemple #24

0

Afficher le fichier

Fichier : organisations.py Projet : hasadna/migdar-data-pipelines

    fix_doc_id,
    fix_links('objective'),
    fix_links('objective__en'),
    fix_links('objective__ar'),
    DF.add_field('year', 'integer', default=cur_year),
    DF.set_type('org_name',        **{'es:title': True}),
    DF.set_type('org_name__ar',    **{'es:title': True}),
    DF.set_type('alt_names',       
                **{'es:itemType': 'string', 'es:title': True}),
    *[
        DF.set_type(f, **{'es:index': False})
        for f in [
            'org_website', 'org_facebook', 'org_phone_number',
            'org_email_address', 'logo_url'

        ]
    ],
    DF.validate(),
)


def flow(*_):
    return DF.Flow(
        org_flow,
        es_dumper('orgs', REVISION, 'orgs_in_es')
    )


if __name__ == '__main__':
    DF.Flow(org_flow, DF.printer()).process()

Exemple #25

0

Afficher le fichier

Fichier : oil_prices_flow.py Projet : socialpercon/oil-prices

    update_resource('brent-annual', **{
        'path': 'data/brent-annual.csv',
        'dpp:streaming': True
    }),
    update_resource('wti-daily', **{
        'path': 'data/wti-daily.csv',
        'dpp:streaming': True
    }),
    update_resource('wti-weekly', **{
        'path': 'data/wti-weekly.csv',
        'dpp:streaming': True
    }),
    update_resource('wti-monthly', **{
        'path': 'data/wti-monthly.csv',
        'dpp:streaming': True
    }),
    update_resource('wti-annual', **{
        'path': 'data/wti-annual.csv',
        'dpp:streaming': True
    }), format_date, remove_empty_rows,
    set_type('Date', resources=None, type='date', format='any'), validate(),
    dump_to_path('data'))


def flow(parameters, datapackage, resources, stats):
    return oil_prices


if __name__ == '__main__':
    oil_prices.process()