Python joinの例、dataflows.join Pythonの例

コード例 #1

0

ファイルを表示

def join_unique_records(*args):
    is_dpp = len(args) > 3
    return Flow(
        load('data/search_import_from_gdrive/datapackage.json',
             resources=['search_import']),
        load('data/search_results/unique_records.csv',
             resources=['unique_records']),
        set_type('migdar_id',
                 type='string',
                 resources=['unique_records', 'search_import']),
        join(source_name='search_import',
             source_key=['migdar_id'],
             target_name='unique_records',
             target_key=['migdar_id'],
             fields={
                 f'gd_{field}': {
                     'name': field
                 }
                 for field in SEARCH_IMPORT_FIELD_NAMES
             },
             full=False),
        printer(tablefmt='plain' if is_dpp else 'html',
                num_rows=1,
                fields=['migdar_id']),
        dump_to_path('data/unique_records_full'),
        update_resource(None, **{'dpp:streaming': True}))

コード例 #2

0

ファイルを表示

def judges_flow(out_path):
    return Flow(
        get_tribunals(),
        update_resource(['res_1'], name='tribunals', path='tribunals.csv'),
        checkpoint('judges_tribunals'), get_judges(),
        update_resource(['res_2'], name='judges_list', path='judges_list.csv'),
        set_type('Is_In_Dimus_List', resources=['judges_list'],
                 type='boolean'), checkpoint('judges_judges_list'),
        join('tribunals', ['Tribunal_Code'],
             'judges_list', ['Tribunal_Code'],
             fields={
                 'Tribunal_Type_Code': {},
                 'Tribunal_Arkaa_Code': {
                     'name': 'Arkaa_Code'
                 },
                 'Tribunal_District_Code': {
                     'name': 'District_Code'
                 },
                 'Tribunal_Name': {
                     'name': 'Name'
                 }
             }), fetch_judges_details, checkpoint('judges_details'),
        add_field('tribunal_type_name', 'string'), parse_judges_extra_details,
        checkpoint('judges_extra_details'), parse_judge_events,
        dump_to_path(out_path), printer(num_rows=1))

コード例 #3

0

ファイルを表示

ファイル: join.py プロジェクト: slallum/datapackage-pipelines

def flow(parameters):
    source = parameters['source']
    target = parameters['target']
    return Flow(
        load_lazy_json(source['name']),
        join(source['name'], source['key'], target['name'], target['key'],
             parameters['fields'], parameters.get('full', True),
             source.get('delete', False)),
        update_resource(target['name'], **{PROP_STREAMING: True}))

コード例 #4

0

ファイルを表示

def test_fullouter_join_dump_different_keys():
    from dataflows import Flow, join, dump_to_path
    from decimal import Decimal

    data1 = [
        {
            "col1": 1.531,
            "col2": "hello"
        },
        {
            "col1": 1.132,
            "col2": "goodbye"
        },
    ]
    data2 = [
        {
            "colA": 1.531,
            "colB": "123"
        },
        {
            "colA": 1.132,
            "colB": 1.132
        },
    ]
    f = Flow(
        data1,
        data2,
        join("res_1", ["col1"],
             "res_2", ["colA"],
             {"col2": {
                 "name": "col2",
                 "aggregate": "first"
             }},
             mode="full-outer"),
        dump_to_path(out_path='out/test_join_dump'),
    )
    results = f.results()[0][0]
    assert results == [
        {
            'colA': Decimal('1.531'),
            'col2': 'hello',
            'colB': '123'
        },
        {
            'colA': Decimal('1.132'),
            'col2': 'goodbye',
            'colB': 1.132
        },
    ]

コード例 #5

0

ファイルを表示

 def conditional(self):
     target_field_names = [
         ct.replace(':', '-') for ct in self.TARGET_FIELD_COLUMNTYPES
     ]
     steps = [
         join(
             self.key, self.REF_KEY_FIELDS, RESOURCE_NAME,
             self.SOURCE_KEY_FIELDS,
             dict((target_field_name, dict(name=fetch_field))
                  for target_field_name, fetch_field in zip(
                      target_field_names, self.REF_FETCH_FIELDS))),
     ]
     steps.extend([
         set_type(target_field_name,
                  resources=RESOURCE_NAME,
                  columnType=target_field_columntype)
         for target_field_name, target_field_columntype in zip(
             target_field_names, self.TARGET_FIELD_COLUMNTYPES)
     ])
     f = Flow(*steps)
     return f

コード例 #6

0

ファイルを表示

def test_join_full_outer():
    from dataflows import load, set_type, join
    flow = Flow(
        load('data/population.csv'),
        load('data/cities.csv'),
        join(
            source_name='population',
            source_key=['id'],
            target_name='cities',
            target_key=['id'],
            fields={'population': {
                'name': 'population'
            }},
            mode='full-outer',
        ),
    )
    data = flow.results()[0]
    assert data == [[
        {
            'id': 1,
            'city': 'london',
            'population': 8
        },
        {
            'id': 2,
            'city': 'paris',
            'population': 2
        },
        {
            'id': 3,
            'city': 'rome',
            'population': None
        },
        {
            'id': 4,
            'city': None,
            'population': 3
        },
    ]]

コード例 #7

0

ファイルを表示

ファイル: test_examples.py プロジェクト: vitaly-am/dataflows

def test_example_9():
    from dataflows import Flow, load, dump_to_path, join, concatenate, filter_rows

    f = Flow(
        # Emmy award nominees and winners
        load('data/emmy.csv', name='emmies'),
        filter_rows(equals=[dict(winner=1)]),
        concatenate(dict(emmy_nominee=['nominee'], ),
                    dict(name='emmies_filtered'),
                    resources='emmies'),
        # Academy award nominees and winners
        load('data/academy.csv', encoding='utf8', name='oscars'),
        join(
            'emmies_filtered',
            ['emmy_nominee'],  # Source resource
            'oscars',
            ['Name'],  # Target resource
            full=False  # Don't add new fields, remove unmatched rows
        ),
        filter_rows(equals=[dict(Winner='1')]),
        dump_to_path('out/double_winners'))
    _ = f.process()

コード例 #8

0

ファイルを表示

def join_search_app_records(*args):
    is_dpp = len(args) > 3

    def load_search_app_data():
        with Stream(
                'https://migdar-internal-search.odata.org.il/__data/search_app/index.csv',
                http_session=get_migdar_session()) as index_stream:
            for i, row in enumerate(index_stream.iter()):
                search_id = row[4]
                print(f"#{i}. {search_id} ({row[0]}/{row[1]})")
                url = f'https://migdar-internal-search.odata.org.il/__data/search_app/{search_id}/records.csv'
                with Stream(url, headers=1,
                            http_session=get_migdar_session()) as data_stream:
                    for rownum, row in enumerate(data_stream.iter(keyed=True)):
                        row['migdar_id'] = f'{search_id}-{rownum}'
                        yield row

    return Flow(
        load('data/search_import_from_gdrive/datapackage.json',
             resources=['search_import']), load_search_app_data(),
        update_resource('res_2',
                        name='search_app_records',
                        path='search_app_records.csv'),
        join(source_name='search_import',
             source_key=['migdar_id'],
             target_name='search_app_records',
             target_key=['migdar_id'],
             fields={
                 f'gd_{field}': {
                     'name': field
                 }
                 for field in SEARCH_IMPORT_FIELD_NAMES
             },
             full=False),
        printer(tablefmt='plain' if is_dpp else 'html',
                num_rows=1,
                fields=['migdar_id']), dump_to_path('data/app_records_full'),
        update_resource(None, **{'dpp:streaming': True}))

コード例 #9

0

ファイルを表示

def test_join():
    from dataflows import Flow, join, join_with_self, set_type, sort_rows
    from decimal import Decimal

    characters = [
        {
            'first_name': 'Jaime',
            'house': 'Lannister',
            'last_name': 'Lannister',
            'age': 34
        },
        {
            'first_name': 'Tyrion',
            'house': 'Lannister',
            'last_name': 'Lannister',
            'age': 27
        },
        {
            'first_name': 'Cersei',
            'house': 'Lannister',
            'last_name': 'Lannister',
            'age': 34
        },
        {
            'first_name': 'Jon',
            'house': 'Stark',
            'last_name': 'Snow',
            'age': 17
        },
        {
            'first_name': 'Sansa',
            'house': 'Stark',
            'last_name': 'Stark',
            'age': 14
        },
        {
            'first_name': 'Rickon',
            'house': 'Stark',
            'last_name': 'Stark',
            'age': 5
        },
        {
            'first_name': 'Arya',
            'house': 'Stark',
            'last_name': 'Stark',
            'age': 11
        },
        {
            'first_name': 'Bran',
            'house': 'Stark',
            'last_name': 'Stark',
            'age': 10
        },
        {
            'first_name': 'Daenerys',
            'house': 'Targaryen',
            'last_name': 'Targaryen',
            'age': 16
        },
    ]

    houses = [
        {
            'house': 'House of Lannister'
        },
        {
            'house': 'House of Greyjoy'
        },
        {
            'house': 'House of Stark'
        },
        {
            'house': 'House of Targaryen'
        },
        {
            'house': 'House of Martell'
        },
        {
            'house': 'House of Tyrell'
        },
    ]

    res, _, _ = Flow(
        characters, set_type('age', type='number'), houses,
        join('res_1',
             'House of {house}',
             'res_2',
             '{house}',
             dict(max_age={
                 'name': 'age',
                 'aggregate': 'max'
             },
                  avg_age={
                      'name': 'age',
                      'aggregate': 'avg'
                  },
                  representative={
                      'name': 'first_name',
                      'aggregate': 'last'
                  },
                  representative_age={'name': 'age'},
                  number_of_characters={'aggregate': 'count'},
                  last_names={
                      'name': 'last_name',
                      'aggregate': 'counters'
                  }),
             full=False,
             source_delete=True)).results()

    assert res[0] == [
        {
            'avg_age': Decimal('31.66666666666666666666666667'),
            'house': 'House of Lannister',
            'max_age': Decimal(34),
            'number_of_characters': 3,
            'representative': 'Cersei',
            'representative_age': Decimal(34),
            'last_names': [('Lannister', 3)]
        },
        {
            'avg_age': Decimal('11.4'),
            'house': 'House of Stark',
            'max_age': Decimal(17),
            'number_of_characters': 5,
            'representative': 'Bran',
            'representative_age': Decimal(10),
            'last_names': [('Stark', 4), ('Snow', 1)]
        },
        {
            'avg_age': Decimal(16),
            'house': 'House of Targaryen',
            'max_age': Decimal(16),
            'number_of_characters': 1,
            'representative': 'Daenerys',
            'representative_age': Decimal(16),
            'last_names': [('Targaryen', 1)]
        },
    ]

    # Find youngest of each house
    res, _, _ = Flow(
        characters, set_type('age', type='number'), sort_rows('{age:02}'),
        join_with_self('res_1', '{house}', {
            'the_house': {
                'name': 'house'
            },
            '*': {
                'aggregate': 'first'
            },
        }), sort_rows('{the_house}')).results()

    assert res[0] == [{
        'the_house': 'Lannister',
        'first_name': 'Tyrion',
        'last_name': 'Lannister',
        'age': Decimal('27')
    }, {
        'the_house': 'Stark',
        'first_name': 'Rickon',
        'last_name': 'Stark',
        'age': Decimal('5')
    }, {
        'the_house': 'Targaryen',
        'first_name': 'Daenerys',
        'last_name': 'Targaryen',
        'age': Decimal('16')
    }]

コード例 #10

0

ファイルを表示

ファイル: process.py プロジェクト: Manish91232316/Coronavirus-COVID-19-

 load(f'{BASE_URL}{DEATH}'),
 unpivot(unpivoting_fields, extra_keys, extra_value),
 find_replace([{
     'name': 'Date',
     'patterns': [{
         'find': '/',
         'replace': '-'
     }]
 }]), to_normal_date,
 set_type('Date', type='date', format='%d-%m-%y', resources=None),
 set_type('Case', type='number', resources=None),
 join(source_name='time_series_19-covid-Confirmed',
      source_key=['Province/State', 'Country/Region', 'Date'],
      source_delete=True,
      target_name='time_series_19-covid-Deaths',
      target_key=['Province/State', 'Country/Region', 'Date'],
      fields=dict(Confirmed={
          'name': 'Case',
          'aggregate': 'first'
      })),
 join(source_name='time_series_19-covid-Recovered',
      source_key=['Province/State', 'Country/Region', 'Date'],
      source_delete=True,
      target_name='time_series_19-covid-Deaths',
      target_key=['Province/State', 'Country/Region', 'Date'],
      fields=dict(Recovered={
          'name': 'Case',
          'aggregate': 'first'
      })),
 add_computed_field(target={
     'name': 'Deaths',

コード例 #11

0

ファイルを表示

 find_replace([{
     "name": "Date",
     "patterns": [{
         "find": "/",
         "replace": "-"
     }]
 }]),
 to_normal_date,
 set_type("Date", type="date", format="%d-%m-%y", resources=None),
 set_type("Case", type="number", resources=None),
 join(
     source_name="time_series_covid19_confirmed_global",
     source_key=["Province/State", "Country/Region", "Date"],
     source_delete=True,
     target_name="time_series_covid19_deaths_global",
     target_key=["Province/State", "Country/Region", "Date"],
     fields=dict(Confirmed={
         "name": "Case",
         "aggregate": "first"
     }),
 ),
 join(
     source_name="time_series_covid19_recovered_global",
     source_key=["Province/State", "Country/Region", "Date"],
     source_delete=True,
     target_name="time_series_covid19_deaths_global",
     target_key=["Province/State", "Country/Region", "Date"],
     fields=dict(Recovered={
         "name": "Case",
         "aggregate": "first"
     }),

コード例 #12

0

ファイルを表示

ファイル: prepare_stacks.py プロジェクト: hasadna/datacity-businessgate

def process_stack_demand(stack):

    def collect_cats():
        F = 'כלל המדגם'
        
        def f(rows):
            cat = None
            for row in rows:
                if F in row:
                    v = row[F]
                    if v.startswith('סך הכל '):
                        cat = v[7:]
                    elif v.startswith('--- '):
                        if not v.endswith('ללא פירוט'):
                            subcat = v[4:]
                            row['category'] = cat
                            row['subcategory'] = subcat
                            yield row
                else:
                    yield row
        return DF.Flow(
            DF.add_field('category', 'string', resources=-1),
            DF.add_field('subcategory', 'string', resources=-1),
            f,
            DF.delete_fields([F], resources=-1),
        )

    def fix_nones(row):
        row['demand_pct'] = row['demand_pct'] or 0

    key = 'stack:demand'
    try:
        demand_stacks = _cache.get(key)
    except KeyError:        
        demand_stacks = DF.Flow(
            DF.load('demand.xlsx', infer_strategy=DF.load.INFER_STRINGS, headers=2),
            collect_cats(),
            DF.update_schema(-1, missingValues=['--']),
            DF.unpivot(
                unpivot_fields=[dict(
                    name='(.+) \\([A-Z]\\)',
                    keys=dict(
                        neighborhood='\\1'
                    ),
                )],
                extra_keys=[dict(
                    name='neighborhood', type='string'
                )],
                extra_value=dict(
                    name='demand_pct', type='number'
                ),
                resources=-1
            ),
            DF.validate(),
            DF.duplicate('demand', 'demand_stacks'),
            DF.join_with_self('demand', ['category', 'subcategory'], dict(
                category=None, subcategory=None, max_demand=dict(name='demand_pct', aggregate='max')
            )),
            DF.join(
                'demand', ['category', 'subcategory'],
                'demand_stacks', ['category', 'subcategory'],
                dict(
                    max_demand=None
                )
            ),
            fix_nones,
            DF.add_field('display', 'string', lambda r: '{:.0f}%'.format(r['demand_pct'] * 100)),
            DF.add_field('value', 'number', lambda r: r['demand_pct']),
            DF.add_field('score', 'number', lambda r: r['demand_pct'] / r['max_demand'] * 6),
            DF.delete_fields(['demand_pct', 'max_demand']),
            DF.sort_rows('{score}', reverse=True),
            DF.add_field('scores', 'object', lambda r: dict(
                title=r['neighborhood'],
                score_display=r['display'],
                score_value=float(r['value']),
                geometry_score=float(r['score']),
            )),
            DF.join_with_self('demand_stacks', ['category', 'subcategory'], dict(
                category=None, subcategory=None,
                scores=dict(aggregate='array'),
            )),
            DF.add_field('card', 'object', lambda r: dict(
                title='ביקוש ל{}'.format(r['subcategory']),
                content='',
                scores=r['scores'],
                test='demand__{category}__{subcategory}'.format(**r).replace(' ', '_')
            )),
            DF.join_with_self('demand_stacks', ['category'], dict(
                category=None,
                cards=dict(name='card', aggregate='array'),
            )),
            DF.add_field('name', 'string', lambda r: 'demand.{}'.format(r['category']).replace(' ', '_')),
        ).results()[0][0]
        _cache.set(key, demand_stacks)
                    
    cards = [s for s in demand_stacks if s['name'] == stack['name']][0]['cards']
    stack.update(dict(
        layout='scores',
        currentField='neighborhood',
        map=True
    ))
    stack.setdefault('cards', []).extend(cards)

コード例 #13

0

ファイルを表示

ファイル: prepare_stacks.py プロジェクト: hasadna/datacity-businessgate

def process_demographics(stack):
    key = 'stack:demographics'
    try:
        demographics_cards = _cache.get(key)
    except KeyError:        
        def add_source():
            def f(rows):
                for row in rows:
                    row['source'] = rows.res.name
                    yield row
            return DF.Flow(
                DF.add_field('source', 'string'),
                f
            )

        def map_to_cards():
            MAP = {
                ("דו''ח אג''ס לפי עולים וותיקים",
                        ("סה''כ עולים",)
                ): 'immigrants',
                ("דו''ח אג''ס לפי קבוצות גיל",
                        ('0-5', '6-12')
                ): 'kids',
                ("דו''ח אג''ס לפי קבוצות גיל",
                        ('13-17',)
                ): 'teenagers',
                ("דו''ח אג''ס לפי קבוצות גיל",
                        ('60-64', '65-69', '70-74', '75-120')
                ): 'elderly',
                ("דו''ח אג''ס לפי קבוצות גיל",
                        ('18-21','22-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59')
                ): 'adults',
            }
            
            def f(rows):
                for row in rows:
                    for (source, kinds), kind in MAP.items():
                        if row['source'] == source and row['kind'] in kinds:
                            row['kind'] = kind
                            yield row
            return f

        s2n = dict(
            (int(stat_area), f['properties']['title'])
            for f in get_neighborhood_features()
            for stat_area in f['properties']['stat_areas']
        )

        MAP2 = dict(
            adults=('אוכלוסיה בוגרת', 'גברים ונשים בין גיל 18 ל-60', 0),
            kids=('ילדים', 'תינוקות וילדים עד גיל 12', 1),
            teenagers=('בני נוער', 'נערים ונערות עד גיל 18', 2),
            elderly=('הגיל השלישי', 'גברים ונשים מעל גיל 60', 3),
            immigrants=('עולים לישראל', 'תושבים שאינם ילידי ישראל', 4),
        )

        demographics_cards = DF.Flow(
            *[
                DF.load(f, headers=4)
                for f in glob.glob('demographics/*.csv')
            ],
            DF.add_field('stat_id', 'string', lambda r: r["אג''ס"]),
            DF.add_field('total', 'number', lambda r: r.get("סה''כ")),
            DF.delete_fields(["אג''ס", "סה''כ "]),
            DF.unpivot([dict(
                name="([-'א-ת0-9 ].+)",
                keys=dict(
                    kind=r'\1'
                )
            )], [dict(
                name='kind', type='string'
            )], dict(
                name='value', type='number'
            )),
            DF.validate(),
            add_source(),
            map_to_cards(),
            DF.concatenate(dict(
                total=[], value=[], kind=[], stat_id=[]
            )),
            DF.add_field('neighborhood', 'string', lambda r: s2n.get(int(r['stat_id']))),
            DF.filter_rows(lambda r: r['neighborhood']),
            DF.join_with_self('concat', ['neighborhood', 'kind'], dict(
                neighborhood=None,
                kind=None,
                total=dict(aggregate='sum'),
                value=dict(aggregate='sum'),
            )),
            DF.duplicate('concat', 'maxes'),
            DF.join_with_self('concat', ['neighborhood'], dict(neighborhood=None, total=None)),
            DF.join('concat', ['neighborhood'], 'maxes', ['neighborhood'], dict(
                total=None,
            )),
            DF.add_field('score_value', 'number', lambda r: r['value']), # /r['total']  
            DF.sort_rows('{score_value}', reverse=True),
            DF.duplicate('maxes', 'demographics'),
            DF.join_with_self('maxes', ['kind'], dict(kind=None, max_value=dict(name='score_value', aggregate='max'))),
            DF.join('maxes', ['kind'], 'demographics', ['kind'], dict(max_value=None)),
            DF.add_field('geometry_score', 'number', lambda r: 6*r['score_value']/r['max_value']),
            DF.add_field('score_display', 'string', lambda r: '{:,} ({:.0f}%)'.format(r['value'], 100*r['score_value']/r['total'])),
            DF.add_field('scores', 'object', lambda r: dict(
                title=r['neighborhood'],
                score_value=float(r['score_value']),
                score_display=r['score_display'],
                geometry_score=float(r['geometry_score']),
            )),
            DF.join_with_self('demographics', ['kind'], dict(
                kind=None, scores=dict(aggregate='array'),
            )),
            DF.add_field('title', 'string', lambda r: MAP2[r['kind']][0]),
            DF.add_field('content', 'string', lambda r: MAP2[r['kind']][1]),
            DF.add_field('order', 'integer', lambda r: MAP2[r['kind']][2]),
            DF.sort_rows('{order}'),
            DF.delete_fields(['kind']),
        ).results()[0][0]
        _cache.set(key, demographics_cards)

    # features = [
    #     dict(type='Feature', geometry=r['geometry'], properties=dict(title=r['neighborhoods'][0]))
    #     for r in DF.Flow(
    #         DF.load('geo/stat-areas/stat-areas/datapackage.json'),
    #     ).results()[0][0]
    # ]
    # geometry=dict(type='FeatureCollection', features=features)

    stack.update(dict(
        map=True,
        scheme='green',
        currentField='neighborhood',
        layout='scores',
        # geometry=geometry
    ))
    stack.setdefault('cards', []).extend(demographics_cards)

コード例 #14

0

ファイルを表示

ファイル: map_coloring.py プロジェクト: hasadna/avid-covider-raw-data

    neighborhood_fill_color_cases = ['case']
    neighborhood_fill_pattern_cases = ['case']

    r, _, _ = DF.Flow(
        DF.load(latest_file(),
                name='cities',
                override_fields=dict(area_id=dict(type='string')),
                cast_strategy=DF.load.CAST_WITH_SCHEMA),
        DF.filter_rows(lambda r: r['is_city']),
        DF.load(latest_file(),
                name='out',
                override_fields=dict(area_id=dict(type='string')),
                cast_strategy=DF.load.CAST_WITH_SCHEMA),
        DF.add_field('city_area_id', 'string',
                     lambda r: r['area_id'].split('-')[0]),
        DF.join('cities', ['city_area_id'], 'out', ['city_area_id'],
                dict(num_city_reports=dict(name='num_reports_weighted'))),
        DF.add_field('desc', 'string', ''),
        DF.add_field('kind', 'string', ''),
        DF.add_field('property', 'string', ''),
        props(),
        DF.join_with_self(
            'out', ['is_city', 'kind', 'desc', 'property'],
            dict(is_city=None,
                 kind=None,
                 desc=None,
                 property=None,
                 id=dict(aggregate='array'))),
    ).results()

    for item in r[0]:
        print('bucket for {} {} {}: {}'.format(

コード例 #15

0

ファイルを表示

ファイル: pull.py プロジェクト: TheCulliganMan/covid-19

def data_pull_csv():
    unpivoting_fields = [{
        "name": r"([0-9]+\/[0-9]+\/[0-9]+)",
        "keys": {
            "Date": r"\1"
        }
    }]

    extra_keys = [{"name": "Date", "type": "string"}]
    extra_value = {"name": "Case", "type": "number"}

    Flow(
        load(f"{BASE_URL}{CONFIRMED}"),
        load(f"{BASE_URL}{RECOVERED}"),
        load(f"{BASE_URL}{DEATH}"),
        unpivot(unpivoting_fields, extra_keys, extra_value),
        find_replace([{
            "name": "Date",
            "patterns": [{
                "find": "/",
                "replace": "-"
            }]
        }]),
        to_normal_date,
        set_type("Date", type="date", format="%d-%m-%y", resources=None),
        set_type("Case", type="number", resources=None),
        join(
            source_name="time_series_19-covid-Confirmed",
            source_key=["Province/State", "Country/Region", "Date"],
            source_delete=True,
            target_name="time_series_19-covid-Deaths",
            target_key=["Province/State", "Country/Region", "Date"],
            fields=dict(Confirmed={
                "name": "Case",
                "aggregate": "first"
            }),
        ),
        join(
            source_name="time_series_19-covid-Recovered",
            source_key=["Province/State", "Country/Region", "Date"],
            source_delete=True,
            target_name="time_series_19-covid-Deaths",
            target_key=["Province/State", "Country/Region", "Date"],
            fields=dict(Recovered={
                "name": "Case",
                "aggregate": "first"
            }),
        ),
        add_computed_field(
            target={
                "name": "Deaths",
                "type": "number"
            },
            operation="format",
            with_="{Case}",
        ),
        delete_fields(["Case"]),
        update_resource(
            "time_series_19-covid-Deaths",
            name="time-series-19-covid-combined",
            path=RAW_OUTPUT_CSV,
        ),
        dump_to_path(),
    ).results()[0]

コード例 #16

0

ファイルを表示

ファイル: process.py プロジェクト: uttaravadina/covid-19

 unpivot(unpivoting_fields, extra_keys, extra_value),
 find_replace([{
     'name': 'Date',
     'patterns': [{
         'find': '/',
         'replace': '-'
     }]
 }]),
 to_normal_date,
 set_type('Date', type='date', format='%d-%m-%y', resources=None),
 set_type('Case', type='number', resources=None),
 join(source_name='time_series_covid19_confirmed_global',
      source_key=['Province/State', 'Country/Region', 'Date'],
      source_delete=True,
      target_name='time_series_covid19_deaths_global',
      target_key=['Province/State', 'Country/Region', 'Date'],
      fields=dict(Confirmed={
          'name': 'Case',
          'aggregate': 'first'
      })),
 join(source_name='time_series_covid19_recovered_global',
      source_key=['Province/State', 'Country/Region', 'Date'],
      source_delete=True,
      target_name='time_series_covid19_deaths_global',
      target_key=['Province/State', 'Country/Region', 'Date'],
      fields=dict(Recovered={
          'name': 'Case',
          'aggregate': 'first'
      }),
      mode='full-outer'),
 # Add missing columns, e.g., after 'full-outer' join, the rows structure

コード例 #17

0

ファイルを表示

import dataflows as DF
import glob

x = DF.Flow(
    ({'filename': x[:-4]} for x in glob.glob('*png')),
    DF.update_resource(-1, name='files'),
    DF.load('http://api.yodaat.org/data/orgs_in_es/data/orgs.csv', name='orgs'),
    DF.join(
        'files', '{filename}',
        'orgs', '{entity_id}',
        {
            'filename': {},
        }, full=True, source_delete=True
    ),
    DF.filter_rows(lambda row: row['filename'] is None),
    DF.select_fields(['org_name', 'entity_id']),
    DF.printer()
).process()