Python sort_rows Exemples, dataflows.sort_rows Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : addresses.py Projet : hasadna/datacity-businessgate

def prepare_locations():
    prepare_addresses()
    return DF.Flow(
        DF.load('_cache_addresses/datapackage.json'),
        DF.add_field(
            'address', 'string', lambda r: '{} {}{}'.format(
                r['street_name'], r['house_number'], r['letter'] or '')),
        DF.add_field(
            'item', 'object',
            lambda r: dict(value=dict(lat=float(r['lat']),
                                      lon=float(r['lon']),
                                      arnona_zones=r['arnona_zones'],
                                      שם=r['address']),
                           display=r['address'])),
        DF.sort_rows('{house_number}'),
        DF.delete_fields([
            'house_number', 'letter', 'lat', 'lon', 'arnona_zones', 'address'
        ]),
        DF.join_with_self(
            'concat', ['street_name'],
            dict(display=dict(name='street_name'),
                 items=dict(name='item', aggregate='array'))),
        DF.add_field('sort_street_address', 'string',
                     lambda r: sort_street_address(r['display'])),
        DF.sort_rows('{sort_street_address}'),
        DF.delete_fields(['sort_street_address']), DF.printer(),
        DF.dump_to_path('_cache_locations'),
        DF.checkpoint('_cache_locations')).results()[0][0]

Exemple #2

0

Afficher le fichier

Fichier : flow.py Projet : ColinMaudry/decp-table-schema-utils

def decp_processing():
    flow = Flow(

        # Chargement du CSV suite à la conversion depuis JSON
        load("decp.csv"),
        set_type("acheteur.id", type="string"),
        set_type("titulaire.id", type="string"),
        set_type("codeCPV", type="string"),
        set_type("lieuExecution.code", type="string"),

        # Tri par rootId et seq pour préparer à la création de donneesActuelles
        sort_rows('{rootId}:{seq}', resources=0, reverse=True),
        donnees_actuelles,

        # rootId et seq peuvent maintenant être supprimés
        delete_fields(["rootId", "seq"], resources=0, regex=False),
        sort_rows('{datePublicationDonnees}', resources=0, reverse=True),

        # Nouvelle table dédiée aux marchés, sans données sur les titulaires
        print("Création de la table dédiée aux marchés..."),
        duplicate(source="decp",
                  target_name="decp-sans-titulaires",
                  target_path="decp-sans-titulaires.csv",
                  duplicate_to_end=True),
        delete_fields([
            "titulaire.id", "titulaire.denominationSociale",
            "titulaire.typeIdentifiant"
        ],
                      resources="decp-sans-titulaires",
                      regex=False),
        set_primary_key(["uid"], resources="decp-sans-titulaires"),
        deduplicate(),

        # Chargement des précédentes DECP au format CSV, pour extraction des nouvelles données
        # print("Téléchargement des données tabulaires précédentes..."),
        # load("https://decp.info/db/decp.csv?_size=max&_dl=1", name="previous-decp"),
        # set_type("acheteur.id", type="string"),
        # set_type("titulaire.id", type="string"),
        # set_type("codeCPV", type="string"),
        # set_type("lieuExecution.code", type="string"),
        # delete_fields(["rowid"], resources="previous-decp", regex=False),
        # #print("Fusion des données tabulaires précédentes et des données d'aujourd'hui..."),
        # concatenate({},target={"name": "decp-titulaires","path": "decp-titulaires.csv"},resources=["decp","previous-decp"]),

        # Chargement des précédentes données dédiées aux titulaires
        print("Chargement des données titulaires..."),
        load("decp-titulaires.csv", name="decp-titulaires"),
        set_type("acheteur.id", type="string"),
        set_type("titulaire.id", type="string"),
        set_type("codeCPV", type="string"),
        set_type("lieuExecution.code", type="string"),
        set_type("departement", type="string"),
        set_type("codeAPE", type="string"),
        print("Enregistrement des données sur le disque..."),
        dump_to_path("decp"))
    flow.process()

Exemple #3

0

Afficher le fichier

def test_sort_rows_decimal():
    from decimal import Decimal
    from dataflows import sort_rows, load

    f = Flow(
        load('data/numbers.csv', cast_strategy=load.CAST_WITH_SCHEMA),
        sort_rows(key='{a}'),
    )
    results, dp, _ = f.results()
    assert list(results[0]) == [{
        'a': Decimal('-1000')
    }, {
        'a': Decimal('-0.5')
    }, {
        'a': Decimal('-0.4')
    }, {
        'a': Decimal('0')
    }, {
        'a': Decimal('1.1')
    }, {
        'a': Decimal('2')
    }, {
        'a': Decimal('10')
    }, {
        'a': Decimal('1000')
    }]

Exemple #4

0

Afficher le fichier

def test_sort_rows_datetime():
    import datetime
    from dataflows import sort_rows

    f = Flow(
        [
            {
                'a': datetime.date(2000, 1, 3)
            },
            {
                'a': datetime.date(2010, 1, 2)
            },
            {
                'a': datetime.date(2020, 1, 1)
            },
        ],
        sort_rows(key='{a}'),
    )
    results, _, _ = f.results()
    assert list(results[0]) == [
        {
            'a': datetime.date(2000, 1, 3)
        },
        {
            'a': datetime.date(2010, 1, 2)
        },
        {
            'a': datetime.date(2020, 1, 1)
        },
    ]

Exemple #5

0

Afficher le fichier

def test_sort_reverse_many_rows():
    from dataflows import sort_rows

    f = Flow(
        ({'a': i, 'b': i % 5} for i in range(1000)),
        sort_rows(key='{b}{a}', reverse=True, batch_size=0),
    )
    results, _, _ = f.results()
    results = results[0]
    assert results[0:2] == [{'a': 999, 'b': 4}, {'a': 994, 'b': 4}]
    assert results[998:1000] == [{'a': 100, 'b': 0}, {'a': 0, 'b': 0}]

Exemple #6

0

Afficher le fichier

def test_sort_rows():
    from dataflows import sort_rows

    f = Flow(
        [
            {
                'a': 1,
                'b': 3
            },
            {
                'a': 2,
                'b': 3
            },
            {
                'a': 3,
                'b': 1
            },
            {
                'a': 4,
                'b': 1
            },
        ],
        sort_rows(key='{b}{a}'),
    )
    results, _, _ = f.results()
    assert list(results[0]) == [
        {
            'a': 3,
            'b': 1
        },
        {
            'a': 4,
            'b': 1
        },
        {
            'a': 1,
            'b': 3
        },
        {
            'a': 2,
            'b': 3
        },
    ]

Exemple #7

0

Afficher le fichier

    def postflow(self):
        key_field_names = [
            ct.replace(':', '-') for ct in self.config.get(CONFIG_PRIMARY_KEY)
        ]

        def save_pks(saved_pk):
            def func(package: PackageWrapper):
                for res in package.pkg.descriptor['resources']:
                    if res['name'] == RESOURCE_NAME:
                        saved_pk['pk'] = res['schema'].get('primaryKey', [])
                yield package.pkg
                yield from package

            return func

        def restore_pks(saved_pk):
            def func(package: PackageWrapper):
                for res in package.pkg.descriptor['resources']:
                    if res['name'] == RESOURCE_NAME:
                        res['schema']['primaryKey'] = saved_pk['pk']
                yield package.pkg
                yield from package

            return func

        saved_pk = dict(pk=[])
        steps = [
            save_pks(saved_pk),
            sort_rows(self.ORDER_BY_KEY, resources=RESOURCE_NAME),
            join_with_self(
                RESOURCE_NAME, key_field_names, {
                    **dict((f, {}) for f in key_field_names), '*':
                    dict(aggregate='last')
                }),
            restore_pks(saved_pk)
        ]
        f = Flow(*steps)
        return f

Exemple #8

0

Afficher le fichier

Fichier : historic_data.py Projet : wsheffel/budgetkey-data-pipelines

def flow(*_):
    prepare()
    yearly_fields = [
        'year', 'unit', 'subunit', 'subsubunit', 'allocated_budget',
        'num_beneficiaries'
    ]
    return DF.Flow(
        *[
            DF.load('tmp/' + resource_name + '/datapackage.json')
            for resource_name, _ in loads
        ],
        DF.concatenate(
            FIELD_MAPPING,
            dict(name='social_services', path='social_services.csv')),
        DF.sort_rows('{year}', reverse=True),
        DF.add_field(
            'history', 'object', lambda r: dict(
                (k, r[k]
                 if not isinstance(r[k], decimal.Decimal) else int(r[k]))
                for k in yearly_fields)),
        DF.printer(),
        DF.join_with_self(
            'social_services', ['publisher_name', 'activity_name'],
            dict(
                publisher_name=None,
                activity_name=None,
                activity_description=dict(aggregate='set'),
                min_year=dict(name='year', aggregate='min'),
                max_year=dict(name='year', aggregate='max'),
                history=dict(aggregate='array'),
            )),
        DF.add_field('kind', 'string', 'gov_social_service'),
        DF.add_field('kind_he', 'string', 'שירות חברתי'),
        DF.update_resource(-1, **{'dpp:streaming': True}),
        DF.printer(),
    )

Exemple #9

0

Afficher le fichier

def test_join():
    from dataflows import Flow, join, join_with_self, set_type, sort_rows
    from decimal import Decimal

    characters = [
        {
            'first_name': 'Jaime',
            'house': 'Lannister',
            'last_name': 'Lannister',
            'age': 34
        },
        {
            'first_name': 'Tyrion',
            'house': 'Lannister',
            'last_name': 'Lannister',
            'age': 27
        },
        {
            'first_name': 'Cersei',
            'house': 'Lannister',
            'last_name': 'Lannister',
            'age': 34
        },
        {
            'first_name': 'Jon',
            'house': 'Stark',
            'last_name': 'Snow',
            'age': 17
        },
        {
            'first_name': 'Sansa',
            'house': 'Stark',
            'last_name': 'Stark',
            'age': 14
        },
        {
            'first_name': 'Rickon',
            'house': 'Stark',
            'last_name': 'Stark',
            'age': 5
        },
        {
            'first_name': 'Arya',
            'house': 'Stark',
            'last_name': 'Stark',
            'age': 11
        },
        {
            'first_name': 'Bran',
            'house': 'Stark',
            'last_name': 'Stark',
            'age': 10
        },
        {
            'first_name': 'Daenerys',
            'house': 'Targaryen',
            'last_name': 'Targaryen',
            'age': 16
        },
    ]

    houses = [
        {
            'house': 'House of Lannister'
        },
        {
            'house': 'House of Greyjoy'
        },
        {
            'house': 'House of Stark'
        },
        {
            'house': 'House of Targaryen'
        },
        {
            'house': 'House of Martell'
        },
        {
            'house': 'House of Tyrell'
        },
    ]

    res, _, _ = Flow(
        characters, set_type('age', type='number'), houses,
        join('res_1',
             'House of {house}',
             'res_2',
             '{house}',
             dict(max_age={
                 'name': 'age',
                 'aggregate': 'max'
             },
                  avg_age={
                      'name': 'age',
                      'aggregate': 'avg'
                  },
                  representative={
                      'name': 'first_name',
                      'aggregate': 'last'
                  },
                  representative_age={'name': 'age'},
                  number_of_characters={'aggregate': 'count'},
                  last_names={
                      'name': 'last_name',
                      'aggregate': 'counters'
                  }),
             full=False,
             source_delete=True)).results()

    assert res[0] == [
        {
            'avg_age': Decimal('31.66666666666666666666666667'),
            'house': 'House of Lannister',
            'max_age': Decimal(34),
            'number_of_characters': 3,
            'representative': 'Cersei',
            'representative_age': Decimal(34),
            'last_names': [('Lannister', 3)]
        },
        {
            'avg_age': Decimal('11.4'),
            'house': 'House of Stark',
            'max_age': Decimal(17),
            'number_of_characters': 5,
            'representative': 'Bran',
            'representative_age': Decimal(10),
            'last_names': [('Stark', 4), ('Snow', 1)]
        },
        {
            'avg_age': Decimal(16),
            'house': 'House of Targaryen',
            'max_age': Decimal(16),
            'number_of_characters': 1,
            'representative': 'Daenerys',
            'representative_age': Decimal(16),
            'last_names': [('Targaryen', 1)]
        },
    ]

    # Find youngest of each house
    res, _, _ = Flow(
        characters, set_type('age', type='number'), sort_rows('{age:02}'),
        join_with_self('res_1', '{house}', {
            'the_house': {
                'name': 'house'
            },
            '*': {
                'aggregate': 'first'
            },
        }), sort_rows('{the_house}')).results()

    assert res[0] == [{
        'the_house': 'Lannister',
        'first_name': 'Tyrion',
        'last_name': 'Lannister',
        'age': Decimal('27')
    }, {
        'the_house': 'Stark',
        'first_name': 'Rickon',
        'last_name': 'Stark',
        'age': Decimal('5')
    }, {
        'the_house': 'Targaryen',
        'first_name': 'Daenerys',
        'last_name': 'Targaryen',
        'age': Decimal('16')
    }]

Exemple #10

0

Afficher le fichier

Fichier : process.py Projet : uttaravadina/covid-19

               }, {
                   "format": "default",
                   "groupChar": "",
                   "name": "Recovered",
                   "title": "Cumulative total recovered cases to date",
                   "type": "integer"
               }, {
                   "format": "default",
                   "groupChar": "",
                   "name": "Deaths",
                   "title": "Cumulative total deaths to date",
                   "type": "integer"
               }]),
 checkpoint('processed_data'),
 # Sort rows by date and country
 sort_rows('{Country/Region}{Province/State}{Date}',
           resources='time-series-19-covid-combined'),
 # Duplicate the stream to create aggregated data
 duplicate(source='time-series-19-covid-combined',
           target_name='worldwide-aggregated',
           target_path='data/worldwide-aggregated.csv'),
 join_with_self(resource_name='worldwide-aggregated',
                join_key=['Date'],
                fields=dict(Date={'name': 'Date'},
                            Confirmed={
                                'name': 'Confirmed',
                                'aggregate': 'sum'
                            },
                            Recovered={
                                'name': 'Recovered',
                                'aggregate': 'sum'
                            },

Exemple #11

0

Afficher le fichier

         "name": "Province/State",
         "type": "string"
     },
     operation="format",
     with_="{Province_State}",
     resources=["us_confirmed", "us_deaths"],
 ),
 delete_fields(
     ["Long_", "Country_Region", "Province_State"],
     resources=["us_confirmed", "us_deaths"],
 ),
 checkpoint("processed_data"),
 printer(),
 # Sort rows by date and country
 sort_rows(
     "{Country/Region}{Province/State}{Date}",
     resources="time-series-19-covid-combined",
 ),
 # Duplicate the stream to create aggregated data
 duplicate(
     source="time-series-19-covid-combined",
     target_name="worldwide-aggregated",
     target_path="data/worldwide-aggregated.csv",
 ),
 join_with_self(
     resource_name="worldwide-aggregated",
     join_key=["Date"],
     fields=dict(
         Date={"name": "Date"},
         Confirmed={
             "name": "Confirmed",
             "aggregate": "sum"

Exemple #12

0

Afficher le fichier

Fichier : prepare_stacks.py Projet : hasadna/datacity-businessgate

def process_stack_demand(stack):

    def collect_cats():
        F = 'כלל המדגם'
        
        def f(rows):
            cat = None
            for row in rows:
                if F in row:
                    v = row[F]
                    if v.startswith('סך הכל '):
                        cat = v[7:]
                    elif v.startswith('--- '):
                        if not v.endswith('ללא פירוט'):
                            subcat = v[4:]
                            row['category'] = cat
                            row['subcategory'] = subcat
                            yield row
                else:
                    yield row
        return DF.Flow(
            DF.add_field('category', 'string', resources=-1),
            DF.add_field('subcategory', 'string', resources=-1),
            f,
            DF.delete_fields([F], resources=-1),
        )

    def fix_nones(row):
        row['demand_pct'] = row['demand_pct'] or 0

    key = 'stack:demand'
    try:
        demand_stacks = _cache.get(key)
    except KeyError:        
        demand_stacks = DF.Flow(
            DF.load('demand.xlsx', infer_strategy=DF.load.INFER_STRINGS, headers=2),
            collect_cats(),
            DF.update_schema(-1, missingValues=['--']),
            DF.unpivot(
                unpivot_fields=[dict(
                    name='(.+) \\([A-Z]\\)',
                    keys=dict(
                        neighborhood='\\1'
                    ),
                )],
                extra_keys=[dict(
                    name='neighborhood', type='string'
                )],
                extra_value=dict(
                    name='demand_pct', type='number'
                ),
                resources=-1
            ),
            DF.validate(),
            DF.duplicate('demand', 'demand_stacks'),
            DF.join_with_self('demand', ['category', 'subcategory'], dict(
                category=None, subcategory=None, max_demand=dict(name='demand_pct', aggregate='max')
            )),
            DF.join(
                'demand', ['category', 'subcategory'],
                'demand_stacks', ['category', 'subcategory'],
                dict(
                    max_demand=None
                )
            ),
            fix_nones,
            DF.add_field('display', 'string', lambda r: '{:.0f}%'.format(r['demand_pct'] * 100)),
            DF.add_field('value', 'number', lambda r: r['demand_pct']),
            DF.add_field('score', 'number', lambda r: r['demand_pct'] / r['max_demand'] * 6),
            DF.delete_fields(['demand_pct', 'max_demand']),
            DF.sort_rows('{score}', reverse=True),
            DF.add_field('scores', 'object', lambda r: dict(
                title=r['neighborhood'],
                score_display=r['display'],
                score_value=float(r['value']),
                geometry_score=float(r['score']),
            )),
            DF.join_with_self('demand_stacks', ['category', 'subcategory'], dict(
                category=None, subcategory=None,
                scores=dict(aggregate='array'),
            )),
            DF.add_field('card', 'object', lambda r: dict(
                title='ביקוש ל{}'.format(r['subcategory']),
                content='',
                scores=r['scores'],
                test='demand__{category}__{subcategory}'.format(**r).replace(' ', '_')
            )),
            DF.join_with_self('demand_stacks', ['category'], dict(
                category=None,
                cards=dict(name='card', aggregate='array'),
            )),
            DF.add_field('name', 'string', lambda r: 'demand.{}'.format(r['category']).replace(' ', '_')),
        ).results()[0][0]
        _cache.set(key, demand_stacks)
                    
    cards = [s for s in demand_stacks if s['name'] == stack['name']][0]['cards']
    stack.update(dict(
        layout='scores',
        currentField='neighborhood',
        map=True
    ))
    stack.setdefault('cards', []).extend(cards)

Exemple #13

0

Afficher le fichier

Fichier : prepare_stacks.py Projet : hasadna/datacity-businessgate

def process_institutions(stack):
    key = 'stack:institutions'
    try:
        institutions_cards = _cache.get(key)
    except KeyError:
        CRS = '+ellps=GRS80 +k=1.00007 +lat_0=31.73439361111111 +lon_0=35.20451694444445 +no_defs +proj=tmerc +units=m +x_0=219529.584 +y_0=626907.39'
        projector = pyproj.Proj(CRS)

        def proj():
            def func(row):
                row['lon'], row['lat'] = projector(row['X'], row['Y'], inverse=True)
            return DF.Flow(
                DF.add_field('lon', 'number'),
                DF.add_field('lat', 'number'),
                func,
                DF.delete_fields(['X', 'Y'])
            )

        def translate_kind():
            translations = {
                'מרפאה': 'מרפאות',
                'איצטדיון': 'איצטדיון',
                'ספרייה': 'ספריות',
                'בית ספר': 'בתי ספר',
                'מועדון קהילתי כולל מרכז צעירים': 'מועדון קהילתי',
                'בית כנסת': 'בתי כנסת',
                'מועדון נוער': 'מועדון נוער',
                'אולם מופעים, היכל תרבות': 'מוסדות תרבות',
                'מועדון קשישים, מרכז לאזרחים ותיקים,מרכז יום לקשישים': 'מרכזי פעילות לקשישים',
            }
            def func(row):
                row['kind'] = translations[row['kind']]
            return func

        institutions_cards = DF.Flow(
            *[
                DF.load(f)
                for f in glob.glob('institutions/*xlsx')
            ],
            DF.concatenate(dict(
                kind=['סוג המוסד'],
                title=['שם המוסד'],
                address=['כתובת'],
                X=[], Y=[]
            )),
            translate_kind(),
            proj(),
            DF.add_field('feature', 'object', 
                        lambda r: geojson.Feature(
                            properties=dict(title=r['title'], address=r['address']),
                            geometry=geojson.Point(coordinates=[float(r['lon']), float(r['lat'])])
                        )),
            DF.delete_fields(['title', 'lon', 'lat', 'address']),
            DF.join_with_self('concat', ['kind'], dict(
                title=dict(name='kind'),
                features=dict(name='feature', aggregate='array')
            )),
            DF.sort_rows('{title}', reverse=True),
            DF.add_field('pointGeometry', 'object', lambda r: geojson.FeatureCollection(features=r['features'])),
            DF.add_field('content', 'string', '&nbsp;'),
            DF.delete_fields(['features']),
        #     DF.printer(tablefmt='html')
        ).results()[0][0]
        _cache.set(key, institutions_cards)

    stack.update(dict(
        map=True,
    ))
    stack.setdefault('cards', [])
    current_cards = dict(
        (c['title'], c) for c in stack['cards']
    )
    for card in institutions_cards:
        current_card = current_cards.pop(card['title'], None)
        if current_card is not None:
            card['content'] = current_card['content']
        else:
            print('SPURIOUS CARD for INSTITUTIONS', card['title'])
    stack['cards'] = [
        c for c in stack['cards']
        if c['title'] in current_cards
    ] + institutions_cards

Exemple #14

0

Afficher le fichier

Fichier : prepare_stacks.py Projet : hasadna/datacity-businessgate

def process_demographics(stack):
    key = 'stack:demographics'
    try:
        demographics_cards = _cache.get(key)
    except KeyError:        
        def add_source():
            def f(rows):
                for row in rows:
                    row['source'] = rows.res.name
                    yield row
            return DF.Flow(
                DF.add_field('source', 'string'),
                f
            )

        def map_to_cards():
            MAP = {
                ("דו''ח אג''ס לפי עולים וותיקים",
                        ("סה''כ עולים",)
                ): 'immigrants',
                ("דו''ח אג''ס לפי קבוצות גיל",
                        ('0-5', '6-12')
                ): 'kids',
                ("דו''ח אג''ס לפי קבוצות גיל",
                        ('13-17',)
                ): 'teenagers',
                ("דו''ח אג''ס לפי קבוצות גיל",
                        ('60-64', '65-69', '70-74', '75-120')
                ): 'elderly',
                ("דו''ח אג''ס לפי קבוצות גיל",
                        ('18-21','22-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59')
                ): 'adults',
            }
            
            def f(rows):
                for row in rows:
                    for (source, kinds), kind in MAP.items():
                        if row['source'] == source and row['kind'] in kinds:
                            row['kind'] = kind
                            yield row
            return f

        s2n = dict(
            (int(stat_area), f['properties']['title'])
            for f in get_neighborhood_features()
            for stat_area in f['properties']['stat_areas']
        )

        MAP2 = dict(
            adults=('אוכלוסיה בוגרת', 'גברים ונשים בין גיל 18 ל-60', 0),
            kids=('ילדים', 'תינוקות וילדים עד גיל 12', 1),
            teenagers=('בני נוער', 'נערים ונערות עד גיל 18', 2),
            elderly=('הגיל השלישי', 'גברים ונשים מעל גיל 60', 3),
            immigrants=('עולים לישראל', 'תושבים שאינם ילידי ישראל', 4),
        )

        demographics_cards = DF.Flow(
            *[
                DF.load(f, headers=4)
                for f in glob.glob('demographics/*.csv')
            ],
            DF.add_field('stat_id', 'string', lambda r: r["אג''ס"]),
            DF.add_field('total', 'number', lambda r: r.get("סה''כ")),
            DF.delete_fields(["אג''ס", "סה''כ "]),
            DF.unpivot([dict(
                name="([-'א-ת0-9 ].+)",
                keys=dict(
                    kind=r'\1'
                )
            )], [dict(
                name='kind', type='string'
            )], dict(
                name='value', type='number'
            )),
            DF.validate(),
            add_source(),
            map_to_cards(),
            DF.concatenate(dict(
                total=[], value=[], kind=[], stat_id=[]
            )),
            DF.add_field('neighborhood', 'string', lambda r: s2n.get(int(r['stat_id']))),
            DF.filter_rows(lambda r: r['neighborhood']),
            DF.join_with_self('concat', ['neighborhood', 'kind'], dict(
                neighborhood=None,
                kind=None,
                total=dict(aggregate='sum'),
                value=dict(aggregate='sum'),
            )),
            DF.duplicate('concat', 'maxes'),
            DF.join_with_self('concat', ['neighborhood'], dict(neighborhood=None, total=None)),
            DF.join('concat', ['neighborhood'], 'maxes', ['neighborhood'], dict(
                total=None,
            )),
            DF.add_field('score_value', 'number', lambda r: r['value']), # /r['total']  
            DF.sort_rows('{score_value}', reverse=True),
            DF.duplicate('maxes', 'demographics'),
            DF.join_with_self('maxes', ['kind'], dict(kind=None, max_value=dict(name='score_value', aggregate='max'))),
            DF.join('maxes', ['kind'], 'demographics', ['kind'], dict(max_value=None)),
            DF.add_field('geometry_score', 'number', lambda r: 6*r['score_value']/r['max_value']),
            DF.add_field('score_display', 'string', lambda r: '{:,} ({:.0f}%)'.format(r['value'], 100*r['score_value']/r['total'])),
            DF.add_field('scores', 'object', lambda r: dict(
                title=r['neighborhood'],
                score_value=float(r['score_value']),
                score_display=r['score_display'],
                geometry_score=float(r['geometry_score']),
            )),
            DF.join_with_self('demographics', ['kind'], dict(
                kind=None, scores=dict(aggregate='array'),
            )),
            DF.add_field('title', 'string', lambda r: MAP2[r['kind']][0]),
            DF.add_field('content', 'string', lambda r: MAP2[r['kind']][1]),
            DF.add_field('order', 'integer', lambda r: MAP2[r['kind']][2]),
            DF.sort_rows('{order}'),
            DF.delete_fields(['kind']),
        ).results()[0][0]
        _cache.set(key, demographics_cards)

    # features = [
    #     dict(type='Feature', geometry=r['geometry'], properties=dict(title=r['neighborhoods'][0]))
    #     for r in DF.Flow(
    #         DF.load('geo/stat-areas/stat-areas/datapackage.json'),
    #     ).results()[0][0]
    # ]
    # geometry=dict(type='FeatureCollection', features=features)

    stack.update(dict(
        map=True,
        scheme='green',
        currentField='neighborhood',
        layout='scores',
        # geometry=geometry
    ))
    stack.setdefault('cards', []).extend(demographics_cards)

Exemple #15

0

Afficher le fichier

Fichier : preprocess_raw_data.py Projet : YishTish/avid-covider-pipelines

def flow(*_):
    run_row = None
    last_run_row = Flow(
        load_if_exists('%s/last_run/datapackage.json' % OUTPUT_DIR, 'last_run',
                       [{}])).results()[0][0][0]
    last_run_sha1 = last_run_row.get('COVID19-ISRAEL_github_sha1')
    last_run_time = last_run_row.get('start_time')
    if last_run_time and (datetime.datetime.now() -
                          last_run_time).total_seconds() < 120:
        logging.info('last run was less then 120 seconds ago, not running')
    else:
        new_sha1 = github_pull_covid19_israel.flow({
            'dump_to_path':
            '%s/last_github_pull' % OUTPUT_DIR
        }).results()[0][0][0]['sha1']
        if last_run_time and (
                datetime.datetime.now() - last_run_time
        ).total_seconds() < 60 * 60 * 24 and last_run_sha1 == new_sha1:
            logging.info(
                "No change detected in COVID19-ISRAEL GitHub, not running")
        else:
            run_row = {
                'start_time': datetime.datetime.now(),
                'COVID19-ISRAEL_github_sha1': new_sha1
            }
            for module in RUN_MODULES:
                try:
                    os.makedirs('data/preprocess_raw_data/log_files/%s' %
                                module['id'],
                                exist_ok=True)
                    run_covid19_israel.flow({
                        'module':
                        module['module'],
                        'resource_name':
                        '%s_last_updated_files' % module['id'],
                        'dump_to_path':
                        'data/preprocess_raw_data/last_updated_files/%s' %
                        module['id'],
                        'log_file':
                        'data/preprocess_raw_data/log_files/%s/%s.log' %
                        (module['id'],
                         datetime.datetime.now().strftime('%Y%m%dT%H%M%S'))
                    }).process()
                    run_row['%s_success' % module['id']] = 'yes'
                except Exception:
                    logging.exception('failed to run %s' % module['id'])
                    run_row['%s_success' % module['id']] = 'no'

    if run_row is not None:
        Flow(
            iter([run_row]),
            update_resource(-1,
                            name='last_run',
                            path='last_run.csv',
                            **{'dpp:streaming': True}),
            dump_to_path('%s/last_run' % OUTPUT_DIR)).process()

    def _get_runs_history():
        if os.path.exists('%s/runs_history/datapackage.json' % OUTPUT_DIR):
            for resource in Flow(
                    load('%s/runs_history/datapackage.json' %
                         OUTPUT_DIR), ).datastream().res_iter:
                yield from resource
        if run_row is not None:
            yield run_row

    Flow(
        _get_runs_history(),
        update_resource(-1,
                        name='runs_history',
                        path='runs_history',
                        **{'dpp:streaming': True}),
        dump_to_path('%s/runs_history' % OUTPUT_DIR)).process()

    return Flow(load('%s/runs_history/datapackage.json' % OUTPUT_DIR),
                sort_rows('{start_time}', reverse=True), printer(num_rows=10))

Exemple #16

0

Afficher le fichier

Fichier : utils.py Projet : omertalmi5/avid-covider-pipelines

def keep_last_runs_history(output_dir, run_callback, *callback_args,
                           **callback_kwargs):
    run_row = {'start_time': datetime.datetime.now()}
    last_run_row = Flow(
        load_if_exists('%s/last_run/datapackage.json' % output_dir, 'last_run',
                       [{}])).results()[0][0][0]
    run_row, raise_exception_msg = run_callback(last_run_row, run_row,
                                                *callback_args,
                                                **callback_kwargs)
    if run_row:
        Flow(
            iter([{k: v
                   for k, v in run_row.items() if k != 'start_time'}]),
            update_resource(-1,
                            name='last_run',
                            path='last_run.csv',
                            **{'dpp:streaming': True}),
            dump_to_path('%s/last_run' % output_dir)).process()

    run_fields = set()
    if os.path.exists('%s/runs_history/datapackage.json' % output_dir):
        with open('%s/runs_history/datapackage.json' % output_dir) as f:
            datapackage = json.load(f)
        for f in datapackage['resources'][0]['schema']['fields']:
            run_fields.add(f['name'])

    if run_row:
        run_row["end_time"] = datetime.datetime.now().strftime(
            '%Y-%m-%dT%H:%M:%S')
        for k in run_row.keys():
            run_fields.add(k)

    def _get_runs_history():
        if os.path.exists('%s/runs_history/datapackage.json' % output_dir):
            for resource in Flow(
                    load('%s/runs_history/datapackage.json' %
                         output_dir), ).datastream().res_iter:
                for row in resource:
                    yield {k: row.get(k, '') for k in run_fields}
        if run_row:
            yield {k: run_row.get(k, '') for k in run_fields}

    Flow(
        _get_runs_history(),
        update_resource(-1,
                        name='runs_history',
                        path='runs_history',
                        **{'dpp:streaming': True}),
        dump_to_path('%s/runs_history' % output_dir)).process()

    def _printer(rows):
        logging.info('--- last runs ---')
        for i, row in enumerate(rows):
            if i < 10:
                logging.info('%s:' % row['start_time'])
                for k in sorted(row.keys()):
                    if k == 'start_time': continue
                    if row[k] is None or row[k] == '': continue
                    logging.info('  %s: %s' % (k, row[k]))
            yield row

    flow = Flow(load('%s/runs_history/datapackage.json' % output_dir),
                sort_rows('{start_time}', reverse=True), _printer)
    if raise_exception_msg:
        flow.process()
        raise Exception(raise_exception_msg)
    else:
        return flow

Exemple #17

0

Afficher le fichier

def test_sort_rows_number():
    from dataflows import sort_rows

    f = Flow(
        [
            {
                'a': 0.1
            },
            {
                'a': -3
            },
            {
                'a': -4
            },
            {
                'a': 10
            },
            {
                'a': 8
            },
            {
                'a': 0
            },
            {
                'a': -1000000
            },
            {
                'a': 1000000
            },
            {
                'a': -0.1
            },
            {
                'a': -0.2
            },
            {
                'a': 0.2
            },
            {
                'a': -1000001
            },
            {
                'a': 1000001
            },
            {
                'a': 6
            },
            {
                'a': -10
            },
            {
                'a': -0.001
            },
            {
                'a': 0.001
            },
            {
                'a': 1
            },
            {
                'a': -1
            },
        ],
        sort_rows(key='{a}'),
    )
    results, _, _ = f.results()
    assert list(results[0]) == [
        {
            'a': -1000001
        },
        {
            'a': -1000000
        },
        {
            'a': -10
        },
        {
            'a': -4
        },
        {
            'a': -3
        },
        {
            'a': -1
        },
        {
            'a': -0.2
        },
        {
            'a': -0.1
        },
        {
            'a': -0.001
        },
        {
            'a': 0
        },
        {
            'a': 0.001
        },
        {
            'a': 0.1
        },
        {
            'a': 0.2
        },
        {
            'a': 1
        },
        {
            'a': 6
        },
        {
            'a': 8
        },
        {
            'a': 10
        },
        {
            'a': 1000000
        },
        {
            'a': 1000001
        },
    ]

Exemple #18

0

Afficher le fichier

def flow(parameters):
    return Flow(
        load_lazy_json(parameters.get('resources')),
        sort_rows(parameters['sort-by'],
                  resources=parameters.get('resources'),
                  reverse=parameters.get('reverse')))

Exemple #19

0

Afficher le fichier

                override_fields=dict(area_id=dict(type='string')),
                cast_strategy=DF.load.CAST_WITH_SCHEMA),
        DF.filter_rows(lambda r: r['is_city']),
        DF.add_field('score_date', 'object', lambda r: dict(
            date=r['date'].isoformat(), sr=float(r['symptoms_ratio_weighted'] or 0), nr=int(r['num_reports_weighted']))
        ),
        DF.concatenate(dict(
            id=[], city_name=[], score_date=[]
        ), target=dict(name='ranking')),
        DF.join_with_self('ranking', '{city_name}', dict(
            id=None, city_name=None, scores=dict(name='score_date', aggregate='array')
        )),
        sort_limit_scores(),
        DF.filter_rows(lambda r: r['scores'][-1]['nr'] >= 200),
        DF.add_field('sortkey', 'integer', lambda r: int(r['scores'][-1]['sr'] * 1000000) + r['scores'][-1]['nr']),
        DF.sort_rows('{sortkey}', reverse=True),
        DF.delete_fields(['sortkey']),
        DF.add_field('rank', 'integer', 0),
        DF.add_field('translations', 'object', lambda r: city_translations[r['city_name']]),
        DF.add_field('image', 'object', lambda r: upload_static_image(r['id'], width=280*2, height=160*2)),
        ranker(),
    ).results()
    rankings = r[0]

    r, _, _ = DF.Flow(
        DF.load(all_data(), name='cities', headers=1,
                override_fields=dict(area_id=dict(type='string')),
                cast_strategy=DF.load.CAST_WITH_SCHEMA),
        DF.filter_rows(lambda r: r['is_city']),
        DF.filter_rows(lambda r: r['num_reports_weighted'] >= 200),
        DF.add_field('ws', 'number', lambda r: r['symptoms_ratio_weighted'] * r['num_reports_weighted']),

Exemple #20

0

Afficher le fichier

 flow = Flow(
     # Load inputs
     {% if input == 'file' %}
     load('{{input_url}}', format='{{format}}', {% if sheet %}sheet={{sheet}}{% endif %}),
     {% endif %}
     {% if input == 'remote' %}
     load('{{input_url}}', format='{{format}}', {% if sheet %}sheet={{sheet}}{% endif %}),
     {% endif %}
     {% if input == 'sql' %}
     load('{{input_url}}', table='{{input_db_table}}'),
     {% endif %}
     {% if input == 'other' %}
     {% endif %}
     # Process them (if necessary)
     {% if 'sort' in processing %}
     sort_rows('{field_name}'),  # Key is a Python format string or a list of field names
     {% endif %}
     {% if 'filter' in processing %}
     filter_rows(),
     {% endif %}
     {% if 'find_replace' in processing %}
     find_replace([
         dict(name='field_name',
              patterns=[
                  dict(find='re-pattern-to-find', replace='re-pattern-to-replace-with'),                     
              ])
     ]),
     {% endif %}
     {% if 'delete_fields' in processing %}
     delete_fields(['field_name']),  # Pass a list of field names to delete from the data
     {% endif %}