Exemple #1
0
 load(f'{BASE_URL}{CONFIRMED}'),
 load(f'{BASE_URL}{RECOVERED}'),
 load(f'{BASE_URL}{DEATH}'),
 load(f'{BASE_URL}{CONFIRMED_US}'),
 load(f'{BASE_URL}{DEATH_US}'),
 checkpoint('load_data'),
 unpivot(unpivoting_fields, extra_keys, extra_value),
 find_replace([{
     'name': 'Date',
     'patterns': [{
         'find': '/',
         'replace': '-'
     }]
 }]),
 to_normal_date,
 set_type('Date', type='date', format='%d-%m-%y', resources=None),
 set_type('Case', type='number', resources=None),
 join(source_name='time_series_covid19_confirmed_global',
      source_key=['Province/State', 'Country/Region', 'Date'],
      source_delete=True,
      target_name='time_series_covid19_deaths_global',
      target_key=['Province/State', 'Country/Region', 'Date'],
      fields=dict(Confirmed={
          'name': 'Case',
          'aggregate': 'first'
      })),
 join(source_name='time_series_covid19_recovered_global',
      source_key=['Province/State', 'Country/Region', 'Date'],
      source_delete=True,
      target_name='time_series_covid19_deaths_global',
      target_key=['Province/State', 'Country/Region', 'Date'],
Exemple #2
0
                    "series": [
                        "Exchange Rate"
                    ]
                }
            }
        ],
        version="0.2.0",
        readme=readme()
    ),
    extract_exchange_rates('daily'),
    extract_exchange_rates('monthly'),
    extract_exchange_rates('annual'),
    update_resource('res_1', **{'name': 'daily', 'path':'data/daily.csv', 'dpp:streaming': True}),
    update_resource('res_2', **{'name': 'monthly', 'path':'data/monthly.csv', 'dpp:streaming': True}),
    update_resource('res_3', **{'name': 'annual', 'path':'data/annual.csv', 'dpp:streaming': True}),
    set_type('Date', resources='daily', type='date', description="Date in ISO format"),
    set_type('Country', resources='daily', type='string', description="Name of a country"),
    set_type('Exchange rate', resources='daily', type='number', description="Foreign Exchange Rate to USD. Only AUD, IEP, NZD, GBP and EUR to USD."),
    set_type('Date', resources='monthly', type='date', description="Date in ISO format"),
    set_type('Country', resources='monthly', type='string', description="Name of a country"),
    set_type('Exchange rate', resources='monthly', type='number', description="Foreign Exchange Rate to USD. Only AUD, IEP, NZD, GBP and EUR to USD."),
    set_type('Date', resources='annual', type='date', description="Date in ISO format"),
    set_type('Country', resources='annual', type='string', description="Name of a country"),
    set_type('Exchange rate', resources='annual', type='number', description="Foreign Exchange Rate to USD. Only AUD, IEP, NZD, GBP and EUR to USD."),
    validate()
)


def flow(parameters, datapackage, resources, stats):
    return exchange_rate_flow
def test_join():
    from dataflows import Flow, join, join_with_self, set_type, sort_rows
    from decimal import Decimal

    characters = [
        {
            'first_name': 'Jaime',
            'house': 'Lannister',
            'last_name': 'Lannister',
            'age': 34
        },
        {
            'first_name': 'Tyrion',
            'house': 'Lannister',
            'last_name': 'Lannister',
            'age': 27
        },
        {
            'first_name': 'Cersei',
            'house': 'Lannister',
            'last_name': 'Lannister',
            'age': 34
        },
        {
            'first_name': 'Jon',
            'house': 'Stark',
            'last_name': 'Snow',
            'age': 17
        },
        {
            'first_name': 'Sansa',
            'house': 'Stark',
            'last_name': 'Stark',
            'age': 14
        },
        {
            'first_name': 'Rickon',
            'house': 'Stark',
            'last_name': 'Stark',
            'age': 5
        },
        {
            'first_name': 'Arya',
            'house': 'Stark',
            'last_name': 'Stark',
            'age': 11
        },
        {
            'first_name': 'Bran',
            'house': 'Stark',
            'last_name': 'Stark',
            'age': 10
        },
        {
            'first_name': 'Daenerys',
            'house': 'Targaryen',
            'last_name': 'Targaryen',
            'age': 16
        },
    ]

    houses = [
        {
            'house': 'House of Lannister'
        },
        {
            'house': 'House of Greyjoy'
        },
        {
            'house': 'House of Stark'
        },
        {
            'house': 'House of Targaryen'
        },
        {
            'house': 'House of Martell'
        },
        {
            'house': 'House of Tyrell'
        },
    ]

    res, _, _ = Flow(
        characters, set_type('age', type='number'), houses,
        join(
            'res_1', 'House of {house}', 'res_2', '{house}',
            dict(max_age={
                'name': 'age',
                'aggregate': 'max'
            },
                 avg_age={
                     'name': 'age',
                     'aggregate': 'avg'
                 },
                 representative={
                     'name': 'first_name',
                     'aggregate': 'last'
                 },
                 representative_age={'name': 'age'},
                 number_of_characters={'aggregate': 'count'},
                 last_names={
                     'name': 'last_name',
                     'aggregate': 'counters'
                 }), False, True)).results()

    assert res[0] == [
        {
            'avg_age': Decimal('31.66666666666666666666666667'),
            'house': 'House of Lannister',
            'max_age': Decimal(34),
            'number_of_characters': 3,
            'representative': 'Cersei',
            'representative_age': Decimal(34),
            'last_names': [('Lannister', 3)]
        },
        {
            'avg_age': Decimal('11.4'),
            'house': 'House of Stark',
            'max_age': Decimal(17),
            'number_of_characters': 5,
            'representative': 'Bran',
            'representative_age': Decimal(10),
            'last_names': [('Stark', 4), ('Snow', 1)]
        },
        {
            'avg_age': Decimal(16),
            'house': 'House of Targaryen',
            'max_age': Decimal(16),
            'number_of_characters': 1,
            'representative': 'Daenerys',
            'representative_age': Decimal(16),
            'last_names': [('Targaryen', 1)]
        },
    ]

    # Find youngest of each house
    res, _, _ = Flow(
        characters, set_type('age', type='number'), sort_rows('{age:02}'),
        join_with_self('res_1', '{house}', {
            'the_house': {
                'name': 'house'
            },
            '*': {
                'aggregate': 'first'
            },
        }), sort_rows('{the_house}')).results()

    assert res[0] == [{
        'the_house': 'Lannister',
        'first_name': 'Tyrion',
        'last_name': 'Lannister',
        'age': Decimal('27')
    }, {
        'the_house': 'Stark',
        'first_name': 'Rickon',
        'last_name': 'Stark',
        'age': Decimal('5')
    }, {
        'the_house': 'Targaryen',
        'first_name': 'Daenerys',
        'last_name': 'Targaryen',
        'age': Decimal('16')
    }]
    update_resource('brent-annual', **{
        'path': 'data/brent-annual.csv',
        'dpp:streaming': True
    }),
    update_resource('wti-daily', **{
        'path': 'data/wti-daily.csv',
        'dpp:streaming': True
    }),
    update_resource('wti-weekly', **{
        'path': 'data/wti-weekly.csv',
        'dpp:streaming': True
    }),
    update_resource('wti-monthly', **{
        'path': 'data/wti-monthly.csv',
        'dpp:streaming': True
    }),
    update_resource('wti-annual', **{
        'path': 'data/wti-annual.csv',
        'dpp:streaming': True
    }), format_date, remove_empty_rows,
    set_type('Date', resources=None, type='date', format='any'), validate(),
    dump_to_path('data'))


def flow(parameters, datapackage, resources, stats):
    return oil_prices


if __name__ == '__main__':
    oil_prices.process()
def flow(parameters, *_):

    def take_first(field):
        def f(row):
            if field in row and isinstance(row[field], list):
                row[field] = row[field][0]
        return Flow(
            f, set_type(field, type='string'),
        )

    def datetime_to_date(field):
        def f(row):
            if field in row:
                row[field] = row[field].date()
        return Flow(
            f, set_type(field, type='date'),
        )

    def approve(parameters):
        def func(row):
            if parameters.get('filter-out') is None:
                return True
            bad_phrase = parameters['filter-out']
            for f in ('page_title', 'description'):
                if row.get(f) and bad_phrase in row[f]:
                    return False
            return True
        return func

    return Flow(
        fetcher(parameters),
        concatenate(dict(
            page_title=['Title'],
            publication_id=['ItemId'],
            tender_id=['ItemUniqueId'],
            publisher=['OfficeDesc'],
            start_date=['PublishDate'],
            claim_date=['LastDate'],
            decision=['StatusDesc'],
            description=['Description'],
            last_update_date=['UpdateDate'],
            base_url=['BaseUrl'],
            url_name=['UrlName'],
            tender_type_he=['PublicationTypeDesc'],
        ), resources=-1),
        add_field('tender_type', 'string', default=parameters['tender_type'], resources=-1),
        take_first('publisher'),
        take_first('tender_type_he'),
        add_field('page_url', 'string',
                  default=lambda row: 'https://www.gov.il/he{base_url}{url_name}'.format(**row)),
        # delete_fields(['base_url', 'url_name']),
        filter_rows(approve(parameters)),
        set_type('publication_id', type='integer'),
        set_type('start_date', type='datetime', format=DATE_FMT),
        set_type('last_update_date', type='datetime', format=DATE_FMT),
        set_type('claim_date', type='datetime', format=DATE_FMT),
        datetime_to_date('last_update_date'),
        datetime_to_date('start_date'),
        set_primary_key(['publication_id', 'tender_type', 'tender_id']),
        dedup(),
        update_resource(-1, **parameters.pop('resource')),
        update_resource(-1, **{'dpp:streaming': True}),
        validate(),
        # printer(),
        # lambda rows: (row for row in rows if row['tender_id'].endswith('73f3')),
    )
        skip_rows=[1, 2, 3, 4, 5, -1],
        headers=['Date', 'Price', 'Empty column'],
        format='csv',
        name='annual'
    ),
    extract_december_rows,
    load(
        load_source='http://www.bundesbank.de/cae/servlet/StatisticDownload?tsId=BBEX3.M.XAU.USD.EA.AC.C06&its_csvFormat=en&its_fileFormat=csv&mode=its',
        skip_rows=[1, 2, 3, 4, 5, -1],
        headers=['Date', 'Price', 'Empty column'],
        format='csv',
        name='monthly'
    ),
    update_resource('monthly', **{'path':'data/monthly.csv', 'dpp:streaming': True}),
    update_resource('annual', **{'path':'data/annual.csv', 'dpp:streaming': True}),
    set_type('Date', resources='annual', type='yearmonth'),
    set_type('Price', resources='annual', type='number'),
    set_type('Date', resources='monthly', type='yearmonth'),
    set_type('Price', resources='monthly', type='number'),
    validate(),
    delete_fields(['Empty column'], resources=None)
)


def flow(parameters, datapackage, resources, stats):
    return gold_price_flow


if __name__ == '__main__':
    gold_price_flow.process()
Exemple #7
0
 verify_unused_fields(),
 DF.concatenate(FIELD_MAPPING, target=dict(name='out')),
 fix_urls(['source_url']),
 ensure_chart_title(),
 fix_languages(),
 DF.add_field('order_index', 'integer'),
 lambda rows: ({
     **row,
     **{
         'order_index': i
     }
 } for i, row in enumerate(rows)),
 set_defaults,
 extrapulate_years,
 fix_values,
 DF.set_type('value', groupChar=',', bareNumber=True),
 fix_units,
 DF.set_type('extrapulation_years',
             type='array',
             **{'es:itemType': 'string'}),
 DF.validate(),
 DF.add_computed_field([
     dict(target=dict(name='life_areas',
                      type='array',
                      **{
                          'es:itemType': 'string',
                          'es:keyword': True
                      }),
          operation=lambda row: [
              x for x in
              [row.get('life_area{}'.format(i)) for i in range(1, 4)]
Exemple #8
0
def data_pull_csv():
    unpivoting_fields = [{
        "name": r"([0-9]+\/[0-9]+\/[0-9]+)",
        "keys": {
            "Date": r"\1"
        }
    }]

    extra_keys = [{"name": "Date", "type": "string"}]
    extra_value = {"name": "Case", "type": "number"}

    Flow(
        load(f"{BASE_URL}{CONFIRMED}"),
        load(f"{BASE_URL}{RECOVERED}"),
        load(f"{BASE_URL}{DEATH}"),
        unpivot(unpivoting_fields, extra_keys, extra_value),
        find_replace([{
            "name": "Date",
            "patterns": [{
                "find": "/",
                "replace": "-"
            }]
        }]),
        to_normal_date,
        set_type("Date", type="date", format="%d-%m-%y", resources=None),
        set_type("Case", type="number", resources=None),
        join(
            source_name="time_series_19-covid-Confirmed",
            source_key=["Province/State", "Country/Region", "Date"],
            source_delete=True,
            target_name="time_series_19-covid-Deaths",
            target_key=["Province/State", "Country/Region", "Date"],
            fields=dict(Confirmed={
                "name": "Case",
                "aggregate": "first"
            }),
        ),
        join(
            source_name="time_series_19-covid-Recovered",
            source_key=["Province/State", "Country/Region", "Date"],
            source_delete=True,
            target_name="time_series_19-covid-Deaths",
            target_key=["Province/State", "Country/Region", "Date"],
            fields=dict(Recovered={
                "name": "Case",
                "aggregate": "first"
            }),
        ),
        add_computed_field(
            target={
                "name": "Deaths",
                "type": "number"
            },
            operation="format",
            with_="{Case}",
        ),
        delete_fields(["Case"]),
        update_resource(
            "time_series_19-covid-Deaths",
            name="time-series-19-covid-combined",
            path=RAW_OUTPUT_CSV,
        ),
        dump_to_path(),
    ).results()[0]
    def flow(self):
        taxonomy = self.context.taxonomy
        txn_config = taxonomy.config
        fmt_str = [taxonomy.title + ' for:']
        fields = txn_config['key-fields']
        for f in fields:
            for ct in taxonomy.column_types:
                if ct['name'] == f:
                    fmt_str.append(
                        '%s: "{%s}",' % (ct['title'], f.replace(':', '-'))
                    )
                    break
        fmt_str = ' '.join(fmt_str)
        fields = [
            ct.replace(':', '-')
            for ct in fields
        ]
        all_fields = ['_source'] + fields

        TARGET = 'configurations'
        saved_config = self.config._unflatten()
        saved_config.setdefault('publish', {})['allowed'] = False

        return Flow(
            duplicate(RESOURCE_NAME, TARGET),
            join_with_self(
                TARGET,
                all_fields,
                dict((f, {}) for f in all_fields),
            ),
            add_computed_field(
                [
                    dict(
                        operation='format',
                        target='snippets',
                        with_=fmt_str
                    ),
                    dict(
                        operation='constant',
                        target='key_values',
                        with_=None
                    ),
                ],
                resources=TARGET
            ),
            add_field('config', 'object', saved_config, resources=TARGET),
            add_field('fields', type='object', 
                      default=self.collate_values(fields), resources=TARGET),
            join_with_self(
                TARGET,
                ['_source'],
                dict(
                    source=dict(name='_source'),
                    config={},
                    key_values=dict(aggregate='array'),
                    snippets=dict(aggregate='array'),
                )
            ),
            set_type('source', type='string'),
            set_type('config', type='object'),
            set_type('key_values', type='array'),
            set_type('snippets', type='array'),
            set_primary_key(['source']),
            dump_to_sql(
                dict([
                    (TARGET, {
                        'resource-name': TARGET,
                        'mode': 'update'
                    })
                ]),
                engine=self.lazy_engine(),
            ),
        )
Exemple #10
0
        'name': 'serie-a',
        'path': 'italym.php',
        'key': 'I1',
        'links': [],
        'dataset-name': 'italian-serie-a',
        'dataset-title': 'Italian Serie A (football)'
    }, {
        'name': 'ligue-1',
        'path': 'francem.php',
        'key': 'F1',
        'links': [],
        'dataset-name': 'french-ligue-1',
        'dataset-title': 'French Ligue 1 (football)'
    }]
    for league in leagues:
        meta = get_league_meta(league)
        processors = get_processors(meta)
        processors.append(set_type('Date', type='date', format='%d/%m/%y')),
        processors.append(dump_to_path(out_path='datasets/' + league['name']))
        processors.append(printer())
        processors = [
            add_metadata(name=league['dataset-name'],
                         title=league['dataset-title'],
                         licenses=licenses,
                         sources=sources,
                         related=related_datasets,
                         readme=readme %
                         league['dataset-title'].replace(' (football)', ''))
        ] + processors
        Flow(*processors).process()
def Olap_Datapackage():
    flow = Flow(
        # Load datapackages:
        load('elspot_prices_data/datapackage.json'),
        load('afrr_data/datapackage.json'),
        load('fcr_dk1_data/datapackage.json'),
        concatenate(fields={
            'Timestamp': ['HourUTC'],
            'Area': ['PriceArea'],
            'Product': ['product'],
            'Amount': ['amount'],
            'Price_DKK': ['PriceDKK'],
            'Price_EUR': ['PriceEUR']
        },
                    target={
                        'name': 'fact',
                        'path': 'data/fact.csv'
                    }),
        add_computed_field(
            [dict(target='id', operation='constant', with_='dummy')]),
        add_id,
        set_type('id', type='integer'),
        set_primary_key(primary_key=['id']),
        # Reorder so that 'id' column is the first:
        select_fields([
            'id', 'Timestamp', 'Area', 'Product', 'Amount', 'Price_DKK',
            'Price_EUR'
        ],
                      resources='fact'),
        # Add foreign keys:
        add_foreign_keys,
        # Fact table is ready. Now duplicate the resource to generate dim tables:
        # First is 'time' table:
        duplicate(source='fact', target_name='time', target_path='time.csv'),
        select_fields(['Timestamp'], resources=['time']),
        join_self(source_name='time',
                  source_key=['Timestamp'],
                  target_name='time',
                  fields={'Timestamp': {}}),
        # Parse datetime fields and add a separate field for year, month and day:
        add_computed_field([
            dict(target=dict(name='day', type='string'),
                 operation=lambda row: datetime.strptime(
                     row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%d'
                                                                           )),
            dict(target=dict(name='month', type='string'),
                 operation=lambda row: datetime.strptime(
                     row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%m'
                                                                           )),
            dict(target=dict(name='month_name', type='string'),
                 operation=lambda row: datetime.strptime(
                     row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%B'
                                                                           )),
            dict(target=dict(name='year', type='year'),
                 operation=lambda row: datetime.strptime(
                     row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%Y'
                                                                           )),
        ],
                           resources=['time']),
        set_primary_key(primary_key=['Timestamp'], resources=['time']),
        # Now 'area' table:
        duplicate(source='fact', target_name='area', target_path='area.csv'),
        select_fields(['Area'], resources=['area']),
        join_self(source_name='area',
                  source_key=['Area'],
                  target_name='area',
                  fields={'Area': {}}),
        set_primary_key(primary_key=['Area'], resources=['area']),
        # Now 'product' table:
        duplicate(source='fact',
                  target_name='product',
                  target_path='product.csv'),
        select_fields(['Product'], resources=['product']),
        join_self(source_name='product',
                  source_key=['Product'],
                  target_name='product',
                  fields={'Product': {}}),
        set_primary_key(primary_key=['Product'], resources=['product']),
        dump_to_path('olap_datapackage'))
    flow.process()
Exemple #12
0
def main_flow(prefix=''):
    source_url = '{}data/publications_for_es/datapackage.json'.format(prefix)
    package = Package(source_url)
    all_fields = set(field.name for resource in package.resources
                     for field in resource.schema.fields)
    all_fields = dict((field_name, []) for field_name in all_fields)
    return Flow(
        load(source_url),
        lambda row: dict(row, json='{}'),
        concatenate(all_fields,
                    target=dict(name='publications', path='publications.csv')),
        delete_fields(['json']),
        prefer_gd('title'),
        prefer_gd('notes'),
        prefer_gd('publisher'),
        prefer_gd('tags'),
        prefer_gd('language_code'),
        prefer_gd('pubyear'),
        split_keyword_list('item_kind', 'gd_Item Type'),
        split_keyword_list('life_areas', 'gd_Life Domains'),
        split_keyword_list('source_kind', 'gd_Resource Type'),
        split_keyword_list('languages', 'language_code', ' '),
        split_keyword_list('tags', 'tags'),
        load('data/zotero/datapackage.json'),
        concatenate(dict(
            title=[],
            pubyear=[],
            publisher=[],
            authors=[],
            life_areas=[],
            notes=[],
            languages=[],
            tags=[],
            url=[],
            migdar_id=[],
            item_kind=[],
            source_kind=[],
            isbn=[],
            physical_description=[],
            publication_distribution_details=[],
            doc_id=[],
        ),
                    target=dict(name='publications', path='publications.csv')),
        set_type('title', **{'es:title': True}),
        set_type('notes', **{'es:hebrew': True}),
        set_type('publisher', **{'es:keyword': True}),
        add_field('year', 'integer', default=extract_year),
        split_and_translate('tags', 'tags', keyword=True),
        split_and_translate('life_areas', 'life_areas', keyword=True),
        split_and_translate('languages', 'languages', keyword=True),
        split_and_translate('source_kind', 'source_kind', keyword=True),
        split_and_translate('item_kind', 'item_kind', keyword=True),
        printer(),
        add_computed_field([
            {
                'operation': 'format',
                'target': 'doc_id',
                'with': KEY_PATTERN
            },
            {
                'operation': 'format',
                'target': 'page_title',
                'with': PAGE_TITLE_PATTERN
            },
        ]),
        add_computed_field([]),
    )
        'Year',
        'patterns': [{
            'find': '(\s?\(\d+\))|(\.0)',
            'replace': ''
        }]
    }, {
        'name': 'Fourth',
        'patterns': [{
            'find': '\+|',
            'replace': ''
        }]
    }],
                 resources=0),
    update_resource(
        0, **{
            'name': 'household-income-us-historical',
            'path': 'data/household-income-us-historical.csv',
            'dpp:streaming': True
        }),
    set_type('Year', type='year'),
    set_type('^(?!Y).+', type='number'),
    validate())


def flow(parameters, datapackage, resources, stats):
    return household_us


if __name__ == '__main__':
    household_us.process()
Exemple #14
0
        load_source="http://www.eia.gov/dnav/pet/hist_xls/RWTCw.xls",
        format="xls",
        sheet=2,
        skip_rows=[1, 2, 3],
        headers=["Date", "Price"],
    ),
    load(
        load_source="http://www.eia.gov/dnav/pet/hist_xls/RWTCm.xls",
        format="xls",
        sheet=2,
        skip_rows=[1, 2, 3],
        headers=["Date", "Price"],
    ),
    load(
        load_source="http://www.eia.gov/dnav/pet/hist_xls/RWTCa.xls",
        format="xls",
        sheet=2,
        skip_rows=[1, 2, 3],
        headers=["Date", "Price"],
    ),
    rename_resources,
    set_type("Date", resources=None, type="date", format="any"),
    validate(),
    printer(),
    filter_out_empty_rows,
    dump_to_path(),
)

if __name__ == "__main__":
    OIL_PRICES.process()
    ),
    load(
        load_source='http://www.bankofengland.co.uk/boeapps/iadb/fromshowcolumns.asp?csv.x=yes&SeriesCodes=IUQAMNPY&UsingCodes=Y&CSVF=TN&Datefrom=01/Jan/1963',
        skip_rows=[1],
        headers=['Date', 'Rate'],
        format='csv',
        name='quarterly'
    ),
    load(
        load_source='http://www.bankofengland.co.uk/boeapps/iadb/fromshowcolumns.asp?csv.x=yes&SeriesCodes=IUAAMNPY&UsingCodes=Y&CSVF=TN&Datefrom=01/Jan/1963',
        skip_rows=[1],
        headers=['Year', 'Rate'],
        format='csv',
        name='annual'
    ),
    set_type('Date', resources='quarterly', type='date', format='any'),
    set_type('Rate', resources='quarterly', type='number', description='Quarterly average yield from British Government Securities, 10 year Nominal Par Yield'),
    set_type('Year', resources='annual', type='date', format='any'),
    set_type('Rate', resources='annual', type='number', description='Annual average yield from British Government Securities, 10 year Nominal Par Yield'),
    update_resource('quarterly', **{'path':'data/quarterly.csv', 'dpp:streaming': True}),
    update_resource('annual', **{'path':'data/annual.csv', 'dpp:streaming': True}),
    validate(),
    dump_to_path()
)

def flow(parameters, datapackage, resources, stats):
    return bond_uk

if __name__ == '__main__':
    bond_uk.process()
Exemple #16
0
def test_example_4():
    from dataflows import Flow, set_type

    f = Flow(country_population(),
             set_type('population', type='number', groupChar=','))
    data, dp, _ = f.results()
            }
        ],
        version="0.2.0",
        views=[
            {
              "name": "graph",
              "title": "VIX - CBOE Volatility Index",
              "specType": "simple",
              "spec": {"type": "line","group": "Date","series": ["VIX Close"]}
            }
        ],
        readme=readme()
    ),
    load(
        load_source='http://www.cboe.com/publish/ScheduledTask/MktData/datahouse/vixcurrent.csv',
        headers=2,
        name='vix-daily'
    ),
    set_type('Date', type='date', format='any'),
    update_resource('vix-daily', **{'title': 'VIX Daily', 'path':'data/vix-daily.csv', 'dpp:streaming': True}),
    validate()
)


def flow(parameters, datapackage, resources, stats):
    return finance_vix


if __name__ == '__main__':
    finance_vix.process()
def base_flow():
    sources, *_ = Flow(
        list_gdrive(),
        filter_rows(lambda row: (
            row['kind'] == 'drive#file' and
            row['mimeType'] == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
        )),
        add_field('filename', 'string',
                  default=lambda row: 'pubfiles/{modifiedTime}-{id}.xlsx'.format(**row)),
        download_files(),
        add_field('sheet', 'string'),
        add_field('headers', 'integer', 1),
        get_sheets(),
    ).results()
    return Flow(
        *[
            load(source['filename'],
                 sheet=source['sheet'],
                 headers=source['headers'],
                 infer_strategy=load.INFER_STRINGS,
                 cast_strategy=load.CAST_TO_STRINGS,
                 name=source['filename'])
            for source in sources[0]
        ],
        filter_rows(lambda row: row.get('migdar_id') not in ('', 'None', None)),
        load('data/zotero/zotero.csv'),
        concatenate(
            fields={
                'migdar_id': [],
                'title': ['Title', ],
                'bib_title': [],
                'bib_related_parts': [],

                'notes': [],
                'tags': ['Tags'],
                'publisher': [],
                'languages': ['language_code'],
                'item_kind': ['Item Type', 'Item type', 'item_type'],
                'pubyear': ['pubyear/pubdate'],
                'life_areas': ['Life Domains', 'Domain'],
                'source_kind': ['Resource Type', 'Resource type'],
                'authors': ['author'],
                'url': ['URL'],

            },
            target=dict(
                name='publications',
                path='data/publications.csv'
            )
        ),
        fix_nones(),
        fix_urls(['url']),
        set_type('title',        **{'es:title': True}),
        set_type('authors',       **{'es:boost': True}),
        set_type('notes',        **{'es:hebrew': True}),
        set_type('publisher',    **{'es:boost': True}),
        add_field('year', 'integer',
                  default=extract_year),
        split_and_translate('tags', 'tags', keyword=True, delimiter=','),
        split_and_translate('life_areas', 'life_areas', keyword=True, delimiter=','),
        split_and_translate('languages', 'languages', keyword=True, delimiter=' '),
        split_and_translate('source_kind', 'source_kind', keyword=True),
        split_and_translate('item_kind', 'item_kind', keyword=True),
        fix_links('notes'), 
        verify_migdar_id(),
        add_computed_field([
            {'operation': 'format', 'target': 'doc_id', 'with': KEY_PATTERN},
            {'operation': 'format', 'target': 'page_title',
             'with': PAGE_TITLE_PATTERN},
        ]),
        add_field('title_kw', 'string',
                  default=lambda row: row.get('title'),
                  **{'es:keyword': True}),
    )
            {
              "name": "graph",
              "title": "10 year US Government Bond Yields (Monthly granuarlity)",
              "specType": "simple",
              "spec": {"type": "line","group": "Date","series": ["Rate"]}
            }
        ],
        readme=readme()
    ),
    load(
        load_source='http://www.federalreserve.gov/datadownload/Output.aspx?rel=H15&series=0809abf197c17f1ff0b2180fe7015cc3&lastObs=&from=&to=&filetype=csv&label=include&layout=seriescolumn',
        skip_rows=[i+1 for i in range(6)],
        headers=['Date', 'Rate'],
        format='csv',
        name='monthly'
    ),
    set_type('Date', type='date', format='any', descriptor='Date in ISO 8601'),
    set_type('Rate', type='number', description='Percent per year'),
    update_resource('monthly', **{'path':'data/monthly.csv', 'dpp:streaming': True}),
    validate(),
    dump_to_path()
)


def flow(parameters, datapackage, resources, stats):
    return bond_us


if __name__ == '__main__':
    bond_us.process()
Exemple #20
0
 {% if 'filter' in processing %}
 filter_rows(),
 {% endif %}
 {% if 'find_replace' in processing %}
 find_replace([
     dict(name='field_name',
          patterns=[
              dict(find='re-pattern-to-find', replace='re-pattern-to-replace-with'),                     
          ])
 ]),
 {% endif %}
 {% if 'delete_fields' in processing %}
 delete_fields(['field_name']),  # Pass a list of field names to delete from the data
 {% endif %}
 {% if 'set_type' in processing %}
 set_type('field_name', type='number', constraints=dict(minimum=3)),  # There are quite a few options you can use here
                                                                      # Take a look at https://frictionlessdata.io/specs/table-schema/
 # Or you can simply use validate() here instead                                                                             
 {% endif %}
 {% if 'unpivot' in processing %}
 unpivot(unpivot_fields, extra_keys, extra_value),  # See documentation on the meaning of each of these parameters
 {% endif %}
 {% if 'custom' in processing %}
 my_custom_processing,
 {% endif %}
 # Save the results
 add_metadata(name='{{slug}}', title='''{{title}}'''),
 {% if output in ('print', 'print_n_pkg')  %}
 printer(),
 {% endif %}
 {% if output == 'list' %}