def test_basic_flow_no_mapping_type(): data = [ dict(key='key%04d' % x, value=x) for x in range(1000) ] conn_str = 'localhost:9200' DF.Flow( data, DF.update_resource(-1, name='data'), DF.set_primary_key(['key']), dump_to_es( engine=conn_str, indexes=dict( test_basic_flow_no_mapping_type=[dict( resource_name='data' )] ) ), ).process() time.sleep(1) out = list(Storage(Elasticsearch(hosts=[conn_str])).read('test_basic_flow_no_mapping_type')) assert data == sorted(out, key=lambda r: r['key'])
def flow(*_): gcd = google_chrome_driver() download = gcd.download( 'https://data.gov.il/dataset/246d949c-a253-4811-8a11-41a137d3d613/resource/f004176c-b85f-4542-8901-7b3176f9a054/download/f004176c-b85f-4542-8901-7b3176f9a054.csv' ) return Flow( load(download, cast_strategy=load.CAST_TO_STRINGS), concatenate(_get_columns_mapping_dict(), target=dict(name='company-details')), set_type('id', type='string'), set_type('company_registration_date', type='date', format='%d/%m/%Y'), set_type('company_is_government', type='boolean', falseValues=['לא'], trueValues=['כן']), set_type('company_is_mafera', type='boolean', falseValues=['לא'], trueValues=['מפרה', 'התראה']), set_type('company_last_report_year', type='integer'), clear_bool_values, update_resource(**{'dpp:streaming': True}, resources='company-details'), set_primary_key(['id'], resources='company-details'), printer(), )
def flow(*_): print('reading companies...') return Flow( data_gov_il_resource.flow(companies), fix_values(), concatenate(_get_columns_mapping_dict(), target=dict(name='company-details')), set_type('id', type='string'), set_type('company_street_number', type='string'), set_type('company_registration_date', type='date', format='%d/%m/%Y'), set_type('company_is_government', type='boolean', falseValues=['לא'], trueValues=['כן']), set_type('company_is_mafera', type='boolean', falseValues=['לא'], trueValues=['מפרה', 'התראה']), set_type('company_last_report_year', type='integer'), set_type('company_postal_code', type='string'), clear_bool_values, update_resource(**{'dpp:streaming': True}, resources='company-details'), set_primary_key(['id'], resources='company-details'), printer(), )
def flow(*_): return Flow( get_all_reports(), calculate_publication_id(1), set_type('start_date', type='date', format='%d-%m-%Y'), set_primary_key(['publication_id']), update_resource(-1, name='criteria', **{PROP_STREAMING: True}), )
def flow(*_): return Flow( fetch_results(), set_type('start_date', type='date', format='%d.%m.%Y'), process_kind, calculate_publication_id(2), set_primary_key(['publication_id']), update_resource(-1, name='jobiz', **{PROP_STREAMING: True}), )
def flow(self): taxonomy = self.context.taxonomy txn_config = taxonomy.config fmt_str = [taxonomy.title + ' עבור:'] fields = txn_config['key-fields'] for f in fields: for ct in taxonomy.column_types: if ct['name'] == f: fmt_str.append('%s: "{%s}",' % (ct['title'], f.replace(':', '-'))) break fmt_str = ' '.join(fmt_str) fields = [ct.replace(':', '-') for ct in fields] all_fields = ['_source'] + fields TARGET = 'configurations' saved_config = self.config._unflatten() saved_config.setdefault('publish', {})['allowed'] = False return Flow( duplicate(RESOURCE_NAME, TARGET), join_with_self( TARGET, all_fields, dict((f, {}) for f in all_fields), ), add_computed_field([ dict(operation='format', target='snippets', with_=fmt_str), dict(operation='constant', target='key_values', with_=None), ], resources=TARGET), add_field('config', 'object', saved_config, resources=TARGET), add_field('fields', type='object', default=self.collate_values(fields), resources=TARGET), join_with_self( TARGET, ['_source'], dict( source=dict(name='_source'), config={}, key_values=dict(aggregate='array'), snippets=dict(aggregate='array'), )), set_type('source', type='string'), set_type('config', type='object'), set_type('key_values', type='array'), set_type('snippets', type='array'), set_primary_key(['source']), dump_to_sql( dict([(TARGET, { 'resource-name': TARGET, 'mode': 'update' })]), engine=self.lazy_engine(), ), )
def decp_processing(): flow = Flow( # Chargement du CSV suite à la conversion depuis JSON load("decp.csv"), set_type("acheteur.id", type="string"), set_type("titulaire.id", type="string"), set_type("codeCPV", type="string"), set_type("lieuExecution.code", type="string"), # Tri par rootId et seq pour préparer à la création de donneesActuelles sort_rows('{rootId}:{seq}', resources=0, reverse=True), donnees_actuelles, # rootId et seq peuvent maintenant être supprimés delete_fields(["rootId", "seq"], resources=0, regex=False), sort_rows('{datePublicationDonnees}', resources=0, reverse=True), # Nouvelle table dédiée aux marchés, sans données sur les titulaires print("Création de la table dédiée aux marchés..."), duplicate(source="decp", target_name="decp-sans-titulaires", target_path="decp-sans-titulaires.csv", duplicate_to_end=True), delete_fields([ "titulaire.id", "titulaire.denominationSociale", "titulaire.typeIdentifiant" ], resources="decp-sans-titulaires", regex=False), set_primary_key(["uid"], resources="decp-sans-titulaires"), deduplicate(), # Chargement des précédentes DECP au format CSV, pour extraction des nouvelles données # print("Téléchargement des données tabulaires précédentes..."), # load("https://decp.info/db/decp.csv?_size=max&_dl=1", name="previous-decp"), # set_type("acheteur.id", type="string"), # set_type("titulaire.id", type="string"), # set_type("codeCPV", type="string"), # set_type("lieuExecution.code", type="string"), # delete_fields(["rowid"], resources="previous-decp", regex=False), # #print("Fusion des données tabulaires précédentes et des données d'aujourd'hui..."), # concatenate({},target={"name": "decp-titulaires","path": "decp-titulaires.csv"},resources=["decp","previous-decp"]), # Chargement des précédentes données dédiées aux titulaires print("Chargement des données titulaires..."), load("decp-titulaires.csv", name="decp-titulaires"), set_type("acheteur.id", type="string"), set_type("titulaire.id", type="string"), set_type("codeCPV", type="string"), set_type("lieuExecution.code", type="string"), set_type("departement", type="string"), set_type("codeAPE", type="string"), print("Enregistrement des données sur le disque..."), dump_to_path("decp")) flow.process()
def flow(*_): return Flow( add_field('date', 'date'), add_field('source', 'string'), add_field('s3_object_name', 'string'), add_field('url', 'string'), add_field('pdf', 'array'), add_field('other', 'array'), add_field('num_files', 'number'), set_primary_key(['s3_object_name']), scrape_maya_notification_list(), set_primary_key(['url']), update_resource(-1, name='maya_notification_list', path="data/maya_notification_list.csv", **{ PROP_STREAMING: True, }), )
def test_set_primary_key(): from dataflows import set_primary_key datas1 = [ {'a': 1, 'b': True, 'c': 'c1'}, {'a': 2, 'b': True, 'c': 'c2'}, ] _, dp, _ = Flow( datas1, set_primary_key(['a', 'b']), ).results() assert dp.resources[0].schema.primary_key == ['a', 'b']
def flow(*_): return DF.Flow( DF.load( '/var/datapackages/activities/social_services/social_services/datapackage.json' ), DF.concatenate( dict(kind=[], kind_he=[], activity_name=[], activity_description=[], publisher_name=[], history=[], max_year=[], min_year=[]), dict(name='activities', path='activities.csv')), DF.set_primary_key(['kind', 'publisher_name', 'activity_name']), DF.set_type('activity_name', **{'es:title': True}), DF.set_type('activity_description', **{ 'es:itemType': 'string', 'es:boost': True }), DF.set_type('kind', **{ 'es:keyword': True, 'es:exclude': True }), DF.set_type('kind_he', **{ 'es:keyword': True, 'es:exclude': True }), DF.set_type('publisher_name', **{'es:keyword': True}), DF.set_type( 'history', **{ 'es:itemType': 'object', 'es:schema': dict(fields=[ dict(name='year', type='integer'), dict(name='unit', type='string'), dict(name='subunit', type='string'), dict(name='subsubunit', type='string'), dict(name='allocated_budget', type='integer'), dict(name='num_beneficiaries', type='string', **{'es:index': False}), ]) }), DF.add_field( 'score', 'number', lambda x: (x['history'][0].get('allocated_budget') or 1000) / 1000, **{'es:score-column': True}), DF.update_resource(-1, **{'dpp:streaming': True}), DF.dump_to_path('/var/datapackages/activities/all'), DF.dump_to_sql(dict(activities={'resource-name': 'activities'})))
def flow(*_): return DF.Flow( services(), DF.delete_fields( ['__tab', 'complete', 'non_suppliers', 'non_tenders', 'notes']), DF.add_field('publisher_name', 'string', lambda r: r['office'], **{'es:keyword': True}), splitter('target_audience'), splitter('subject'), splitter('intervention'), splitter('target_age_group'), floater('beneficiaries'), floater('budgetItems'), floater('manualBudget'), floater('tenders'), floater('suppliers'), floater('virtue_of_table'), fix_suppliers(), fix_tenders(), add_current_budget(), add_current_beneficiaries(), DF.add_field('min_year', 'integer', 2020), DF.add_field('max_year', 'integer', CURRENT_YEAR), DF.add_field('kind', 'string', 'gov_social_service', **{ 'es:keyword': True, 'es:exclude': True }), DF.add_field('kind_he', 'string', 'שירות חברתי', **{ 'es:keyword': True, 'es:exclude': True }), DF.set_type('name', **{'es:title': True}), DF.set_type('description', **{ 'es:itemType': 'string', 'es:boost': True }), DF.add_field('score', 'number', get_score, **{'es:score-column': True}), DF.set_primary_key(['kind', 'id']), DF.update_resource(-1, name='activities', **{'dpp:streaming': True}), DF.dump_to_sql(dict(all_activities={'resource-name': 'activities'})), DF.filter_rows(lambda r: not r['deleted']), DF.delete_fields(['deleted']), DF.dump_to_path('/var/datapackages/activities/social_services'), DF.dump_to_sql(dict(activities={'resource-name': 'activities'})), )
def test_deduplicate(): from dataflows import deduplicate, set_primary_key a = [ { 'a': 1, 'b': 3, 'c': 'First' }, { 'a': 2, 'b': 3, 'c': 'First' }, { 'a': 1, 'b': 3, 'c': '!First' }, { 'a': 1, 'b': 2, 'c': 'First' }, { 'a': 2, 'b': 3, 'c': '!First' }, ] f = Flow( a, set_primary_key(['a', 'b']), deduplicate(), ) results, _, _ = f.results() assert set(x['c'] for x in results[0]) == {'First'}
def flow(*_): return DF.Flow( all_units(), DF.add_field('office', 'string', lambda r: r['path'][0] if len(r['path']) > 0 else None, **{'es:keyword': True}), DF.add_field('unit', 'string', lambda r: r['path'][1] if len(r['path']) > 1 else None, **{'es:keyword': True}), DF.add_field('subunit', 'string', lambda r: r['path'][2] if len(r['path']) > 2 else None, **{'es:keyword': True}), DF.add_field('subsubunit', 'string', lambda r: r['path'][3] if len(r['path']) > 3 else None, **{'es:keyword': True}), DF.add_field('breadcrumbs', 'string', lambda r: '/'.join(r['path']) or 'משרדי הממשלה', **{'es:exclude': True}), DF.add_field('id', 'string', lambda r: '__'.join(r['path']) or 'main', **{'es:exclude': True}), DF.delete_fields([ 'path', ]), DF.add_field('min_year', 'integer', 2020), DF.add_field('max_year', 'integer', 2020), DF.add_field('kind', 'string', 'gov_social_service_unit', **{ 'es:keyword': True, 'es:exclude': True }), DF.add_field('kind_he', 'string', 'שירותים חברתיים במיקור חוץ', **{ 'es:keyword': True, 'es:exclude': True }), DF.add_field('score', 'number', 1000, **{'es:score-column': True}), DF.set_primary_key(['kind', 'id']), DF.update_resource(-1, name='units', **{'dpp:streaming': True}), # Ensure we only have the main offices DF.filter_rows(lambda r: r['unit'] is None), DF.filter_rows(lambda r: r['office'] != 'משרד העליה והקליטה'), DF.dump_to_path('/var/datapackages/units/social_services'), DF.dump_to_sql(dict(units={'resource-name': 'units'})))
def Olap_Datapackage(): flow = Flow( # Load datapackages: load('elspot_prices_data/datapackage.json'), load('afrr_data/datapackage.json'), load('fcr_dk1_data/datapackage.json'), concatenate(fields={ 'Timestamp': ['HourUTC'], 'Area': ['PriceArea'], 'Product': ['product'], 'Amount': ['amount'], 'Price_DKK': ['PriceDKK'], 'Price_EUR': ['PriceEUR'] }, target={ 'name': 'fact', 'path': 'data/fact.csv' }), add_computed_field( [dict(target='id', operation='constant', with_='dummy')]), add_id, set_type('id', type='integer'), set_primary_key(primary_key=['id']), # Reorder so that 'id' column is the first: select_fields([ 'id', 'Timestamp', 'Area', 'Product', 'Amount', 'Price_DKK', 'Price_EUR' ], resources='fact'), # Add foreign keys: add_foreign_keys, # Fact table is ready. Now duplicate the resource to generate dim tables: # First is 'time' table: duplicate(source='fact', target_name='time', target_path='time.csv'), select_fields(['Timestamp'], resources=['time']), join_self(source_name='time', source_key=['Timestamp'], target_name='time', fields={'Timestamp': {}}), # Parse datetime fields and add a separate field for year, month and day: add_computed_field([ dict(target=dict(name='day', type='string'), operation=lambda row: datetime.strptime( row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%d' )), dict(target=dict(name='month', type='string'), operation=lambda row: datetime.strptime( row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%m' )), dict(target=dict(name='month_name', type='string'), operation=lambda row: datetime.strptime( row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%B' )), dict(target=dict(name='year', type='year'), operation=lambda row: datetime.strptime( row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%Y' )), ], resources=['time']), set_primary_key(primary_key=['Timestamp'], resources=['time']), # Now 'area' table: duplicate(source='fact', target_name='area', target_path='area.csv'), select_fields(['Area'], resources=['area']), join_self(source_name='area', source_key=['Area'], target_name='area', fields={'Area': {}}), set_primary_key(primary_key=['Area'], resources=['area']), # Now 'product' table: duplicate(source='fact', target_name='product', target_path='product.csv'), select_fields(['Product'], resources=['product']), join_self(source_name='product', source_key=['Product'], target_name='product', fields={'Product': {}}), set_primary_key(primary_key=['Product'], resources=['product']), dump_to_path('olap_datapackage')) flow.process()
def flow(parameters, *args): return Flow( set_primary_key(primary_key=parameters["primary-key"], resources=parameters["resources"]))
def flow(parameters, *_): def take_first(field): def f(row): if field in row and isinstance(row[field], list): row[field] = row[field][0] return Flow( f, set_type(field, type='string'), ) def datetime_to_date(field): def f(row): if row.get(field): row[field] = row[field].date() return Flow( f, set_type(field, type='date'), ) def approve(parameters): def func(row): if parameters.get('filter-out') is None: return True bad_phrase = parameters['filter-out'] for f in ('page_title', 'description'): if row.get(f) and bad_phrase in row[f]: return False return True return func return Flow( fetcher(parameters), concatenate(dict( page_title=['Title'], publication_id=['ItemId'], tender_id=['ItemUniqueId'], publisher=['OfficeDesc'], start_date=['PublishDate'], claim_date=['LastDate'], decision=['StatusDesc'], description=['Description'], last_update_date=['UpdateDate'], base_url=['BaseUrl'], url_name=['UrlName'], tender_type_he=['PublicationTypeDesc'], ), resources=-1), add_field('tender_type', 'string', default=parameters['tender_type'], resources=-1), take_first('publisher'), take_first('tender_type_he'), add_field('page_url', 'string', default=lambda row: 'https://www.gov.il/he{base_url}{url_name}'.format(**row)), # delete_fields(['base_url', 'url_name']), filter_rows(approve(parameters)), set_type('publication_id', type='integer'), set_type('start_date', type='datetime', format=DATE_FMT), set_type('last_update_date', type='datetime', format=DATE_FMT), set_type('claim_date', type='datetime', format=DATE_FMT), datetime_to_date('last_update_date'), datetime_to_date('start_date'), set_primary_key(['publication_id', 'tender_type', 'tender_id']), dedup(), update_resource(-1, **parameters.pop('resource')), update_resource(-1, **{'dpp:streaming': True}), validate(), )
prop['type'] = 'keyword' elif schema_type in ('number', 'integer'): prop['index'] = True return prop if __name__ == '__main__': DF.Flow( DF.load('new-york-city-current-job-postings.zip', filename='nyc-jobs.csv', name='jobs'), DF.add_field('doc_id', 'string', default=lambda row: 'job/{Job ID}'.format(**row)), DF.add_field('score', 'integer', default=1), DF.set_type('Salary Frequency', **{'es:keyword': True}), DF.set_primary_key(['doc_id']), dump_to_es(indexes={'jobs-job': [{ 'resource-name': 'jobs', }]}, mapper_cls=SampleMappingGenerator), DF.dump_to_path('data'), DF.add_field('value', 'object', default=lambda row: dict((k, v) for k, v in row.items() if k not in ('doc_id', 'score')), **{'es:index': False}), DF.select_fields(['doc_id', 'value']), dump_to_es(indexes={'jobs-document': [{ 'resource-name': 'jobs', }]}), DF.printer(fields=['doc_id'])).process()
def flow(self): if len(self.errors) == 0: primaryKey = [ self.ct_to_fn(f) for f in self.config.get(CONFIG_PRIMARY_KEY) ] fieldOptions = {} dataTypes = dict( (ct['name'], dict(ct.get('options', {}), type=ct['dataType'])) for ct in self.config.get(CONFIG_TAXONOMY_CT) if 'dataType' in ct) for mf in self.config.get(CONFIG_MODEL_MAPPING): ct = mf.get('columnType') name = mf['name'] fieldOptions[name] = {} if ct is not None: fieldOptions[name].update(dataTypes.get(ct, {})) fieldOptions[name].update(mf.get('options', {})) fieldOptions[name]['columnType'] = ct extraFieldDefs = self.join_mapping_taxonomy('extra', fieldOptions) normalizeFieldDef = self.join_mapping_taxonomy( 'normalize', fieldOptions) unpivotFields = [ dict( name=f['name'], keys=f['normalize'], ) for f in self.config.get(CONFIG_MODEL_MAPPING) if 'normalize' in f ] if len(normalizeFieldDef) > 0: normalizeFieldDef = normalizeFieldDef[0] else: normalizeFieldDef = None steps = [ self.create_fdp(), self.datetime_handler(), self.set_consts(fieldOptions), validate(on_error=ignore), ] + ([ unpivot(unpivotFields, extraFieldDefs, normalizeFieldDef, regex=False, resources=RESOURCE_NAME), ] if normalizeFieldDef else []) + [ self.copy_names_to_titles(), self.rename([(self.ct_to_fn(f['columnType']), f['name']) for f in self.config.get(CONFIG_MODEL_MAPPING) if f.get('columnType') is not None]), update_resource(RESOURCE_NAME, path='out.csv'), # *[ # set_type( # self.ct_to_fn(f['columnType']), # columnType=f['columnType'], # **fieldOptions.get(f['columnType'], {}), # resources=RESOURCE_NAME, # on_error=ignore # ) # for f in self.config.get(CONFIG_MODEL_MAPPING) # if f.get('columnType') is not None # ], set_primary_key(primaryKey, resources=RESOURCE_NAME) if len(primaryKey) else None # printer() ] f = Flow(*steps) return f