def get_cache_graph(**options): """ This graphs builds a cache of badges from ccure :return: bonobo.Graph """ graph = bonobo.Graph() graph.add_chain( bonobo.CsvReader( '/etl/ccure/uploads/BadgeID/ccure_BadgeID_AllButVendor.txt', fields=('badge_id', 'empty1', 'last_name', 'empty2', 'first_name', 'empty3', 'issued_on', 'empty4', 'disabled', 'empty5', 'valid_until', 'empty6', 'flag2', 'empty7', 'flag3', 'empty8', 'flag4'), delimiter='|', fs='brickftp'), badge_active, cache, ) return graph
def get_graph(**options): """ This function builds the graph that needs to be executed. :return: bonobo.Graph """ graph = bonobo.Graph() split_dbs = bonobo.noop graph.add_chain( bonobo.CsvReader('/etl/metrics-insights/workday-users.csv', fs='brickftp'), employee_active, find_badge_id, bonobo.UnpackItems(0), split_dbs) for engine in list(set(options['engine'])): graph.add_chain(bonobo_sqlalchemy.InsertOrUpdate( table_name=options['table_name'] + options['table_suffix'], discriminant=('badgeid', ), buffer_size=10, engine=engine), _input=split_dbs) return graph
def get_inventory_graph(**options): """ This function builds the graph that needs to be executed. :return: bonobo.Graph """ graph = bonobo.Graph() graph.add_chain( bonobo.CsvReader('Deckbox-inventory.csv'), bonobo.Filter(lambda *args: args[-1] != 'English'), inventory, bonobo.Rename(Card_Number='Card Number', Tradelist_Count='Tradelist Count'), # bonobo_sqlalchemy.InsertOrUpdate( # 'cards', # discriminant=( # 'Name', # 'Edition', # 'Card_Number', # 'Foil', # ), # engine='cards'), _name='main', ) return graph
def get_debug_graph(job, graph=None, *, _limit=(), _print=()): """Builds a simple graph for debugging.""" graph = graph or bonobo.Graph() graph.add_chain( bonobo.CsvReader(job.input_file, fs=FS_IN_SERVICE_ID), *_limit, # bonobo.Filter(lambda *row: len(row) == 5), # bonobo.JsonWriter('output.json', fs='fs.out'), # bonobo.CsvWriter(job.output_file, fs=FS_OUT_SERVICE_ID, # fields=['integration_id', 'site_name', 'address', 'borough', 'status']), log_raw, *_print, ) return graph
def get_graph(**options): """ This function builds the graph that needs to be executed. :return: bonobo.Graph """ graph = bonobo.Graph() graph.add_chain( bonobo.CsvReader('data/2018_ref_pub.csv'), FilterSerialTitle(), lambda title, count: title, get_serial_by_title, FilterDuplicate(collection='serial', field='_id', database='scopus'), MongoWriter(collection='serial', database='scopus'), ) return graph
def get_graph(job, graph=None, *, _limit=(), _print=()): """Builds the execution graph.""" graph = graph or bonobo.Graph() graph.add_chain( bonobo.CsvReader(job.input_file, fs=FS_IN_SERVICE_ID, fields=[ 'integration_id', 'site_name', 'address', 'borough', 'status' ], skip=1), *_limit, search, bonobo.UnpackItems(0), bonobo.CsvWriter(job.output_file, fs=FS_OUT_SERVICE_ID), *_print, ) return graph
def get_graph(**options): graph = bonobo.Graph() # # Import authors # graph.add_chain( # bonobo.CsvReader('data/ff-faculty.csv', skip=1), # bonobo.Limit(limit), # create_author_document, # FilterDuplicate(collection="jhu-authors", field='hopkins_id', target='hopkins_id', database=database), # MongoWriter(collection='jhu-authors', database=database), # ) # # # Retreive authors from scopus # graph.add_chain( # extract_author_scopus_ids, # bonobo.Limit(limit), # FilterDuplicate(collection='scopus-authors', database=database), # get_author, # MongoWriter(collection='scopus-authors', database=database), # _input=create_author_document, # ) # Retrieve documents from scopus graph.add_chain( bonobo.CsvReader('data/ff-article-ids-17.csv'), bonobo.Limit(limit), FilterDuplicate(collection='scopus-documents', database=database), get_document, # Keep errata data. Leave it to downstream analysis. Otherwise it'll be repeatedly downloaded and discarded. # remove_errata, MongoWriter(collection='scopus-documents', database=database), ) # Extract serials data from Scopus and load into MongoDB graph.add_chain(lambda args: args['coredata'].get('source-id', None), bonobo.Limit(limit), FilterDuplicate(collection='scopus-serials', database=database), get_serial, MongoWriter(collection='scopus-serials', database=database), _input=get_document) return graph
def get_graph(**options): """ This function builds the graph that needs to be executed. :return: bonobo.Graph """ graph = bonobo.Graph() graph.add_chain( bonobo.CsvReader('data/biophysics/faculty.csv'), get_author_by_name, create_author_document, MongoWriter(collection='jhu-authors', database='assessments'), ) graph.add_chain(extract_authors, extract_id, FilterDuplicate(collection='scopus-authors', database='assessments'), get_author, MongoWriter(collection='scopus-authors', database='assessments'), _input=get_author_by_name) graph.add_chain(extract_id, get_docs_by_author, extract_id, FilterDuplicate(collection='scopus-documents', field='_id', database='assessments'), get_document, MongoWriter(collection='scopus-documents', database='assessments'), _input=extract_authors) graph.add_chain(lambda args: args['coredata'].get('source-id', None), FilterDuplicate(collection='scopus-serials', database='assessments'), get_serial, MongoWriter(collection='scopus-serials', database='assessments'), _input=get_document) return graph
def get_decks(**options): """ This function builds the graph that needs to be executed. :return: bonobo.Graph """ graph = bonobo.Graph() csv_in = bonobo.noop graph.add_chain(csv_in, in_use_cards, _input=None) for deck in listdir('decks'): deck_path = join('decks', deck) if deck == '.gitignore': continue if isfile(deck_path): graph.add_chain(bonobo.CsvReader(deck_path), _output=csv_in) return graph
def get_graph(**options): """ This function builds the graph that needs to be executed. :return: bonobo.Graph """ graph = bonobo.Graph() split_dbs = bonobo.noop graph.add_chain(bonobo.CsvReader(options['input_file'], delimiter='|', fields=('Admitted', 'blank1', 'Timestamp', 'blank2', 'Name', 'card_id', 'Location'), fs='brickftp'), timestamp, card_id, map_fields, bonobo.UnpackItems(0), split_dbs, _name="main") for engine in list(set(options['engine'])): graph.add_chain(bonobo_sqlalchemy.InsertOrUpdate( table_name=options['table_name'] + options['table_suffix'], discriminant=( 'activitydate', 'badgeid', 'username', 'location', ), engine=engine), _input=split_dbs) return graph
def get_graph(**options): """ This function builds the graph that needs to be executed. :return: bonobo.Graph """ graph = bonobo.Graph() # Read data from the CSV file and load into MongoDB graph.add_chain( bonobo.CsvReader('data/biophysics-author-names.csv'), bonobo.Limit(limit), get_author_by_name, create_author_document, MongoWriter(collection='jhu-authors', database=database), ) # Extract authors from Scopus and load into MongoDB graph.add_chain( extract_authors, bonobo.Limit(limit), extract_id, FilterDuplicate(collection='scopus-authors', database=database), get_author, MongoWriter(collection='scopus-authors', database=database), _input=get_author_by_name ) # Extract documents from Scopus and load into MongoDB graph.add_chain( extract_id, get_docs_by_author, bonobo.Limit(limit), extract_id, FilterDuplicate(collection='scopus-documents', field='_id', database=database), get_document, remove_errata, MongoWriter(collection='scopus-documents', database=database), _input=extract_authors ) # Extract serials data from Scopus and load into MongoDB graph.add_chain( lambda args: args['coredata'].get('source-id', None), bonobo.Limit(limit), FilterDuplicate(collection='scopus-serials', database=database), get_serial, MongoWriter(collection='scopus-serials', database=database), _input=remove_errata ) # Extract co-authors data from Scopus and load into MongoDB graph.add_chain( get_authors_from_doc, bonobo.Limit(limit), FilterDuplicate(collection='scopus-authors', field='@auid', database=database), lambda args: args['@auid'], get_author, MongoWriter(collection='scopus-authors', database=database), # bonobo.JsonWriter('results/authors.json'), _input=remove_errata ) return graph
def get_graph(*, _limit=None, _print=False): return bonobo.Graph(bonobo.CsvReader("coffeeshops.csv"), *((bonobo.Limit(_limit), ) if _limit else ()), *((bonobo.PrettyPrinter(), ) if _print else ()), bonobo.CsvWriter("coffeeshops.csv", fs="fs.output"))
def get_graph(*, _limit=None, _print=False): return bonobo.Graph(bonobo.CsvReader('datasets/coffeeshops.txt'), *((bonobo.Limit(_limit), ) if _limit else ()), *((bonobo.PrettyPrinter(), ) if _print else ()), bonobo.CsvWriter('coffeeshops.csv', fs='fs.output'))
import bonobo graph = bonobo.Graph( bonobo.CsvReader('Google_facebook_statuses.csv'), bonobo.JsonWriter('output.json'), ) if __name__ == '__main__': bonobo.run(graph)
import bonobo from bonobo.commands.run import get_default_services graph = bonobo.Graph( bonobo.CsvReader('datasets/coffeeshops.txt'), print, ) if __name__ == '__main__': bonobo.run(graph, services=get_default_services(__file__))
import bonobo from bonobo.commands.run import get_default_services graph = bonobo.Graph( bonobo.CsvReader('datasets/coffeeshops.txt', headers=('item', )), bonobo.PrettyPrinter(), ) if __name__ == '__main__': bonobo.run(graph, services=get_default_services(__file__))
def get_graph(**options): """ This function builds the graph that needs to be executed. :return: bonobo.Graph """ graph = bonobo.Graph() split = bonobo.noop graph.add_chain( bonobo.CsvWriter('DeckedBuilder.csv'), # bonobo.Limit(10), metadata, # bonobo.UnpackItems(0), split, _input=None, _name='main', ) graph.add_chain( bonobo.CsvReader('main-en.csv'), bonobo.Format(Language='English'), _output='main', ) graph.add_chain( bonobo.CsvReader('main-de.csv'), bonobo.Format(Language='German'), _output='main', ) graph.add_chain( bonobo.CsvReader('main-ru.csv'), bonobo.Format(Language='Russian'), _output='main', ) graph.add_chain( bonobo.CsvReader('main-it.csv'), bonobo.Format(Language='Italian'), _output='main', ) graph.add_chain( bonobo.CsvReader('main-jp.csv'), bonobo.Format(Language='Japanese'), _output='main', ) graph.add_chain( bonobo.CsvReader('main-fr.csv'), bonobo.Format(Language='French'), _output='main', ) graph.add_chain( bonobo.CsvReader('main-kr.csv'), bonobo.Format(Language='Korean'), _output='main', ) graph.add_chain( bonobo.CsvReader('main-cs.csv'), bonobo.Format(Language='Chinese'), _output='main', ) graph.add_chain( bonobo.CsvReader('Deckbox-extras.csv'), bonobo.Format(Language='English'), _output='main', ) if ECHO_MTG: # Reg Qty,Foil Qty,Name,Set,Acquired,Language echomtg = {'Acquired For': '0.004', 'Language': 'en'} graph.add_chain( # echomtg specific fiddling remove_metadata, bonobo.UnpackItems(0), # bonobo.PrettyPrinter(), bonobo.Rename(Name='Card'), bonobo.Format(**echomtg), bonobo.CsvWriter('EchoMTG.csv'), _input=split, ) # MTG Studio if MTG_STUDIO: graph.add_chain( mtg_studio, remove_metadata, bonobo.UnpackItems(0), # bonobo.Format(Edition='{Set}'), bonobo.Rename(Edition='Set'), # bonobo.Rename(Name='Card'), # bonobo.Rename(Qty='Reg Qty'), # bonobo.Rename(Foil='Foil Qty'), # bonobo.PrettyPrinter(), bonobo.CsvWriter('MTG-Studio.csv'), _input=split, ) # graph.add_chain( # tradeable, # bonobo.UnpackItems(0), # #bonobo.PrettyPrinter(), # #bonobo.Limit(3000), # bonobo.CsvWriter("DeckedBuilder-tradelist.csv"), # bonobo.OrderFields([ # 'Card', # 'Set', # 'Foil', # 'Quantity', # ]), # bonobo.CsvWriter("CardKingdom-buylist.csv"), # bonobo.OrderFields([ # 'Quantity', # 'Card', # 'Set', # ]), # bonobo.CsvWriter( # "mtgprice-buylist.csv", # delimiter="\t", # ), # _input=split, # ) # if DECKBOX: csv_out = bonobo.CsvWriter('Deckbox-inventory.csv') graph.add_chain( # # metadata, # #bonobo.UnpackItems(0), deckbox, bonobo.UnpackItems(0), csv_out, _input=split, ) graph.add_chain(bonobo.CsvReader('Deckbox-specials.csv'), _output=csv_out) return graph
import bonobo def guess_email(**row): return { **row, 'email': row['name'] + '@' + row['domain'] } graph = bonobo.Graph( bonobo.CsvReader('employees.csv'), bonobo.Filter(lambda *row: row['position'] != 'CEO'), # guess_email, bonobo.CsvWriter('employees.output.csv'), ) if __name__ == "__main__": parser = bonobo.get_argument_parser() with bonobo.parse_args(parser): bonobo.run(graph)