def run_external_link_checker( google_api_credentials_path: str, master_spreadsheet_id: Optional[str] = None, spreadsheet_ids_str: Optional[str] = None, ): """ Run the the external link checker. If a list of spreadsheet ids are provided, run the external link checker against the list of spreadsheet ids, instead of the spreadsheet ids gathered from the master spreadsheet. Parameters ---------- master_spreadsheet_id: str The master spreadsheet id. google_api_credentials_path: str The path to Google API credentials file needed to read Google Sheets. spreadsheet_ids_str: Optional[str] The list spreadsheet ids, delimited by comma. """ log.info("Finished external link checker set up, start checking external link.") log.info("=" * 80) # Spawn local dask cluster cluster = LocalCluster() # Log the dashboard link log.info(f"Dashboard available at: {cluster.dashboard_link}") # Setup workflow with Flow("Check external links") as flow: # Get spreadsheet ids spreadsheet_ids = _get_spreadsheet_ids( master_spreadsheet_id, google_api_credentials_path, spreadsheet_ids_str ) # Extract sheets data. # Get back list of list of SheetData spreadsheets_data = _extract.map( spreadsheet_ids, unmapped(google_api_credentials_path), ) # Extract links from list of SheetData # Get back list of list of URLData links_data = _extract_external_links.map(flatten(spreadsheets_data)) # Unique the url data unique_links_data = _unique_external_links(flatten(links_data)) # Check external links _check_external_link.map(unique_links_data) # Run the flow state = flow.run(executor=DaskExecutor(cluster.scheduler_address)) if state.is_failed(): raise PrefectFlowFailure(ErrorInfo({"flow_name": flow.name})) # Get the list of CheckedURL checked_links = state.result[flow.get_tasks(name="_check_external_link")[0]].result log.info("=" * 80) # Get error links error_links = [link for link in checked_links if link.has_error] gs_cells = [] for error_link in error_links: for cell in error_link.url_data.cells: gs_cells.append( GoogleSheetCell( spreadsheet_title=cell.spreadsheet_title, sheet_title=cell.sheet_title, row_index=cell.row_index, col_index=cell.col_index, url=error_link.url_data.url, msg=error_link.msg, ) ) sorted_gs_cells = sorted( gs_cells, key=lambda x: ( x.spreadsheet_title, x.sheet_title, x.row_index, x.col_index, x.url, ), ) # Write error links to a csv file with open("external_links.csv", mode="w") as csv_file: fieldnames = ["spreadsheet_title", "sheet_title", "cell", "url", "reason"] writer = csv.DictWriter(csv_file, fieldnames=fieldnames, delimiter="\t") writer.writeheader() for gs_cell in sorted_gs_cells: writer.writerow( { "spreadsheet_title": gs_cell.spreadsheet_title, "sheet_title": gs_cell.sheet_title, "cell": convert_rowcol_to_A1_name( gs_cell.row_index, gs_cell.col_index ), "url": gs_cell.url, "reason": f"{gs_cell.msg}", } ) log.info("Finished writing external links csv file")
def run_sigla_pipeline(master_spreadsheet_id: str, google_api_credentials_path: str, db_connection_url: str): """ Run the SIGLA ETL pipeline Parameters ---------- master_spreadsheet_id: The master spreadsheet id. google_api_credentials_path: str The path to Google API credentials file needed to read Google Sheets. db_connection_url: str The DB's connection url str. """ log.info("Finished pipeline set up, start running pipeline") log.info("=" * 80) # Spawn local dask cluster cluster = LocalCluster() # Log the dashboard link log.info(f"Dashboard available at: {cluster.dashboard_link}") # Setup workflow with Flow("SIGLA Data Pipeline") as flow: # Delete all documents from db clean_up_task = _clean_up(db_connection_url) # Get spreadsheet ids spreadsheet_ids = _get_spreadsheet_ids(master_spreadsheet_id, google_api_credentials_path) # Extract sheets data. # Get back list of list of SheetData spreadsheets_data = _extract.map( spreadsheet_ids, unmapped(google_api_credentials_path), upstream_tasks=[unmapped(clean_up_task)], ) # Transform list of SheetData into FormattedSheetData formatted_spreadsheets_data = _transform.map( flatten(spreadsheets_data)) # Create instituton filter gs_institution_filter = _create_filter_task([ gs_format.standard_institution, gs_format.multiple_sigla_answer_variable, ]) # Filter to list of institutional formatted sheet data gs_institutions_data = gs_institution_filter( formatted_spreadsheets_data) # Create composite filter gs_composite_filter = _create_filter_task([ gs_format.composite_variable, gs_format.institution_and_composite_variable, ]) # Filter to list of composite formatted sheet data gs_composites_data = gs_composite_filter(formatted_spreadsheets_data) # Load instutional data load_institutions_data_task = _load_institutions_data.map( gs_institutions_data, unmapped(db_connection_url)) # Load composite data load_composites_data_task = _load_composites_data.map( gs_composites_data, unmapped(db_connection_url), upstream_tasks=[unmapped(load_institutions_data_task)], ) # Log spreadsheets that were loaded _log_spreadsheets(spreadsheets_data, upstream_tasks=[load_composites_data_task]) # Run the flow state = flow.run(executor=DaskExecutor(cluster.scheduler_address)) if state.is_failed(): raise PrefectFlowFailure(ErrorInfo({"flow_name": flow.name}))
next_button = driver.find_element_by_class_name('next') except NoSuchElementException: driver.save_screenshot('screenshot.png') continued = True while continued: next_button.click() next_button = driver.find_element_by_class_name('next') if pages[-1] == driver.current_url: break pages.append(driver.current_url) return pages @task def write(data: list): import json import boto3 s3 = boto3.resource('s3') s3object = s3.Object('testmydask', f'gulp-{date.today().strftime("%Y-%m-%d")}.json') s3object.put(Body=(bytes(json.dumps(data).encode('UTF-8')))) from prefect import Flow, flatten with Flow("gulp-prefect-dask") as flow: pages = get_pages('https://www.gulp.de/gulp2/g/projekte?order=DATE_DESC') jobs = get_jobs_links.map(pages) data = read_job.map(flatten(jobs)) write(data)
def run_qa_test( db_connection_url: str, google_api_credentials_path: str, master_spreadsheet_id: Optional[str] = None, spreadsheet_ids_str: Optional[str] = None, ): """ Run QA test Parameters ---------- master_spreadsheet_id: str The master spreadsheet id. db_connection_url: str The DB's connection url str. google_api_credentials_path: str The path to Google API credentials file needed to read Google Sheets. spreadsheet_ids_str: Optional[str] = None The list of spreadsheet ids. """ cluster = LocalCluster() # Log the dashboard link log.info(f"Dashboard available at: {cluster.dashboard_link}") # Setup workflow with Flow("QA Test") as flow: # get a list of spreadsheet ids spreadsheet_ids = _get_spreadsheet_ids(master_spreadsheet_id, google_api_credentials_path, spreadsheet_ids_str) # list of list of db institutions db_institutions_data = _gather_db_institutions.map( spreadsheet_ids, unmapped(db_connection_url)) # db institutions with their db variables and composite variable data db_institutions = _gather_db_variables.map( flatten(db_institutions_data), unmapped(db_connection_url)) # group db institutions db_institutions_group = _group_db_institutions(db_institutions) # extract list of list of sheet data spreadsheets_data = _extract.map(spreadsheet_ids, unmapped(google_api_credentials_path)) # transform to list of formatted sheet data formatted_spreadsheets_data = _transform.map( flatten(spreadsheets_data)) # create institutional filter gs_institution_filter = _create_filter_task([ GoogleSheetsFormat.standard_institution, GoogleSheetsFormat.multiple_sigla_answer_variable, GoogleSheetsFormat.institution_and_composite_variable, ]) # filter to list of institutional formatted sheet data gs_institutions_data = gs_institution_filter( formatted_spreadsheets_data) # get list of list of gs institution gs_institutions = _gather_gs_institutions.map(gs_institutions_data) # create composite filter gs_composite_filter = _create_filter_task([ GoogleSheetsFormat.composite_variable, GoogleSheetsFormat.institution_and_composite_variable, ]) # filter to list of composite formatted sheet data gs_composites = gs_composite_filter(formatted_spreadsheets_data) # group gs institutions gs_institutions_group = _group_gs_institutions( flatten(gs_institutions)) # compare gs institutions against db # get list of comparisons gs_institution_comparisons = _compare_gs_institution.map( flatten(gs_institutions), unmapped(db_institutions_group)) # compare gs composite variables against db institutions # get list of list of comparisons gs_composite_comparisons = _compare_gs_composite_variable.map( gs_composites, unmapped(db_connection_url)) # write gs institution comparisons _write_comparison.map(gs_institution_comparisons) # write gs composite comparisons _write_comparison.map(flatten(gs_composite_comparisons)) # write extra db institution _write_extra_db_institutions(db_institutions, gs_institutions_group) # Run the flow state = flow.run(executor=DaskExecutor(cluster.scheduler_address)) if state.is_failed(): raise PrefectFlowFailure(ErrorInfo({"flow_name": flow.name})) # get write comparison tasks _write_comparison_tasks = flow.get_tasks(name="_write_comparison") # get the comparisons comparisons = [ *state.result[_write_comparison_tasks[0]].result, *state.result[_write_comparison_tasks[1]].result, ] # filter to error comparisons gs_error_comparisons = [ comparison for comparison in comparisons if comparison.has_error() ] # get extra db institution filename extra_db_institutions_filename = state.result[flow.get_tasks( name="_write_extra_db_institutions")[0]].result # write zip file with ZipFile("qa-test.zip", "w") as zip_file: for comp in gs_error_comparisons: zip_file.write( comp.get_filename(), f"{comp.spreadsheet_title}/{comp.sheet_title},{comp.name}", ) if extra_db_institutions_filename: zip_file.write(extra_db_institutions_filename, "extra-institutions.csv")
def load_spreadsheets( spreadsheet_ids: List[str], db_connection_url: str, google_api_credentials_path: str, ): """ Load spreadsheets to the database. Parameters ---------- spreadsheet_ids: List[str] The list of spreadsheet ids. db_connection_url: str The DB's connection url str. google_api_credentials_path: str The path to Google API credentials file needed to read Google Sheets. """ cluster = LocalCluster() # Log the dashboard link log.info(f"Dashboard available at: {cluster.dashboard_link}") # Setup workflow with Flow("Load spreadsheets") as flow: # list of list of db institutions db_institutions_data = _gather_db_institutions.map( spreadsheet_ids, unmapped(db_connection_url) ) # db institutions with their db variables and composite variable data db_institutions = _gather_db_variables.map( flatten(db_institutions_data), unmapped(db_connection_url) ) # use db_institutions to remove data delete_db_institutions_task = _delete_db_institutions( db_institutions, db_connection_url ) # extract list of list of sheet data spreadsheets_data = _extract.map( spreadsheet_ids, unmapped(google_api_credentials_path), upstream_tasks=[unmapped(delete_db_institutions_task)], ) # transform to list of formatted sheet data formatted_spreadsheets_data = _transform.map(flatten(spreadsheets_data)) # create institutonal filter gs_institution_filter = _create_filter_task( [ gs_format.standard_institution, gs_format.multiple_sigla_answer_variable, ] ) # filter to list of institutional formatted sheet data gs_institutions_data = gs_institution_filter(formatted_spreadsheets_data) # Create composite filter gs_composite_filter = _create_filter_task( [ gs_format.composite_variable, gs_format.institution_and_composite_variable, ] ) # filter to list of composite formatted sheet data gs_composites_data = gs_composite_filter(formatted_spreadsheets_data) # load instutional data load_institutions_data_task = _load_institutions_data.map( gs_institutions_data, unmapped(db_connection_url) ) # load composite data load_composites_data_task = _load_composites_data.map( gs_composites_data, unmapped(db_connection_url), upstream_tasks=[unmapped(load_institutions_data_task)], ) # log spreadsheets that were loaded _log_spreadsheets(spreadsheets_data, upstream_tasks=[load_composites_data_task]) # Run the flow state = flow.run(executor=DaskExecutor(cluster.scheduler_address)) # Check the flow's final state if state.is_failed(): raise PrefectFlowFailure(ErrorInfo({"flow_name": flow.name}))
Note: this module is not meant to be an efficient solution to the word counting problem. It is only meant to demonstrate distributed workflows in Prefect. """ from prefect import Parameter from prefect import Flow from prefect import flatten from server.src.tasks.mock import download_message from server.src.tasks.mock import split_message from server.src.tasks.mock import mapper from server.src.tasks.mock import shuffler from server.src.tasks.mock import reducer with Flow(name='mapreduce-wordcount') as mapreduce_wordcount: url = Parameter('url', required=True) message = download_message(url) lines = split_message(message) token_tuples = mapper.map(lines) partitions = shuffler(flatten(token_tuples)) token_counts = reducer.map(partitions) if __name__ == "__main__": pass
def get_next_uv_dates( master_spreadsheet_id: str, google_api_credentials_path: str, start_date: date, end_date: date, ): """ Get next update and verify dates or uv dates that falls within the date range. Parameters ---------- master_spreadsheet_id: str The master spreadsheet id google_api_credentials_path: str The path to Google API credentials file needed to read Google Sheets. start_date: date The start date. end_date: date The end date. """ log.info("Finished setup, start finding next uv dates.") log.info("=" * 80) # Spawn local dask cluster cluster = LocalCluster() # Log the dashboard link log.info(f"Dashboard available at: {cluster.dashboard_link}") # Setup workflow with Flow("Get next update and verify dates") as flow: # Get the list of spreadsheet ids from the master spreadsheet spreadsheet_ids = _get_spreadsheet_ids( master_spreadsheet_id, google_api_credentials_path ) # Extract sheets data. # Get back list of list of SheetData spreadsheets_data = _extract.map( spreadsheet_ids, unmapped(google_api_credentials_path), ) log.info("Finished extracting the spreadsheet data.") # Extract next uv dates next_uv_dates_data = _extract_next_uv_dates.map(flatten(spreadsheets_data)) log.info("Finished extracting the next uv dates.") # Check next uv dates _check_next_uv_date.map( flatten(next_uv_dates_data), unmapped(start_date), unmapped(end_date) ) log.info("Finished checking next uv dates.") # Run the flow state = flow.run(executor=DaskExecutor(cluster.scheduler_address)) # Check the flow's final state if state.is_failed(): raise PrefectFlowFailure(ErrorInfo({"flow_name": flow.name})) # Get the list of CheckedNextUVDates checked_next_uv_dates = state.result[ flow.get_tasks(name="_check_next_uv_date")[0] ].result log.info("=" * 80) # Get next uv dates next_uv_dates = [ next_uv_date for next_uv_date in checked_next_uv_dates if next_uv_date.status != NextUVDateStatus.irrelevant ] sorted_next_uv_dates = sorted( next_uv_dates, key=lambda x: ( x.next_uv_date_data.spreadsheet_title, x.next_uv_date_data.sheet_title, x.next_uv_date_data.row_index, ), ) # Write next uv dates to a csv file with open("next_uv_dates.csv", mode="w") as csv_file: fieldnames = ["spreadsheet_title", "sheet_title", "cell", "status"] writer = csv.DictWriter(csv_file, fieldnames=fieldnames, delimiter="\t") writer.writeheader() for next_uv_date in sorted_next_uv_dates: next_uv_date_data = next_uv_date.next_uv_date_data writer.writerow( { "spreadsheet_title": next_uv_date_data.spreadsheet_title, "sheet_title": next_uv_date_data.sheet_title, "cell": f"{next_uv_date_data.column_name}{next_uv_date_data.row_index}", "status": next_uv_date.status, } ) log.info("Finished writing next uv dates csv file")
# print(state) # print(state.result[a].result) # print(state.result[b].result) # print(state.result[c].result) @task def values(): return [[None]] @task def add(v): # if v: return v + 100 @task def print_vals(a, b): print("here") print(a) print(b) with Flow('fm') as f: vals = values() # a = add.map(flat(vals)) print_vals.map(a=flatten([1]), b=flatten([[1]])) state = f.run()
# Load config config = load_config() # Load the stocks + the initial value # CSV SCHEMA: stock,initial_value stocks = load_stocks() # Split the stocks into even groups of 5 as the Vantage API # only allows 5 api calls per minute split_stocks = split_stocks(stocks["stock"], 5) # Get the adjusted closing price of each group of stocks stock_prices = get_price.map(split_stocks) # Concat the stocks back into a single list stocks_price_mapping = concat_stocks(flatten(stock_prices)) # Add closing price to dataframe stocks = add_price(stocks, stocks_price_mapping) # Calculate gain or loss based off initial price stocks = get_difference(stocks) # Create the email message message = create_message(stocks) # Send email email_task(msg=message, email_to=config["general"]["email_list"]) flow.register(project_name="Portolio Updater")