Beispiel #1
0
def run_external_link_checker(
    google_api_credentials_path: str,
    master_spreadsheet_id: Optional[str] = None,
    spreadsheet_ids_str: Optional[str] = None,
):
    """
    Run the the external link checker.
    If a list of spreadsheet ids are provided, run the external link checker
    against the list of spreadsheet ids, instead of the spreadsheet ids gathered
    from the master spreadsheet.

    Parameters
    ----------
    master_spreadsheet_id: str
        The master spreadsheet id.
    google_api_credentials_path: str
        The path to Google API credentials file needed to read Google Sheets.
    spreadsheet_ids_str: Optional[str]
        The list spreadsheet ids, delimited by comma.
    """
    log.info("Finished external link checker set up, start checking external link.")
    log.info("=" * 80)
    # Spawn local dask cluster
    cluster = LocalCluster()
    # Log the dashboard link
    log.info(f"Dashboard available at: {cluster.dashboard_link}")
    # Setup workflow
    with Flow("Check external links") as flow:
        # Get spreadsheet ids
        spreadsheet_ids = _get_spreadsheet_ids(
            master_spreadsheet_id, google_api_credentials_path, spreadsheet_ids_str
        )

        # Extract sheets data.
        # Get back list of list of SheetData
        spreadsheets_data = _extract.map(
            spreadsheet_ids,
            unmapped(google_api_credentials_path),
        )
        # Extract links from list of SheetData
        # Get back list of list of URLData
        links_data = _extract_external_links.map(flatten(spreadsheets_data))
        # Unique the url data
        unique_links_data = _unique_external_links(flatten(links_data))
        # Check external links
        _check_external_link.map(unique_links_data)

    # Run the flow
    state = flow.run(executor=DaskExecutor(cluster.scheduler_address))
    if state.is_failed():
        raise PrefectFlowFailure(ErrorInfo({"flow_name": flow.name}))
    # Get the list of CheckedURL
    checked_links = state.result[flow.get_tasks(name="_check_external_link")[0]].result
    log.info("=" * 80)
    # Get error links
    error_links = [link for link in checked_links if link.has_error]
    gs_cells = []
    for error_link in error_links:
        for cell in error_link.url_data.cells:
            gs_cells.append(
                GoogleSheetCell(
                    spreadsheet_title=cell.spreadsheet_title,
                    sheet_title=cell.sheet_title,
                    row_index=cell.row_index,
                    col_index=cell.col_index,
                    url=error_link.url_data.url,
                    msg=error_link.msg,
                )
            )

    sorted_gs_cells = sorted(
        gs_cells,
        key=lambda x: (
            x.spreadsheet_title,
            x.sheet_title,
            x.row_index,
            x.col_index,
            x.url,
        ),
    )
    # Write error links to a csv file
    with open("external_links.csv", mode="w") as csv_file:
        fieldnames = ["spreadsheet_title", "sheet_title", "cell", "url", "reason"]
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames, delimiter="\t")
        writer.writeheader()
        for gs_cell in sorted_gs_cells:
            writer.writerow(
                {
                    "spreadsheet_title": gs_cell.spreadsheet_title,
                    "sheet_title": gs_cell.sheet_title,
                    "cell": convert_rowcol_to_A1_name(
                        gs_cell.row_index, gs_cell.col_index
                    ),
                    "url": gs_cell.url,
                    "reason": f"{gs_cell.msg}",
                }
            )
    log.info("Finished writing external links csv file")
def run_sigla_pipeline(master_spreadsheet_id: str,
                       google_api_credentials_path: str,
                       db_connection_url: str):
    """
    Run the SIGLA ETL pipeline

    Parameters
    ----------
    master_spreadsheet_id:
        The master spreadsheet id.
    google_api_credentials_path: str
        The path to Google API credentials file needed to read Google Sheets.
    db_connection_url: str
        The DB's connection url str.
    """
    log.info("Finished pipeline set up, start running pipeline")
    log.info("=" * 80)
    # Spawn local dask cluster
    cluster = LocalCluster()
    # Log the dashboard link
    log.info(f"Dashboard available at: {cluster.dashboard_link}")
    # Setup workflow
    with Flow("SIGLA Data Pipeline") as flow:
        # Delete all documents from db
        clean_up_task = _clean_up(db_connection_url)
        # Get spreadsheet ids
        spreadsheet_ids = _get_spreadsheet_ids(master_spreadsheet_id,
                                               google_api_credentials_path)
        # Extract sheets data.
        # Get back list of list of SheetData
        spreadsheets_data = _extract.map(
            spreadsheet_ids,
            unmapped(google_api_credentials_path),
            upstream_tasks=[unmapped(clean_up_task)],
        )

        # Transform list of SheetData into FormattedSheetData
        formatted_spreadsheets_data = _transform.map(
            flatten(spreadsheets_data))
        # Create instituton filter
        gs_institution_filter = _create_filter_task([
            gs_format.standard_institution,
            gs_format.multiple_sigla_answer_variable,
        ])
        # Filter to list of institutional formatted sheet data
        gs_institutions_data = gs_institution_filter(
            formatted_spreadsheets_data)
        # Create composite filter
        gs_composite_filter = _create_filter_task([
            gs_format.composite_variable,
            gs_format.institution_and_composite_variable,
        ])
        # Filter to list of composite formatted sheet data
        gs_composites_data = gs_composite_filter(formatted_spreadsheets_data)

        # Load instutional data
        load_institutions_data_task = _load_institutions_data.map(
            gs_institutions_data, unmapped(db_connection_url))
        # Load composite data
        load_composites_data_task = _load_composites_data.map(
            gs_composites_data,
            unmapped(db_connection_url),
            upstream_tasks=[unmapped(load_institutions_data_task)],
        )
        # Log spreadsheets that were loaded
        _log_spreadsheets(spreadsheets_data,
                          upstream_tasks=[load_composites_data_task])

    # Run the flow
    state = flow.run(executor=DaskExecutor(cluster.scheduler_address))
    if state.is_failed():
        raise PrefectFlowFailure(ErrorInfo({"flow_name": flow.name}))
Beispiel #3
0
        next_button = driver.find_element_by_class_name('next')
    except NoSuchElementException:
        driver.save_screenshot('screenshot.png')
    continued = True
    while continued:
        next_button.click()
        next_button = driver.find_element_by_class_name('next')
        if pages[-1] == driver.current_url:
            break
        pages.append(driver.current_url)
    return pages


@task
def write(data: list):
    import json
    import boto3
    s3 = boto3.resource('s3')
    s3object = s3.Object('testmydask',
                         f'gulp-{date.today().strftime("%Y-%m-%d")}.json')
    s3object.put(Body=(bytes(json.dumps(data).encode('UTF-8'))))


from prefect import Flow, flatten

with Flow("gulp-prefect-dask") as flow:
    pages = get_pages('https://www.gulp.de/gulp2/g/projekte?order=DATE_DESC')
    jobs = get_jobs_links.map(pages)
    data = read_job.map(flatten(jobs))
    write(data)
Beispiel #4
0
def run_qa_test(
    db_connection_url: str,
    google_api_credentials_path: str,
    master_spreadsheet_id: Optional[str] = None,
    spreadsheet_ids_str: Optional[str] = None,
):
    """
    Run QA test

    Parameters
    ----------
    master_spreadsheet_id: str
        The master spreadsheet id.
    db_connection_url: str
        The DB's connection url str.
    google_api_credentials_path: str
        The path to Google API credentials file needed to read Google Sheets.
    spreadsheet_ids_str: Optional[str] = None
        The list of spreadsheet ids.
    """

    cluster = LocalCluster()
    # Log the dashboard link
    log.info(f"Dashboard available at: {cluster.dashboard_link}")
    # Setup workflow
    with Flow("QA Test") as flow:
        # get a list of spreadsheet ids
        spreadsheet_ids = _get_spreadsheet_ids(master_spreadsheet_id,
                                               google_api_credentials_path,
                                               spreadsheet_ids_str)
        # list of list of db institutions
        db_institutions_data = _gather_db_institutions.map(
            spreadsheet_ids, unmapped(db_connection_url))
        # db institutions with their db variables and composite variable data
        db_institutions = _gather_db_variables.map(
            flatten(db_institutions_data), unmapped(db_connection_url))
        # group db institutions
        db_institutions_group = _group_db_institutions(db_institutions)

        # extract list of list of sheet data
        spreadsheets_data = _extract.map(spreadsheet_ids,
                                         unmapped(google_api_credentials_path))
        # transform to list of formatted sheet data
        formatted_spreadsheets_data = _transform.map(
            flatten(spreadsheets_data))
        # create institutional filter
        gs_institution_filter = _create_filter_task([
            GoogleSheetsFormat.standard_institution,
            GoogleSheetsFormat.multiple_sigla_answer_variable,
            GoogleSheetsFormat.institution_and_composite_variable,
        ])
        # filter to list of institutional formatted sheet data
        gs_institutions_data = gs_institution_filter(
            formatted_spreadsheets_data)
        # get list of list of gs institution
        gs_institutions = _gather_gs_institutions.map(gs_institutions_data)
        # create composite filter
        gs_composite_filter = _create_filter_task([
            GoogleSheetsFormat.composite_variable,
            GoogleSheetsFormat.institution_and_composite_variable,
        ])
        # filter to list of composite formatted sheet data
        gs_composites = gs_composite_filter(formatted_spreadsheets_data)

        # group gs institutions
        gs_institutions_group = _group_gs_institutions(
            flatten(gs_institutions))

        # compare gs institutions against db
        # get list of comparisons
        gs_institution_comparisons = _compare_gs_institution.map(
            flatten(gs_institutions), unmapped(db_institutions_group))
        # compare gs composite variables against db institutions
        # get list of list of comparisons
        gs_composite_comparisons = _compare_gs_composite_variable.map(
            gs_composites, unmapped(db_connection_url))

        # write gs institution comparisons
        _write_comparison.map(gs_institution_comparisons)
        # write gs composite comparisons
        _write_comparison.map(flatten(gs_composite_comparisons))
        # write extra db institution
        _write_extra_db_institutions(db_institutions, gs_institutions_group)

    # Run the flow
    state = flow.run(executor=DaskExecutor(cluster.scheduler_address))
    if state.is_failed():
        raise PrefectFlowFailure(ErrorInfo({"flow_name": flow.name}))
    # get write comparison tasks
    _write_comparison_tasks = flow.get_tasks(name="_write_comparison")
    # get the comparisons
    comparisons = [
        *state.result[_write_comparison_tasks[0]].result,
        *state.result[_write_comparison_tasks[1]].result,
    ]
    # filter to error comparisons
    gs_error_comparisons = [
        comparison for comparison in comparisons if comparison.has_error()
    ]
    # get extra db institution filename
    extra_db_institutions_filename = state.result[flow.get_tasks(
        name="_write_extra_db_institutions")[0]].result
    # write zip file
    with ZipFile("qa-test.zip", "w") as zip_file:
        for comp in gs_error_comparisons:
            zip_file.write(
                comp.get_filename(),
                f"{comp.spreadsheet_title}/{comp.sheet_title},{comp.name}",
            )
        if extra_db_institutions_filename:
            zip_file.write(extra_db_institutions_filename,
                           "extra-institutions.csv")
Beispiel #5
0
def load_spreadsheets(
    spreadsheet_ids: List[str],
    db_connection_url: str,
    google_api_credentials_path: str,
):
    """
    Load spreadsheets to the database.

    Parameters
    ----------
    spreadsheet_ids: List[str]
        The list of spreadsheet ids.
    db_connection_url: str
        The DB's connection url str.
    google_api_credentials_path: str
        The path to Google API credentials file needed to read Google Sheets.
    """

    cluster = LocalCluster()
    # Log the dashboard link
    log.info(f"Dashboard available at: {cluster.dashboard_link}")
    # Setup workflow
    with Flow("Load spreadsheets") as flow:
        # list of list of db institutions
        db_institutions_data = _gather_db_institutions.map(
            spreadsheet_ids, unmapped(db_connection_url)
        )
        # db institutions with their db variables and composite variable data
        db_institutions = _gather_db_variables.map(
            flatten(db_institutions_data), unmapped(db_connection_url)
        )

        # use db_institutions to remove data
        delete_db_institutions_task = _delete_db_institutions(
            db_institutions, db_connection_url
        )

        # extract list of list of sheet data
        spreadsheets_data = _extract.map(
            spreadsheet_ids,
            unmapped(google_api_credentials_path),
            upstream_tasks=[unmapped(delete_db_institutions_task)],
        )
        # transform to list of formatted sheet data
        formatted_spreadsheets_data = _transform.map(flatten(spreadsheets_data))
        # create institutonal filter
        gs_institution_filter = _create_filter_task(
            [
                gs_format.standard_institution,
                gs_format.multiple_sigla_answer_variable,
            ]
        )
        # filter to list of institutional formatted sheet data
        gs_institutions_data = gs_institution_filter(formatted_spreadsheets_data)
        # Create composite filter
        gs_composite_filter = _create_filter_task(
            [
                gs_format.composite_variable,
                gs_format.institution_and_composite_variable,
            ]
        )
        # filter to list of composite formatted sheet data
        gs_composites_data = gs_composite_filter(formatted_spreadsheets_data)

        # load instutional data
        load_institutions_data_task = _load_institutions_data.map(
            gs_institutions_data, unmapped(db_connection_url)
        )
        # load composite data
        load_composites_data_task = _load_composites_data.map(
            gs_composites_data,
            unmapped(db_connection_url),
            upstream_tasks=[unmapped(load_institutions_data_task)],
        )
        # log spreadsheets that were loaded
        _log_spreadsheets(spreadsheets_data, upstream_tasks=[load_composites_data_task])

    # Run the flow
    state = flow.run(executor=DaskExecutor(cluster.scheduler_address))
    # Check the flow's final state
    if state.is_failed():
        raise PrefectFlowFailure(ErrorInfo({"flow_name": flow.name}))
Beispiel #6
0
Note: this module is not meant to be an efficient solution to the word
counting problem. It is only meant to demonstrate distributed workflows
in Prefect.
"""

from prefect import Parameter
from prefect import Flow
from prefect import flatten

from server.src.tasks.mock import download_message
from server.src.tasks.mock import split_message
from server.src.tasks.mock import mapper
from server.src.tasks.mock import shuffler
from server.src.tasks.mock import reducer


with Flow(name='mapreduce-wordcount') as mapreduce_wordcount:

    url = Parameter('url', required=True)

    message = download_message(url)
    lines = split_message(message)
    token_tuples = mapper.map(lines)
    partitions = shuffler(flatten(token_tuples))
    token_counts = reducer.map(partitions)


if __name__ == "__main__":
    pass
def get_next_uv_dates(
    master_spreadsheet_id: str,
    google_api_credentials_path: str,
    start_date: date,
    end_date: date,
):
    """
    Get next update and verify dates or uv dates that falls within the date range.

    Parameters
    ----------
    master_spreadsheet_id: str
        The master spreadsheet id
    google_api_credentials_path: str
        The path to Google API credentials file needed to read Google Sheets.
    start_date: date
        The start date.
    end_date: date
        The end date.
    """
    log.info("Finished setup, start finding next uv dates.")
    log.info("=" * 80)
    # Spawn local dask cluster
    cluster = LocalCluster()
    # Log the dashboard link
    log.info(f"Dashboard available at: {cluster.dashboard_link}")
    # Setup workflow
    with Flow("Get next update and verify dates") as flow:
        # Get the list of spreadsheet ids from the master spreadsheet
        spreadsheet_ids = _get_spreadsheet_ids(
            master_spreadsheet_id, google_api_credentials_path
        )
        # Extract sheets data.
        # Get back list of list of SheetData
        spreadsheets_data = _extract.map(
            spreadsheet_ids,
            unmapped(google_api_credentials_path),
        )
        log.info("Finished extracting the spreadsheet data.")
        # Extract next uv dates
        next_uv_dates_data = _extract_next_uv_dates.map(flatten(spreadsheets_data))
        log.info("Finished extracting the next uv dates.")
        # Check next uv dates
        _check_next_uv_date.map(
            flatten(next_uv_dates_data), unmapped(start_date), unmapped(end_date)
        )
        log.info("Finished checking next uv dates.")

    # Run the flow
    state = flow.run(executor=DaskExecutor(cluster.scheduler_address))
    # Check the flow's final state
    if state.is_failed():
        raise PrefectFlowFailure(ErrorInfo({"flow_name": flow.name}))

    # Get the list of CheckedNextUVDates
    checked_next_uv_dates = state.result[
        flow.get_tasks(name="_check_next_uv_date")[0]
    ].result
    log.info("=" * 80)
    # Get next uv dates
    next_uv_dates = [
        next_uv_date
        for next_uv_date in checked_next_uv_dates
        if next_uv_date.status != NextUVDateStatus.irrelevant
    ]
    sorted_next_uv_dates = sorted(
        next_uv_dates,
        key=lambda x: (
            x.next_uv_date_data.spreadsheet_title,
            x.next_uv_date_data.sheet_title,
            x.next_uv_date_data.row_index,
        ),
    )
    # Write next uv dates to a csv file
    with open("next_uv_dates.csv", mode="w") as csv_file:
        fieldnames = ["spreadsheet_title", "sheet_title", "cell", "status"]
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames, delimiter="\t")
        writer.writeheader()
        for next_uv_date in sorted_next_uv_dates:
            next_uv_date_data = next_uv_date.next_uv_date_data
            writer.writerow(
                {
                    "spreadsheet_title": next_uv_date_data.spreadsheet_title,
                    "sheet_title": next_uv_date_data.sheet_title,
                    "cell": f"{next_uv_date_data.column_name}{next_uv_date_data.row_index}",
                    "status": next_uv_date.status,
                }
            )
    log.info("Finished writing next uv dates csv file")
Beispiel #8
0
# print(state)
# print(state.result[a].result)
# print(state.result[b].result)
# print(state.result[c].result)


@task
def values():
    return [[None]]


@task
def add(v):
    # if v:
    return v + 100


@task
def print_vals(a, b):
    print("here")
    print(a)
    print(b)


with Flow('fm') as f:
    vals = values()
    # a = add.map(flat(vals))
    print_vals.map(a=flatten([1]), b=flatten([[1]]))

state = f.run()
Beispiel #9
0
    # Load config
    config = load_config()

    # Load the stocks + the initial value
    # CSV SCHEMA: stock,initial_value
    stocks = load_stocks()

    # Split the stocks into even groups of 5 as the Vantage API
    # only allows 5 api calls per minute
    split_stocks = split_stocks(stocks["stock"], 5)

    # Get the adjusted closing price of each group of stocks
    stock_prices = get_price.map(split_stocks)

    # Concat the stocks back into a single list
    stocks_price_mapping = concat_stocks(flatten(stock_prices))

    # Add closing price to dataframe
    stocks = add_price(stocks, stocks_price_mapping)

    # Calculate gain or loss based off initial price
    stocks = get_difference(stocks)

    # Create the email message
    message = create_message(stocks)

    # Send email
    email_task(msg=message, email_to=config["general"]["email_list"])

flow.register(project_name="Portolio Updater")