Beispiel #1
0
def _start_workflows_and_record_start_time(
        workflow_name: str,
        workflow_range: (int, int),
        workers: int = WORKERS_DEFAULT_COUNT) -> pd.DataFrame:
    logger.info(f"Starting {workflow_range} workflows...")
    df = create_empty_dataframe_for_started_results()
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=workers) as executor:
        futures = [
            executor.submit(_start_single_workflow,
                            build_extended_workflow_name(workflow_name, i))
            for i in range(workflow_range[0], workflow_range[1] + 1)
        ]
        for future in concurrent.futures.as_completed(futures):
            try:
                workflow_name, asked_to_start_datetime = future.result()
                df = df.append(
                    {
                        "name": workflow_name,
                        "asked_to_start_date": asked_to_start_datetime,
                    },
                    ignore_index=True,
                )
            except Exception as e:
                logger.error(e)
    return df
Beispiel #2
0
def monitor(workflow: str, sleep: int) -> None:
    """Start periodically collect defined metrics and save them to JSON file.

    This function is blocking.
    """
    _print_metrics()
    logger.info("Starting monitoring...")

    all_metrics = {}
    metrics_parameters = {
        "workflow": workflow,
    }

    try:
        while True:
            # if metrics will take, for example, couple of seconds to collect monitored_date will be less accurate
            monitored_date = get_utc_now_timestamp()
            collected_metrics = _collect_metrics(metrics_parameters)
            all_metrics[monitored_date] = collected_metrics
            _save_metrics(workflow, all_metrics)

            time.sleep(sleep)
    except KeyboardInterrupt:
        logger.info("Stopping monitoring...")
    finally:
        _save_metrics(workflow, all_metrics)
Beispiel #3
0
def _save_plots(plots: List[Tuple[str, Figure]], workflow: str,
                workflow_range: Tuple[int, int]) -> None:
    logger.info("Saving plots...")
    for base_name, figure in plots:
        path = Path(
            f"{workflow}_{base_name}_{workflow_range[0]}_{workflow_range[1]}.png"
        )
        figure.savefig(path)
Beispiel #4
0
def _merge_workflows_and_started_results(
        workflows: pd.DataFrame,
        started_results: pd.DataFrame) -> pd.DataFrame:
    """Merge workflows status results and recorded started results.

    Required columns: name (workflow_name)
    """
    logger.info("Merging workflows and started results...")
    return workflows.merge(started_results, on=["name"], how="left")
Beispiel #5
0
def start(workflow_name: str, workflow_range: (int, int),
          workers: int) -> None:
    """Start already submitted workflows."""
    started_results = _start_workflows_and_record_start_time(
        workflow_name, workflow_range, workers)

    started_results = _append_to_existing_started_results(
        workflow_name, started_results)

    _save_started_results(workflow_name, started_results)
    logger.info("Finished starting workflows.")
Beispiel #6
0
def _append_to_existing_started_results(
        workflow_name: str, new_results: pd.DataFrame) -> pd.DataFrame:
    """Append new started results to existing started results and return them."""
    results_path = build_started_results_path(workflow_name)

    existing_results = pd.DataFrame()

    if results_path.exists():
        logger.info("Loading existing started results. Appending...")
        existing_results = pd.read_csv(results_path)

    return existing_results.append(new_results, ignore_index=True)
Beispiel #7
0
def _build_plots(df: pd.DataFrame,
                 plot_parameters: Dict) -> List[Tuple[str, Figure]]:
    logger.info("Building plots...")

    plots = []
    for build_plot in [
            _build_execution_progress_plot,
            _build_execution_status_plot,
            _build_total_time_histogram,
            _build_runtime_histogram,
            _build_pending_time_histogram,
    ]:
        plot_base_name, figure = build_plot(df, plot_parameters)
        plots.append((plot_base_name, figure))

    return plots
Beispiel #8
0
def _create_and_upload_workflows(
    workflow: str,
    workflow_range: (int, int),
    file: Optional[str] = None,
    workers: int = WORKERS_DEFAULT_COUNT,
) -> None:
    logger.info(f"Creating and uploading {workflow_range} workflows...")
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=workers) as executor:
        futures = [
            executor.submit(
                _create_and_upload_single_workflow,
                build_extended_workflow_name(workflow, i),
                file,
            ) for i in range(workflow_range[0], workflow_range[1] + 1)
        ]
        for future in concurrent.futures.as_completed(futures):
            # collect results, in case of exception, it will be raised here
            future.result()
Beispiel #9
0
def _clean_results(df: pd.DataFrame) -> pd.DataFrame:
    logger.info("Cleaning results...")

    # fix "-" values for created status
    df.loc[df["status"] == "created", "started"] = None
    df.loc[df["status"] == "created", "ended"] = None
    df["asked_to_start_date"] = df.apply(
        lambda row: None
        if pd.isna(row["asked_to_start_date"]) else row["asked_to_start_date"],
        axis=1,
    )

    # fix "-" values for running, pending, queued statuses
    df.loc[df["status"] == "running", "ended"] = None

    df.loc[df["status"] == "pending", "started"] = None
    df.loc[df["status"] == "pending", "ended"] = None

    df.loc[df["status"] == "queued", "started"] = None
    df.loc[df["status"] == "queued", "ended"] = None
    return df
Beispiel #10
0
def collect(workflow_prefix: str, force: bool) -> None:  # noqa: D103
    results_path = build_started_results_path(workflow_prefix)

    if results_path.exists():
        started_results = pd.read_csv(results_path)
    else:
        logger.warning("Started results are not found.")
        started_results = create_empty_dataframe_for_started_results()

    workflows = _get_workflows(workflow_prefix)
    if _workflows_finished(workflows) or force:
        results = _merge_workflows_and_started_results(workflows,
                                                       started_results)
        results = _clean_results(results)

        collect_datetime = get_utc_now_timestamp()
        results["collected_date"] = [collect_datetime] * len(results)

        _save_collected_results(workflow_prefix, results)
        logger.info(f"Collected {len(results)} workflows. Finished.")
    else:
        logger.info(
            "Not collecting. Workflows are still running. Use -f option to force collect."
        )
Beispiel #11
0
def _derive_metrics(df: pd.DataFrame) -> pd.DataFrame:
    logger.info("Deriving metrics...")

    df["workflow_number"] = df.apply(
        lambda row: _get_workflow_number_from_name(row["name"]), axis=1)

    def _calculate_difference(row: pd.Series, start_column: str,
                              end_column: str) -> Optional[int]:
        """Calculate difference between two date times in string format."""
        start_date = row[start_column]
        end_date = row[end_column]

        start_date_exists = not pd.isna(start_date)
        end_date_exists = not pd.isna(end_date)

        if start_date_exists and end_date_exists:
            return _convert_str_date_to_epoch(
                end_date) - _convert_str_date_to_epoch(start_date)
        return None

    df["pending_time"] = df.apply(
        partial(
            _calculate_difference,
            start_column="asked_to_start_date",
            end_column="started",
        ),
        axis=1,
    )

    df["runtime"] = df.apply(
        partial(_calculate_difference,
                start_column="started",
                end_column="ended"),
        axis=1,
    )
    return df
Beispiel #12
0
def submit(workflow_prefix: str, workflow_range: (int, int), file: str,
           workers: int) -> None:
    """Submit multiple workflows, do not start them."""
    _create_and_upload_workflows(workflow_prefix, workflow_range, file,
                                 workers)
    logger.info("Finished creating and uploading workflows.")
Beispiel #13
0
def _save_started_results(workflow_name: str, df: pd.DataFrame) -> None:
    logger.info("Saving started results...")
    results_path = build_started_results_path(workflow_name)
    df.to_csv(results_path, index=False)
Beispiel #14
0
def _print_metrics() -> None:
    logger.info("Following metrics will be collected:")
    for m in METRICS:
        logger.info(f"- {m.name}")
Beispiel #15
0
def _save_collected_results(workflow: str, df: pd.DataFrame):
    logger.info("Saving collected results...")
    results_path = build_collected_results_path(workflow)
    df.to_csv(results_path, index=False)