Example #1
0
def run(rundate, pipeline, *args, **kwargs):
    """Run a Where pipeline for a given date and session

    Args:
        rundate:   Rundate of analysis.
        pipeline:  Pipeline used for analysis.
        session:   Session in analysis.
    """

    if not setup.has_config(rundate, pipeline, *args, **kwargs):
        log.fatal(
            f"No configuration found for {pipeline.upper()} {rundate.strftime(config.FMT_date)}"
        )

    # Set up config
    config.init(rundate, pipeline, **kwargs)

    # Register filekey suffix
    filekey_suffix = config.tech.filekey_suffix.list
    if filekey_suffix:
        config.files.profiles = filekey_suffix

    # Validate input arguments
    try:
        prefix = plugins.call(package_name=__name__,
                              plugin_name=pipeline,
                              part="validate_args",
                              rundate=rundate,
                              **kwargs)
    except mg_exceptions.UnknownPluginError:
        log.warn(
            f"Pipeline {pipeline} has not defined function 'validate_args'")
    except exceptions.InvalidArgsError as err:

        from where.tools import delete

        # Clean up {placeholder} directories created by config
        delete.delete_analysis(rundate, pipeline, **kwargs)
        log.fatal(err)

    # Set up console logger and start file logger
    try:
        prefix = plugins.call(package_name=__name__,
                              plugin_name=pipeline,
                              part="log_prefix",
                              rundate=rundate,
                              **kwargs)
    except mg_exceptions.UnknownPluginError:
        log.warn(f"Pipeline {pipeline} has not defined function 'log_prefix'")
        prefix = ""

    log_cfg = config.where.log
    log.init(log_level=log_cfg.default_level.str, prefix=prefix)
    if log_cfg.log_to_file.bool:
        log.file_init(
            file_path=config.files.path("log"),
            log_level=log_cfg.default_level.str,
            prefix=prefix,
            rotation=log_cfg.number_of_log_backups.int,
        )

    # Update analysis config and file variables
    config.set_analysis(rundate, pipeline=pipeline, **kwargs)
    config.set_file_vars(file_vars())

    log.blank()  # Empty line for visual clarity

    # Read which stages that should be executed once for each iterable
    skip_stages = config.tech.skip_stages.list
    stage_iterate = config.tech.stage_iterate.list
    dset_list = []
    dset = None

    if stage_iterate:
        # Read which list should be iterated over and the placeholder name of each entry
        iterate_over, _, var_name = config.tech.stage_iterate_over.str.partition(
            ":")
        var_name = var_name.strip()

        # Iterate
        for item in config.tech[iterate_over].list:
            kwargs[var_name] = item
            log.blank()
            log.info(f"***** Running {item} *****")

            for prev_stage, stage in zip([None] + stage_iterate,
                                         stage_iterate):
                if stage not in skip_stages:
                    dset = run_stage(rundate, pipeline, dset, stage,
                                     prev_stage, **kwargs)

            if dset is not None:
                dset_list.append(dset)
                dset = None
        kwargs[var_name] = "combined"

    if dset_list:
        dset_list[0].merge_with(*dset_list[1:], sort_by="time")
        dset = dset_list[0]
        if len(dset_list) > 1:
            log.info(f"Combining dataset for {len(dset_list)} {iterate_over}")
            dset.write_as(stage=stage_iterate[-1], label=2, **kwargs)

    # Read which stages that should be executed once
    stage_once = config.tech.stage_once.list
    # Find which stages we will run analysis for
    if not stage_once and not stage_iterate:
        stage_list = [s for s in stages(pipeline)]
        prev_stage_start = None
    else:
        stage_list = [s for s in stage_once]
        prev_stage_start = stage_iterate[-1] if stage_iterate else None

    for prev_stage, stage in zip([prev_stage_start] + stage_list, stage_list):
        if stage not in skip_stages:
            dset = run_stage(rundate, pipeline, dset, stage, prev_stage,
                             **kwargs)
            log.blank()

        if dset is not None and dset.num_obs == 0:
            log.warn(f"No observations in dataset after {stage} stage.")
            break

    # Store configuration to library
    setup.store_config_to_library(rundate, pipeline, **kwargs)

    # Write requirements to file for reproducibility
    util.write_requirements()
Example #2
0
def run(rundate, pipeline, session=""):
    """Run a Where pipeline for a given date and session

    Args:
        rundate:   Rundate of analysis.
        pipeline:  Pipeline used for analysis.
        session:   Session in analysis.
    """
    if not setup.has_config(rundate, pipeline, session):
        log.fatal(
            f"No configuration found for {pipeline.upper()} {session} {rundate.strftime(config.FMT_date)}"
        )

    # Set up session config
    config.init(rundate=rundate, tech_name=pipeline, session=session)

    # Set up prefix for console logger and start file logger
    log_cfg = config.where.log
    prefix = f"{pipeline.upper()} {session} {rundate:%Y-%m-%d}"
    log.init(log_level=log_cfg.default_level.str, prefix=prefix)
    if log_cfg.log_to_file.bool:
        log.file_init(
            file_path=files.path("log"),
            log_level=log_cfg.default_level.str,
            prefix=prefix,
            rotation=log_cfg.number_of_log_backups.int,
        )

    # Read which stages to skip from technique configuration file.
    skip_stages = config.tech.get("skip_stages", default="").list

    # Register filekey suffix
    filekey_suffix = config.tech.filekey_suffix.list
    if filekey_suffix:
        config.files.profiles = filekey_suffix

    # Find which stages we will run analysis for
    # TODO: Specify stage_list in config
    stage_list = [s for s in stages(pipeline) if s not in skip_stages]

    # Start file logging and reporting
    reports.report.init(sessions=[session])
    reports.report.start_session(session)
    reports.report.text("header", session.replace("_", " ").title())

    # Update analysis config and file variables
    config.set_analysis(rundate=rundate,
                        tech=pipeline,
                        analysis=pipeline,
                        session=session)
    config.set_file_vars(file_vars())

    # Log the name of the session
    log.blank()  # Empty line for visual clarity
    log.info(f"Start session {session}")
    session_timer = timer(f"Finish session {session} in")
    session_timer.start()

    # Run stages, keep track of previous stage
    dset = None
    dep_fast = config.where.files.dependencies_fast.bool
    for prev_stage, stage in zip([None] + stage_list, stage_list):

        # Skip stages where no dependencies have changed
        dep_path = files.path("depends", file_vars=dict(stage=stage))
        if not (dependencies.changed(dep_path, fast_check=dep_fast)
                or util.check_options("-F", "--force")):
            log.info(
                f"Not necessary to run {stage} for {pipeline.upper()} {rundate.strftime(config.FMT_date)}"
            )
            continue
        elif dset is None:
            # Create or read dataset
            empty = stage == stage_list[0]
            dset = dataset.Dataset(rundate,
                                   tech=pipeline,
                                   stage=prev_stage,
                                   dataset_name=session,
                                   dataset_id="last",
                                   empty=empty)

        # Report on the stage
        reports.report.start_section(stage)
        reports.report.text("header", stage.replace("_", " ").title())
        if prev_stage:
            log.blank()  # Empty line for visual clarity

        # Set up dependencies. Add dependencies to previous stage and config file
        dependencies.init(dep_path, fast_check=dep_fast)
        dependencies.add(files.path("depends",
                                    file_vars=dict(stage=prev_stage)),
                         label="depends")
        dependencies.add(*config.tech.sources, label="config")

        # Delete old datasets for this stage
        dset.delete_from_file(stage=stage, dataset_id="all")

        # Call the current stage. Skip rest of stages if current stage returns False (compare with is since by
        # default stages return None)
        plugins.call(package_name=__name__,
                     plugin_name=pipeline,
                     part=stage,
                     stage=stage,
                     dset=dset,
                     plugin_logger=log.info)
        dependencies.write()
        if dset.num_obs == 0:
            log.warn(
                f"No observations in dataset after {stage} stage. Exiting pipeline"
            )
            break
    else:  # Only done if loop does not break (all stages finish normally)
        # Publish files for session
        files.publish_files()

    session_timer.end()

    # Store configuration to library
    setup.store_config_to_library(rundate, pipeline, session)

    # Write reports specified in config
    reports.write(rundate, pipeline)

    # Write requirements to file for reproducibility
    util.write_requirements()
Example #3
0
def run(rundate, pipeline, session=""):
    """Run a Where pipeline for a given date and session

    Args:
        rundate:   Rundate of analysis.
        pipeline:  Pipeline used for analysis.
        session:   Session in analysis.
    """
    if not setup.has_config(rundate, pipeline, session):
        log.fatal(
            f"No configuration found for {pipeline.upper()} {session} {rundate.strftime(config.FMT_date)}"
        )

    # Set up tech config and file logging
    config.init(rundate=rundate, tech_name=pipeline, session=session)
    log.file_init(log_path=files.path("log"))

    # Read which stages to skip from technique configuration file.
    skip_stages = config.tech.get("skip_stages", default="").list

    # Register filekey suffix
    filekey_suffix = config.tech.filekey_suffix.list
    if filekey_suffix:
        files.use_filelist_profiles(*filekey_suffix)

    # Find which stages we will run analysis for
    stage_list = [s for s in stages(pipeline) if s not in skip_stages]

    # Start file logging and reporting
    reports.report.init(sessions=[session])
    reports.report.start_session(session)
    reports.report.text("header", session.replace("_", " ").title())

    # Update analysis config and file variables
    config.set_analysis(rundate=rundate,
                        tech=pipeline,
                        analysis=pipeline,
                        session=session)
    config.set_file_vars(file_vars())

    # Log the name of the session
    log.blank()  # Empty line for visual clarity
    log.info(f"Start session {session}")
    session_timer = timer(f"Finish session {session} in")
    session_timer.start()

    # Run stages, keep track of previous stage
    dep_fast = config.where.files.dependencies_fast.bool
    for prev_stage, stage in zip([None] + stage_list, stage_list):

        # Skip stages where no dependencies have changed
        if not (dependencies.changed(fast_check=dep_fast,
                                     rundate=rundate,
                                     tech=pipeline,
                                     session=session,
                                     stage=stage)
                or util.check_options("-F", "--force")):
            log.info(
                f"Not necessary to run {stage} for {pipeline.upper()} {rundate.strftime(config.FMT_date)}"
            )
            continue

        # Report on the stage
        reports.report.start_section(stage)
        reports.report.text("header", stage.replace("_", " ").title())
        if prev_stage:
            log.blank()  # Empty line for visual clarity

        # Set up dependencies. Add dependencies to previous stage and config file
        dependencies.init(fast_check=dep_fast, session=session, stage=stage)
        dependencies.add(
            files.path("model_run_depends",
                       file_vars=dict(session=session, stage=prev_stage)))
        dependencies.add(*config.tech.sources)

        # Call the current stage. Skip rest of stages if current stage returns False (compare with is since by
        # default stages return None)
        do_next_stage = call(pipeline,
                             stage,
                             rundate=rundate,
                             session=session,
                             prev_stage=prev_stage,
                             stage=stage,
                             logger=log.info)
        dependencies.write()
        if do_next_stage is False:
            break  # TODO, this does not work together with dependencies changed ...

    # Publish files for session
    files.publish_files()
    session_timer.end()

    # Store configuration to library
    setup.store_config_to_library(rundate, pipeline, session)

    # Write reports specified in config
    reports.write(rundate, pipeline)

    # Write requirements to file for reproducibility
    util.write_requirements()