def parse(self): """Parse data This is a basic implementation that carries out the whole pipeline of reading and parsing datafiles including calculating secondary data. Returns: Parser: The parsed data """ if self.file_path is None: self.file_path = files.path(self.file_key, file_vars=self.vars, download_missing=True) parser_package, parser_name = self.__module__.rsplit(".", maxsplit=1) with timer("Finish {} ({}) - {} in".format(parser_name, parser_package, self.file_key)): if self.data_available: self.read_data() if not self.data_available: # May have been set to False by self.read_data() log.warn( f"No data found by {self.__class__.__name__} for {self.rundate.strftime(config.FMT_date)} " f"(was looking for {self.file_path})") return self self.calculate_data() dependencies.add(*self.dependencies, label=self.file_key) return self
def main(): """Parse command line options and run the Where analysis Do simple parsing of command line arguments. Set up config-files and start the analysis. See the help docstring at the top of the file for more information about the workflow. """ # Start logging log.init() # Read command line options if util.check_options("--doy"): rundate = util.parse_args("doy", doc_module=__name__) else: rundate = util.parse_args("date", doc_module=__name__) pipeline = pipelines.get_from_options() session = pipelines.get_session(rundate, pipeline) # Pretend to empty mailbox pretend_to_empty_mailbox() # Start an interactive session if util.check_options("-I", "--interactive"): from where.tools import interactive interactive.interactive(rundate, pipeline, session) return # Set up the configuration for a new analysis or update an existing one setup.setup_config(rundate, pipeline, session) # Run the analysis setup.add_timestamp(rundate, pipeline, session, "last run") with timer(f"Finish pipeline {pipeline.upper()} in"): pipelines.run(rundate, pipeline, session)
def calculate_data(self): """ TODO: Description? """ for calculator in self.setup_calculators(): log.debug( f"Start calculator {calculator.__name__} in {self.__module__}") with timer( f"Finish calculator {calculator.__name__} ({self.__module__}) in", logger=log.debug): calculator()
def calculate_data(self): """Do simple manipulations on the data after they are read Simple manipulations of data may be performed in calculators after they are read. They should be kept simple so that a parser returns as true representation of the data file as possible. Advanced calculations may be done inside apriori classes or similar. To add a calculator, define it in its own method, and override the `setup_calculators`-method to return a list of all calculators. """ for calculator in self.setup_calculators(): log.debug("Start calculator {} in {}", calculator.__name__, self.__module__) with timer("Finish calculator {} ({}) in".format( calculator.__name__, self.__module__), logger=log.debug): calculator()
def main(): """Parse command line options and run the Where analysis Do simple parsing of command line arguments. Set up config-files and start the analysis. See the help docstring at the top of the file for more information about the workflow. """ util.check_help_and_version(doc_module=__name__) # Start logging log.init(config.where.log.default_level.str) log.debug( f"Use {util.get_python_version()} on process {util.get_pid_and_server()}" ) # Read command line options pipeline = pipelines.get_from_options() config.read_pipeline(pipeline) if util.check_options("--doy"): rundate = util.parse_args("doy", doc_module=__name__) else: rundate = util.parse_args("date", doc_module=__name__) session = pipelines.get_session(rundate, pipeline) # Pretend to empty mailbox pretend_to_empty_mailbox() # Start an interactive session if util.check_options("-I", "--interactive"): from where.tools import interactive # Local import because interactive imports many external packages interactive.interactive(rundate, pipeline, session) return # Set up the configuration for a new analysis or update an existing one setup.setup_config(rundate, pipeline, session) # Run the analysis setup.add_timestamp(rundate, pipeline, session, "last run") with timer(f"Finish pipeline {pipeline.upper()} in"): pipelines.run(rundate, pipeline, session)
def run(rundate, pipeline, session=""): """Run a Where pipeline for a given date and session Args: rundate: Rundate of analysis. pipeline: Pipeline used for analysis. session: Session in analysis. """ if not setup.has_config(rundate, pipeline, session): log.fatal( f"No configuration found for {pipeline.upper()} {session} {rundate.strftime(config.FMT_date)}" ) # Set up session config config.init(rundate=rundate, tech_name=pipeline, session=session) # Set up prefix for console logger and start file logger log_cfg = config.where.log prefix = f"{pipeline.upper()} {session} {rundate:%Y-%m-%d}" log.init(log_level=log_cfg.default_level.str, prefix=prefix) if log_cfg.log_to_file.bool: log.file_init( file_path=files.path("log"), log_level=log_cfg.default_level.str, prefix=prefix, rotation=log_cfg.number_of_log_backups.int, ) # Read which stages to skip from technique configuration file. skip_stages = config.tech.get("skip_stages", default="").list # Register filekey suffix filekey_suffix = config.tech.filekey_suffix.list if filekey_suffix: config.files.profiles = filekey_suffix # Find which stages we will run analysis for # TODO: Specify stage_list in config stage_list = [s for s in stages(pipeline) if s not in skip_stages] # Start file logging and reporting reports.report.init(sessions=[session]) reports.report.start_session(session) reports.report.text("header", session.replace("_", " ").title()) # Update analysis config and file variables config.set_analysis(rundate=rundate, tech=pipeline, analysis=pipeline, session=session) config.set_file_vars(file_vars()) # Log the name of the session log.blank() # Empty line for visual clarity log.info(f"Start session {session}") session_timer = timer(f"Finish session {session} in") session_timer.start() # Run stages, keep track of previous stage dset = None dep_fast = config.where.files.dependencies_fast.bool for prev_stage, stage in zip([None] + stage_list, stage_list): # Skip stages where no dependencies have changed dep_path = files.path("depends", file_vars=dict(stage=stage)) if not (dependencies.changed(dep_path, fast_check=dep_fast) or util.check_options("-F", "--force")): log.info( f"Not necessary to run {stage} for {pipeline.upper()} {rundate.strftime(config.FMT_date)}" ) continue elif dset is None: # Create or read dataset empty = stage == stage_list[0] dset = dataset.Dataset(rundate, tech=pipeline, stage=prev_stage, dataset_name=session, dataset_id="last", empty=empty) # Report on the stage reports.report.start_section(stage) reports.report.text("header", stage.replace("_", " ").title()) if prev_stage: log.blank() # Empty line for visual clarity # Set up dependencies. Add dependencies to previous stage and config file dependencies.init(dep_path, fast_check=dep_fast) dependencies.add(files.path("depends", file_vars=dict(stage=prev_stage)), label="depends") dependencies.add(*config.tech.sources, label="config") # Delete old datasets for this stage dset.delete_from_file(stage=stage, dataset_id="all") # Call the current stage. Skip rest of stages if current stage returns False (compare with is since by # default stages return None) plugins.call(package_name=__name__, plugin_name=pipeline, part=stage, stage=stage, dset=dset, plugin_logger=log.info) dependencies.write() if dset.num_obs == 0: log.warn( f"No observations in dataset after {stage} stage. Exiting pipeline" ) break else: # Only done if loop does not break (all stages finish normally) # Publish files for session files.publish_files() session_timer.end() # Store configuration to library setup.store_config_to_library(rundate, pipeline, session) # Write reports specified in config reports.write(rundate, pipeline) # Write requirements to file for reproducibility util.write_requirements()
def call_one(package_name, plugin_name, part=None, prefix=None, logger=log.time, use_timer=True, do_report=True, **kwargs): """Call one plug-in If the plug-in is not part of the package an UnknownPluginError is raised. If there are several functions registered in a plug-in and `part` is not specified, then the first function registered in the plug-in will be called. The file containing the source code of the plug-in is added to the list of dependencies. Args: package_name (String): Name of package containing plug-ins. plugin_name (String): Name of the plug-in, i.e. the module containing the plug-in. part (String): Name of function to call within the plug-in (optional). prefix (String): Prefix of the plug-in name, used if the plug-in name is unknown (optional). logger (Function): Logger from the lib.log package specifying the level of logging to be used (optional). use_timer (Boolean): Whether to time and log the call to the plug-in (optional). do_report (Boolean): Whether to add the call to the plug-in to the report (optional). kwargs: Named arguments passed on to the plug-in. Returns: Return value of the plug-in. """ # Get Plugin-object plugin_name = load_one(package_name, plugin_name, prefix=prefix) part = "__default__" if part is None else part try: plugin = _PLUGINS[package_name][plugin_name][part] except KeyError: raise exceptions.UnknownPluginError( "Plugin '{}' not found for '{}' in '{}'" "".format(part, plugin_name, package_name)) from None # Add plug-in to report if do_report: from where.reports import report code_kwargs = kwargs.copy() if "dset" in code_kwargs: code_kwargs["dset"] = code_kwargs["dset"].repr report.add( package_name, __plugin__=plugin.name, __doc__=plugin.function.__doc__, __text__="TODO", __code__= "kwargs = {}\n{} = plugins.call_one('{}', '{}', part='{}', **kwargs)" "".format(code_kwargs, plugin_name, package_name, plugin_name, part), **kwargs, ) # Call plug-in dependencies.add(plugin.file_path, label="plugin") if logger: logger(f"Start {plugin.name} in {package_name}") time_logger = log.time if use_timer else None else: time_logger = None with timer(f"Finish {plugin.name} ({package_name}) in", logger=time_logger): return plugin.function(**kwargs)
def run(rundate, pipeline, session=""): """Run a Where pipeline for a given date and session Args: rundate: Rundate of analysis. pipeline: Pipeline used for analysis. session: Session in analysis. """ if not setup.has_config(rundate, pipeline, session): log.fatal( f"No configuration found for {pipeline.upper()} {session} {rundate.strftime(config.FMT_date)}" ) # Set up tech config and file logging config.init(rundate=rundate, tech_name=pipeline, session=session) log.file_init(log_path=files.path("log")) # Read which stages to skip from technique configuration file. skip_stages = config.tech.get("skip_stages", default="").list # Register filekey suffix filekey_suffix = config.tech.filekey_suffix.list if filekey_suffix: files.use_filelist_profiles(*filekey_suffix) # Find which stages we will run analysis for stage_list = [s for s in stages(pipeline) if s not in skip_stages] # Start file logging and reporting reports.report.init(sessions=[session]) reports.report.start_session(session) reports.report.text("header", session.replace("_", " ").title()) # Update analysis config and file variables config.set_analysis(rundate=rundate, tech=pipeline, analysis=pipeline, session=session) config.set_file_vars(file_vars()) # Log the name of the session log.blank() # Empty line for visual clarity log.info(f"Start session {session}") session_timer = timer(f"Finish session {session} in") session_timer.start() # Run stages, keep track of previous stage dep_fast = config.where.files.dependencies_fast.bool for prev_stage, stage in zip([None] + stage_list, stage_list): # Skip stages where no dependencies have changed if not (dependencies.changed(fast_check=dep_fast, rundate=rundate, tech=pipeline, session=session, stage=stage) or util.check_options("-F", "--force")): log.info( f"Not necessary to run {stage} for {pipeline.upper()} {rundate.strftime(config.FMT_date)}" ) continue # Report on the stage reports.report.start_section(stage) reports.report.text("header", stage.replace("_", " ").title()) if prev_stage: log.blank() # Empty line for visual clarity # Set up dependencies. Add dependencies to previous stage and config file dependencies.init(fast_check=dep_fast, session=session, stage=stage) dependencies.add( files.path("model_run_depends", file_vars=dict(session=session, stage=prev_stage))) dependencies.add(*config.tech.sources) # Call the current stage. Skip rest of stages if current stage returns False (compare with is since by # default stages return None) do_next_stage = call(pipeline, stage, rundate=rundate, session=session, prev_stage=prev_stage, stage=stage, logger=log.info) dependencies.write() if do_next_stage is False: break # TODO, this does not work together with dependencies changed ... # Publish files for session files.publish_files() session_timer.end() # Store configuration to library setup.store_config_to_library(rundate, pipeline, session) # Write reports specified in config reports.write(rundate, pipeline) # Write requirements to file for reproducibility util.write_requirements()