def __init__( self, db_connector: Optional[DatabaseConnector] = None, project_path: Optional[Union[Path, str]] = None, ) -> None: """ Set connector, reader and a project path. Notes ----- The StatusChecker instance only checks the project belonging to the same database schema grouped together by the `db_connector` Parameters ---------- db_connector : DatabaseConnector Connection to the database project_path : Path Path to the project (the root directory with which usually contains the makefile and the executable) """ self.__db_connector = (db_connector if db_connector is not None else DatabaseConnector()) self.__db_reader = DatabaseReader(self.__db_connector) self.__project_path = Path( project_path) if project_path is not None else Path()
def _get_number_of_rows_for_all_tables( db_reader: DatabaseReader) -> Dict[str, int]: """ Return the number of rows for all tables in a schema. Parameters ---------- db_reader : DatabaseReader The object used read from the database Returns ------- number_of_rows_dict : dict Dict on the form >>> {'table_name_1': int, 'table_name_2': int, ...} """ number_of_rows_dict = dict() query_str = ("SELECT name FROM sqlite_master\n" " WHERE type ='table'\n" " AND name NOT LIKE 'sqlite_%'") table_of_tables = db_reader.query(query_str) for _, table_name_as_series in table_of_tables.iterrows(): table_name = table_name_as_series["name"] # NOTE: SQL injection possible through bad table name, however the table # names are hard-coded in this example query_str = f"SELECT COUNT(*) AS rows FROM {table_name}" # nosec table = db_reader.query(query_str) number_of_rows_dict[table_name] = table.loc[0, "rows"] return number_of_rows_dict
def test_status_checker_until_complete_infinite( get_test_data_path: Path, get_test_db_copy: Callable[[str], DatabaseConnector], copy_test_case_log_file: Callable[[str], None], ) -> None: """ Test the infinite loop of StatusChecker. Parameters ---------- get_test_data_path : Path Path to the test data get_test_db_copy : function Function which returns a DatabaseConnector connected to a copy of test.db copy_test_case_log_file : function Return the function for copying the test case log files """ test_case = "infinite_log_file_pid_started_ended_no_mock_pid_complete" project_path = get_test_data_path db_connector = get_test_db_copy(test_case) copy_test_case_log_file(test_case) # Remove row which has status running (as it will always have # this status) db_connector.execute_statement("DELETE FROM run WHERE name = 'testdata_5'") db_reader = DatabaseReader(db_connector) status_checker = StatusChecker(db_connector, project_path) status_checker.check_and_update_until_complete() query = status_checker.get_query_string_for_non_errored_runs() assert len(db_reader.query(query).index) == 0
def test_db_writer( make_test_schema: Callable[[str], Tuple[DatabaseConnector, str]]) -> None: """ Test we can create write to the database schemas. Specifically this test that: 1. We can write to the `split` table 2. That only one record is made 3. That the type is correct 4. Check that the values are correct 5. Check that it's possible to update the values Parameters ---------- make_test_schema : function Function returning the database connection with the schema created """ db_connector, _ = make_test_schema("write_test") db_reader = DatabaseReader(db_connector) db_writer = DatabaseWriter(db_connector) table_name = "split" dummy_split_dict = { "number_of_processors": 41, "number_of_nodes": 42, "processors_per_node": 43, } db_writer.create_entry(table_name, dummy_split_dict) # NOTE: Protected against SQL injection as table_name is hard-coded above table = db_reader.query(f"SELECT * FROM {table_name}") # nosec # Check that the shape is expected (note that one column is # assigned to the id) assert table.shape == (1, 4) # Check all the elements are the same # https://www.quora.com/How-do-you-check-if-all-elements-in-a-NumPy-array-are-the-same-in-Python-pandas values = table.dtypes.values assert (values == np.dtype("int64")).all() for key, value in dummy_split_dict.items(): assert table.loc[0, key] == value # pylint: disable=no-member update_fields = ("number_of_processors", "number_of_nodes") search_condition = (f"processors_per_node = " f'{dummy_split_dict["processors_per_node"]}') values = tuple(dummy_split_dict[field] - 10 for field in update_fields) db_writer.update( db_writer.create_update_string(update_fields, table_name, search_condition), values, ) # NOTE: Protected against SQL injection as table_name is hard-coded above table = db_reader.query(f"SELECT * FROM {table_name}") # nosec for index, field in enumerate(update_fields): # pylint: disable=no-member assert table.loc[:, field].values[0] == values[index]
def test_db_creator( make_test_database: Callable[[str], DatabaseConnector], make_test_schema: Callable[ [str], Tuple[DatabaseConnector, Dict[str, Dict[str, str]]] ], ) -> None: """ Test we can create the database schemas. Specifically this test that: 1. The database is empty on creation 2. The tables are created 3. It is not possible to create the schema more than once 4. Check that all expected tables have been created Parameters ---------- make_test_database : function Function returning the database connection make_test_schema : function Function returning the database connection and the final parameters as sql types """ db_connector_no_schema = make_test_database("test_creation_without_schema") db_reader_no_schema = DatabaseReader(db_connector_no_schema) # There should be no tables before creating them assert not db_reader_no_schema.check_tables_created() db_connector_schema, final_parameters_as_sql_types = make_test_schema( "test_creation_with_schema" ) db_reader_schema = DatabaseReader(db_connector_schema) db_creator = DatabaseCreator(db_connector_schema) # The tables should now have been created assert db_reader_schema.check_tables_created() with pytest.raises(sqlite3.OperationalError): db_creator.create_all_schema_tables(final_parameters_as_sql_types) # Check that all tables has been created non_parameter_tables = { "system_info", "split", "file_modification", "parameters", "run", } parameter_tables = set( el.replace(":", "_") for el in final_parameters_as_sql_types.keys() ) query_str = 'SELECT name FROM sqlite_master WHERE type="table"' table = db_reader_schema.query(query_str) actual = table.loc[:, "name"].values # pylint: disable=no-member assert non_parameter_tables.union(parameter_tables) == set(actual)
def assert_first_run(bout_paths: BoutPaths, db_connector: DatabaseConnector) -> DatabaseReader: """ Assert that the first run went well. Parameters ---------- bout_paths : BoutPaths The object containing the paths db_connector : DatabaseConnector The database connection Returns ------- db_reader : DatabaseReader The database reader object """ db_reader = DatabaseReader(db_connector) assert_dump_files_exist(bout_paths.bout_inp_dst_dir) assert db_reader.check_tables_created() return db_reader
def __init__( self, db_connector: Optional[DatabaseConnector] = None, drop_id: Optional[str] = "keep_run_id", ) -> None: """ Set the database to use. Parameters ---------- db_connector : DatabaseConnector or None The connection to the database If None: Default database connector will be used drop_id : None or str Specifies what id columns should be dropped when obtaining the metadata - None : No columns will be dropped - 'parameters' : All columns containing parameters ids will be dropped - 'keep_run_id' : Only the run.id of the id columns will be kept - 'all_id' : All id columns will be removed """ self.drop_id = drop_id db_connector = db_connector if db_connector is not None else DatabaseConnector() self.__db_reader = DatabaseReader(db_connector) self.__table_names = self.__get_all_table_names() self.__table_column_dict = self.__get_table_column_dict() self.__table_connections = self.__get_table_connections() self.__sorted_columns = self.__get_sorted_columns() parameters_connections = {"parameters": self.__table_connections["parameters"]} parameters_tables = ("parameters", *parameters_connections["parameters"]) self.__parameters_columns = tuple( str(col) for col in self.__sorted_columns if col.split(".")[0] in parameters_tables )
def __init__( self, db_connector: DatabaseConnector, bout_paths: BoutPaths, final_parameters: FinalParameters, ) -> None: """ Set the database to use. Parameters ---------- db_connector : DatabaseConnector The database connector bout_paths : BoutPaths Object containing the paths final_parameters : FinalParameters Object containing the final parameters """ self.__db_writer = DatabaseWriter(db_connector) self.__db_reader = DatabaseReader(db_connector) self.__bout_paths = bout_paths self.__final_parameters = final_parameters self.__make = Make(self.__bout_paths.project_path)
def _get_metadata_updater_and_db_reader( name: str, ) -> Tuple[MetadataUpdater, DatabaseReader]: """ Return a MetadataUpdater and its DatabaseConnector. Parameters ---------- name : str Name of the temporary database Returns ------- metadata_updater : MetadataUpdater Object to update the database with db_reader : DatabaseReader The corresponding database reader """ db_connector = get_test_db_copy(name) db_reader = DatabaseReader(db_connector) metadata_updater = MetadataUpdater(db_connector, 1) return metadata_updater, db_reader
class MetadataReader: r""" Class for reading the metadata from the database. Attributes ---------- __db_reader : DatabaseConnector The connection to the database __table_names : tuple Getter variable for table_names __table_column_dict : dict of tuple Getter variable for table_column_dict __table_connections : dict of tuple Getter variable for table_connections __sorted_columns : tuple Getter variable for sorted_columns table_names : tuple A tuple containing all names of the tables table_column_dict : dict of tuple A dict where the keys are table names, and the values are corresponding column names table_connections : dict of tuple A dict where the keys are tables, and the values are tuples of tables connected to the key table sorted_columns : tuple A tuple of the column names as they will be sorted in the all_metadata DataFrame date_columns : tuple Columns containing dates drop_id : None or str Specifies what id columns should be dropped when obtaining the metadata Methods ------- get_all_metadata() Return all of the run metadata get_parameters_metadata() Return only the parameter part of the run metadata get_join_query(from_statement, columns, alias_columns, table_connections) Return the query string of a `SELECT` query with `INNER JOIN` __get_parameters_query() Return the parameters query string __get_sorted_columns() Return all columns sorted __get_table_connections() Return a dict containing the table connections __get_all_table_names() Return all the table names in the schema __get_table_column_dict() Return all the column names of the specified tables Examples -------- >>> from pathlib import Path >>> from bout_runners.database.database_connector import DatabaseConnector >>> db_connector = DatabaseConnector('test', Path()) >>> metadata_reader = MetadataReader(db_connector) >>> metadata_reader.get_parameters_metadata() bar.id bar.foo ... parameters.baz_id parameters.foo_id 0 1 1 ... 1 1 1 2 10 ... 1 2 2 2 10 ... 1 1 [3 rows x 16 columns] >>> metadata_reader.get_all_metadata() run.id ... system_info.version 0 1 ... #1 SMP Thu Oct 17 19:31:58 UTC 2019 1 2 ... #1 SMP Thu Oct 17 19:31:58 UTC 2019 2 3 ... #1 SMP Thu Oct 17 19:31:58 UTC 2019 3 4 ... #1 SMP Thu Oct 17 19:31:58 UTC 2019 4 5 ... #1 SMP Thu Oct 17 19:31:58 UTC 2019 5 6 ... #1 SMP Thu Oct 17 19:31:58 UTC 2019 6 7 ... #1 SMP Thu Oct 17 19:31:58 UTC 2019 [7 rows x 43 columns] >>> metadata_reader.drop_id = 'all_id' >>> metadata_reader.get_all_metadata() run.latest_status ... system_info.version 0 complete ... #1 SMP Thu Oct 17 19:31:58 UTC 2019 1 complete ... #1 SMP Thu Oct 17 19:31:58 UTC 2019 2 complete ... #1 SMP Thu Oct 17 19:31:58 UTC 2019 3 complete ... #1 SMP Thu Oct 17 19:31:58 UTC 2019 4 error ... #1 SMP Thu Oct 17 19:31:58 UTC 2019 5 running ... #1 SMP Thu Oct 17 19:31:58 UTC 2019 6 submitted ... #1 SMP Thu Oct 17 19:31:58 UTC 2019 [7 rows x 28 columns] """ date_columns = ( "run.start_time", "run.stop_time", "run.submitted_time", "file_modification.bout_lib_modified", "file_modification.project_executable_modified", "file_modification.project_makefile_modified", ) def __init__( self, db_connector: DatabaseConnector, drop_id: Optional[str] = "keep_run_id", ) -> None: """ Set the database to use. Parameters ---------- db_connector : DatabaseConnector The connection to the database drop_id : None or str Specifies what id columns should be dropped when obtaining the metadata - None : No columns will be dropped - 'parameters' : All columns containing parameters ids will be dropped - 'keep_run_id' : Only the run.id of the id columns will be kept - 'all_id' : All id columns will be removed """ self.drop_id = drop_id self.__db_reader = DatabaseReader(db_connector) self.__table_names = self.__get_all_table_names() self.__table_column_dict = self.__get_table_column_dict() self.__table_connections = self.__get_table_connections() self.__sorted_columns = self.__get_sorted_columns() parameters_connections = {"parameters": self.__table_connections["parameters"]} parameters_tables = ("parameters", *parameters_connections["parameters"]) self.__parameters_columns = tuple( str(col) for col in self.__sorted_columns if col.split(".")[0] in parameters_tables ) @property def table_names(self) -> Tuple[str, ...]: """ Set the properties of self.table_names. Returns ------- self.__table_names : tuple A tuple containing all names of the tables """ return self.__table_names @property def table_column_dict( self, ) -> Dict[str, Tuple[str, ...]]: """ Set the properties of self.table_column_dict. Returns ------- self.__table_column_dict : dict of tuple A dict where the keys are table names, and the values are corresponding column names """ return self.__table_column_dict @property def table_connection( self, ) -> Dict[str, Tuple[str, ...]]: """ Set the properties of self.table_connections. Returns ------- self.__table_connections : dict of tuple A dict where the keys are tables, and the values are tuples of tables connected to the key table """ return self.__table_connections @property def sorted_columns(self) -> Tuple[str, ...]: """ Set the properties of self.sorted_columns. Returns ------- self.__sorted_columns : tuple A tuple of the column names as they will be sorted in the all_metadata DataFrame """ return self.__sorted_columns @drop_ids def get_all_metadata(self): """ Return all of the run metadata. Returns ------- DataFrame The DataFrame of the run metadata """ parameters_query = self.__get_parameters_query() # Adding spaces and parenthesis parameter_sub_query = "\n".join( [f'{" " * 6}{line}' for line in parameters_query.split("\n")] ) parameter_sub_query = ( f"{parameter_sub_query[:5]}({parameter_sub_query[6:-1]}) " f"AS subquery" ) # NOTE: The subquery names are the names of the columns after # the query. We would like to rename them to # sorted_columns. Hence the `columns` field and # `alias_columns` field appears swapped subquery_columns = [ f'subquery."{col}"' if col in self.__parameters_columns else col for col in self.sorted_columns ] # Remove the parameters from the table_connection to avoid # double joining table_connections = self.__table_connections.copy() table_connections.pop("parameters") unfinished_all_metadata_query = self.get_join_query( "run", subquery_columns, self.sorted_columns, table_connections ) # Update the parameters columns all_metadata_query = unfinished_all_metadata_query.replace( " parameters ", f"\n{parameter_sub_query}\n" ).replace("= parameters.id", '= subquery."parameters.id"') return self.__db_reader.query(all_metadata_query, parse_dates=self.date_columns) @drop_ids def get_parameters_metadata(self): """ Return only the parameter part of the run metadata. Returns ------- DataFrame The DataFrame of the parameter metadata """ parameters_query = self.__get_parameters_query() return self.__db_reader.query(parameters_query) @staticmethod def get_join_query( from_statement: str, columns: Sequence[str], alias_columns: Sequence[str], table_connections: Dict[str, Tuple[str, ...]], ) -> str: """ Return the query string of a `SELECT` query with `INNER JOIN`. Notes ----- The tables in `table_connection` is assumed to be joined by `id`s. I.e. `table_a` is connected to `table_b` by `table_b` having a column named `table_a_id` which corresponds to the `id` column of `table_a` Parameters ---------- from_statement : str The statement after the `FROM` keyword in the query I.e. >>> f'SELECT * FROM {from_statement}' columns : array_like The columns to select from the tables I.e. >>> f'SELECT {columns} FROM *' alias_columns : array_like The name of the columns in the resulting table I.e. >>> f'SELECT {columns[0]} AS {alias_columns[0]} FROM *' table_connections : dict A dict where the keys are the table names, and the values are tuples containing table names connected to the key table as described in the note above Returns ------- query : str The SQL-string which can be used to query where table in databases are joined through `INNER JOIN` operations """ query = "SELECT\n" for column, alias in zip(columns, alias_columns): query += f'{" " * 7}{column} AS "{alias}",\n' # Remove last comma query = f"{query[:-2]}\n" query += f"FROM {from_statement}\n" for left_table in table_connections.keys(): for right_table in table_connections[left_table]: query += ( f'{" " * 4}INNER JOIN {right_table} ON ' f"{left_table}." f"{right_table}_id = {right_table}.id\n" ) return query def __get_parameters_query(self) -> str: """ Return the parameters query string. Returns ------- parameters_query : str The SQL-string which can be used to query where table in databases are joined through `INNER JOIN` operations """ parameter_connections = {"parameters": self.__table_connections["parameters"]} parameters_query = self.get_join_query( "parameters", self.__parameters_columns, self.__parameters_columns, parameter_connections, ) return parameters_query def __get_sorted_columns(self) -> Tuple[str, ...]: """ Return all columns sorted. The columns will be sorted alphabetically first by table name, then alphabetically by column name, with the following exceptions: 1. The columns from the run table is presented first 2. The id column is the first column in the table Returns ------- tuple Dict containing the column names On the form >>> ('run.id', ... 'run.column_name_1', ... 'run.column_name_2', ... ... ... 'table_name_1.column_name_1', ... 'table_name_1.column_name_2', ...) """ sorted_columns: List[str] = list() table_names = sorted(self.table_column_dict.keys()) table_names.pop(table_names.index("run")) table_names.insert(0, "run") for table_name in table_names: table_columns = list() for column_name in sorted(self.table_column_dict[table_name]): table_columns.append(f"{table_name}.{column_name}") table_columns.pop(table_columns.index(f"{table_name}.id")) table_columns.insert(0, f"{table_name}.id") sorted_columns = [*sorted_columns, *table_columns] return tuple(sorted_columns) def __get_table_connections(self) -> Dict[str, Tuple[str, ...]]: """ Return a dict containing the table connections. Returns ------- table_connection_dict : dict A dict telling which tables are connected to each other, where the key is the table under consideration and the value is a tuple containing the tables which have a key connection to the table under consideration On the form >>> {'table_1': ('table_2', 'table_3'), ... 'table_4': ('table_5',), ...} Raises ------ RuntimeError If match is None """ table_connection_dict = dict() pattern = re.compile("(.*)_id") for table, columns in self.table_column_dict.items(): ids: List[str] = list() for column in columns: if "_id" in column: match = pattern.match(column) if match is None: msg = f"match is None for '(.*)_id' for input '{column}'" logging.critical(msg) raise RuntimeError(msg) ids.append(match[1]) if len(ids) > 0: table_connection_dict[table] = tuple(ids) return table_connection_dict def __get_all_table_names(self) -> Tuple[str, ...]: """ Return all the table names in the schema. Returns ------- tuple A tuple containing all names of the tables """ query = ( "SELECT name FROM sqlite_master\n" "WHERE\n" " type ='table' AND\n" " name NOT LIKE 'sqlite_%'" ) # pylint: disable=no-member return tuple(self.__db_reader.query(query).loc[:, "name"]) def __get_table_column_dict(self) -> Dict[str, Tuple[str, ...]]: """ Return all the column names of the specified tables. Returns ------- table_column_dict : dict of tuple Dict containing the column names On the form >>> {'table_1': ('table_1_column_1', ...), ... 'table_2': ('table_2_column_1', ...), ...} """ table_column_dict = dict() query = "SELECT name FROM pragma_table_info('{}')" for table_name in self.table_names: # pylint: disable=no-member table_column_dict[table_name] = tuple( self.__db_reader.query(query.format(table_name)).loc[:, "name"] ) return table_column_dict
def large_graph_tester( submitter_type: Type[AbstractSubmitter], make_project: Path, yield_number_of_rows_for_all_tables: Callable[[DatabaseReader], Dict[str, int]], file_state_restorer: FileStateRestorer, ) -> None: """ Test that the graph with 10 nodes work as expected. The node setup can be found in node_functions.py Parameters ---------- submitter_type : type Used to assert that the correct submitter is used make_project : Path The path to the conduction example yield_number_of_rows_for_all_tables : function Function which returns the number of rows for all tables in a schema file_state_restorer : FileStateRestorer Object for restoring files to original state """ name = f"test_large_graph_{submitter_type.__name__}" node_adder = LargeGraphNodeAdder( name, make_project, submitter_type, file_state_restorer ) # RunGroup belonging to node 2 node_adder.add_and_assert_node_group_2() # RunGroup belonging to node 3 and 4 node_adder.add_and_assert_node_group_3_and_4() # RunGroup belonging to node 6 node_8 = node_adder.add_and_assert_node_group_6() # RunGroup belonging to node 9 node_adder.add_and_assert_node_node_9(node_8) # Run the project runner = BoutRunner(node_adder.run_graph) runner.run() runner.wait_until_completed() # Check that all the nodes have changed status with pytest.raises(RuntimeError): runner.run() runner.wait_until_completed() # Check that all files are present # Check that the pre and post files are present for node in (0, 1, 5, 7, 8, 10): assert ( node_adder.paths["pre_and_post_directory"].joinpath(f"{node}.txt").is_file() ) # Check that all the dump files are present for restart_str in ("", "_restart_0", "_restart_1", "_restart_2"): assert ( node_adder.paths["project_path"] .joinpath(f"{name}{restart_str}", "BOUT.dmp.0.nc") .is_file() or node_adder.paths["project_path"] .joinpath(f"{name}{restart_str}", "BOUT.dmp.0.h5") .is_file() ) # NOTE: We will only have 4 runs as node 4 is a duplicate of node 2 and will # therefore be skipped number_of_runs = 4 assert_tables_have_expected_len( DatabaseReader(node_adder.run_groups["run_group_2"].db_connector), yield_number_of_rows_for_all_tables, expected_run_number=number_of_runs, restarted=True, ) simulation_steps = LogReader( node_adder.paths["project_path"].joinpath(f"{name}_restart_2", "BOUT.log.0") ).get_simulation_steps() # NOTE: nout=0 set in the function tests.utils.run.make_run_group assert np.isclose(simulation_steps.loc[simulation_steps.index[-1], "Sim_time"], 0.0)
class MetadataRecorder: r""" Class for recording the metadata of the runs. Attributes ---------- __db_writer : DatabaseWriter Getter variable for db_writer __db_reader : DatabaseReader Getter variable for db_reader db_writer : DatabaseWriter Object which writes to the database db_reader : DatabaseReader Object which reads from the database Methods ------- capture_new_data_from_run(runner, processor_split) Capture new data from a run _create_parameter_tables_entry(parameters_dict) Insert the parameters into a the parameter tables Examples -------- Import dependencies >>> from pathlib import Path >>> from bout_runners.executor.bout_paths import BoutPaths >>> from bout_runners.parameters.default_parameters import DefaultParameters >>> from bout_runners.parameters.final_parameters import FinalParameters >>> from bout_runners.database.database_connector import DatabaseConnector >>> from bout_runners.submitter.processor_split import ProcessorSplit Create the `bout_paths` object >>> project_path = Path().joinpath('path', 'to', 'project') >>> bout_inp_src_dir = Path().joinpath('path', 'to', 'source', 'BOUT.inp') >>> bout_inp_dst_dir = Path().joinpath('path', 'to', 'destination','BOUT.inp') >>> bout_paths = BoutPaths(project_path=project_path, ... bout_inp_src_dir=bout_inp_src_dir, ... bout_inp_dst_dir=bout_inp_dst_dir) Obtain the parameters >>> default_parameters = DefaultParameters(bout_paths) >>> final_parameters = FinalParameters(default_parameters) >>> final_parameters_dict = final_parameters.get_final_parameters() >>> final_parameters_as_sql_types = \ ... final_parameters.cast_to_sql_type( ... final_parameters_dict) Create the metadata recorder object >>> db_connector = DatabaseConnector('name') >>> metadata_recorder = MetadataRecorder(db_connector, ... bout_paths, ... final_parameters) Capture the data to the database >>> metadata_recorder.capture_new_data_from_run(ProcessorSplit()) None """ def __init__( self, db_connector: DatabaseConnector, bout_paths: BoutPaths, final_parameters: FinalParameters, ) -> None: """ Set the database to use. Parameters ---------- db_connector : DatabaseConnector The database connector bout_paths : BoutPaths Object containing the paths final_parameters : FinalParameters Object containing the final parameters """ self.__db_writer = DatabaseWriter(db_connector) self.__db_reader = DatabaseReader(db_connector) self.__bout_paths = bout_paths self.__final_parameters = final_parameters self.__make = Make(self.__bout_paths.project_path) @property def db_reader(self) -> DatabaseReader: """ Set the properties of self.db_reader. Returns ------- self.__db_reader : DatabaseReader The database reader object Notes ----- The db_reader is read only """ return self.__db_reader @property def db_writer(self): """ Set the properties of self.db_writer. Returns ------- self.__db_writer : DatabaseWriter The database writer object Notes ----- The db_writer is read only """ return self.__db_writer def capture_new_data_from_run( self, processor_split: ProcessorSplit, restart: bool = False, force: bool = False, ) -> Optional[int]: """ Capture new data from a run. This function will capture all uncaptured data from a run. If all data has been captured previously, it means that the run has already been executed, and new_entry = False will be returned. Parameters ---------- processor_split : ProcessorSplit The processor split object restart : bool If True, the data will be captured (even if it has been executed before) force : bool Store entry to the run table even if a entry with the same parameter exists This will typically be used if the bout_runners is forcefully executing a run Returns ------- run_id : None or int If no previous run with the same configuration has been executed, this will return None, else the run_id is returned """ # Initiate the run_dict (will be filled with the ids) run_dict: Dict[str, Union[str, int, float, None]] = { "name": self.__bout_paths.bout_inp_dst_dir.name } # Update the parameters parameters_dict = self.__final_parameters.get_final_parameters() if restart: parameters_dict["global"]["restart"] = 1 run_dict["parameters_id"] = self._create_parameter_tables_entry( parameters_dict) # Update the file_modification file_modification_dict = get_file_modification( self.__bout_paths.project_path, self.__make.makefile_path, self.__make.exec_name, ) run_dict["file_modification_id"] = self.__db_reader.get_entry_id( "file_modification", file_modification_dict) if run_dict["file_modification_id"] is None: run_dict["file_modification_id"] = self.create_entry( "file_modification", file_modification_dict) # Update the split split_dict = { "number_of_processors": processor_split.number_of_processors, "number_of_nodes": processor_split.number_of_nodes, "processors_per_node": processor_split.processors_per_node, } run_dict["split_id"] = self.__db_reader.get_entry_id( "split", split_dict) if run_dict["split_id"] is None: run_dict["split_id"] = self.create_entry("split", split_dict) # Update the system info system_info_dict = get_system_info() run_dict["system_info_id"] = self.__db_reader.get_entry_id( "system_info", system_info_dict) if run_dict["system_info_id"] is None: run_dict["system_info_id"] = self.create_entry( "system_info", system_info_dict) # Update the run # NOTE: If restart is True, a new run_id will be given as the run_dict["name"] # will be unique run_id = self.__db_reader.get_entry_id("run", run_dict) if force or run_id is None: run_dict["latest_status"] = "submitted" run_dict["submitted_time"] = datetime.now().isoformat() _ = self.create_entry("run", run_dict) return run_id def create_entry( self, table_name: str, entries_dict: Mapping[str, Union[int, str, float, None]]) -> int: """ Create a database entry and return the entry id. Parameters ---------- table_name : str Name of the table entries_dict : dict Dictionary containing the entries as key value pairs Returns ------- entry_id : int The id of the newly created entry Raises ------ RuntimeError If the newly created id could not be fetched """ self.__db_writer.create_entry(table_name, entries_dict) entry_id = self.__db_reader.get_entry_id(table_name, entries_dict) if entry_id is None: raise RuntimeError("Could not fetch the newly created id") return entry_id def _create_parameter_tables_entry( self, parameters_dict: Dict[str, Dict[str, Union[int, str, float]]]) -> int: """ Insert the parameters into a the parameter tables. Parameters ---------- parameters_dict : dict The dictionary on the form >>> {'section': {'parameter': 'value'}} Returns ------- parameters_id : int The id key from the `parameters` table Notes ----- All `:` will be replaced by `_` in the section names """ parameters_foreign_keys = dict() parameter_sections = list(parameters_dict.keys()) for section in parameter_sections: # Replace bad characters for SQL section_name = section.replace(":", "_") section_parameters = parameters_dict[section] section_id = self.__db_reader.get_entry_id(section_name, section_parameters) if section_id is None: section_id = self.create_entry(section_name, section_parameters) parameters_foreign_keys[f"{section_name}_id"] = section_id # Update the parameters table parameters_id = self.__db_reader.get_entry_id("parameters", parameters_foreign_keys) if parameters_id is None: parameters_id = self.create_entry("parameters", parameters_foreign_keys) return parameters_id
def test_large_graph( make_project: Path, yield_number_of_rows_for_all_tables: Callable[[DatabaseReader], Dict[str, int]], clean_default_db_dir: Path, tear_down_restart_directories: Callable[[Path], None], ) -> None: """ Test that the graph with 10 nodes work as expected. The node setup can be found in node_functions.py Parameters ---------- make_project : Path The path to the conduction example yield_number_of_rows_for_all_tables : function Function which returns the number of rows for all tables in a schema clean_default_db_dir : Path Path to the default database directory tear_down_restart_directories : function Function used for removal of restart directories """ _ = clean_default_db_dir name = "test_large_graph" paths = dict() paths["project_path"] = make_project paths["pre_and_post_directory"] = paths["project_path"].joinpath( f"pre_and_post_{name}") paths["pre_and_post_directory"].mkdir() run_groups = dict() # RunGroup belonging to node 2 run_groups["run_group_2"] = make_run_group(name, make_project) run_graph = run_groups["run_group_2"].run_graph paths["bout_run_directory_node_2"] = run_groups[ "run_group_2"].bout_paths.bout_inp_dst_dir run_groups["run_group_2"].add_pre_processor({ "function": node_zero, "args": ( paths["bout_run_directory_node_2"], paths["pre_and_post_directory"], ), "kwargs": None, }) run_groups["run_group_2"].add_pre_processor({ "function": node_one, "args": ( paths["bout_run_directory_node_2"], paths["pre_and_post_directory"], ), "kwargs": None, }) run_groups["run_group_2"].add_post_processor({ "function": node_five, "args": ( paths["bout_run_directory_node_2"], paths["pre_and_post_directory"], ), "kwargs": None, }) tear_down_restart_directories(paths["bout_run_directory_node_2"]) # RunGroup belonging to node 3 run_groups["run_group_3"] = make_run_group( name, make_project, run_graph, restart_from=run_groups["run_group_2"].bout_paths.bout_inp_dst_dir, waiting_for=run_groups["run_group_2"].bout_run_node_name, ) # RunGroup belonging to node 4 run_groups["run_group_4"] = make_run_group(name, make_project, run_graph) paths["bout_run_directory_node_4"] = run_groups[ "run_group_4"].bout_paths.bout_inp_dst_dir # RunGroup belonging to node 6 run_groups["run_group_6"] = make_run_group( name, make_project, run_graph, restart_from=run_groups["run_group_2"].bout_paths.bout_inp_dst_dir, waiting_for=run_groups["run_group_2"].bout_run_node_name, ) paths["bout_run_directory_node_6"] = run_groups[ "run_group_6"].bout_paths.bout_inp_dst_dir node_8 = run_groups["run_group_6"].add_post_processor( { "function": node_eight, "args": ( paths["bout_run_directory_node_4"], paths["bout_run_directory_node_6"], paths["pre_and_post_directory"], ), "kwargs": None, }, waiting_for=run_groups["run_group_4"].bout_run_node_name, ) # RunGroup belonging to node 9 # NOTE: We need the paths['bout_run_directory_node_9'] as an input in node 7 # As node 9 is waiting for node 7 we hard-code the name # (as we will know what it will be) paths["bout_run_directory_node_9"] = paths["project_path"].joinpath( f"{name}_restart_2") # The function of node_seven belongs to RunGroup2, but takes # paths['bout_run_directory_node_9'] as an input node_7_name = run_groups["run_group_2"].add_post_processor({ "function": node_seven, "args": ( paths["bout_run_directory_node_2"], paths["bout_run_directory_node_9"], paths["pre_and_post_directory"], ), "kwargs": None, }) run_groups["run_group_9"] = make_run_group( name, make_project, run_graph, restart_from=run_groups["run_group_6"].bout_paths.bout_inp_dst_dir, waiting_for=( run_groups["run_group_4"].bout_run_node_name, run_groups["run_group_6"].bout_run_node_name, node_7_name, ), ) run_groups["run_group_9"].add_post_processor( { "function": node_ten, "args": ( paths["bout_run_directory_node_9"], paths["pre_and_post_directory"], ), "kwargs": None, }, waiting_for=node_8, ) # Run the project runner = BoutRunner(run_graph) runner.run() # Check that all the nodes have changed status with pytest.raises(RuntimeError): runner.run() # Check that all files are present # Check that the pre and post files are present for node in (0, 1, 5, 7, 8, 10): assert paths["pre_and_post_directory"].joinpath( f"{node}.txt").is_file() # Check that all the dump files are present for restart_str in ("", "_restart_0", "_restart_1", "_restart_2"): assert (paths["project_path"].joinpath( f"{name}{restart_str}").joinpath("BOUT.dmp.0.nc").is_file() or paths["project_path"].joinpath(f"{name}{restart_str}"). joinpath("BOUT.dmp.0.h5").is_file()) # NOTE: We will only have 4 runs as node 4 is a duplicate of node 2 and will # therefore be skipped number_of_runs = 4 assert_tables_have_expected_len( DatabaseReader(run_groups["run_group_2"].db_connector), yield_number_of_rows_for_all_tables, expected_run_number=number_of_runs, restarted=True, )
class StatusChecker: r""" Class to check and update the status of runs. Attributes ---------- __db_connector : DatabaseConnector Connection to the database under consideration __db_reader : DatabaseReader Object to read the database with project_path : Path Path to the project Methods ------- check_and_update_status() Check and update the status for the schema check_and_update_status_until_complete() Check and update the status until all runs are stopped __check_submitted(metadata_updater, submitted_to_check) Check the status of all runs which has status `submitted` __check_running(metadata_updater, running_to_check) Check the status of all runs which has status `running` __check_if_stopped(log_reader, metadata_updater) Check if a run has stopped check_if_running_or_errored(log_reader) Check if a run is still running or has errored Examples -------- >>> from pathlib import Path >>> from bout_runners.database.database_connector import \ ... DatabaseConnector >>> db_connector = DatabaseConnector('name_of_db') >>> project_path = Path('path').joinpath('to', 'project') >>> status_checker = StatusChecker(db_connector, project_path) >>> status_checker.check_and_update_status() Any updates to the runs will be written to the database. Alternatively, one can run the program until all jobs have stopped by calling >>> status_checker.check_and_update_until_complete() """ def __init__( self, db_connector: Optional[DatabaseConnector] = None, project_path: Optional[Union[Path, str]] = None, ) -> None: """ Set connector, reader and a project path. Notes ----- The StatusChecker instance only checks the project belonging to the same database schema grouped together by the `db_connector` Parameters ---------- db_connector : DatabaseConnector Connection to the database project_path : Path Path to the project (the root directory with which usually contains the makefile and the executable) """ self.__db_connector = (db_connector if db_connector is not None else DatabaseConnector()) self.__db_reader = DatabaseReader(self.__db_connector) self.__project_path = Path( project_path) if project_path is not None else Path() def check_and_update_status(self) -> None: """ Check and update the status for the schema. Raises ------ RuntimeError If the schema does not exist """ # Check that run table exist if not self.__db_reader.check_tables_created(): logging.error( "No tables found in %s", self.__db_reader.db_connector.db_path, ) message = "Can not check the status of schemas that does not exist" raise RuntimeError(message) # Create place holder metadata_updater metadata_updater = MetadataUpdater(self.__db_connector, run_id=-1) # Check runs with status 'submitted' query = ("SELECT name, id AS run_id FROM run WHERE\n" "latest_status = 'submitted' OR\n" "latest_status = 'created'") submitted_to_check = self.__db_reader.query(query) self.__check_submitted(metadata_updater, submitted_to_check) # Check runs with status 'running' query = 'SELECT name, id FROM run WHERE latest_status = "running"' running_to_check = self.__db_reader.query(query) self.__check_running(metadata_updater, running_to_check) @staticmethod def get_query_string_for_non_errored_runs() -> str: """ Return the query string for non errored results. Returns ------- str Query string for non errored results """ return ("SELECT name, id AS run_id FROM run WHERE\n" "latest_status = 'submitted' OR\n" "latest_status = 'created' OR\n" "latest_status = 'running'") def check_and_update_until_complete(self, seconds_between_update: int = 5 ) -> None: """ Check and update the status until all runs are stopped. Parameters ---------- seconds_between_update : int Number of seconds before a new status check is performed """ query = self.get_query_string_for_non_errored_runs() while len(self.__db_reader.query(query).index) != 0: self.check_and_update_status() time.sleep(seconds_between_update) def __check_submitted(self, metadata_updater: MetadataUpdater, submitted_to_check: DataFrame) -> None: """ Check the status of all runs which has status `submitted`. Parameters ---------- metadata_updater : MetadataUpdater Object which updates the database submitted_to_check : DataFrame DataFrame containing the `id` and `name` of the runs with status `submitted` Raises ------ RuntimeError In case log_reader.started() is True and log_reader.start_time is None """ for name, run_id in submitted_to_check.itertuples(index=False): metadata_updater.run_id = run_id log_path = self.__project_path.joinpath(name, "BOUT.log.0") if log_path.is_file(): log_reader = LogReader(log_path) if log_reader.started(): start_time = log_reader.start_time # Assert to prevent "Incompatible types in assignment" with Optional if start_time is None: raise RuntimeError( "log_reader.start_time is None although " "log_reader.started is True") metadata_updater.update_start_time(start_time) latest_status = self.__check_if_stopped( log_reader, metadata_updater) else: # No started time is found in the log latest_status = self.check_if_running_or_errored( log_reader) else: # No log file exists # NOTE: This means that the execution is either in a # queue or has failed the submission. # For now, we still consider this as submitted # This can maybe be decided by checking either the # pid or the status from the submitter latest_status = "submitted" metadata_updater.update_latest_status(latest_status) def __check_running(self, metadata_updater: MetadataUpdater, running_to_check: DataFrame) -> None: """ Check the status of all runs which has status `running`. Parameters ---------- metadata_updater : MetadataUpdater Object which updates the database running_to_check : DataFrame DataFrame containing the `id` and `name` of the runs with status `running` """ for name, run_id in running_to_check.itertuples(index=False): metadata_updater.run_id = run_id log_path = self.__project_path.joinpath(name, "BOUT.log.0") log_reader = LogReader(log_path) latest_status = self.check_if_running_or_errored(log_reader) metadata_updater.update_latest_status(latest_status) def __check_if_stopped(self, log_reader: LogReader, metadata_updater: MetadataUpdater) -> str: """ Check if a run has stopped. Parameters ---------- log_reader : LogReader The object which reads log files metadata_updater : MetadataUpdater Object which updates the database Returns ------- latest_status : str The latest status Raises ------ RuntimeError In case log_reader.ended() is True and log_reader.end_time is None """ if log_reader.ended(): end_time = log_reader.end_time # Assert to prevent "Incompatible types in assignment" with Optional if end_time is None: raise RuntimeError("log_reader.end_time is None although " "log_reader.ended() is True") metadata_updater.update_stop_time(end_time) latest_status = "complete" else: latest_status = self.check_if_running_or_errored(log_reader) return latest_status @staticmethod def check_if_running_or_errored(log_reader: LogReader) -> str: """ Check if a run is still running or has errored. Parameters ---------- log_reader : LogReader The object which reads log files Returns ------- latest_status : str The latest status """ pid = log_reader.pid if pid is None: latest_status = "created" elif psutil.pid_exists(pid): latest_status = "running" else: latest_status = "error" return latest_status
def test_db_reader( make_test_database: Callable[[str], DatabaseConnector], write_to_split: Callable[[str], DatabaseConnector], ) -> None: """ Test we can create read from the database. Specifically this test that: 1. We can make a query 2. That an empty db has not been populated 3. That a populated db has table entries 4. Check that we can extract the id for a given set of values which exist 5. Check that no id is returned if a given set of values is not found in the database 6. That we can get the latest row id which has been written to Parameters ---------- make_test_database : function Function which returns the database connection write_to_split : function Function returning the database connection where `split` has been populated """ empty_db_connector = make_test_database("empty_read_test") empty_db_reader = DatabaseReader(empty_db_connector) # Check that we can make a query table = empty_db_reader.query("SELECT 1+1 AS col") assert table.loc[0, "col"] == 2 # pylint: disable=no-member # Check that the tables has not been created in an empty db assert not empty_db_reader.check_tables_created() db_connector = write_to_split("read_test") db_reader = DatabaseReader(db_connector) # Check that tables exist assert db_reader.check_tables_created() # As write_to_split writes to the split table, we can get the # written values with the following query table = db_reader.query("SELECT * FROM split") entries_dict = table.to_dict(orient="records")[0] # Remove the 'id' entries_dict.pop("id") row_id = db_reader.get_entry_id("split", entries_dict) assert row_id == 1 # Modify entries_dict so that row_id returns None entries_dict[list(entries_dict.keys())[0]] += 1 new_row_id = db_reader.get_entry_id("split", entries_dict) assert new_row_id is None # Assert that get_latest_row_id is working assert db_reader.get_latest_row_id() == 1
def test_status_checker( test_case: str, get_test_data_path: Path, get_test_db_copy: Callable[[str], DatabaseConnector], mock_pid_exists: Callable[[str], None], copy_test_case_log_file: Callable[[str], None], ) -> None: """ Test the StatusChecker exhaustively (excluding raises and loop). Parameters ---------- test_case : str Description of the test on the form >>> ('<log_file_present>_<pid_present_in_log>_' ... '<started_time_present_in_log>_<ended_time_present_in_log>' ... '_<whether_pid_exists>_<new_status>') get_test_data_path : Path Path to test data get_test_db_copy : function Function which returns a a database connector to the copy of the test database mock_pid_exists : function Function which sets up a monkeypatch for psutil.pid_exist copy_test_case_log_file : function Function which copies log files according to the test_case """ project_path = get_test_data_path db_connector = get_test_db_copy(test_case) mock_pid_exists(test_case) copy_test_case_log_file(test_case) db_reader = DatabaseReader(db_connector) status_checker = StatusChecker(db_connector, project_path) status_checker.check_and_update_status() # Check that the correct status has been assigned to "running" # pylint: disable=no-member result = db_reader.query("SELECT latest_status FROM run WHERE name = " "'testdata_5'").loc[0, "latest_status"] assert result == "running" # Check that the correct status has been assigned to "submitted" expected = test_case.split("_")[-1] # pylint: disable=no-member result = db_reader.query("SELECT latest_status FROM run WHERE name = " "'testdata_6'").loc[0, "latest_status"] assert result == expected # Check that correct start_time has been set if "not_started" not in test_case: expected = str(datetime(2020, 5, 1, 17, 7, 10)) # pylint: disable=no-member result = db_reader.query("SELECT start_time FROM run WHERE name = " "'testdata_6'").loc[0, "start_time"] assert expected == result # Check that correct end_time has been set if "not_ended" not in test_case and "complete" in test_case: expected = str(datetime(2020, 5, 1, 17, 7, 14)) # pylint: disable=no-member result = db_reader.query("SELECT stop_time FROM run WHERE name = " "'testdata_6'").loc[0, "stop_time"] assert expected == result