def persistManifest(self, parent_trace, manifest_dict): ''' Persists manifest_dict as a yaml object and returns a ManifestHandle that uniquely identifies it. ''' kind = manifest_dict['kind'] suffix = '' version = ManifestUtils().get_manifest_version(parent_trace, manifest_dict) if version != None and len(str(version).strip()) > 0: suffix = '.' + str(version) manifest_file = self.test_case_name + "." + kind + suffix + ".yaml" my_trace = parent_trace.doing("Persisting manifest", data={ 'manifests_dir': self.output_manifests_dir, 'manifest_file': manifest_file }, origination={ 'concrete class': str(self.__class__.__name__), 'signaled_from': __file__ }) if True: YAML_Utils().save(my_trace, data_dict=manifest_dict, path=self.output_manifests_dir + '/' + manifest_file) handle = ManifestUtils().inferHandle(my_trace, manifest_dict) return handle
def _prepare_yaml_comparison(self, parent_trace, output_dict, test_output_name, output_data_dir, expected_data_dir, save_output_dict=False): ''' Helper method that does most of the heavy lifting when we seek to compare yaml-based expected output. On success, this returns a dictionary corresponding to the expected output, that the caller can then compare with the parameter `output_dict` The motivation for the existence of this method is that there are different ways of comparing yaml files. For example: * A "pure" comparison, done by method `_compare_to_expected_yaml` * A "tolerance-based" comparison, done by method `_compare_yaml_within_tolerance`. This is used, for example, to validate expected output where the contents of file systems is displayed. In such cases, generated Excel files may display a size that differs by 1 or 2 bytes because of the non-determinism involved in creating Excel files since "xlsx" files are really zip files with XML contents, and it is well known that zip files are non-deterministcically created (for example, see https://medium.com/@pat_wilson/building-deterministic-zip-files-with-built-in-commands-741275116a19). Sometimes simply running Apodeixi in a different deployment or machine will cause generated Excel files to change in size by 19 bytes or more. So the tolerance level can be configured in the test_config.yaml, a file that is located under the root folder for the testing database that you have configured to use in ApodeixiConfig (i.e., the folder above the knowledge base's root folder configured in ApodeixiConfig.) ''' # Check not null, or else rest of actions will "gracefully do nothing" and give the false impression that test passes # (at least it would erroneously pass when the expected output is set to an empty file) self.assertIsNotNone(output_dict) PathUtils().create_path_if_needed(parent_trace = parent_trace, path = output_data_dir) # Persist output (based on save_output_dict flag) if save_output_dict: YAML_Utils().save( parent_trace, data_dict = output_dict, path = output_data_dir + '/' + test_output_name + '_OUTPUT.yaml') # Retrieve expected output expected_dict = YAML_Utils().load(parent_trace, path = expected_data_dir + '/' + test_output_name + '_EXPECTED.yaml') return expected_dict
def _compare_to_expected_yaml(self, parent_trace, output_dict, test_output_name, output_data_dir, expected_data_dir, save_output_dict=False): ''' Utility method for derived classes that create YAML files and need to check they match an expected output previously saves as a YAML file as well. It also saves the output as a yaml file, which can be copied to be the expected output when test case is created. @param output_data_dir Directory to which to save any output. @param expected_data_dir Directory from which to retrieve any previously saved expected output. ''' expected_dict = self._prepare_yaml_comparison( parent_trace = parent_trace, output_dict = output_dict, test_output_name = test_output_name, output_data_dir = output_data_dir, expected_data_dir = expected_data_dir, save_output_dict = save_output_dict) result_yaml = YAML_Utils().dict_to_yaml_string(parent_trace, data_dict = output_dict) expected_yaml = YAML_Utils().dict_to_yaml_string(parent_trace, data_dict = expected_dict) self.assertEqual(result_yaml, expected_yaml)
def _getMatchingManifests(self, parent_trace, folder, manifest_handle): ''' Returns two lists of the same length: * A list of dictionaries, one per manifest that matches the given manifest handle * A list of filenames, which is where each of those manifests was retrieved from The search is done over the space of objects in the store that lie "at or below the folder", where the notion of "folder" depends on the concrete store class. For filesystem-based stores, "folder" would literally be a directory of some filesystem mount. @param folder A string scoping a subset of the store @param manifest_handle A ManifestHandle instance that (should) uniquely identify a single manifest in the store @param suffix A string representing a valid "file extension" type used for manifests, where the logical notion of "file extension" is up to each concrete store class to define. For filesystem-based stores, the suffix string is literally a file extension in the filesystem, such as ".yaml" for stores that persist manifests as yaml files. ''' matching_manifests = [] # List of dictionaries, one per manifest matching_filenames = [ ] # List of filename strings. Will be 1-1 lined up with matching_manifests # Two areas where to search for manifests: input area, and output area. First the input area for filename in self._getFilenames(parent_trace, folder): my_trace = parent_trace.doing("Loading manifest from file", data={ 'filename': filename, 'folder': folder }, origination={ 'concrete class': str(self.__class__.__name__), 'signaled_from': __file__ }) manifest_dict = YAML_Utils().load(my_trace, path=folder + '/' + filename) inferred_handle = ManifestUtils().inferHandle( my_trace, manifest_dict) if inferred_handle == manifest_handle: matching_filenames.append(filename) matching_manifests.append(manifest_dict) return matching_manifests, matching_filenames
def save_environment_metadata(self, parent_trace): ''' Creates and saves a YAML file called "METATATA.yaml" in the root folder for self. It is sufficient information from which to re-create the environment (for example, if it was created in a different Python process, so this Python process wouldn't have an in-memory object for it unless it loads it, leveraing the "METADATA.yaml" file). This can happen when the CLI creates a sandbox that will later be used by subsequent commands. Since each CLI invocation is its own Python process, different invocations can only share the same sandbox environment if there is a way to persist and then load the state of an environment. ''' ME = File_KBEnv_Impl METADATA_FILENAME = "METADATA.yaml" metadata_dict = {} metadata_dict['name'] = self.name(parent_trace) metadata_dict['parent'] = self.parent(parent_trace).name(parent_trace) metadata_dict['postingsURL'] = self.postingsURL(parent_trace) metadata_dict['manifestsURL'] = self.manifestsURL(parent_trace) metadata_dict['clientURL'] = self.clientURL(parent_trace) config = self.config(parent_trace) config_dict = {} config_dict['read_misses_policy'] = config.read_misses_policy config_dict['use_timestamps'] = config.use_timestamps metadata_dict['config'] = config_dict if self == self._store.base_environment(parent_trace): environment_dir = _os.path.dirname(self._store.base_environment(parent_trace). \ manifestsURL(parent_trace)) else: root_dir = _os.path.dirname(self._store.base_environment(parent_trace). \ manifestsURL(parent_trace)) envs_dir = root_dir + "/" + ME.ENVS_FOLDER environment_dir = envs_dir + "/" + self.name(parent_trace) PathUtils().create_path_if_needed(parent_trace, environment_dir) YAML_Utils().save(parent_trace, data_dict=metadata_dict, path=environment_dir + "/" + METADATA_FILENAME)
def yaml_2_df(self, parent_trace, manifests_folder, manifests_file, contents_path, sparse, abbreviate_uids): ''' Loads a YAML file for an Apodeixi manifest, and returns a Pandas Dataframe for the data contents and a dictionary for all other fields @param sparse A boolean. If True, it returns a "sparse" representation suitable for Excel rendering, with exactly 1 UID per row (helpful when making joins). If on the other hand sparse=False then a "full" representation is returned, more suitable for data analysis in Pandas. For examples and details, refer to the documentation for`self.dict_2_df` @param contents_path A string using 'dot notation' to convey a path in a dictionary. For example, for a dictionary like this: .. code:: {a: {b: 5, c: 6, streams: {W1: {UID: S1.W1, cost: 4, name: 'requirements gathering'}, W2: {UID: S1.W2, cost: 5, name: 'design'}}, g: 23 } } then if contents-path=`a.streams` that denotes the sub-tree .. code:: {W1: {UID: S1.W1, cost: 4, name: 'requirements gathering'}, W2: {UID: S1.W2, cost: 5, name: 'design'}} which will be turned into DataFrame like .. code:: df = | UID | streams | cost | --------------------------------------------- | S1.W1 | requirements gathering | 4 | | S1.W2 | design | 5 | The function also computes the remaining subtree, which in this example is: .. code:: {a: {b: 5, c: 6, g: 23 } } The return value is the tuple `(df, subtree)` @param abbreviate_uids A boolean. If True, UIDs will only keep the top acronym. For example, a UID like "BR2.MR2.SM4" in the manifest would be transformed to "BR2.2.4" in the DataFrame returned by this method ''' manifest_path = manifests_folder + '/' + manifests_file my_trace = parent_trace.doing('Loading YAML Manifest', data={'path': manifest_path}) manifest_dict = YAML_Utils().load(my_trace, path=manifest_path) path_tokens = contents_path.split('.') # Create the dictionary of everything except what is in the path my_trace = parent_trace.doing('Splitting manifest', data={'path_tokens': path_tokens}) content_dict, non_content_dict = self._split_out( my_trace, manifest_dict, path_tokens) df, uid_info_list = self.dict_2_df(parent_trace, content_dict, contents_path, sparse, abbreviate_uids) return df, non_content_dict
def find_child_environment_from_metadata(self, parent_trace, child_env_name): ''' Attempts to instantiate an immediate child environment based on its metadata. It must be an immediate child of self. This method deliberately only looks for an immediate child. However, before attempting to instantiate such an environment it checks if an environment with that name already exists in memory. If so, it defers to the in-memory object and does not load the metadata, returning the in-memory object instead. If no metadata for such an environment exists, or if it does but is not for an immediate child of self, then this method will return None. NOTE: It is important to return None (as opposed to raising an ApodeixiError) because this method will likely be called in a recursive search by self.findSubEnvironment, and it is "normal" in such a recursion to attempt calling this method from different "self" in the environment hierarcy. So for most but one of them, the child_env_name is not for a real child, so it is OK to return None. It would be erroneous to raise an ApodeixiError since that would cause the recursion of self.findSubEnvironment to abort before it has a chance to get to the real parent of `child-env_name` ''' # First, determine if the environment exists in memory child_env = self.findSubEnvironmentInMemory(parent_trace, child_env_name) if child_env != None: return child_env ME = File_KBEnv_Impl my_trace = parent_trace.doing( "Retrieving metadata for child environment", data={ 'child_env_name': child_env_name, }) METADATA_FILENAME = "METADATA.yaml" root_dir = _os.path.dirname( self._store.base_environment(parent_trace).manifestsURL( parent_trace)) envs_dir = root_dir + "/" + ME.ENVS_FOLDER environment_dir = envs_dir + "/" + child_env_name metadata_path = environment_dir + "/" + METADATA_FILENAME if not _os.path.exists(metadata_path): return None metadata_dict = YAML_Utils().load(my_trace, path=metadata_path) if self.name(my_trace) != metadata_dict['parent']: return None my_trace = parent_trace.doing( "Instantiating child environment from metadata", data={ 'child_env_name': child_env_name, 'metadata': metadata_dict }) config_dict = metadata_dict["config"] child_env_config = KB_Environment_Config( parent_trace=my_trace, read_misses_policy=config_dict["read_misses_policy"], use_timestamps=config_dict["use_timestamps"], path_mask=None, # This was not persisted ) # GOTCHA: When constructing the child_env_impl, we must give a parent_environment that is # of type KB_Environment. # # However, here we are an implementation class, so not derived from KB_Environment, so will need # first to find our wrapping KB_Environment object for which self is the impl base_environment = self._store.base_environment(my_trace) our_name = self.name(my_trace) if our_name == base_environment.name(my_trace): our_env = base_environment else: our_env = base_environment.findSubEnvironment( parent_trace=my_trace, name=our_name) child_env_impl = File_KBEnv_Impl( parent_trace=my_trace, name=child_env_name, store=self._store, parent_environment=our_env, config=child_env_config, postings_rootdir=metadata_dict["postingsURL"], manifests_roodir=metadata_dict["manifestsURL"], clientURL=metadata_dict["clientURL"]) child_env = KB_Environment(parent_trace=my_trace, impl=child_env_impl) self._children[child_env_name] = child_env return child_env
def overwrite_test_context(self, parent_trace): ''' This is a "trick" method needed so that CLI invocations run in the environment isolated for this test case (or its children), as opposed to on the base environment. It accomplishes this by "fooling" the CLI into thinking that "base environment" is actually the environment isolated for this test case. It does so by overwriting the value of the self.CONFIG_DIRECTORY() environment variable but what is tricky is: * By the time this method is called, this class no longer needs the self.CONFIG_DIRECTORY() environment variable, since it was used in super().setUp() to initialize self.a6i_config and other properties, and that is as it should be. * Therefore, the modification in this method to self.CONFIG_DIRECTORY() is not going to impact this test object. Instead, it will impact other objects that use it. There is no such object in Apodeixi itself, but there is one in the CLI: the KB_Session class. * The intent is then for the KB_Session class to initialize it's notion of self.a6i_config differently, so that it is "fooled" into thinking that the "base environment" is this test cases's isolated environment. * Each time the CLI is invoked, it constructs a KB_Session to initialiaze the KnowledgeBaseStore. Thus the CLI will be using a store pointing to this test case's isolated environment. This is different than for non-CLI tests, for whom the store points to the test knowledge base common to the Apodeixi test suite. ''' # Before changing context, create the environment for this test, which will later become the # "fake base environment" when we switch context. But this uses the "original" store, so must be done # before we switch context, so we must select the stack here (and later we re-select it when # switching context) self.selectStack(parent_trace) self.provisionIsolatedEnvironment(parent_trace) # Remember original config before it is overwritten when we change context original_a6i_config = self.a6i_config # In case it is ever needed, remember this tests suite's value for the environment variable self.config_directory_for_this_test_object = _os.environ.get( self.CONFIG_DIRECTORY()) # OK, we start the context switch here. # For this test case, we want the CLI to use a config file that is in the input folder _os.environ[ self.CONFIG_DIRECTORY()] = self.input_data + "/" + self.scenario() # Each CLI test has a dedicated folder containing the test environment for it, i.e., an entire # test database (knowledge base folder, collaboration area folder) just for that test. # These folders are referenced in a test-specific apodeixi_config.toml. # In order to make the folders thus referenced not depend on the installation folder for the Apodeixi # test database, we introduce the environment variable ${TEST_DB_DIR} that should be used in all these # CLI-test-specific apodeixi_config.toml. # # Example: for CLI test for subproducts with id #1011, in the folder ...../test_db/input_data/1011/cli.subproducts, # the apodeixi_config.toml should have a line like # knowledge-base-root-folder = "${TEST_DB_DIR}/knowledge-base/envs/1011_ENV/kb" # instead of # knowledge-base-root-folder = "C:/Users/aleja/Documents/Code/chateauclaudia-labs/apodeixi/test_db/knowledge-base/envs/1011_ENV/kb" # # Such practice ensures that the test harness continues to work no matter where it is installed (for example, # in a Docker container). # To make this approach work, we hereby set that environment variable, whose value will be consulted by the # ApodeixiConfig constructor when we do the context switch a few lines further below _os.environ[self.TEST_DB_DIR] = self.test_db_dir # Now overwrite parent's notion of self.a6i_config and of the self.test_config_dict self.a6i_config = ApodeixiConfig(parent_trace) self.selectStack( parent_trace ) # Re-creates the store for this test with the "fake" base environment # Set again the location of the test directory as per the original a6i config. We need it to mask non-deterministic # paths self.a6i_config.test_db_dir = original_a6i_config.test_db_dir # Next time an environment is provisioned for this test, use this overwritten config for the name of the folder self.test_config_dict = YAML_Utils().load( parent_trace, path=self.input_data + "/" + self.scenario() + '/test_config.yaml')
def setUp(self): super().setUp() self.input_data = None # Will be set later by method selectTestDataLocation self.results_data = None # Will be set later by method selectTestDataLocation # Integration test cases must call self.selectTestDataLocation(-) to set self.results_data # and self.input_data. # This is required for 2 reasons: # 1. Ease of management - all integration tests are registered in test_config.yaml and # their output is externalized from the Apodeixi code base # 2. Reduce the size of folder structure by placing output in a less nested directory structure # (i.e., not under the code of the tests themselves). This is needed in Windows to avoid # issues with long paths that impede file persistence and/or impede committing to GIT. # As an added benefit, I noticed it improves test performance by 50% to use shorter paths. # # To support all this we have these two attributes that get used later in self.selectTestDataLocation(-): # - self.test_db_dir # - self.test_config_dict #. root_trace = FunctionalTrace( parent_trace=None, path_mask=self._path_mask).doing( "Checking where results should be saved to", origination={'signaled_from': __file__}) self.test_db_dir = _os.path.dirname( self.a6i_config.get_KB_RootFolder(root_trace)) self.test_config_dict = YAML_Utils().load(root_trace, path=self.test_db_dir + '/test_config.yaml') # Remember location of test_db in ApodeixiConfig. # This flag will be set by test cases to assist with masking non-deterministic information about the # location of the test database. It is used in the masking function that hides parts of paths from regression # output, to avoid non-deterministic test output. When not using the test regression suite, this flag plays no role # in Apodeixi. self.a6i_config.test_db_dir = self.test_db_dir root_trace = FunctionalTrace( parent_trace=None, path_mask=self._path_mask).doing( "Provisioning stack for integration test", origination={'signaled_from': __file__}) # These will be set by each individual test case (i.e., each method in a derived class with a name like "test_*") self._current_test_name = None # For ease of maintenance of tests, each output for a test will be named using standard numbering # enforced by the "next_*" functions self._output_nb = 0 # As a general pattern, we only enforce referential integrity tests in "flow" tests, which is the # more "realistic" flavor of integration tests self.a6i_config.enforce_referential_integrity = False # Log output files like "POST_EVENT_LOG.txt" are normally masked in test output, so we want # them to match expected output to the byte when showing environment contents. # *HOWEVER*, in the case of CLI tests we don't mask their contents to make them "more realistic" and because # CLI test output doesn't show the contents of such log files. # So to ensure CLI tests don't frivolously fail when the test_db is relocated, this setting # (normally set to False) can be enabled by derived classes (such as CLI tests) can set to True # so the test case accepts accept whatever byte size is displayed for log files when displaying environment contents. self.ignore_log_files_byte_size = False