def parse_dataset_section(self): """ Parse the dataset section of the toml file and instantiate the dataset Returns ------- Dict[str, Any] The keys are ``[train, valid, test]`` with values being the instantiations of the dataset mentioned in the toml filename """ dataset_section = self.doc.get("dataset") all_datasets = {} dataset_classname = dataset_section.get("class") if dataset_classname is None: raise TOMLConfigurationError( f"Dataset section needs to have a name and class section" ) args = dataset_section.get("args") for dataset_type in ["train", "valid", "test"]: try: dataset_cls = create_class( classname=dataset_classname, module_name=ClassNursery.class_nursery[dataset_classname], ) args["dataset_type"] = dataset_type args["filename"] = dataset_section[f"{dataset_type}_filename"] dataset = dataset_cls(**args) all_datasets[dataset_type] = dataset except ModuleNotFoundError: print( f"Module {ClassNursery.class_nursery[dataset_classname]} is not found" ) except AttributeError: print(f"Class {dataset_classname} is not found ") return all_datasets
def parse_dataset_section(self): """ Parse the dataset section of the toml file and instantiate the dataset Returns ------- DatasetManager The dataset manager for the experiment """ dataset_section = self.doc.get("dataset") if dataset_section is None: raise TOMLConfigurationError( f"{self.toml_filename} does not have a datasets section. Please " f"Provide a dataset section in your toml file" ) self.dataset_section = dataset_section dataset_classname = dataset_section.get("class") if dataset_classname is None: raise TOMLConfigurationError( f"Dataset section needs to have a name and class section" ) train_filename = dataset_section.get("train_filename") dev_filename = dataset_section.get("dev_filename") test_filename = dataset_section.get("test_filename") train_filename = self.data_dir.joinpath(train_filename) dev_filename = self.data_dir.joinpath(dev_filename) test_filename = self.data_dir.joinpath(test_filename) train_filename = str(train_filename) dev_filename = str(dev_filename) test_filename = str(test_filename) tokenizers = dataset_section.get("tokenizers") namespace_vocab_options = dataset_section.get("namespace_vocab_options") namespace_numericalizer_map = dataset_section.get("namespace_numericalizer_map") args = { "train_filename": train_filename, "dev_filename": dev_filename, "test_filename": test_filename, "tokenizers": tokenizers, "namespace_vocab_options": namespace_vocab_options, "namespace_numericalizer_map": namespace_numericalizer_map, } try: dataset_cls = create_class( classname=dataset_classname, module_name=ClassNursery.class_nursery[dataset_classname], ) dataset_manager = dataset_cls(**args) self.msg_printer.good(title=f"Finished Creating {dataset_classname}") return dataset_manager except ModuleNotFoundError: print( f"Module {ClassNursery.class_nursery[dataset_classname]} is not found" ) except AttributeError: print(f"Class {dataset_classname} is not found ")
def test_create_class_raises_class_not_found_error(self): with pytest.raises(AttributeError): create_class(classname="dummy", module_name=Engine.__module__)
def test_create_class_raises_module_not_found_error(self): with pytest.raises(ModuleNotFoundError): create_class("dummy", "dummy")
def test_create_class(self, classname, modulename): cls = create_class(classname, modulename) assert cls.__name__ == classname assert cls.__module__ == modulename
def parse_engine_section(self): """ Parses the engine section of the TOML file Returns ------- Engine Object of the Engine class """ engine_section = self.doc.get("engine") if engine_section is None: raise TOMLConfigurationError( f"{self.toml_filename} does not have an engine section" ) self.engine_section = engine_section engine_args = {} for key, value in engine_section.items(): if not isinstance(value, dict): engine_args[key] = value optimizer_section = engine_section.get("optimizer") if optimizer_section is None: optimizer_classname = "SGD" optimizer_module = ClassNursery.class_nursery[optimizer_classname] optimizer_args = {"lr": 1e-2} else: optimizer_classname = optimizer_section.get("class") optimizer_module = ClassNursery.class_nursery[optimizer_classname] optimizer_args = {} for arg_, value in optimizer_section.items(): if arg_ != "class": optimizer_args[arg_] = value optimizer_cls = create_class( module_name=optimizer_module, classname=optimizer_classname ) optimizer = optimizer_cls(params=self.model.parameters(), **optimizer_args) # patching optimizer engine_args["optimizer"] = optimizer metric_section = engine_section.get("metric") metric_classname = metric_section.get("class") # parse any other arguments for metric metric_args = {"datasets_manager": self.datasets_manager} for key, value in metric_section.items(): if key == "class": pass else: metric_args[key] = value metric_cls = create_class( module_name=ClassNursery.class_nursery[metric_classname], classname=metric_classname, ) train_metric = metric_cls(**metric_args) dev_metric = metric_cls(**metric_args) test_metric = metric_cls(**metric_args) engine_args["train_metric"] = train_metric engine_args["validation_metric"] = dev_metric engine_args["test_metric"] = test_metric engine_args["datasets_manager"] = self.datasets_manager engine_args["model"] = self.model engine_args["experiment_name"] = self.experiment_name engine_args["experiment_hyperparams"] = self.doc engine_module = ClassNursery.class_nursery["Engine"] engine_classname = "Engine" engine_cls = create_class(classname=engine_classname, module_name=engine_module) engine = engine_cls(**engine_args) return engine
def _instantiate_model_using_dag(self): """ This is a key method that instantiates the DAG using topological order THE DAG from the TOML model section should be instantiated with the submodules of a module instantiated before the parent module can be instantiated This method does it using topological sort. Topoloogical sort is the sorting of nodes of a DAG where if there is an edge between two nodes from u ->v , then u appears before v in the ordering. We do exactly this for SciWING. We instantiate the children nodes that are used by parent nodes before we can instantiate the root node of the DAG that will represent the entire module. Returns ------- nn.Module The instantiation of the root node """ topo_order = nx.algorithms.topological_sort(self.model_dag) topo_order = reversed(list(topo_order)) topo_order = list(topo_order) root_nodename = topo_order[-1] for node_id in topo_order: node_data = self.model_dag.nodes[node_id] tag = node_data.get("tag", None) classname = node_data.pop("class", None) if classname is None: raise TOMLConfigurationError( f"Class {classname} is missing for one of the components of your model." f"Have you added the class into the ClassNursery?" ) class_args = copy.deepcopy(self.model_dag.nodes[node_id]) # models accept a datasets manager by default # TODO: make this true for all the modules and models including embedders? if tag == "model" or tag == "embedder": class_args["datasets_manager"] = self.datasets_manager # instantiated class holds the object that is instantiated for # the node class_args.pop("instantiated_class", None) current_node_tag = class_args.pop("tag", None) # leaf node # we always assume that embedders are used at the lower level # This is a reasonable assumption to make if not list(self.model_dag.successors(node_id)): cls_obj = create_class( classname=classname, module_name=ClassNursery.class_nursery[classname], ) cls_obj = cls_obj(**class_args) self.model_dag.nodes[node_id]["instantiated_class"] = { "key": tag, "object": cls_obj, } # must have children that would have been instantiated else: successors = list(self.model_dag.successors(node_id)) num_successors = len(successors) different_tags = set( self.model_dag.nodes[child]["tag"] for child in successors ) num_different_tags = len(different_tags) # They are a lst of embedders # They all have the same section name # but different classes if num_successors > 1 and num_different_tags == 1: # pass through concat embedders embedders_ = [] unique_tag = different_tags.pop() for successor in successors: node_data = self.model_dag.nodes[successor] embedder = node_data["instantiated_class"]["object"] embedders_.append(embedder) embedder = ConcatEmbedders(embedders=embedders_) # instantiate the current node here class_args[unique_tag] = embedder cls_obj = create_class( classname=classname, module_name=ClassNursery.class_nursery[classname], ) cls_obj = cls_obj(**class_args) self.model_dag.nodes[node_id]["instantiated_class"] = { "key": current_node_tag, "object": cls_obj, } # use their tags separately as attributes of the parent node else: for successor in successors: successor_node_data = self.model_dag.nodes[successor] instantiated_class_data = successor_node_data[ "instantiated_class" ] successor_key = instantiated_class_data["key"] successor_obj = instantiated_class_data["object"] class_args[successor_key] = successor_obj cls_obj = create_class( classname=classname, module_name=ClassNursery.class_nursery[classname], ) cls_obj = cls_obj(**class_args) self.model_dag.nodes[node_id]["instantiated_class"] = { "key": current_node_tag, "object": cls_obj, } return self.model_dag.nodes[root_nodename]["instantiated_class"]["object"]
def parse_engine_section(self): """ Parses the engine section of the TOML file Returns ------- Engine Object of the Engine class """ engine_section = self.doc.get("engine") engine_args = {} for key, value in engine_section.items(): if not isinstance(value, dict): engine_args[key] = value optimizer_section = engine_section.get("optimizer") if optimizer_section is None: optimizer_classname = "SGD" optimizer_module = ClassNursery.class_nursery[optimizer_classname] optimizer_args = {"lr": 1e-2} else: optimizer_classname = optimizer_section.get("class") optimizer_module = ClassNursery.class_nursery[optimizer_classname] optimizer_args = {} for arg_, value in optimizer_section.items(): if arg_ != "class": optimizer_args[arg_] = value optimizer_cls = create_class( module_name=optimizer_module, classname=optimizer_classname ) optimizer = optimizer_cls(params=self.model.parameters(), **optimizer_args) # patching optimizer engine_args["optimizer"] = optimizer metric_section = engine_section.get("metric") metric_classname = metric_section.get("class") metric_args = {} for key, value in metric_section.items(): if key == "class": pass else: metric_args[key] = value metric_cls = create_class( module_name=ClassNursery.class_nursery[metric_classname], classname=metric_classname, ) metric = metric_cls(**metric_args) engine_args["metric"] = metric train_dataset = self.all_datasets["train"] valid_dataset = self.all_datasets["valid"] test_dataset = self.all_datasets["test"] engine_args["train_dataset"] = train_dataset engine_args["validation_dataset"] = valid_dataset engine_args["test_dataset"] = test_dataset engine_args["model"] = self.model engine_args["experiment_name"] = self.experiment_name engine_args["experiment_hyperparams"] = self.doc engine_module = ClassNursery.class_nursery["Engine"] engine_classname = "Engine" engine_cls = create_class(classname=engine_classname, module_name=engine_module) engine = engine_cls(**engine_args) return engine
def _instantiate_model_using_dag(self): """ This is a key method that instantiates the DAG using topological order THE DAG from the TOML model section should be instantiated with the submodules of a module instantiated before the parent module can be instantiated This method does it using topological sort. Topoloogical sort is the sorting of nodes of a DAG where if there is an edge between two nodes from u ->v , then u appears before v in the ordering. We do exactly this for SciWING. We instantiate the children nodes that are used by parent nodes before we can instantiate the root node of the DAG that will represent the entire module. Returns ------- nn.Module The instantiation of the root node """ topo_order = nx.algorithms.topological_sort(self.model_dag) topo_order = reversed(list(topo_order)) topo_order = list(topo_order) root_nodename = topo_order[-1] for node_id in topo_order: node_data = self.model_dag.nodes[node_id] tag = node_data.get("tag", None) classname = node_data.pop("class", None) if classname is None: raise TOMLConfigurationError( f"class is missing for one of the components of your model" ) class_args = copy.deepcopy(self.model_dag.nodes[node_id]) class_args.pop("instantiated_class", None) current_node_tag = class_args.pop("tag", None) # leaf node # we always assume that vanilla embedder is used at the lower level # This is a reasonable assumption to make if not list(self.model_dag.successors(node_id)): if node_data.get("embed") == "word_vocab": embedding = self.all_datasets["train"].word_vocab.load_embedding() embedding_dim = self.all_datasets[ "train" ].word_vocab.embedding_dimension freeze = node_data.get("freeze", False) embedding = nn.Embedding.from_pretrained(embedding, freeze=freeze) embedder = VanillaEmbedder( embedding_dim=embedding_dim, embedding=embedding ) self.model_dag.nodes[node_id]["instantiated_class"] = { "key": tag, "object": embedder, } elif node_data.get("embed") == "char_vocab": embedding = self.all_datasets["train"].char_vocab.load_embedding() embedding_dim = self.all_datasets[ "train" ].char_vocab.embedding_dimension freeze = node_data.get("freeze", False) embedding = nn.Embedding.from_pretrained(embedding, freeze=freeze) embedder = VanillaEmbedder( embedding_dim=embedding_dim, embedding=embedding ) self.model_dag.nodes[node_id]["instantiated_class"] = { "key": tag, "object": embedder, } else: cls_obj = create_class( classname=classname, module_name=ClassNursery.class_nursery[classname], ) cls_obj = cls_obj(**class_args) self.model_dag.nodes[node_id]["instantiated_class"] = { "key": tag, "object": cls_obj, } # must have children that would have been instantiated else: successors = list(self.model_dag.successors(node_id)) num_successors = len(successors) different_tags = set( self.model_dag.nodes[child]["tag"] for child in successors ) num_different_tags = len(different_tags) if num_successors > 1 and num_different_tags == 1: # pass through concat embedders embedders = [] unique_tag = different_tags.pop() for successor in successors: node_data = self.model_dag.nodes[successor] embedder = node_data["instantiated_class"]["object"] embedders.append(embedder) embedder = ConcatEmbedders(embedders=embedders) # instantiate the current node here class_args[unique_tag] = embedder cls_obj = create_class( classname=classname, module_name=ClassNursery.class_nursery[classname], ) cls_obj = cls_obj(**class_args) self.model_dag.nodes[node_id]["instantiated_class"] = { "key": current_node_tag, "object": cls_obj, } # use their tags separately as attributes of the parent node else: for successor in successors: successor_node_data = self.model_dag.nodes[successor] instantiated_class_data = successor_node_data[ "instantiated_class" ] successor_key = instantiated_class_data["key"] successor_obj = instantiated_class_data["object"] class_args[successor_key] = successor_obj cls_obj = create_class( classname=classname, module_name=ClassNursery.class_nursery[classname], ) cls_obj = cls_obj(**class_args) self.model_dag.nodes[node_id]["instantiated_class"] = { "key": current_node_tag, "object": cls_obj, } return self.model_dag.nodes[root_nodename]["instantiated_class"]["object"]