def __init__(self, task_graph=None, input_module=None, seed=None, **kwargs): defaults = recursive_merge_dicts(em_default_config, mt_em_default_config, misses='insert') self.config = recursive_merge_dicts(defaults, kwargs) # If no task_graph is specified, default to a single binary task if task_graph is None: task_graph = TaskHierarchy(edges=[], cardinalities=[2]) self.task_graph = task_graph self.K_t = self.task_graph.K_t # Cardinalities by task self.T = self.task_graph.T # Total number of tasks MTClassifier.__init__(self, cardinalities=self.K_t, seed=seed) if input_module is None: input_module = IdentityModule(self.config['layer_output_dims'][0]) self._build(input_module) # Show network if self.config['verbose']: print("\nNetwork architecture:") self._print() print()
def __init__( self, layer_out_dims, input_modules=None, middle_modules=None, head_modules=None, K=[], task_graph=None, **kwargs, ): kwargs["layer_out_dims"] = layer_out_dims config = recursive_merge_dicts(em_default_config, mt_em_default_config, misses="insert") config = recursive_merge_dicts(config, kwargs) MTClassifier.__init__(self, K, config) if task_graph is None: if K is None: raise ValueError("You must supply either a list of " "cardinalities (K) or a TaskGraph.") task_graph = TaskGraph(K) self.task_graph = task_graph self.K = self.task_graph.K # Cardinalities by task self.t = self.task_graph.t # Total number of tasks assert len(self.K) == self.t self._build(input_modules, middle_modules, head_modules) # Show network if self.config["verbose"]: print("\nNetwork architecture:") self._print() print()
def test_recursive_merge_dicts(self): x = {"foo": {"Foo": {"FOO": 1}}, "bar": 2, "baz": 3} y = {"FOO": 4, "bar": 5} z = {"foo": 6} w = recursive_merge_dicts(x, y, verbose=False) self.assertEqual(w["bar"], 5) self.assertEqual(w["foo"]["Foo"]["FOO"], 4) with self.assertRaises(ValueError): recursive_merge_dicts(x, z, verbose=False)
def __init__(self, **kwargs): self.config = recursive_merge_dicts(trainer_defaults, kwargs, misses="insert") # Set random seeds if self.config["seed"] is None: self.config["seed"] = np.random.randint(1e6) set_seed(self.config["seed"])
def __init__( self, layer_out_dims, input_module=None, middle_modules=None, head_module=None, **kwargs, ): if len(layer_out_dims) < 2 and not kwargs["skip_head"]: raise ValueError( "Arg layer_out_dims must have at least two " "elements corresponding to the output dim of the input module " "and the cardinality of the task. If the input module is the " "IdentityModule, then the output dim of the input module will " "be equal to the dimensionality of your input data points") # Add layer_out_dims to kwargs so it will be merged into the config dict kwargs["layer_out_dims"] = layer_out_dims config = recursive_merge_dicts(em_default_config, kwargs, misses="insert") super().__init__(k=layer_out_dims[-1], config=config) self._build(input_module, middle_modules, head_module) # Show network if self.config["verbose"]: print("\nNetwork architecture:") self._print() print()
def __init__(self, input_dim, output_dim=2, **kwargs): layer_out_dims = [input_dim, output_dim] overrides = {"input_batchnorm": False, "input_dropout": 0.0} kwargs = recursive_merge_dicts( kwargs, overrides, misses="insert", verbose=False ) super().__init__(layer_out_dims, **kwargs)
def train_model(self, train_data, valid_data=None, log_writer=None, **kwargs): self.config = recursive_merge_dicts(self.config, kwargs) # If train_data is provided as a tuple (X, Y), we can make sure Y is in # the correct format # NOTE: Better handling for if train_data is Dataset or DataLoader...? if isinstance(train_data, (tuple, list)): X, Y = train_data Y = self._preprocess_Y(self._to_torch(Y, dtype=torch.FloatTensor), self.k) train_data = (X, Y) # Convert input data to data loaders train_loader = self._create_data_loader(train_data, shuffle=True) # Create loss function loss_fn = self._get_loss_fn() # Execute training procedure self._train_model(train_loader, loss_fn, valid_data=valid_data, log_writer=log_writer)
def generate_configs_and_commands(args, launch_args, search_space, n=None): # Create directory with all configurations saved configspace_path = "%s/configspace" % args.outputpath if not os.path.exists(configspace_path): os.makedirs(configspace_path) # Save searchspace with open("%s/search_space" % configspace_path, "w") as f: f.write(json.dumps(search_space)) tuner = RandomSearchTuner(None, seed=time.time()) configs = tuner.config_generator(search_space, n, tuner.rng, True) command_dicts = [] for i, random_config in enumerate(configs): # Recursive merge dicts launch_args with sampled parameters config_to_use = recursive_merge_dicts(launch_args, random_config, misses="insert") # Add commit hash to config config_to_use["commit_hash"] = args.commit_hash config_to_use["ami"] = args.ami # Write to directory config_path = "%s/config_%d.json" % (configspace_path, i) with open(config_path, "w") as f: json.dump(config_to_use, f) # Create command dict command_dicts.append( create_command_dict(args, config_path, config_to_use)) return command_dicts
def test_recursive_merge_dicts(self): x = { 'foo': {'Foo': {'FOO': 1}}, 'bar': 2, 'baz': 3, } y = { 'FOO': 4, 'bar': 5, } z = { 'foo': 6 } w = recursive_merge_dicts(x, y, verbose=False) self.assertEqual(w['bar'], 5) self.assertEqual(w['foo']['Foo']['FOO'], 4) with self.assertRaises(ValueError): recursive_merge_dicts(x, z, verbose=False)
def __init__(self, input_dim, output_dim=2, padding_idx=0, **kwargs): layer_out_dims = [input_dim, output_dim] sparse_linear = SparseLinearModule( vocab_size=input_dim, embed_size=output_dim, padding_idx=padding_idx ) overrides = {"input_batchnorm": False, "input_dropout": 0.0} kwargs = recursive_merge_dicts( kwargs, overrides, misses="insert", verbose=False ) super().__init__(layer_out_dims, head_module=sparse_linear, **kwargs)
def __init__(self, cardinality=2, input_module=None, **kwargs): self.config = recursive_merge_dicts( em_default_config, kwargs) super().__init__(cardinality, seed=self.config['seed']) if input_module is None: input_module = IdentityModule(self.config['layer_output_dims'][0]) self._build(input_module) # Show network if self.config['verbose']: print("\nNetwork architecture:") self._print() print()
def train(self, L, **kwargs): """Train the model (i.e. estimate mu) in one of two ways, depending on whether source dependencies are provided or not: (1) No dependencies (conditionally independent sources): Estimate mu subject to constraints: (1a) O_{B(i,j)} - (mu P mu.T)_{B(i,j)} = 0, for i != j, where B(i,j) is the block of entries corresponding to sources i,j (1b) np.sum( mu P, 1 ) = diag(O) (2) Source dependencies: - First, estimate Z subject to the inverse form constraint: (2a) O_\Omega + (ZZ.T)_\Omega = 0, \Omega is the deps mask - Then, compute Q = mu P mu.T - Finally, estimate mu subject to mu P mu.T = Q and (1b) """ self.config = recursive_merge_dicts(self.config, kwargs, misses='ignore') if self.inv_form: # Compute O, O^{-1}, and initialize params if self.config['verbose']: print("Computing O^{-1}...") self._generate_O_inv(L) self._init_params() # Estimate Z, compute Q = \mu P \mu^T if self.config['verbose']: print("Estimating Z...") self._train(self.loss_inv_Z) self.Q = torch.from_numpy(self.get_Q()).float() # Estimate \mu if self.config['verbose']: print("Estimating \mu...") self._train(self.loss_inv_mu) else: # Compute O and initialize params if self.config['verbose']: print("Computing O...") self._generate_O(L) self._init_params() # Estimate \mu if self.config['verbose']: print("Estimating \mu...") self._train(self.loss_mu)
def __init__(self, tasks, **kwargs): self.config = recursive_merge_dicts(model_defaults, kwargs, misses="insert") # Set random seed before initializing module weights if self.config["seed"] is None: self.config["seed"] = np.random.randint(1e6) set_seed(self.config["seed"]) super().__init__() # Build network self._build(tasks) self.task_map = {task.name: task for task in tasks} # Load weights if self.config["model_weights"]: self.load_weights(self.config["model_weights"]) # Half precision if self.config["fp16"]: print("metal_model.py: Using fp16") self.half() # Move model to device now, then move data to device in forward() or calculate_loss() if self.config["device"] >= 0: if torch.cuda.is_available(): if self.config["verbose"]: print("Using GPU...") self.to(torch.device(f"cuda:{self.config['device']}")) else: if self.config["verbose"]: print("No cuda device available. Using cpu instead.") # Show network if self.config["verbose"]: print("\nNetwork architecture:") print(self) print() num_params = sum(p.numel() for p in self.parameters() if p.requires_grad) print(f"Total number of parameters: {num_params}")
def train(self, X_train, Y_train, X_dev=None, Y_dev=None, **kwargs): self.config = recursive_merge_dicts(self.config, kwargs) train_config = self.config["train_config"] Y_train = self._to_torch(Y_train, dtype=torch.FloatTensor) Y_dev = self._to_torch(Y_dev) # Make data loaders loader_config = train_config["data_loader_config"] train_loader = self._make_data_loader(X_train, Y_train, loader_config) # Initialize the model self.reset() # Create loss function loss_fn = self._get_loss_fn() # Execute training procedure self._train(train_loader, loss_fn, X_dev=X_dev, Y_dev=Y_dev)
def __init__(self, K=None, task_graph=None, **kwargs): """ Args: K: A t-length list of task cardinalities (overrided by task_graph if task_graph is not None) task_graph: TaskGraph: A TaskGraph which defines a feasible set of task label vectors; overrides K if provided """ config = recursive_merge_dicts(lm_default_config, kwargs) MTClassifier.__init__(self, K, config) if task_graph is None: task_graph = TaskGraph(K) self.task_graph = task_graph # Note: While K is a list of the cardinalities of the tasks, k is the # cardinality of the feasible set. These are always the same for a # single-task model, but rarely the same for a multi-task model. self.k = self.task_graph.k
def __init__(self, m, k=2, task_graph=None, p=None, deps=[], **kwargs): """ Args: m: int: Number of sources k: int: Number of true classes task_graph: TaskGraph: A TaskGraph which defines a feasible set of task label vectors; note this overrides k p: np.array: Class balance deps: list: A list of source dependencies as tuples of indices kwargs: - seed: int: Random state seed """ self.config = recursive_merge_dicts(lm_model_defaults, kwargs) super().__init__() self.k = k self.m = m # TaskGraph; note overrides k if present self.task_graph = task_graph if self.task_graph is not None: self.k = len(self.task_graph) self.multi_task = (self.task_graph is not None) # Class balance- assume uniform if not provided if p is None: self.p = (1 / self.k) * np.ones(self.k) else: self.p = p self.P = torch.diag(torch.from_numpy(self.p)).float() # Dependencies self.deps = deps self.c_tree = get_clique_tree(range(self.m), self.deps) # Whether to take the simple conditionally independent approach, or the # "inverse form" approach for handling dependencies # This flag allows us to eg test the latter even with no deps present self.inv_form = (len(self.deps) > 0)
def __init__(self, input_dim, **kwargs): overrides = { 'batchnorm': False, 'dropout': 0.0, 'layer_output_dims': [input_dim], } kwargs = recursive_merge_dicts(kwargs, overrides, misses='insert', verbose=False) super().__init__(cardinality=2, **kwargs) # class SoftmaxRegression(EndModel): # """A softmax regression classifier for a multi-class single-task problem""" # def __init__(self, input_dim, output_dim, **kwargs): # raise NotImplementedError # overrides = { # 'batchnorm': False, # 'layer_output_dims': [input_dim], # } # kwargs = recursive_merge_dicts(kwargs, overrides, verbose=False) # label_map = [range(output_dim)] # super().__init__(label_map, **kwargs)
def update_config(self, update_dict): """Updates self.config with the values in a given update dictionary""" self.config = recursive_merge_dicts(self.config, update_dict)
def create_glue_tasks_payloads(task_names, skip_payloads=False, **kwargs): assert len(task_names) > 0 config = recursive_merge_dicts(task_defaults, kwargs) if config["seed"] is None: config["seed"] = np.random.randint(1e6) print(f"Using random seed: {config['seed']}") set_seed(config["seed"]) # share bert encoder for all tasks if config["encoder_type"] == "bert": bert_kwargs = config["bert_kwargs"] bert_model = BertRaw(config["bert_model"], **bert_kwargs) if "base" in config["bert_model"]: neck_dim = 768 elif "large" in config["bert_model"]: neck_dim = 1024 input_module = bert_model pooler = bert_model.pooler if bert_kwargs["pooler"] else None cls_middle_module = BertExtractCls(pooler=pooler, dropout=config["dropout"]) else: raise NotImplementedError # Create dict override dl_kwarg for specific task # e.g. {"STSB": {"batch_size": 2}} task_dl_kwargs = {} if config["task_dl_kwargs"]: task_configs_str = [ tuple(config.split(".")) for config in config["task_dl_kwargs"].split(",") ] for (task_name, kwarg_key, kwarg_val) in task_configs_str: if kwarg_key == "batch_size": kwarg_val = int(kwarg_val) task_dl_kwargs[task_name] = {kwarg_key: kwarg_val} tasks = [] payloads = [] for task_name in task_names: # If a flag is specified for attention, use it, otherwise use identity module if config["attention"]: print("Using soft attention head") attention_module = SoftAttentionModule(neck_dim) else: attention_module = IdentityModule() # Pull out names of auxiliary tasks to be dealt with in a second step # TODO: fix this logic for cases where auxiliary task for task_name has # its own payload has_payload = task_name not in config["auxiliary_task_dict"] # Note whether this task has auxiliary tasks that apply to it and require spacy run_spacy = False for aux_task, target_payloads in config["auxiliary_task_dict"].items(): run_spacy = run_spacy or (task_name in target_payloads and aux_task in SPACY_TASKS and aux_task in task_names) # Override general dl kwargs with task-specific kwargs dl_kwargs = copy.deepcopy(config["dl_kwargs"]) if task_name in task_dl_kwargs: dl_kwargs.update(task_dl_kwargs[task_name]) # Each primary task has data_loaders to load if has_payload and not skip_payloads: if config["preprocessed"]: datasets = load_glue_datasets( dataset_name=task_name, splits=config["splits"], bert_vocab=config["bert_model"], max_len=config["max_len"], max_datapoints=config["max_datapoints"], run_spacy=run_spacy, verbose=True, ) else: datasets = create_glue_datasets( dataset_name=task_name, splits=config["splits"], bert_vocab=config["bert_model"], max_len=config["max_len"], max_datapoints=config["max_datapoints"], generate_uids=kwargs.get("generate_uids", False), run_spacy=run_spacy, verbose=True, ) # Wrap datasets with DataLoader objects data_loaders = create_glue_dataloaders( datasets, dl_kwargs=dl_kwargs, split_prop=config["split_prop"], splits=config["splits"], seed=config["seed"], ) if task_name == "COLA": scorer = Scorer( standard_metrics=["accuracy"], custom_metric_funcs={matthews_corr: ["matthews_corr"]}, ) task = ClassificationTask( name=task_name, input_module=input_module, middle_module=cls_middle_module, attention_module=attention_module, head_module=BinaryHead(neck_dim), scorer=scorer, ) elif task_name == "SST2": task = ClassificationTask( name=task_name, input_module=input_module, middle_module=cls_middle_module, attention_module=attention_module, head_module=BinaryHead(neck_dim), ) elif task_name == "MNLI": task = ClassificationTask( name=task_name, input_module=input_module, middle_module=cls_middle_module, attention_module=attention_module, head_module=MulticlassHead(neck_dim, 3), scorer=Scorer(standard_metrics=["accuracy"]), ) elif task_name == "SNLI": task = ClassificationTask( name=task_name, input_module=input_module, middle_module=cls_middle_module, attention_module=attention_module, head_module=MulticlassHead(neck_dim, 3), scorer=Scorer(standard_metrics=["accuracy"]), ) elif task_name == "RTE": task = ClassificationTask( name=task_name, input_module=input_module, middle_module=cls_middle_module, attention_module=attention_module, head_module=BinaryHead(neck_dim), scorer=Scorer(standard_metrics=["accuracy"]), ) elif task_name == "WNLI": task = ClassificationTask( name=task_name, input_module=input_module, middle_module=cls_middle_module, attention_module=attention_module, head_module=BinaryHead(neck_dim), scorer=Scorer(standard_metrics=["accuracy"]), ) elif task_name == "QQP": task = ClassificationTask( name=task_name, input_module=input_module, middle_module=cls_middle_module, attention_module=attention_module, head_module=BinaryHead(neck_dim), scorer=Scorer( custom_metric_funcs={acc_f1: ["accuracy", "f1", "acc_f1"] }), ) elif task_name == "MRPC": task = ClassificationTask( name=task_name, input_module=input_module, middle_module=cls_middle_module, attention_module=attention_module, head_module=BinaryHead(neck_dim), scorer=Scorer( custom_metric_funcs={acc_f1: ["accuracy", "f1", "acc_f1"] }), ) elif task_name == "STSB": scorer = Scorer( standard_metrics=[], custom_metric_funcs={ pearson_spearman: [ "pearson_corr", "spearman_corr", "pearson_spearman", ] }, ) task = RegressionTask( name=task_name, input_module=input_module, middle_module=cls_middle_module, attention_module=attention_module, head_module=RegressionHead(neck_dim), scorer=scorer, ) elif task_name == "QNLI": task = ClassificationTask( name=task_name, input_module=input_module, middle_module=cls_middle_module, attention_module=attention_module, head_module=BinaryHead(neck_dim), scorer=Scorer(standard_metrics=["accuracy"]), ) # AUXILIARY TASKS elif task_name == "THIRD": # A toy task that predict which third of the sentence each token is in OUT_DIM = 3 task = TokenClassificationTask( name="THIRD", input_module=input_module, attention_module=attention_module, head_module=BertTokenClassificationHead(neck_dim, OUT_DIM), loss_multiplier=config["auxiliary_loss_multiplier"], ) elif task_name == "BLEU": task = RegressionTask( name=task_name, input_module=input_module, middle_module=cls_middle_module, attention_module=attention_module, head_module=RegressionHead(neck_dim), output_hat_func=torch.sigmoid, loss_hat_func=(lambda out, Y_gold: F.mse_loss( torch.sigmoid(out), Y_gold)), scorer=Scorer(custom_metric_funcs={mse: ["mse"]}), loss_multiplier=config["auxiliary_loss_multiplier"], ) elif task_name == "SPACY_NER": OUT_DIM = len(SPACY_TAGS["SPACY_NER"]) task = TokenClassificationTask( name=task_name, input_module=input_module, attention_module=attention_module, head_module=BertTokenClassificationHead(neck_dim, OUT_DIM), loss_multiplier=config["auxiliary_loss_multiplier"], ) elif task_name == "SPACY_POS": OUT_DIM = len(SPACY_TAGS["SPACY_POS"]) task = TokenClassificationTask( name=task_name, input_module=input_module, attention_module=attention_module, head_module=BertTokenClassificationHead(neck_dim, OUT_DIM), loss_multiplier=config["auxiliary_loss_multiplier"], ) else: msg = (f"Task name {task_name} was not recognized as a primary or " f"auxiliary task.") raise Exception(msg) tasks.append(task) # Gather slice names slice_names = (config["slice_dict"].get(task_name, []) if config["slice_dict"] else []) # Add a task for each slice for slice_name in slice_names: slice_task_name = f"{task_name}_slice:{slice_name}" slice_task = create_slice_task(task, slice_task_name) tasks.append(slice_task) if has_payload and not skip_payloads: # Create payloads (and add slices/auxiliary tasks as applicable) for split, data_loader in data_loaders.items(): payload_name = f"{task_name}_{split}" labels_to_tasks = {f"{task_name}_gold": task_name} payload = Payload(payload_name, data_loader, labels_to_tasks, split) # Add auxiliary label sets if applicable auxiliary_task_dict = config["auxiliary_task_dict"] for aux_task_name, target_payloads in auxiliary_task_dict.items( ): if aux_task_name in task_names and task_name in target_payloads: aux_task_func = auxiliary_task_functions[aux_task_name] payload = aux_task_func(payload) # Add a labelset slice to each split dataset = payload.data_loader.dataset for slice_name in slice_names: slice_task_name = f"{task_name}_slice:{slice_name}" slice_labels = create_slice_labels( dataset, base_task_name=task_name, slice_name=slice_name) labelset_slice_name = f"{task_name}_slice:{slice_name}" payload.add_label_set(slice_task_name, labelset_slice_name, slice_labels) payloads.append(payload) return tasks, payloads
def __init__(self, k=2, **kwargs): config = recursive_merge_dicts(lm_default_config, kwargs) super().__init__(k, config)
def _test_model_config( self, idx, config, dev_data, init_args=[], train_args=[], init_kwargs={}, train_kwargs={}, module_args={}, module_kwargs={}, verbose=False, **score_kwargs, ): # Integrating generated config into init kwargs and train kwargs init_kwargs = recursive_merge_dicts(init_kwargs, config, misses="insert") train_kwargs = recursive_merge_dicts(train_kwargs, config, misses="insert") # Also make sure train kwargs includes validation metric train_kwargs["validation_metric"] = self.validation_metric # Initialize modules if provided for module_name, module_class in self.module_classes.items(): # Also integrate generated config into module kwargs so that module # hyperparameters can be searched over as well module_kwargs[module_name] = recursive_merge_dicts( module_kwargs[module_name], config, misses="insert") # Initialize module init_kwargs[module_name] = module_class( *module_args[module_name], **module_kwargs[module_name]) # Init model model = self.model_class(*init_args, **init_kwargs) # Search params # Select any params in search space that have list or dict search_params = {} for k, v in config.items(): if k in self.search_space.keys(): if isinstance(self.search_space[k], (list, dict)): search_params[k] = v if verbose: print("=" * 60) print(f"[{idx}] Testing {search_params}") print("=" * 60) # Initialize a new LogWriter and train the model, returning the score log_writer = None if self.log_writer_class is not None: log_writer = self.log_writer_class( log_dir=self.log_subdir, run_dir=".", run_name=f"model_search_{idx}", ) model.train_model( *train_args, **train_kwargs, dev_data=dev_data, verbose=verbose, log_writer=log_writer, ) score = model.score( dev_data, metric=self.validation_metric, verbose=False, # Score is already printed in train_model above **score_kwargs, ) # If score better than best_score, save if score > self.best_score: self.best_score = score self.best_index = idx self.best_config = config self._save_best_model(model) # Save high-level run stats (in addition to per-model log) time_elapsed = time() - self.start_time self.run_stats.append({ "idx": idx, "time_elapsed": time_elapsed, "search_params": search_params, "score": score, }) return score, model
def train(self, X_train, Y_train, X_dev=None, Y_dev=None, **kwargs): self.config = recursive_merge_dicts(self.config, kwargs) train_config = self.config['train_config'] Y_train = self._to_torch(Y_train) Y_dev = self._to_torch(Y_dev) if train_config['use_cuda']: raise NotImplementedError # TODO: fix this # X = X.cuda(self.gpu_id) # Y = Y.cuda(self.gpu_id) # TODO: put model on gpu # Make data loaders loader_config = train_config['data_loader_config'] train_loader = self._make_data_loader(X_train, Y_train, loader_config) evaluate_dev = (X_dev is not None and Y_dev is not None) # Set the optimizer optimizer_config = train_config['optimizer_config'] optimizer = self._set_optimizer(optimizer_config) # Set the lr scheduler scheduler_config = train_config['scheduler_config'] lr_scheduler = self._set_scheduler(scheduler_config, optimizer) # Initialize the model self.reset() # Train the model for epoch in range(train_config['n_epochs']): epoch_loss = 0.0 for i, data in enumerate(train_loader): X, Y = data # Zero the parameter gradients optimizer.zero_grad() # Forward pass to calculate outputs output = self.forward(X) loss = self._get_loss(output, Y) # Backward pass to calculate gradients loss.backward() # Clip gradients # if grad_clip: # torch.nn.utils.clip_grad_norm( # self.net.parameters(), grad_clip) # Perform optimizer step optimizer.step() # Keep running sum of losses epoch_loss += loss.detach() * X.shape[0] # Calculate average loss per training example # Saving division until this stage protects against the potential # mistake of averaging batch losses when the last batch is an orphan train_loss = epoch_loss / len(train_loader.dataset) if evaluate_dev: val_metric = train_config['validation_metric'] dev_score = self.score(X_dev, Y_dev, metric=val_metric, verbose=False) # Apply learning rate scheduler if (lr_scheduler is not None and epoch + 1 >= scheduler_config['lr_freeze']): if scheduler_config['scheduler'] == 'reduce_on_plateau': if evaluate_dev: lr_scheduler.step(dev_score) else: lr_scheduler.step() # Report progress if (self.config['verbose'] and (epoch % train_config['print_every'] == 0 or epoch == train_config['n_epochs'] - 1)): msg = f'[E:{epoch+1}]\tTrain Loss: {train_loss:.3f}' if evaluate_dev: msg += f'\tDev score: {dev_score:.3f}' print(msg) if self.config['verbose']: print('Finished Training') if self.config['show_plots']: if self.k == 2: Y_p_train = self.predict_proba(X_train) plot_probabilities_histogram(Y_p_train[:, 0], title="Training Set Predictions") if X_dev is not None and Y_dev is not None: Y_ph_dev = self.predict(X_dev) print("Confusion Matrix (Dev)") mat = confusion_matrix(Y_ph_dev, Y_dev, pretty_print=True)
def train_model(self, model, payloads, results_path=None, **kwargs): # NOTE: misses="insert" so we can log extra metadata (e.g. num_parameters) # and eventually write to disk. self.config = recursive_merge_dicts(self.config, kwargs, misses="insert") self.task_names = [task_name for task_name in model.task_map] self.payload_names = [payload.name for payload in payloads] train_payloads = [p for p in payloads if p.split == "train"] if not train_payloads: msg = "At least one payload must have property payload.split=='train'" raise Exception(msg) # Calculate epoch statistics # NOTE: We calculate approximate count size using batch_size * num_batches self.batches_per_epoch = sum( [len(p.data_loader) for p in train_payloads]) self.examples_per_epoch = sum([ len(p.data_loader) * p.data_loader.batch_size for p in train_payloads ]) if self.config["verbose"]: print(f"Beginning train loop.") print( f"Expecting approximately {self.examples_per_epoch} examples total " f"and {self.batches_per_epoch} batches per epoch from " f"{len(train_payloads)} payload(s) in the train split.") # Check inputs self._check_metrics() # Set training components self._set_writer() self._set_logger() self._set_checkpointer(model) self._set_optimizer(model) self._set_lr_scheduler( model) # TODO: Support more detailed training schedules self._set_task_scheduler(model, payloads) # Record config if self.writer: self.writer.write_config(self.config) # Train the model # TODO: Allow other ways to train besides 1 epoch of all datasets model.train() # Dict metrics_hist contains the most recently recorded value of all metrics self.metrics_hist = {} self._reset_losses() for epoch in range(self.config["n_epochs"]): progress_bar = self.config["progress_bar"] and self.config[ "verbose"] t = tqdm( enumerate(self.task_scheduler.get_batches(payloads, "train")), total=self.batches_per_epoch, disable=(not progress_bar), ) for batch_num, (batch, payload_name, labels_to_tasks) in t: # NOTE: actual batch_size may not equal config's target batch_size, # for example due to orphan batches. We base batch size off of Y instead # of X because we know Y will contain tensors, whereas X can be of any # format the input_module accepts, including tuples of tensors, etc. _, Ys = batch batch_size = len(next(iter(Ys.values()))) batch_id = epoch * self.batches_per_epoch + batch_num # Zero the parameter gradients self.optimizer.zero_grad() # Forward pass to calculate the average loss per example by task # Counts stores the number of examples in each batch with labels by task loss_dict, count_dict = model.calculate_loss( *batch, payload_name, labels_to_tasks) # NOTE: If there were no "active" examples, loss_dict is empty # Skip additional loss-based computation at this point if not loss_dict: continue loss = sum(loss_dict.values()) if torch.isnan(loss): msg = "Loss is NaN. Consider reducing learning rate." raise Exception(msg) # Backward pass to calculate gradients # Loss is an average loss per example if model.config["fp16"]: self.optimizer.backward(loss) else: loss.backward() # Clip gradient norm (not individual gradient magnitudes) # max_grad_value = max([p.grad.abs().max().item() for p in model.parameters()]) if self.config["grad_clip"]: torch.nn.utils.clip_grad_norm_(model.parameters(), self.config["grad_clip"]) # Perform optimizer step self.optimizer.step() # Update loss for loss_name in loss_dict: if count_dict[loss_name]: self.running_losses[loss_name] += ( loss_dict[loss_name].item() * count_dict[loss_name]) self.running_examples[loss_name] += count_dict[ loss_name] # Calculate metrics, log, and checkpoint as necessary metrics_dict = self._execute_logging(model, payloads, batch_size) # Confirm metrics being produced are in proper format if epoch == 0 and batch_num == 0: self._validate_metrics_dict(metrics_dict) # Apply learning rate scheduler self._update_lr_scheduler(model, batch_id) # tqdm output if len(model.task_map) == 1: t.set_postfix(loss=metrics_dict["model/train/all/loss"]) else: losses = {} for key, val in metrics_dict.items(): if "loss" in key: losses[key] = val t.set_postfix(losses) if results_path: if not os.path.exists(results_path): os.makedirs(results_path) train_metrics_dict = self.calculate_metrics(model, payloads, split="train") valid_metrics_dict = self.calculate_metrics(model, payloads, split="valid") output_eval_file = os.path.join(results_path, "training_metrics.txt") if epoch > 0: append_write = 'a' # append if already started run else: append_write = 'w' # create/overwrite file at the start of training with open(output_eval_file, append_write) as writer: writer.write("Epoch {0}:\n".format(epoch)) for key in sorted(train_metrics_dict.keys()): writer.write("Training: %s = %s\n" % (key, str(train_metrics_dict[key]))) for key in sorted(valid_metrics_dict.keys()): writer.write("Validation: %s = %s\n" % (key, str(valid_metrics_dict[key]))) model.eval() # Restore best model if applicable if self.checkpointer and self.checkpointer.checkpoint_best: # First do a final checkpoint at the end of training metrics_dict = self._execute_logging(model, payloads, batch_size, force_log=True) self.checkpointer.load_best_model(model=model) # Copy best model to log directory if self.writer: path_to_best = os.path.join(self.checkpointer.checkpoint_dir, "best_model.pth") path_to_logs = self.writer.log_subdir if os.path.isfile(path_to_best): copy2(path_to_best, path_to_logs) # Print final performance values if self.config["verbose"]: print("Finished training") # Calculate metrics for all splits if test_split=None test_split = self.config["metrics_config"]["test_split"] metrics_dict = self.calculate_metrics(model, payloads, split=test_split) if self.config["verbose"]: pprint(metrics_dict) # Clean up checkpoints if self.checkpointer and self.config["checkpoint_cleanup"]: print("Cleaning checkpoints") self.checkpointer.clean_up() # Write log if applicable if self.writer: # convert from numpy to python float metrics_dict = recursive_transform( metrics_dict, lambda x: type(x).__module__ == np.__name__, float) self.writer.write_metrics(metrics_dict) self.writer.write_log() self.writer.close() # pickle and save the full model full_model_path = os.path.join(self.writer.log_subdir, "model_state_dict.pkl") torch.save(model.state_dict(), full_model_path) print(f"Full model saved at {full_model_path}") return metrics_dict
type=int, default=np.random.randint(1e6), help="A single seed to use for trainer, model, and task configs", ) parser.add_argument("--model_type", type=str, default="metal", help="Baseline model type") parser = add_flags_from_config(parser, trainer_defaults) parser = add_flags_from_config(parser, model_defaults) parser = add_flags_from_config(parser, task_defaults) args = parser.parse_args() # Extract flags into their respective config files trainer_config = recursive_merge_dicts(trainer_defaults, vars(args), misses="ignore") model_config = recursive_merge_dicts(model_defaults, vars(args), misses="ignore") task_config = recursive_merge_dicts(task_defaults, vars(args), misses="ignore") args = parser.parse_args() task_names = args.tasks.split(",") assert len(task_names) == 1 task_name = task_names[0] # Create tasks and payloads task_config["slice_dict"] = None
def train_model( self, L_train, Y_dev=None, deps=[], class_balance=None, log_writer=None, **kwargs, ): """Train the model (i.e. estimate mu) in one of two ways, depending on whether source dependencies are provided or not: Args: L_train: An [n,m] scipy.sparse matrix with values in {0,1,...,k} corresponding to labels from supervision sources on the training set Y_dev: Target labels for the dev set, for estimating class_balance deps: (list of tuples) known dependencies between supervision sources. If not provided, sources are assumed to be independent. TODO: add automatic dependency-learning code class_balance: (np.array) each class's percentage of the population (1) No dependencies (conditionally independent sources): Estimate mu subject to constraints: (1a) O_{B(i,j)} - (mu P mu.T)_{B(i,j)} = 0, for i != j, where B(i,j) is the block of entries corresponding to sources i,j (1b) np.sum( mu P, 1 ) = diag(O) (2) Source dependencies: - First, estimate Z subject to the inverse form constraint: (2a) O_\Omega + (ZZ.T)_\Omega = 0, \Omega is the deps mask - Then, compute Q = mu P mu.T - Finally, estimate mu subject to mu P mu.T = Q and (1b) """ self.config = recursive_merge_dicts(self.config, kwargs, misses="ignore") train_config = self.config["train_config"] # TODO: Implement logging for label model? if log_writer is not None: raise NotImplementedError("Logging for LabelModel.") # Note that the LabelModel class implements its own (centered) L2 reg. l2 = train_config.get("l2", 0) self._set_class_balance(class_balance, Y_dev) self._set_constants(L_train) self._set_dependencies(deps) self._check_L(L_train) # Whether to take the simple conditionally independent approach, or the # "inverse form" approach for handling dependencies # This flag allows us to eg test the latter even with no deps present self.inv_form = len(self.deps) > 0 # Creating this faux dataset is necessary for now because the LabelModel # loss functions do not accept inputs, but Classifer._train_model() # expects training data to feed to the loss functions. dataset = MetalDataset([0], [0]) train_loader = DataLoader(dataset) if self.inv_form: # Compute O, O^{-1}, and initialize params if self.config["verbose"]: print("Computing O^{-1}...") self._generate_O_inv(L_train) self._init_params() # Estimate Z, compute Q = \mu P \mu^T if self.config["verbose"]: print("Estimating Z...") self._train_model(train_loader, self.loss_inv_Z) self.Q = torch.from_numpy(self.get_Q()).float() # Estimate \mu if self.config["verbose"]: print("Estimating \mu...") self._train_model(train_loader, partial(self.loss_inv_mu, l2=l2)) else: # Compute O and initialize params if self.config["verbose"]: print("Computing O...") self._generate_O(L_train) self._init_params() # Estimate \mu if self.config["verbose"]: print("Estimating \mu...") self._train_model(train_loader, partial(self.loss_mu, l2=l2))
def search( self, search_space, dev_data, init_args=[], train_args=[], init_kwargs={}, train_kwargs={}, max_search=None, shuffle=True, verbose=True, **score_kwargs, ): """ Args: search_space: see config_generator() documentation dev_data: a tuple of Tensors (X,Y), a Dataset, or a DataLoader of X (data) and Y (labels) for the dev split init_args: (list) positional args for initializing the model train_args: (list) positional args for training the model init_kwargs: (dict) keyword args for initializing the model train_kwargs: (dict) keyword args for training the model max_search: see config_generator() documentation shuffle: see config_generator() documentation Returns: best_model: the highest performing trained model Note: Initialization is performed by ModelTuner instead of passing a pre-initialized model so that tuning may be performed over all model parameters, including the network architecture (which is defined before the train loop). """ self._clear_state() # Generate configs configs = self.config_generator(search_space, max_search, shuffle) # Commence search for i, config in enumerate(configs): # Unless seeds are given explicitly, give each config a unique one if config.get("seed", None) is None: config["seed"] = self.seed + i # Integrating generated config into init kwargs and train kwargs init_kwargs = recursive_merge_dicts(init_kwargs, config) train_kwargs = recursive_merge_dicts(train_kwargs, config) score, model = self._test_model_config( i, config, dev_data, init_args=init_args, train_args=train_args, init_kwargs=init_kwargs, train_kwargs=train_kwargs, verbose=verbose, **score_kwargs, ) print("=" * 60) print(f"[SUMMARY]") print(f"Best model: [{self.best_index}]") print(f"Best config: {self.best_config}") print(f"Best score: {self.best_score}") print("=" * 60) # Return best model return self._load_best_model(clean_up=True)