def run(self, configuration: Configuration) -> None: seed_all(configuration.get("seed")) metadata = load_metadata(configuration.metadata) inputs = torch.from_numpy(np.load(configuration.inputs)) missing_mask = generate_mask_for(inputs, configuration.missing_probability, metadata) np.save(configuration.outputs, missing_mask.numpy())
def run(self, configuration: Configuration) -> None: seed_all(configuration.get("seed")) metadata = load_metadata(configuration.metadata) architecture_configuration = load_configuration(configuration.architecture) self.validate_architecture_configuration(architecture_configuration) architecture = create_architecture(metadata, architecture_configuration) architecture.to_gpu_if_available() checkpoints = Checkpoints() checkpoint = checkpoints.load(configuration.checkpoint) if "best_architecture" in checkpoint: checkpoints.load_states(checkpoint["best_architecture"], architecture) else: checkpoints.load_states(checkpoint["architecture"], architecture) # pre-processing imputation = create_component(architecture, metadata, configuration.imputation) pre_processing = PreProcessing(imputation) # post-processing if "scale_transform" in configuration: scale_transform = load_scale_transform(configuration.scale_transform) else: scale_transform = None post_processing = PostProcessing(metadata, scale_transform) # load the features features = to_gpu_if_available(torch.from_numpy(np.load(configuration.features)).float()) missing_mask = to_gpu_if_available(torch.from_numpy(np.load(configuration.missing_mask)).float()) # initial imputation batch = pre_processing.transform({"features": features, "missing_mask": missing_mask}) # generate the model outputs output = self.impute(configuration, metadata, architecture, batch) # imputation output = compose_with_mask(mask=missing_mask, differentiable=False, where_one=output, where_zero=features) # post-process output = post_processing.transform(output) # save the imputation output = to_cpu_if_was_in_gpu(output) output = output.numpy() np.save(configuration.output, output)
def run(self, configuration: Configuration) -> None: seed_all(configuration.get("seed")) metadata = load_metadata(configuration.metadata) architecture_configuration = load_configuration( configuration.architecture) self.validate_architecture_configuration(architecture_configuration) architecture = create_architecture(metadata, architecture_configuration) architecture.to_gpu_if_available() checkpoints = Checkpoints() checkpoint = checkpoints.load(configuration.checkpoint) if "best_architecture" in checkpoint: checkpoints.load_states(checkpoint["best_architecture"], architecture) else: checkpoints.load_states(checkpoint["architecture"], architecture) # load the features features = to_gpu_if_available( torch.from_numpy(np.load(configuration.features)).float()) # conditional if "labels" in configuration: condition = to_gpu_if_available( torch.from_numpy(np.load(configuration.labels)).float()) else: condition = None # encode with torch.no_grad(): code = architecture.autoencoder.encode(features, condition=condition)["code"] # save the code code = to_cpu_if_was_in_gpu(code) code = code.numpy() np.save(configuration.output, code)
def process(self, inputs: Any) -> None: # prepare the outputs outputs = dict(inputs) # load the scale transform # remove the path from the outputs scale_transform = load_scale_transform(outputs.pop("scale_transform")) # load the imputation task configuration impute_task = load_configuration(outputs.pop("impute_task")) # the imputation task output exists if os.path.exists(impute_task.arguments.output): # losses mse_loss_function = MSELoss() rmse_loss_function = RMSE() mr_loss_function = MultiReconstructionLoss(load_metadata(impute_task.arguments.metadata)) # load the scaled data scaled_inputs = torch.from_numpy(np.load(impute_task.arguments.features)) scaled_imputed = torch.from_numpy(np.load(impute_task.arguments.output)) # compute the scaled metrics outputs["scaled_mse"] = mse_loss_function(scaled_imputed, scaled_inputs).item() outputs["scaled_rmse"] = rmse_loss_function(scaled_imputed, scaled_inputs).item() outputs["scaled_mr"] = mr_loss_function(scaled_imputed, scaled_inputs).item() # apply the inverse scale transform to recover the original unscaled data inputs = torch.from_numpy(scale_transform.inverse_transform(scaled_inputs.numpy())) imputed = torch.from_numpy(scale_transform.inverse_transform(scaled_imputed.numpy())) # compute the unscaled metrics outputs["mse"] = mse_loss_function(imputed, inputs).item() outputs["rmse"] = rmse_loss_function(imputed, inputs).item() outputs["mr"] = mr_loss_function(imputed, inputs).item() # if the task was not run else: self.logger.info("{} does not exist.".format(impute_task.arguments.output)) # send the outputs self.send_output(outputs)
def run(self, configuration: Configuration) -> None: metadata = load_metadata(configuration.metadata) inputs = torch.from_numpy(np.load(configuration.inputs)) missing_mask = torch.from_numpy(np.load(configuration.missing_mask)) non_missing_mask = inverse_mask(missing_mask) assert inputs.shape == missing_mask.shape filling_values = torch.zeros(metadata.get_num_features(), dtype=inputs.dtype) for variable_metadata in metadata.get_by_independent_variable(): index = variable_metadata.get_feature_index() size = variable_metadata.get_size() # binary if variable_metadata.is_binary(): # count how many ones in the variable where the non missing mask is one one_count = inputs[non_missing_mask[:, index] == 1, index].sum() # count how many ones non missing values the variable has and subtract the ones zero_count = non_missing_mask[:, index].sum() - one_count # fill with a one if there are more ones than zeros # if not fill with a zero filling_value = (1 if one_count >= zero_count else 0) # categorical elif variable_metadata.is_categorical(): # how many ones per column (per categorical variable value) column_count = torch.zeros(size) for offset in range(size): column_count[offset] = inputs[non_missing_mask[:, index + offset] == 1, index + offset].sum() # get the most common filling_value = one_hot(column_count.argmax(), num_classes=size) # numerical else: # take the mean of the values where the non missing mask is one filling_value = inputs[non_missing_mask[:, index] == 1, index].mean() # fill the variable filling_values[index:index + size] = filling_value # save the filling values np.save(configuration.outputs, filling_values.numpy())
def run(self, configuration: Configuration) -> None: metadata = load_metadata(configuration.metadata) # the inputs are expected to be scaled scaled_inputs = torch.from_numpy(np.load(configuration.inputs)) missing_mask = torch.from_numpy(np.load(configuration.missing_mask)) # the imputation will be scaled too scaled_imputed = self.impute(configuration, metadata, scaled_inputs, missing_mask) # post-process (without scaling back) scaled_imputed = PostProcessing(metadata).transform(scaled_imputed) # scale back if requested if "scaler" in configuration: post_processing = PostProcessing( metadata, load_scale_transform(configuration.scaler)) inputs = post_processing.transform(scaled_inputs) imputed = post_processing.transform(scaled_imputed) outputs = imputed # do not scale back else: inputs = None imputed = None outputs = scaled_imputed # if imputation should be saved if "outputs" in configuration: np.save(configuration.outputs, outputs) # if reconstruction loss should be logged if "logs" in configuration: # this uses one row on a csv file file_mode = "a" if os.path.exists(configuration.logs.path) else "w" with open(configuration.logs.path, file_mode) as reconstruction_loss_file: file_writer = DictWriter(reconstruction_loss_file, [ "inputs", "missing_mask", "scaled_mse", "scaled_rmse", "scaled_mr", "mse", "rmse", "mr", ]) # write the csv header if it is the first time if file_mode == "w": file_writer.writeheader() row = { "inputs": configuration.inputs, "missing_mask": configuration.missing_mask, } # loss functions mse_loss_function = MSELoss() rmse_loss_function = RMSE() mr_loss_function = MultiReconstructionLoss(metadata) # unscaled metrics if imputed is not None and inputs is not None: row["mse"] = mse_loss_function(imputed, inputs).item() row["rmse"] = rmse_loss_function(imputed, inputs).item() row["mr"] = mr_loss_function(imputed, inputs).item() # scaled metrics row["scaled_mse"] = mse_loss_function(scaled_imputed, scaled_inputs).item() row["scaled_rmse"] = rmse_loss_function( scaled_imputed, scaled_inputs).item() row["scaled_mr"] = mr_loss_function(scaled_imputed, scaled_inputs).item() self.logger.info(row) file_writer.writerow(row)
def run(self, configuration: Configuration) -> None: seed_all(configuration.get("seed")) metadata = load_metadata(configuration.metadata) if "scale_transform" in configuration: scale_transform = load_scale_transform( configuration.scale_transform) else: scale_transform = None post_processing = PostProcessing(metadata, scale_transform) architecture_configuration = load_configuration( configuration.architecture) self.validate_architecture_configuration(architecture_configuration) architecture = create_architecture(metadata, architecture_configuration) architecture.to_gpu_if_available() checkpoints = Checkpoints() checkpoint = checkpoints.load(configuration.checkpoint) if "best_architecture" in checkpoint: checkpoints.load_states(checkpoint["best_architecture"], architecture) else: checkpoints.load_states(checkpoint["architecture"], architecture) samples = [] # create the strategy if defined if "strategy" in configuration: # validate strategy name is present if "factory" not in configuration.strategy: raise Exception( "Missing factory name while creating sample strategy.") # validate strategy name strategy_name = configuration.strategy.factory if strategy_name not in strategy_class_by_name: raise Exception( "Invalid factory name '{}' while creating sample strategy." .format(strategy_name)) # create the strategy strategy_class = strategy_class_by_name[strategy_name] strategy = strategy_class(**configuration.strategy.get( "arguments", default={}, transform_default=False)) # use the default strategy else: strategy = DefaultSampleStrategy() # this is only to pass less parameters back and forth sampler = Sampler(self, configuration, metadata, architecture, post_processing) # while more samples are needed start = 0 while start < configuration.sample_size: # do not calculate gradients with torch.no_grad(): # sample: # the task delegates to the strategy and passes the sampler object to avoid passing even more parameters # the strategy may prepare additional sampling arguments (e.g. condition) # the strategy delegates to the sampler object # the sampler object delegates back to the task adding parameters that it was keeping # the task child class does the actual sampling depending on the model # the sampler object applies post-processing # the strategy may apply filtering to the samples (e.g. rejection) # the task finally gets the sample batch_samples = strategy.generate_sample( sampler, configuration, metadata) # transform back the samples batch_samples = to_cpu_if_was_in_gpu(batch_samples) batch_samples = batch_samples.numpy() # if the batch is not empty if len(batch_samples) > 0: # do not go further than the desired number of samples end = min(start + len(batch_samples), configuration.sample_size) # limit the samples taken from the batch based on what is missing batch_samples = batch_samples[:min(len(batch_samples), end - start), :] # if it is the first batch if len(samples) == 0: samples = batch_samples # if its not the first batch we have to concatenate else: samples = np.concatenate((samples, batch_samples), axis=0) # move to next batch start = end # save the samples np.save(configuration.output, samples)
def create(self, metadata_path: str, ratio: float) -> Any: return WrappedSMOTENC(load_metadata(metadata_path), ratio)
def run(self, configuration: Configuration) -> None: metadata = load_metadata(configuration.metadata) architecture = create_architecture( metadata, load_configuration(configuration.architecture)) size = compute_parameter_size(architecture) self.logger.info("{}: {:d}".format(configuration.name, size))
def run(self, configuration: Configuration) -> None: seed_all(configuration.get("seed")) datasets = Datasets() for dataset_name, dataset_path in configuration.data.items(): datasets[dataset_name] = to_gpu_if_available(torch.from_numpy(np.load(dataset_path)).float()) metadata = load_metadata(configuration.metadata) architecture_configuration = load_configuration(configuration.architecture) self.validate_architecture_configuration(architecture_configuration) architecture = create_architecture(metadata, architecture_configuration) architecture.to_gpu_if_available() create_parent_directories_if_needed(configuration.checkpoints.output) checkpoints = Checkpoints() # no input checkpoint by default checkpoint = None # continue from an output checkpoint (has priority over input checkpoint) if configuration.checkpoints.get("continue_from_output", default=False) \ and checkpoints.exists(configuration.checkpoints.output): checkpoint = checkpoints.load(configuration.checkpoints.output) # continue from an input checkpoint elif "input" in configuration.checkpoints: checkpoint = checkpoints.load(configuration.checkpoints.input) if configuration.checkpoints.get("ignore_input_epochs", default=False): checkpoint["epoch"] = 0 if configuration.checkpoints.get("use_best_input", default=False): checkpoint["architecture"] = checkpoint.pop("best_architecture") checkpoint.pop("best_epoch") checkpoint.pop("best_metric") # if there is no starting checkpoint then initialize if checkpoint is None: architecture.initialize() checkpoint = { "architecture": checkpoints.extract_states(architecture), "epoch": 0 } # if there is a starting checkpoint then load it else: checkpoints.load_states(checkpoint["architecture"], architecture) log_path = create_parent_directories_if_needed(configuration.logs) logger = TrainLogger(self.logger, log_path, checkpoint["epoch"] > 0) # pre-processing if "imputation" in configuration: imputation = create_component(architecture, metadata, configuration.imputation) else: imputation = None pre_processing = PreProcessing(imputation) # post-processing if "scale_transform" in configuration: scale_transform = load_scale_transform(configuration.scale_transform) else: scale_transform = None post_processing = PostProcessing(metadata, scale_transform) for epoch in range(checkpoint["epoch"] + 1, configuration.epochs + 1): # train discriminator and generator logger.start_timer() metrics = self.train_epoch(configuration, metadata, architecture, datasets, pre_processing, post_processing) for metric_name, metric_value in metrics.items(): logger.log(epoch, configuration.epochs, metric_name, metric_value) # update the checkpoint checkpoint["architecture"] = checkpoints.extract_states(architecture) checkpoint["epoch"] = epoch # if the best architecture parameters should be kept if "keep_checkpoint_by_metric" in configuration: # get the metric used to compare checkpoints checkpoint_metric = metrics[configuration.keep_checkpoint_by_metric] # check if this is the best checkpoint (or the first) if "best_metric" not in checkpoint or checkpoint_metric < checkpoint["best_metric"]: checkpoint["best_architecture"] = checkpoint["architecture"] checkpoint["best_epoch"] = epoch checkpoint["best_metric"] = checkpoint_metric # save checkpoint checkpoints.delayed_save(checkpoint, configuration.checkpoints.output, configuration.checkpoints.max_delay) # force save of last checkpoint checkpoints.save(checkpoint, configuration.checkpoints.output) # finish logger.close()