def forward(self, architecture: Architecture, real_features: Tensor, fake_features: Tensor, **additional_inputs: Tensor) -> Tensor: # real loss real_predictions = architecture.discriminator(real_features, **additional_inputs) real_loss = - critic_loss_function(real_predictions) # fake loss fake_predictions = architecture.discriminator(fake_features, **additional_inputs) fake_loss = critic_loss_function(fake_predictions) # total loss return real_loss + fake_loss
def forward(self, architecture: Architecture, fake_features: Tensor, **additional_inputs: Tensor) -> Tensor: fake_predictions = architecture.discriminator(fake_features, **additional_inputs) positive_labels = generate_positive_labels(len(fake_predictions), self.smooth_positive_labels) return self.bce_loss(fake_predictions, positive_labels)
def train_generator_step(self, configuration: Configuration, metadata: Metadata, architecture: Architecture) -> float: # clean previous gradients architecture.generator_optimizer.zero_grad() # conditional if "conditional" in architecture.arguments: # for now uniform distribution is used but could be controlled in a different way # also this works for both binary and categorical dependent variables number_of_conditions = metadata.get_dependent_variable().get_size() condition = to_gpu_if_available(FloatTensor(configuration.batch_size).uniform_(0, number_of_conditions)) # non-conditional else: condition = None # generate a full batch of fake features fake_features = self.sample_fake(architecture, configuration.batch_size, condition=condition) # calculate loss loss = architecture.generator_loss(architecture, fake_features, condition=condition) # calculate gradients loss.backward() # update the generator weights architecture.generator_optimizer.step() # return the loss return to_cpu_if_was_in_gpu(loss).item()
def forward(self, architecture: Architecture, real_features: Tensor, fake_features: Tensor, **additional_inputs: Tensor) -> Tensor: loss = super(WGANCriticLossWithGradientPenalty, self).forward( architecture, real_features, fake_features, **additional_inputs) # calculate gradient penalty alpha = rand(len(real_features), 1) alpha = alpha.expand(real_features.size()) alpha = to_gpu_if_available(alpha) interpolates = alpha * real_features + ((1 - alpha) * fake_features) interpolates.requires_grad_() # we do not interpolate the conditions because they are the same for fake and real features discriminator_interpolates = architecture.discriminator(interpolates, **additional_inputs) gradients = grad(outputs=discriminator_interpolates, inputs=interpolates, grad_outputs=to_gpu_if_available(ones_like(discriminator_interpolates)), create_graph=True, retain_graph=True, only_inputs=True)[0] gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean() * self.weight # return total loss return loss + gradient_penalty
def compute_parameter_size(architecture: Architecture) -> int: size = 0 for component in architecture.values(): if isinstance(component, Module): # skip optimizers for parameter in component.parameters(): if parameter.requires_grad: size += parameter.numel() return size
def forward(self, architecture: Architecture, real_features: Tensor, fake_features: Tensor, **additional_inputs: Tensor) -> Tensor: # real loss real_predictions = architecture.discriminator(real_features, **additional_inputs) positive_labels = generate_positive_labels(len(real_predictions), self.smooth_positive_labels) real_loss = self.bce_loss(real_predictions, positive_labels) # fake loss fake_predictions = architecture.discriminator(fake_features, **additional_inputs) negative_labels = to_gpu_if_available(zeros(len(fake_predictions))) fake_loss = self.bce_loss(fake_predictions, negative_labels) # total loss return real_loss + fake_loss
def generate_sample(self, configuration: Configuration, metadata: Metadata, architecture: Architecture, **additional_inputs: Tensor) -> Tensor: noise = to_gpu_if_available( FloatTensor(configuration.batch_size, architecture.arguments.noise_size).normal_()) architecture.autoencoder.eval() architecture.generator.eval() code = architecture.generator(noise, **additional_inputs) return architecture.autoencoder.decode(code, **additional_inputs)
def train_generator_step(configuration: Configuration, metadata: Metadata, architecture: Architecture, batch: Batch) -> float: # clean previous gradients architecture.generator_optimizer.zero_grad() # generate a batch of fake features with the same size as the real feature batch generated = architecture.generator(batch["features"], missing_mask=batch["missing_mask"]) # replace the missing features by the generated imputed = compose_with_mask( mask=batch["missing_mask"], differentiable=True, # now there are no NaNs and this should be used where_one=generated, where_zero=batch["raw_features"]) # generate hint hint = generate_hint(batch["missing_mask"], configuration.hint_probability, metadata) # calculate loss loss = architecture.generator_loss(architecture=architecture, features=batch["raw_features"], generated=generated, imputed=imputed, hint=hint, non_missing_mask=inverse_mask( batch["missing_mask"])) # calculate gradients loss.backward() # update the generator weights architecture.generator_optimizer.step() # return the loss return to_cpu_if_was_in_gpu(loss).item()
def forward(self, architecture: Architecture, features: Tensor, generated: Tensor, imputed: Tensor, hint: Tensor, non_missing_mask: Tensor) -> Tensor: # the discriminator should predict the missing mask # which means that it detects which positions where imputed and which ones were real predictions = architecture.discriminator(imputed, missing_mask=hint) # but the generator wants to fool the discriminator # so we optimize for the inverse mask adversarial_loss = self.bce_loss(predictions, non_missing_mask) # reconstruction of the non-missing values reconstruction_loss = self.reconstruction_loss(generated, features, non_missing_mask) # return the complete loss return adversarial_loss + self.reconstruction_loss_weight * reconstruction_loss
def train_discriminator_step(self, configuration: Configuration, metadata: Metadata, architecture: Architecture, batch: Batch) -> float: # clean previous gradients architecture.discriminator_optimizer.zero_grad() # generate a batch of fake features with the same size as the real feature batch fake_features = self.sample_fake(architecture, len(batch["features"]), condition=batch.get("labels")) fake_features = fake_features.detach() # do not propagate to the generator # calculate loss loss = architecture.discriminator_loss(architecture, batch["features"], fake_features, condition=batch.get("labels")) # calculate gradients loss.backward() # update the discriminator weights architecture.discriminator_optimizer.step() # return the loss return to_cpu_if_was_in_gpu(loss).item()
def extract_states(sources: Architecture) -> Checkpoint: targets = {} for name, source in sources.items(): targets[name] = source.state_dict() return targets
def val_batch(architecture: Architecture, batch: Batch, post_processing: PostProcessing) -> float: generated = architecture.generator(batch["features"], missing_mask=batch["missing_mask"]) loss = architecture.val_loss(post_processing, generated, batch) return to_cpu_if_was_in_gpu(loss).item()
def load_states(sources: Checkpoint, targets: Architecture) -> None: for name, target in targets.items(): target.load_state_dict(sources[name])
def impute(self, configuration: Configuration, metadata: Metadata, architecture: Architecture, batch: Dict[str, Tensor]) -> Tensor: # loss function loss_function = create_component(architecture, metadata, configuration.reconstruction_loss) masked_loss_function = MaskedReconstructionLoss(loss_function) batch_size = batch["features"].shape[0] * batch["features"].shape[1] # we need the non missing mask for the loss non_missing_mask = inverse_mask(batch["missing_mask"]) # initial noise noise = to_gpu_if_available( FloatTensor(len(batch["features"]), architecture.arguments.noise_size).normal_()) noise.requires_grad_() # it is not the generator what we are updating # it is the noise optimizer = Adam([noise], weight_decay=0, lr=configuration.noise_learning_rate) architecture.generator.eval() # logger log_path = create_parent_directories_if_needed(configuration.logs) logger = TrainLogger(self.logger, log_path, False) # initial generation logger.start_timer() generated = architecture.generator(noise, condition=batch.get("labels")) # iterate until we reach the maximum number of iterations or until the non missing loss is too small max_iterations = configuration.max_iterations for iteration in range(1, max_iterations + 1): # compute the loss on the non-missing values non_missing_loss = masked_loss_function(generated, batch["features"], non_missing_mask) logger.log(iteration, max_iterations, "non_missing_loss", to_cpu_if_was_in_gpu(non_missing_loss).item()) # this loss only makes sense if the ground truth is present # only used for debugging if configuration.get("log_missing_loss", False): # this part should not affect the gradient calculation with torch.no_grad(): missing_loss = masked_loss_function( generated, batch["raw_features"], batch["missing_mask"]) logger.log(iteration, max_iterations, "missing_loss", to_cpu_if_was_in_gpu(missing_loss).item()) loss = loss_function(generated, batch["raw_features"]) / batch_size logger.log(iteration, max_iterations, "loss", to_cpu_if_was_in_gpu(loss).item()) # if the generation is good enough we stop if to_cpu_if_was_in_gpu(non_missing_loss).item( ) < configuration.get("tolerance", 1e-5): break # clear previous gradients optimizer.zero_grad() # compute the gradients non_missing_loss.backward() # update the noise optimizer.step() # generate next logger.start_timer() generated = architecture.generator(noise, condition=batch.get("labels")) return generated
def forward(self, architecture: Architecture, imputed: Tensor, hint: Tensor, missing_mask: Tensor) -> Tensor: # the discriminator should predict the missing mask # which means that it detects which positions where imputed and which ones were real predictions = architecture.discriminator(imputed, missing_mask=hint) return self.bce_loss(predictions, missing_mask)
def impute(self, configuration: Configuration, metadata: Metadata, architecture: Architecture, batch: Dict[str, Tensor]) -> Tensor: return architecture.generator(batch["features"], missing_mask=batch["missing_mask"])
def impute(self, configuration: Configuration, metadata: Metadata, architecture: Architecture, batch: Dict[str, Tensor]) -> Tensor: return architecture.autoencoder( batch["features"], condition=batch.get("labels"))["reconstructed"]
def sample_fake(self, architecture: Architecture, size: int, **additional_inputs: Tensor) -> Tensor: # for now the noise comes from a normal distribution but could be other distribution noise = to_gpu_if_available(FloatTensor(size, architecture.arguments.noise_size).normal_()) return architecture.generator(noise, **additional_inputs)
def forward(self, architecture: Architecture, fake_features: Tensor, **additional_inputs: Tensor) -> Tensor: fake_predictions = architecture.discriminator(fake_features, **additional_inputs) return - critic_loss_function(fake_predictions)
def create_architecture(metadata: Metadata, configuration: Configuration) -> Architecture: architecture = Architecture(configuration.arguments) # create the dependency graph # nodes are component names and edges are dependencies between components nodes = set() in_edges = dict() out_edges = dict() for node in configuration.components.keys(): nodes.add(node) in_edges[node] = set() out_edges[node] = set() # create the dependency edges nodes_without_out_edges = set() for node, component_configuration in configuration.components.items(): factory = factory_by_name[component_configuration.factory] dependencies = factory.dependencies( component_configuration.get("arguments", {})) if len(dependencies) == 0: nodes_without_out_edges.add(node) else: for other_node in dependencies: out_edges[node].add( other_node) # the node needs the other node in_edges[other_node].add( node) # the other node is needed by the node # create components until the graph is empty (topological sort) while len(nodes) > 0: # if there are no nodes without out edges there must be a loop if len(nodes_without_out_edges) == 0: raise Exception( "Dependencies cannot be met for components: {}.".format( ", ".join(nodes))) # get any node without out edges node = nodes_without_out_edges.pop() assert len(out_edges[node]) == 0 # create the component architecture[node] = create_component(architecture, metadata, configuration.components[node]) # while the node has other nodes pointing at him while len(in_edges[node]) > 0: # remove any incoming edge for the node other_node = in_edges[node].pop() # remove the outgoing edge for the other node out_edges[other_node].remove(node) # if the other node has no more dependencies if len(out_edges[other_node]) == 0: nodes_without_out_edges.add(other_node) # remove the node nodes.remove(node) in_edges.pop(node) out_edges.pop(node) return architecture