def _process_one_batch( self, model: MultiRelationEmbedder, batch_edges: EdgeList ) -> Stats: model.zero_grad() scores, reg = model(batch_edges) loss = self.calc_loss(scores, batch_edges) stats = Stats( loss=float(loss), reg=float(reg) if reg is not None else 0.0, violators_lhs=int((scores.lhs_neg > scores.lhs_pos.unsqueeze(1)).sum()), violators_rhs=int((scores.rhs_neg > scores.rhs_pos.unsqueeze(1)).sum()), count=len(batch_edges), ) if reg is not None: (loss + reg).backward() else: loss.backward() self.model_optimizer.step(closure=None) for optimizer in self.unpartitioned_optimizers.values(): optimizer.step(closure=None) for optimizer in self.partitioned_optimizers.values(): optimizer.step(closure=None) return stats
def process_one_batch( self, model: MultiRelationEmbedder, batch_edges: EdgeList, ) -> Stats: model.zero_grad() scores = model(batch_edges) lhs_loss = self.loss_fn(scores.lhs_pos, scores.lhs_neg) rhs_loss = self.loss_fn(scores.rhs_pos, scores.rhs_neg) relation = self.relations[batch_edges.get_relation_type_as_scalar( ) if batch_edges.has_scalar_relation_type() else 0] loss = relation.weight * (lhs_loss + rhs_loss) stats = Stats( loss=float(loss), violators_lhs=int( (scores.lhs_neg > scores.lhs_pos.unsqueeze(1)).sum()), violators_rhs=int( (scores.rhs_neg > scores.rhs_pos.unsqueeze(1)).sum()), count=len(batch_edges)) loss.backward() self.global_optimizer.step(closure=None) for optimizer in self.entity_optimizers.values(): optimizer.step(closure=None) return stats
def _process_one_batch(self, model: MultiRelationEmbedder, batch_edges: EdgeList) -> Stats: # Tricky: this isbasically like calling `model.zero_grad()` except # that `zero_grad` calls `p.grad.zero_()`. When we perform infrequent # global L2 regularization, it converts the embedding gradients to dense, # and then they can never convert back to sparse gradients unless we set # them to `None` again here. for p in model.parameters(): p.grad = None scores, reg = model(batch_edges) loss = self.calc_loss(scores, batch_edges) stats = Stats( loss=float(loss), reg=float(reg) if reg is not None else 0.0, violators_lhs=int( (scores.lhs_neg > scores.lhs_pos.unsqueeze(1)).sum()), violators_rhs=int( (scores.rhs_neg > scores.rhs_pos.unsqueeze(1)).sum()), count=len(batch_edges), ) if reg is not None: loss = loss + reg if model.wd > 0 and random.random() < 1. / model.wd_interval: loss = loss + model.wd * model.wd_interval * model.l2_norm() loss.backward() self.model_optimizer.step(closure=None) for optimizer in self.unpartitioned_optimizers.values(): optimizer.step(closure=None) for optimizer in self.partitioned_optimizers.values(): optimizer.step(closure=None) return stats
def do_one_job( # noqa self, lhs_types: Set[str], rhs_types: Set[str], lhs_part: Partition, rhs_part: Partition, lhs_subpart: SubPartition, rhs_subpart: SubPartition, next_lhs_subpart: Optional[SubPartition], next_rhs_subpart: Optional[SubPartition], model: MultiRelationEmbedder, trainer: Trainer, all_embs: Dict[Tuple[EntityName, Partition], FloatTensorType], subpart_slices: Dict[Tuple[EntityName, Partition, SubPartition], slice], subbuckets: Dict[Tuple[int, int], Tuple[LongTensorType, LongTensorType, LongTensorType]], batch_size: int, lr: float, ) -> Stats: tk = TimeKeeper() for embeddings in all_embs.values(): assert embeddings.is_pinned() occurrences: Dict[Tuple[EntityName, Partition, SubPartition], Set[Side]] = defaultdict(set) for entity_name in lhs_types: occurrences[entity_name, lhs_part, lhs_subpart].add(Side.LHS) for entity_name in rhs_types: occurrences[entity_name, rhs_part, rhs_subpart].add(Side.RHS) if lhs_part != rhs_part: # Bipartite assert all(len(v) == 1 for v in occurrences.values()) tk.start("copy_to_device") for entity_name, part, subpart in occurrences.keys(): if (entity_name, part, subpart) in self.sub_holder: continue embeddings = all_embs[entity_name, part] optimizer = trainer.partitioned_optimizers[entity_name, part] subpart_slice = subpart_slices[entity_name, part, subpart] # TODO have two permanent storages on GPU and move stuff in and out # from them # logger.info(f"GPU #{self.gpu_idx} allocating {(subpart_slice.stop - subpart_slice.start) * embeddings.shape[1] * 4:,} bytes") gpu_embeddings = torch.empty( (subpart_slice.stop - subpart_slice.start, embeddings.shape[1]), dtype=torch.float32, device=self.my_device, ) gpu_embeddings.copy_(embeddings[subpart_slice], non_blocking=True) gpu_embeddings = torch.nn.Parameter(gpu_embeddings) gpu_optimizer = RowAdagrad([gpu_embeddings], lr=lr) (cpu_state, ) = optimizer.state.values() (gpu_state, ) = gpu_optimizer.state.values() # logger.info(f"GPU #{self.gpu_idx} allocating {(subpart_slice.stop - subpart_slice.start) * 4:,} bytes") gpu_state["sum"].copy_(cpu_state["sum"][subpart_slice], non_blocking=True) self.sub_holder[entity_name, part, subpart] = ( gpu_embeddings, gpu_optimizer, ) logger.debug( f"Time spent copying subparts to GPU: {tk.stop('copy_to_device'):.4f} s" ) for ( (entity_name, part, subpart), (gpu_embeddings, gpu_optimizer), ) in self.sub_holder.items(): for side in occurrences[entity_name, part, subpart]: model.set_embeddings(entity_name, side, gpu_embeddings) trainer.partitioned_optimizers[entity_name, part, subpart] = gpu_optimizer tk.start("translate_edges") num_edges = subbuckets[lhs_subpart, rhs_subpart][0].shape[0] edge_perm = torch.randperm(num_edges) edges_lhs, edges_rhs, edges_rel = subbuckets[lhs_subpart, rhs_subpart] _C.shuffle(edges_lhs, edge_perm, os.cpu_count()) _C.shuffle(edges_rhs, edge_perm, os.cpu_count()) _C.shuffle(edges_rel, edge_perm, os.cpu_count()) assert edges_lhs.is_pinned() assert edges_rhs.is_pinned() assert edges_rel.is_pinned() gpu_edges = EdgeList( EntityList.from_tensor(edges_lhs), EntityList.from_tensor(edges_rhs), edges_rel, ).to(self.my_device, non_blocking=True) logger.debug(f"GPU #{self.gpu_idx} got {num_edges} edges") logger.debug( f"Time spent copying edges to GPU: {tk.stop('translate_edges'):.4f} s" ) tk.start("processing") stats = process_in_batches(batch_size=batch_size, model=model, batch_processor=trainer, edges=gpu_edges) logger.debug(f"Time spent processing: {tk.stop('processing'):.4f} s") next_occurrences: Dict[Tuple[EntityName, Partition, SubPartition], Set[Side]] = defaultdict(set) if next_lhs_subpart is not None: for entity_name in lhs_types: next_occurrences[entity_name, lhs_part, next_lhs_subpart].add(Side.LHS) if next_rhs_subpart is not None: for entity_name in rhs_types: next_occurrences[entity_name, rhs_part, next_rhs_subpart].add(Side.RHS) tk.start("copy_from_device") for (entity_name, part, subpart), (gpu_embeddings, gpu_optimizer) in list(self.sub_holder.items()): if (entity_name, part, subpart) in next_occurrences: continue embeddings = all_embs[entity_name, part] optimizer = trainer.partitioned_optimizers[entity_name, part] subpart_slice = subpart_slices[entity_name, part, subpart] embeddings[subpart_slice].data.copy_(gpu_embeddings.detach(), non_blocking=True) del gpu_embeddings (cpu_state, ) = optimizer.state.values() (gpu_state, ) = gpu_optimizer.state.values() cpu_state["sum"][subpart_slice].copy_(gpu_state["sum"], non_blocking=True) del gpu_state["sum"] del self.sub_holder[entity_name, part, subpart] logger.debug( f"Time spent copying subparts from GPU: {tk.stop('copy_from_device'):.4f} s" ) logger.debug( f"do_one_job: Time unaccounted for: {tk.unaccounted():.4f} s") return stats