def __init__( self, config: ConfigSchema, model: Optional[MultiRelationEmbedder] = None, trainer: Optional[AbstractBatchProcessor] = None, evaluator: Optional[AbstractBatchProcessor] = None, rank: Rank = SINGLE_TRAINER, subprocess_init: Optional[Callable[[], None]] = None, stats_handler: StatsHandler = NOOP_STATS_HANDLER, ): super().__init__( config, model, trainer, evaluator, rank, subprocess_init, stats_handler ) assert config.num_gpus > 0 if not CPP_INSTALLED: raise RuntimeError( "GPU support requires C++ installation: " "install with C++ support by running " "`PBG_INSTALL_CPP=1 pip install .`" ) if config.half_precision: for entity in config.entities: # need this for tensor cores to work assert config.entity_dimension(entity) % 8 == 0 assert config.batch_size % 8 == 0 assert config.num_batch_negs % 8 == 0 assert config.num_uniform_negs % 8 == 0 assert len(self.holder.lhs_unpartitioned_types) == 0 assert len(self.holder.rhs_unpartitioned_types) == 0 num_edge_chunks = self.iteration_manager.num_edge_chunks max_edges = 0 for edge_path in config.edge_paths: edge_storage = EDGE_STORAGES.make_instance(edge_path) for lhs_part in range(self.holder.nparts_lhs): for rhs_part in range(self.holder.nparts_rhs): num_edges = edge_storage.get_number_of_edges(lhs_part, rhs_part) num_edges_per_chunk = div_roundup(num_edges, num_edge_chunks) max_edges = max(max_edges, num_edges_per_chunk) self.shared_lhs = allocate_shared_tensor((max_edges,), dtype=torch.long) self.shared_rhs = allocate_shared_tensor((max_edges,), dtype=torch.long) self.shared_rel = allocate_shared_tensor((max_edges,), dtype=torch.long) # fork early for HOGWILD threads logger.info("Creating GPU workers...") torch.set_num_threads(1) self.gpu_pool = GPUProcessPool( config.num_gpus, subprocess_init, {s for ss in self.embedding_storage_freelist.values() for s in ss} | { self.shared_lhs.storage(), self.shared_rhs.storage(), self.shared_rel.storage(), }, )
def load_chunk_of_edges( self, lhs_p: Partition, rhs_p: Partition, chunk_idx: int = 0, num_chunks: int = 1, shared: bool = False, ) -> EdgeList: file_path = self.get_edges_file(lhs_p, rhs_p) try: with h5py.File(file_path, "r") as hf: if hf.attrs.get(FORMAT_VERSION_ATTR, None) != FORMAT_VERSION: raise RuntimeError( f"Version mismatch in edge file {file_path}") lhs_ds = hf["lhs"] rhs_ds = hf["rhs"] rel_ds = hf["rel"] num_edges = rel_ds.len() chunk_size = div_roundup(num_edges, num_chunks) begin = chunk_idx * chunk_size end = min((chunk_idx + 1) * chunk_size, num_edges) chunk_size = end - begin allocator = allocate_shared_tensor if shared else torch.empty lhs = allocator((chunk_size, ), dtype=torch.long) rhs = allocator((chunk_size, ), dtype=torch.long) rel = allocator((chunk_size, ), dtype=torch.long) # Needed because https://github.com/h5py/h5py/issues/870. if chunk_size > 0: lhs_ds.read_direct(lhs.numpy(), source_sel=np.s_[begin:end]) rhs_ds.read_direct(rhs.numpy(), source_sel=np.s_[begin:end]) rel_ds.read_direct(rel.numpy(), source_sel=np.s_[begin:end]) lhsd = self.read_dynamic(hf, "lhsd", begin, end, shared=shared) rhsd = self.read_dynamic(hf, "rhsd", begin, end, shared=shared) if "weight" in hf: weight_ds = hf["weight"] weight = allocator((chunk_size, ), dtype=torch.long) if chunk_size > 0: weight_ds.read_direct(weight.numpy(), source_sel=np.s_[begin:end]) else: weight = None return EdgeList(EntityList(lhs, lhsd), EntityList(rhs, rhsd), rel, weight) except OSError as err: # h5py refuses to make it easy to figure out what went wrong. The errno # attribute is set to None. See https://github.com/h5py/h5py/issues/493. if f"errno = {errno.ENOENT}" in str(err): raise CouldNotLoadData() from err raise err