Esempio n. 1
0
  def __init__(self, url: str, is_pre_train: bool = False, must_exist: bool = False, is_replica = False):

    self.is_pre_train = is_pre_train
    if environment.WORLD_RANK == 0 or is_replica:
      encoded_path = pathlib.Path(url.replace("sqlite:///", "")).parent
      self.length_monitor   = monitors.CumulativeHistMonitor(encoded_path, "encoded_kernel_length")
      if not self.is_pre_train:
        self.token_monitor    = monitors.NormalizedFrequencyMonitor(encoded_path, "token_distribution")
        self.feature_monitors = {ftype: monitors.CategoricalDistribMonitor(encoded_path, "{}_distribution".format(ftype)) for ftype in extractor.extractors.keys()}
      super(EncodedContentFiles, self).__init__(url, Base, must_exist=must_exist)
    if environment.WORLD_SIZE > 1 and not is_replica:
      # Conduct engine connections to replicated preprocessed chunks.
      self.base_path = pathlib.Path(url.replace("sqlite:///", "")).resolve().parent
      hash_id = self.base_path.name
      try:
        tdir = pathlib.Path(FLAGS.local_filesystem).resolve() / hash_id / "node_encoded"
      except Exception:
        tdir = pathlib.Path("/tmp").resolve() / hash_id / "node_encoded"
      distrib.lock()
      tdir.mkdir(parents = True, exist_ok = True)
      distrib.unlock()
      self.replicated_path = tdir / "encoded_{}.db".format(environment.WORLD_RANK)
      self.replicated = EncodedContentFiles(
        url = "sqlite:///{}".format(str(self.replicated_path)),
        is_pre_train = is_pre_train,
        must_exist = must_exist,
        is_replica = True
      )
      self.length_monitor = self.replicated.length_monitor
      if not self.is_pre_train:
        self.token_monitor    = self.replicated.token_monitor
        self.feature_monitors = self.replicated.feature_monitors
      distrib.barrier()
    return
Esempio n. 2
0
  def _ConfigTrainParams(self, 
                         data_generator: torchLMDataGenerator,
                         pre_train: bool,
                        ) -> None:
    """
    Model parameter initialization for training and validation.
    """
    self._ConfigModelParams(is_sampling = False)

    self.train_batch_size                 = self.config.training.batch_size
    self.eval_batch_size                  = self.config.training.batch_size
    self.learning_rate                    = self.config.training.adam_optimizer.initial_learning_rate_micros / 1e6
    self.num_warmup_steps                 = self.config.training.num_warmup_steps if not pre_train else self.config.training.num_prewarmup_steps
    self.max_grad_norm                    = 1.0

    self.steps_per_epoch                  = data_generator.steps_per_epoch
    self.current_step                     = None
    self.num_epochs                       = data_generator.num_epochs
    self.num_train_steps                  = self.steps_per_epoch * self.num_epochs
    self.max_eval_steps                   = FLAGS.max_eval_steps

    self.validation_results_file          = "val_results.txt"
    self.validation_results_path          = os.path.join(str(self.logfile_path if not pre_train else self.pre_logfile_path), self.validation_results_file)

    m = model.BertForPreTraining(
          self.bert_config,
          tokenizer = self.tokenizer,
          target_lm = "hole" if self.config.training.data_generator.HasField("hole") else "mask"
          ).to(self.pytorch.offset_device)

    if self.pytorch.num_nodes > 1:
      distrib.barrier()
      m = self.torch.nn.parallel.DistributedDataParallel(
        m,
        device_ids    = [self.pytorch.offset_device],
        output_device = self.pytorch.offset_device,
      )
    elif self.pytorch.num_gpus > 1:
      m = self.torch.nn.DataParallel(m)

    opt, lr_scheduler = optimizer.create_optimizer_and_scheduler(
      model           = m,
      num_train_steps = self.num_train_steps,
      warmup_steps    = self.num_warmup_steps,
      learning_rate   = self.learning_rate,
    )

    self.train = torchBert.BertEstimator(
                  m, data_generator, opt, lr_scheduler
                )
    l.logger().info(self.GetShortSummary())
    return
Esempio n. 3
0
 def tokenizer(self) -> tokenizers.TokenizerBase:
   """Must call Create() first."""
   if not self._created:
     raise ValueError("Must call Create() before accessing tokenizer property.")
   if self._tokenizer is None:
     if self.tokenizer_path.is_file():
       self._tokenizer = tokenizers.TokenizerBase.FromFile(self.tokenizer_path)
     else:
       if environment.WORLD_RANK == 0:
         self._tokenizer = self._CreateTokenizer()
         l.logger().warn("Created tokenizer.")
       distrib.barrier()
       if environment.WORLD_RANK != 0:
         self._tokenizer = tokenizers.TokenizerBase.FromFile(self.tokenizer_path)
   return self._tokenizer
Esempio n. 4
0
  def GetContentFileRoot(self, config: corpus_pb2.Corpus) -> pathlib.Path:
    """Get the path of the directory containing content files.

    If the corpus is a local directory, this simply returns the path. Otherwise,
    this method creates a temporary copy of the files which can be used within
    the scope of this context.

    Args:
      config: The corpus config proto.

    Returns:
      The path of a directory containing content files.
    """
    if config.HasField("local_directory"):
      yield pathlib.Path(ExpandConfigPath(config.local_directory))
    elif config.HasField("local_tar_archive"):
      with tempfile.TemporaryDirectory(prefix="clgen_corpus_", dir = FLAGS.local_filesystem) as d:
        l.logger().info("Unpacking {}...".format(ExpandConfigPath(config.local_tar_archive).name))
        start_time = time.time()
        if environment.WORLD_RANK == 0:
          cmd = [
            "tar",
            "-xf",
            str(ExpandConfigPath(config.local_tar_archive)),
            "-C",
            d,
          ]
          subprocess.check_call(cmd)
        distrib.barrier()
        l.logger().info(
          "Unpacked {} in {} ms".format(
                  ExpandConfigPath(config.local_tar_archive).name,
                  humanize.intcomma(int((time.time() - start_time) * 1000)),
              )
        )
        yield pathlib.Path(d)
    elif config.HasField("bq_database"):
      input_bq = pathlib.Path(ExpandConfigPath(config.bq_database))
      if environment.WORLD_SIZE > 1:
        target_bq = self.replicated_path.parent / "bq_database_replica_{}.db".format(environment.WORLD_RANK)
        if not target_bq.exists():
          shutil.copy(input_bq, target_bq)
        yield target_bq
      else:
        yield input_bq
    else:
      raise NotImplementedError
Esempio n. 5
0
 def MergeReplicas(self) -> None:
   """
   When distributed nodes work for the same preprocessed DB
   this function moves finalized preprocessed chunks back into the AFS
   and master node merges them into the final preprocessed.db
   """
   shutil.copy(
     self.replicated.url.replace("sqlite:///", ""), self.base_path / "preprocessed_{}.db".format(environment.WORLD_RANK)
   )
   distrib.barrier()
   if environment.WORLD_RANK == 0:
     db_chunks = glob.glob(str(self.base_path / "preprocessed_*.db"))
     dbs = [PreprocessedContentFiles(url = "sqlite:///{}".format(p), must_exist = True, is_replica = True) for p in db_chunks]
     merge_db(dbs, self)
     for p in db_chunks:
       os.remove(p)
   distrib.barrier()
   return
Esempio n. 6
0
def GetHashOfArchiveContents(archive: pathlib.Path) -> str:
  """Compute the checksum of the contents of a directory.

  Args:
    archive: Path of the archive.

  Returns:
    Checksum of the archive.

  Raises:
    UserError: If the requested archive does not exist, or cannot be unpacked.
  """
  if not (archive.parent / "corpus_registry.json").exists():
    raise FileNotFoundError("corpus_registry.json file not found.")

  with open(archive.parent / "corpus_registry.json", 'r') as js:
    reg = json.load(js)

  if archive.name not in reg:
    raise FileNotFoundError("Corpus {} is not registered in corpus_registry".format(archive.name))

  if not archive.is_file():
    l.logger().info("Corpus found in registry. Downloading from Google Drive...")
    if environment.WORLD_RANK == 0:
      gdown.download("https://drive.google.com/uc?id={}".format(reg[archive.name]['url']), str(archive))
    distrib.barrier()

  if 'hash' in reg[archive.name]:
    return reg[archive.name]['hash']
  else:
    with tempfile.TemporaryDirectory(prefix="clgen_corpus_", dir = FLAGS.local_filesystem) as d:
      pv  = ["pv", str(archive)]
      tar = ["tar", "xfj", "-", "-C", d]
      try:
        pv_proc = subprocess.Popen(pv, stdout = subprocess.PIPE)
        subprocess.check_call(tar, stdin = pv_proc.stdout)
      except subprocess.CalledProcessError:
        raise ValueError(f"Archive unpack failed: '{archive}'")
      return checksumdir.dirhash(d, "sha1")
Esempio n. 7
0
 def __init__(self, url: str, must_exist: bool = False, is_replica = False):
   if environment.WORLD_RANK == 0 or is_replica:
     super(PreprocessedContentFiles, self).__init__(
       url, Base, must_exist=must_exist
     )
   if environment.WORLD_SIZE > 1 and not is_replica:
     # Conduct engine connections to replicated preprocessed chunks.
     self.base_path = pathlib.Path(url.replace("sqlite:///", "")).resolve().parent
     hash_id = self.base_path.name
     try:
       tdir = pathlib.Path(FLAGS.local_filesystem).resolve() / hash_id / "node_preprocessed"
     except Exception:
       tdir = pathlib.Path("/tmp").resolve() / hash_id / "node_preprocessed"
     distrib.lock()
     tdir.mkdir(parents = True, exist_ok = True)
     distrib.unlock()
     self.replicated_path = tdir / "preprocessed_{}.db".format(environment.WORLD_RANK)
     self.replicated = PreprocessedContentFiles(
       url = "sqlite:///{}".format(str(self.replicated_path)),
       must_exist = must_exist,
       is_replica = True
     )
     distrib.barrier()
   return
Esempio n. 8
0
  def createCorpus(self, path: pathlib.Path) -> np.array:
    """
    Constructs training corpus in text format, stores it in
    shaped_corpus

    Each corpus datapoint is either a single kernel or a random
    sequence of size sequence_length (legacy).

    If corpus has been previously pickled and stored, then it is loaded.
    """
    start_time = time.time()

    # Set corpus dimension parameters
    sequence_length   = self.training_opts.sequence_length
    effect_seq_length = sequence_length - (2 if self.config.use_start_end else 0)
    batch_size        = self.training_opts.batch_size
    dupe_factor       = self.training_opts.dupe_factor
    shuffle           = self.training_opts.shuffle_corpus_contentfiles_between_epochs
    pad               = [self.tokenizer.padToken   ]
    start             = [self.tokenizer.startToken ]
    end               = [self.tokenizer.endToken   ]
    shaped_corpus     = None

    corpus_file = "{}corpus.pkl".format("pre_" if self.pre_train else "")
    # Monitor counts actual length distribution of kernel instances.
    kernel_length_monitor = monitors.FrequencyMonitor(path, "{}kernel_length".format("pre_" if self.pre_train else ""))
    # Token frequency distribution monitor.
    if not self.pre_train:
      feature_monitors = {
        ftype: monitors.CategoricalDistribMonitor(
                          path,
                          "{}{}_distribution".format("pre_" if self.pre_train else "", ftype)
                        )
        for ftype in extractor.extractors.keys()
      }

    if (path / corpus_file).exists():
      with open(path / corpus_file, 'rb') as infile:
        shaped_corpus = pickle.load(infile)
        if self.num_train_steps:
          self.num_epochs      = self.num_train_steps // self.config.steps_per_epoch
        self.steps_per_epoch = self.config.steps_per_epoch
        l.logger().info(
          "Loaded from file corpus of {} examples in {} ms.".format(
                    humanize.intcomma(len(shaped_corpus)),
                    humanize.intcomma(int((time.time() - start_time) * 1000)),
                )
        )
      return shaped_corpus

    # generate a kernel corpus
    if (path / "text_corpus.pkl").exists():
      # Only sampler writes a text_corpus.pkl, to do online or active sampling.
      # The reason is, corpus is saved in text format, to be picked up with the
      # right tokenizer. And that is the model's tokenizer.
      with open(path / "text_corpus.pkl", 'rb') as infile:
        encoded_corpus = [self.tokenizer.TokenizeString(x) for x in pickle.load(infile)]
    else:
      if self.pre_train:
        if self.num_train_steps:
          self.num_epochs      = self.num_train_steps // self.config.steps_per_epoch
        self.steps_per_epoch = self.config.steps_per_epoch
        if environment.WORLD_RANK == 0:
          if len(glob.glob(str(path / "pre_corpus_*.pkl"))) > 0:
            return []
          encoded_corpus = []
          cache_lengths  = {}
          chunk_size = 250000
          i, ch_idx = 0, 0
          bar  = tqdm.tqdm(total = self.corpus.encoded.size, desc = "Chunk pre-train corpus")
          pool = multiprocessing.Pool()
          l.logger().info("Processing pre-train corpus chunks...")
          for dp in pool.imap_unordered(
                              functools.partial(
                                _addStartEndPadToken,
                                tokenizer = pickle.dumps(self.tokenizer),
                                trunc     = effect_seq_length,
                                seq_len   = sequence_length),
                              self.corpus.GetTrainingDataGenerator()):
            if dp:
              rlen, enc_kernel = dp
              kernel_length_monitor.register(rlen)
              encoded_corpus.append(enc_kernel)
            i += 1
            if i % chunk_size == 0:
              encoded_corpus = np.array(encoded_corpus)
              corpus_file = "pre_corpus_{}.pkl".format(ch_idx)
              cache_lengths[corpus_file] = len(encoded_corpus)
              l.logger().info("Storing chunk {}, len: {}".format(ch_idx, encoded_corpus.shape))
              with open(path / corpus_file, 'wb') as outf:
                pickle.dump(encoded_corpus, outf, protocol = 4)
              with open(path / "pre_lengths_cache.json", 'w') as outf:
                json.dump(cache_lengths, outf)
              ch_idx += 1
              encoded_corpus = []
            bar.update(1)
          if encoded_corpus:
            encoded_corpus = np.array(encoded_corpus)
            l.logger().info("Storing chunk {}, len: {}".format(ch_idx, encoded_corpus.shape))
            corpus_file = "pre_corpus_{}.pkl".format(ch_idx)
            cache_lengths[corpus_file] = len(encoded_corpus)
            with open(path / corpus_file, 'wb') as outf:
              pickle.dump(encoded_corpus, outf, protocol = 4)
            with open(path / "pre_lengths_cache.json", 'w') as outf:
              json.dump(cache_lengths, outf)
          kernel_length_monitor.plot()
          pool.close()
          distrib.barrier()
        else:
          distrib.barrier()
          if len(glob.glob(str(path / "pre_corpus_*.pkl"))) > 0:
            return []
          else:
            raise FileNotFoundError(glob.glob(str(path / "pre_corpus_*.pkl")))
        return encoded_corpus
      else:
        if environment.WORLD_RANK == 0:
          if not self.feature_encoder:
            encoded_corpus = self.corpus.GetTrainingData(sequence_length = effect_seq_length if not self.config.truncate_large_kernels else None)
          else:
            encoded_corpus = self.corpus.GetTrainingDataWFeatures(sequence_length = effect_seq_length if not self.config.truncate_large_kernels else None)

    if self.config.datapoint_type == "kernel":
      if environment.WORLD_RANK == 0:
        # Reject larger than sequence length
        initial_length = copy.deepcopy(len(encoded_corpus))

        if not self.pre_train:
          # Get features of fitting dataset within sequence length
          for feature in self.corpus.GetTrainingFeatures(effect_seq_length):
            for ftype, fvector in feature.items():
              feature_monitors[ftype].register(fvector)

        if self.feature_encoder:
          training_features = [x for _, x in encoded_corpus]
          encoded_corpus    = [x for x, _ in encoded_corpus]

        idx, t = set(), []
        if self.config.truncate_large_kernels:
          for i, x in enumerate(encoded_corpus):
            if len(x[:effect_seq_length]) <= effect_seq_length:
              t.append(list(x[:effect_seq_length]))
            else:
              idx.add(i)
        else:
          for i, x in enumerate(encoded_corpus):
            if len(x) <= effect_seq_length:
              t.append(list(x))
            else:
              idx.add(i)
        encoded_corpus = t
        if self.feature_encoder:
          training_features = [x for i, x in enumerate(training_features) if i not in idx]

        reduced_length       = copy.deepcopy(len(encoded_corpus))
        # Add start/end tokens
        if self.config.use_start_end:
          encoded_corpus     = [self._addStartEndToken(kf) for kf in encoded_corpus]
        # Register the actual lengths before padding.
        kernel_length_monitor.register([len(x) for x in encoded_corpus])
        # pad sequences to sequence length
        encoded_corpus       = np.array([x + pad * (sequence_length - len(x)) for x in encoded_corpus])

        if self.feature_encoder:
          expanded_corpus  = []
          encoded_features = []
          for dp, fvec in zip(encoded_corpus, training_features):
            for fspace in extractor.extractors.keys():
              if fspace in fvec:
                expanded_corpus.append(dp)
                encoded_features.append(self.feature_tokenizer.TokenizeFeatureVector(fvec[fspace], fspace, self.feature_sequence_length))
          shaped_corpus = [[src, feats] for src, feats in zip(expanded_corpus, encoded_features)]
        else:
          shaped_corpus     = encoded_corpus
        # Shuffle
        if shuffle:
          self.rngen.shuffle(shaped_corpus)
        assert len(shaped_corpus) != 0, "Not enought data. All kernels have been rejected."

        # Set corpus epoch parameters
        if self.num_train_steps:
          self.num_epochs      = self.num_train_steps // self.config.steps_per_epoch
        self.steps_per_epoch = self.config.steps_per_epoch

        l.logger().info("{} kernels were rejected (larger than sequence_length)".format(initial_length - reduced_length))
        l.logger().info(
          "Loaded corpus of shape {}x{} multiplied by dupe factor: {} in {} ms.".format(
                    len(shaped_corpus),
                    sequence_length,
                    dupe_factor,
                    humanize.intcomma(int((time.time() - start_time) * 1000)),
                )
        )
      else:
        # Set corpus epoch parameters
        if self.num_train_steps:
          self.num_epochs      = self.num_train_steps // self.config.steps_per_epoch
        self.steps_per_epoch = self.config.steps_per_epoch
    elif self.config.datapoint_type == "statement":
    ## This branch is legacy data processing and does not support DDP.

      if shuffle:
        self.rngen.shuffle(encoded_corpus)
      encoded_corpus = np.concatenate(encoded_corpus)
      # encoded_corpus = np.tile(encoded_corpus, dupe_factor)

      # Set corpus dimension parameters
      self.steps_per_epoch        = len(encoded_corpus) // (batch_size * sequence_length * dupe_factor)
      assert self.steps_per_epoch != 0, "Not enought data. Use smaller sequence_length and/or batch_size"
      if self.num_train_steps:
        self.num_epochs            = self.num_train_steps // self.steps_per_epoch

      # clipped_corpus_length       = dupe_factor * self.steps_per_epoch * batch_size * sequence_length
      clipped_corpus_length = self.steps_per_epoch * batch_size * sequence_length
      clipped_corpus        = encoded_corpus[:clipped_corpus_length]

      # shaped_corpus = np.split(clipped_corpus, batch_size * self.steps_per_epoch * dupe_factor, 0)
      shaped_corpus = np.split(clipped_corpus, batch_size * self.steps_per_epoch, 0)

      # Register the actual lengths before padding.
      kernel_length_monitor.register([len(x) for x in shaped_corpus])

      np_corpus = np.asarray(shaped_corpus)
      assert np_corpus.ndim == 2, "Wrong dimensions for shaped_corpus: {}".format(np_corpus.shape)
      assert np_corpus.shape[1] == sequence_length, "Second dimension is not equal to sequence length: {}".format(np_corpus.shape[1])

      l.logger().info(
        "Loaded corpus of {} tokens (clipped last {} tokens) in {} ms.".format(
                  humanize.intcomma(clipped_corpus_length),
                  humanize.intcomma(len(encoded_corpus) - clipped_corpus_length),
                  humanize.intcomma(int((time.time() - start_time) * 1000)),
              )
      )
    else:
      raise ValueError("Unrecognized datapoint_type: {}".format(self.config.datapoint_type))

    if environment.WORLD_RANK == 0:
      kernel_length_monitor.plot()
      if not self.pre_train:
        for fm in feature_monitors.values():
          fm.plot()
      with open(path / corpus_file, 'wb') as outf:
        pickle.dump(shaped_corpus, outf)
    distrib.barrier()
    return shaped_corpus
Esempio n. 9
0
  def __init__(self, config: typing.Union[corpus_pb2.Corpus, corpus_pb2.PreTrainCorpus]):
    """Instantiate a corpus from a proto config.

    If this is a new corpus, a number of files will be created, which may
    take some time.

    Args:
      config: A Corpus message.

    Raises:
      TypeError: If the config argument is not a Sampler proto.
      UserError: In case the corpus is not found, or config contains invalid
        options.
      EmptyCorpusException: In case the corpus contains no data.
    """
    if not isinstance(config, corpus_pb2.Corpus) and not isinstance(config, corpus_pb2.PreTrainCorpus):
      raise TypeError(f"Config must be a Corpus proto. Received: '{type(config).__name__}'")

    # Make a local copy of the configuration.
    if isinstance(config, corpus_pb2.Corpus):
      self.config    = corpus_pb2.Corpus()
      self.pre_train = False
    else:
      self.config    = corpus_pb2.PreTrainCorpus()
      self.pre_train = True

    self.config.CopyFrom(AssertConfigIsValid(config))
    self._tokenizer = None
    self._created = False

    # An in-memory cache of the encoded contentfiles indices arrays.
    # Set and used in GetTrainingData().
    self._indices_arrays: typing.Optional[typing.List[np.array]] = None

    if environment.WORLD_RANK == 0:
      cache.cachepath("corpus").mkdir(parents=True, exist_ok=True)
    distrib.barrier()
    self.content_id = ResolveContentId(self.config)
    # Database of pre-processed files.
    preprocessed_id = ResolvePreprocessedId(self.content_id, self.config)
    if environment.WORLD_RANK == 0:
      cache.cachepath("corpus", "preprocessed", preprocessed_id).mkdir(exist_ok=True, parents=True)
    distrib.barrier()
    preprocessed_db_path = cache.cachepath("corpus", "preprocessed",
                                           preprocessed_id, "preprocessed.db")

    if self.config.HasField("content_id") and not preprocessed_db_path.is_file():
      raise ValueError(f"Content ID not found: '{self.content_id}'")
    self.preprocessed = preprocessed.PreprocessedContentFiles(
      f"sqlite:///{preprocessed_db_path}"
    )
    # Create symlink to contentfiles.
    if environment.WORLD_RANK == 0:
      symlink = (pathlib.Path(self.preprocessed.url[len("sqlite:///") :]).parent / "contentfiles")
      if not symlink.is_symlink():
        if config.HasField("local_directory"):
          os.symlink(
            str(ExpandConfigPath(config.local_directory,   path_prefix=FLAGS.clgen_local_path_prefix)),
            symlink,
          )
        elif config.HasField("local_tar_archive"):
          os.symlink(
            str(ExpandConfigPath(config.local_tar_archive, path_prefix=FLAGS.clgen_local_path_prefix)),
            symlink,
          )
        elif config.HasField("bq_database"):
          os.symlink(
            str(ExpandConfigPath(config.bq_database, path_prefix=FLAGS.clgen_local_path_prefix)),
            symlink,
          )  
        # elif config.HasField("fetch_github"):
        #   os.symlink(
        #     str(ExpandConfigPath(config.fetch_github, path_prefix=FLAGS.clgen_local_path_prefix)),
        #     symlink,
        #   )
    distrib.barrier()
    # Data of encoded pre-preprocessed files.
    encoded_id = ResolveEncodedId(self.content_id, self.config)
    if environment.WORLD_RANK == 0:
      cache.cachepath("corpus", "encoded", encoded_id).mkdir(exist_ok=True, parents=True)
    distrib.barrier()
    db_path = cache.cachepath("corpus", "encoded", encoded_id, "encoded.db")
    if self.config.HasField("pre_encoded_corpus_url"):
      self.encoded = encoded.EncodedContentFiles(config.pre_encoded_corpus_url, self.pre_train)
    else:
      self.encoded = encoded.EncodedContentFiles(f"sqlite:///{db_path}", self.pre_train)
    self.tokenizer_path = cache.cachepath(
      "corpus", "encoded", encoded_id, "tokenizer.pkl"
    )
    if environment.WORLD_RANK == 0 and not self.config.HasField("pre_encoded_corpus_url"):
      symlink = (pathlib.Path(self.encoded.url[len("sqlite:///") :]).parent / "preprocessed")
      if not symlink.is_symlink():
        os.symlink(
          os.path.relpath(
            pathlib.Path(self.preprocessed.url[len("sqlite:///") :]).parent,
            pathlib.Path(self.encoded.url[len("sqlite:///") :]).parent,
            ),
          symlink,
        )
    self.hash = encoded_id
    self.cache = cache.mkcache("corpus", "encoded", encoded_id)
    if environment.WORLD_RANK == 0:
      commit.saveCommit(self.cache.path)
      commit.saveCommit(self.cache.path.parent.parent / "preprocessed" / preprocessed_id)
    distrib.barrier()
    l.logger().info("Initialized {}train corpus in {}".format("pre_" if self.pre_train else "", self.cache.path))
    return