コード例 #1
0
def test_EncodedContentFiles_empty_preprocessed_db(
        temp_db: encoded.EncodedContentFiles,
        abc_atomizer: atomizers.AsciiCharacterAtomizer):
    """Test that EmptyCorpusException raised if preprocessed db is empty."""
    with tempfile.TemporaryDirectory() as d:
        p = preprocessed.PreprocessedContentFiles(
            f'sqlite:///{pathlib.Path(d)}/preprocessed.db')
        with pytest.raises(errors.EmptyCorpusException):
            temp_db.Create(p, abc_atomizer, '\n\n')
コード例 #2
0
ファイル: corpuses.py プロジェクト: whatsmyname/clgen
    def __init__(self, config: corpus_pb2.Corpus):
        """Instantiate a corpus from a proto config.

    If this is a new corpus, a number of files will be created, which may
    take some time.

    Args:
      config: A Corpus message.

    Raises:
      TypeError: If the config argument is not a Sampler proto.
      UserError: In case the corpus is not found, or config contains invalid
        options.
      EmptyCorpusException: In case the corpus contains no data.
    """
        if not isinstance(config, corpus_pb2.Corpus):
            t = type(config).__name__
            raise TypeError(f"Config must be a Corpus proto. Received: '{t}'")

        # Make a local copy of the configuration.
        self.config = corpus_pb2.Corpus()
        self.config.CopyFrom(AssertConfigIsValid(config))
        self._atomizer = None
        self._created = False
        self.dashboard_db = dashboard_db.GetDatabase()
        self._dashboard_db_id: typing.Optional[int] = None  # Set in Create()

        # An in-memory cache of the encoded contentfiles indices arrays.
        # Set and used in GetTrainingData().
        self._indices_arrays: typing.Optional[typing.List[np.array]] = None

        cache.cachepath("corpus").mkdir(parents=True, exist_ok=True)
        hc = hashcache.HashCache(cache.cachepath("hashcache.db"), "sha1")
        self.content_id = ResolveContentId(self.config, hc)
        # Database of pre-processed files.
        preprocessed_id = ResolvePreprocessedId(self.content_id, self.config)
        cache.cachepath("corpus", "preprocessed",
                        preprocessed_id).mkdir(exist_ok=True, parents=True)
        preprocessed_db_path = cache.cachepath("corpus", "preprocessed",
                                               preprocessed_id,
                                               "preprocessed.db")
        if (self.config.HasField("content_id")
                and not preprocessed_db_path.is_file()):
            raise errors.UserError(
                f"Content ID not found: '{self.content_id}'")
        self.preprocessed = preprocessed.PreprocessedContentFiles(
            f"sqlite:///{preprocessed_db_path}")
        # Create symlink to contentfiles.
        symlink = (
            pathlib.Path(self.preprocessed.url[len("sqlite:///"):]).parent /
            "contentfiles")
        if not symlink.is_symlink():
            if config.HasField("local_directory"):
                os.symlink(
                    str(
                        ExpandConfigPath(
                            config.local_directory,
                            path_prefix=FLAGS.clgen_local_path_prefix)),
                    symlink,
                )
            elif config.HasField("local_tar_archive"):
                os.symlink(
                    str(
                        ExpandConfigPath(
                            config.local_tar_archive,
                            path_prefix=FLAGS.clgen_local_path_prefix,
                        )),
                    symlink,
                )
        # Data of encoded pre-preprocessed files.
        encoded_id = ResolveEncodedId(self.content_id, self.config)
        cache.cachepath("corpus", "encoded", encoded_id).mkdir(exist_ok=True,
                                                               parents=True)
        db_path = cache.cachepath("corpus", "encoded", encoded_id,
                                  "encoded.db")
        # TODO(github.com/ChrisCummins/clgen/issues/130): Refactor this conditional
        # logic by making Corpus an abstract class and creating concrete subclasses
        # for the different types of corpus.
        if self.config.HasField("pre_encoded_corpus_url"):
            self.encoded = encoded.EncodedContentFiles(
                config.pre_encoded_corpus_url)
        else:
            self.encoded = encoded.EncodedContentFiles(f"sqlite:///{db_path}")
        self.atomizer_path = cache.cachepath("corpus", "encoded", encoded_id,
                                             "atomizer.pkl")
        # Create symlink to preprocessed files.
        # TODO(github.com/ChrisCummins/clgen/issues/130): Refactor this conditional
        # logic after splitting Corpus class.
        if not self.config.HasField("pre_encoded_corpus_url"):
            symlink = (
                pathlib.Path(self.encoded.url[len("sqlite:///"):]).parent /
                "preprocessed")
            if not symlink.is_symlink():
                os.symlink(
                    os.path.relpath(
                        pathlib.Path(
                            self.preprocessed.url[len("sqlite:///"):]).parent,
                        pathlib.Path(
                            self.encoded.url[len("sqlite:///"):]).parent,
                    ),
                    symlink,
                )
        self.hash = encoded_id
        self.cache = cache.mkcache("corpus", "encoded", encoded_id)
コード例 #3
0
    def __init__(self, config: corpus_pb2.Corpus):
        """Instantiate a corpus from a proto config.

    If this is a new corpus, a number of files will be created, which may
    take some time.

    Args:
      config: A Corpus message.

    Raises:
      TypeError: If the config argument is not a Sampler proto.
      UserError: In case the corpus is not found, or config contains invalid
        options.
      EmptyCorpusException: In case the corpus contains no data.
    """
        if not isinstance(config, corpus_pb2.Corpus):
            t = type(config).__name__
            raise TypeError(f"Config must be a Corpus proto. Received: '{t}'")

        # Make a local copy of the configuration.
        self.config = corpus_pb2.Corpus()
        self.config.CopyFrom(AssertConfigIsValid(config))
        self._atomizer = None
        self._created = False

        cache.cachepath('corpus').mkdir(parents=True, exist_ok=True)
        hc = hashcache.HashCache(cache.cachepath('hashcache.db'), 'sha1')
        self.content_id = ResolveContentId(self.config, hc)
        # Database of pre-processed files.
        preprocessed_id = ResolvePreprocessedId(self.content_id, self.config)
        cache.cachepath('corpus', 'preprocessed',
                        preprocessed_id).mkdir(exist_ok=True, parents=True)
        preprocessed_db_path = cache.cachepath('corpus', 'preprocessed',
                                               preprocessed_id,
                                               'preprocessed.db')
        if (self.config.HasField('content_id')
                and not preprocessed_db_path.is_file()):
            raise errors.UserError(
                f"Content ID not found: '{self.content_id}'")
        self.preprocessed = preprocessed.PreprocessedContentFiles(
            preprocessed_db_path)
        # Create symlink to contentfiles.
        symlink = self.preprocessed.database_path.parent / 'contentfiles'
        if not symlink.is_symlink():
            if config.HasField('local_directory'):
                os.symlink(
                    str(
                        ExpandConfigPath(
                            config.local_directory,
                            path_prefix=FLAGS.clgen_local_path_prefix)),
                    symlink)
            elif config.HasField('local_tar_archive'):
                os.symlink(
                    str(
                        ExpandConfigPath(
                            config.local_tar_archive,
                            path_prefix=FLAGS.clgen_local_path_prefix)),
                    symlink)
        # Data of encoded pre-preprocessed files.
        encoded_id = ResolveEncodedId(self.content_id, self.config)
        cache.cachepath('corpus', 'encoded', encoded_id).mkdir(exist_ok=True,
                                                               parents=True)
        self.encoded = encoded.EncodedContentFiles(
            cache.cachepath('corpus', 'encoded', encoded_id, 'encoded.db'))
        self.atomizer_path = cache.cachepath('corpus', 'encoded', encoded_id,
                                             'atomizer.pkl')
        # Create symlink to preprocessed files.
        symlink = self.encoded.database_path.parent / 'preprocessed'
        if not symlink.is_symlink():
            os.symlink(
                os.path.relpath(self.preprocessed.database_path.parent,
                                self.encoded.database_path.parent), symlink)
        self.hash = encoded_id
        self.cache = cache.mkcache('corpus', 'encoded', encoded_id)
コード例 #4
0
def preprocessed_db(
    tempdir: pathlib.Path, ) -> preprocessed.PreprocessedContentFiles:
    """A preprocessed database with three files:

    a -> Hello, world
    a2 -> This is a duplicate (has same sha256 as 'a')
    b -> Hello, foo
    c -> ERROR: failure (not successfully preprocessed)
  """
    db = preprocessed.PreprocessedContentFiles(
        f"sqlite:///{tempdir}/preprocessed.db")

    with db.Session(commit=True) as session:
        session.add_all([
            preprocessed.PreprocessedContentFile(
                input_relpath="a",
                input_sha256="00000000",
                input_charcount=10,
                input_linecount=10,
                sha256="00000000",
                charcount=10,
                linecount=1,
                text="Hello, world",
                preprocessing_succeeded=True,
                preprocess_time_ms=4,
                wall_time_ms=4,
            ),
            preprocessed.PreprocessedContentFile(
                input_relpath="a2",
                input_sha256="00000000",
                input_charcount=10,
                input_linecount=10,
                sha256="00000000",
                charcount=10,
                linecount=1,
                text="This is a duplicate",
                preprocessing_succeeded=True,
                preprocess_time_ms=4,
                wall_time_ms=4,
            ),
            preprocessed.PreprocessedContentFile(
                input_relpath="b",
                input_sha256="11111111",
                input_charcount=10,
                input_linecount=10,
                sha256="11111111",
                charcount=10,
                linecount=1,
                text="Hello, foo",
                preprocessing_succeeded=True,
                preprocess_time_ms=4,
                wall_time_ms=4,
            ),
            preprocessed.PreprocessedContentFile(
                input_relpath="c",
                input_sha256="22222222",
                input_charcount=10,
                input_linecount=10,
                sha256="22222222",
                charcount=10,
                linecount=1,
                text="ERROR: failure",
                preprocessing_succeeded=False,
                preprocess_time_ms=4,
                wall_time_ms=4,
            ),
        ])
    yield db
コード例 #5
0
def preprocessed_db(
    tempdir: pathlib.Path, ) -> preprocessed.PreprocessedContentFile:
    db = preprocessed.PreprocessedContentFiles(
        f"sqlite:///{tempdir}/preprocessed")
    return db
コード例 #6
0
ファイル: corpuses.py プロジェクト: fivosts/clgen
  def __init__(self, config: typing.Union[corpus_pb2.Corpus, corpus_pb2.PreTrainCorpus]):
    """Instantiate a corpus from a proto config.

    If this is a new corpus, a number of files will be created, which may
    take some time.

    Args:
      config: A Corpus message.

    Raises:
      TypeError: If the config argument is not a Sampler proto.
      UserError: In case the corpus is not found, or config contains invalid
        options.
      EmptyCorpusException: In case the corpus contains no data.
    """
    if not isinstance(config, corpus_pb2.Corpus) and not isinstance(config, corpus_pb2.PreTrainCorpus):
      raise TypeError(f"Config must be a Corpus proto. Received: '{type(config).__name__}'")

    # Make a local copy of the configuration.
    if isinstance(config, corpus_pb2.Corpus):
      self.config    = corpus_pb2.Corpus()
      self.pre_train = False
    else:
      self.config    = corpus_pb2.PreTrainCorpus()
      self.pre_train = True

    self.config.CopyFrom(AssertConfigIsValid(config))
    self._tokenizer = None
    self._created = False

    # An in-memory cache of the encoded contentfiles indices arrays.
    # Set and used in GetTrainingData().
    self._indices_arrays: typing.Optional[typing.List[np.array]] = None

    if environment.WORLD_RANK == 0:
      cache.cachepath("corpus").mkdir(parents=True, exist_ok=True)
    distrib.barrier()
    self.content_id = ResolveContentId(self.config)
    # Database of pre-processed files.
    preprocessed_id = ResolvePreprocessedId(self.content_id, self.config)
    if environment.WORLD_RANK == 0:
      cache.cachepath("corpus", "preprocessed", preprocessed_id).mkdir(exist_ok=True, parents=True)
    distrib.barrier()
    preprocessed_db_path = cache.cachepath("corpus", "preprocessed",
                                           preprocessed_id, "preprocessed.db")

    if self.config.HasField("content_id") and not preprocessed_db_path.is_file():
      raise ValueError(f"Content ID not found: '{self.content_id}'")
    self.preprocessed = preprocessed.PreprocessedContentFiles(
      f"sqlite:///{preprocessed_db_path}"
    )
    # Create symlink to contentfiles.
    if environment.WORLD_RANK == 0:
      symlink = (pathlib.Path(self.preprocessed.url[len("sqlite:///") :]).parent / "contentfiles")
      if not symlink.is_symlink():
        if config.HasField("local_directory"):
          os.symlink(
            str(ExpandConfigPath(config.local_directory,   path_prefix=FLAGS.clgen_local_path_prefix)),
            symlink,
          )
        elif config.HasField("local_tar_archive"):
          os.symlink(
            str(ExpandConfigPath(config.local_tar_archive, path_prefix=FLAGS.clgen_local_path_prefix)),
            symlink,
          )
        elif config.HasField("bq_database"):
          os.symlink(
            str(ExpandConfigPath(config.bq_database, path_prefix=FLAGS.clgen_local_path_prefix)),
            symlink,
          )  
        # elif config.HasField("fetch_github"):
        #   os.symlink(
        #     str(ExpandConfigPath(config.fetch_github, path_prefix=FLAGS.clgen_local_path_prefix)),
        #     symlink,
        #   )
    distrib.barrier()
    # Data of encoded pre-preprocessed files.
    encoded_id = ResolveEncodedId(self.content_id, self.config)
    if environment.WORLD_RANK == 0:
      cache.cachepath("corpus", "encoded", encoded_id).mkdir(exist_ok=True, parents=True)
    distrib.barrier()
    db_path = cache.cachepath("corpus", "encoded", encoded_id, "encoded.db")
    if self.config.HasField("pre_encoded_corpus_url"):
      self.encoded = encoded.EncodedContentFiles(config.pre_encoded_corpus_url, self.pre_train)
    else:
      self.encoded = encoded.EncodedContentFiles(f"sqlite:///{db_path}", self.pre_train)
    self.tokenizer_path = cache.cachepath(
      "corpus", "encoded", encoded_id, "tokenizer.pkl"
    )
    if environment.WORLD_RANK == 0 and not self.config.HasField("pre_encoded_corpus_url"):
      symlink = (pathlib.Path(self.encoded.url[len("sqlite:///") :]).parent / "preprocessed")
      if not symlink.is_symlink():
        os.symlink(
          os.path.relpath(
            pathlib.Path(self.preprocessed.url[len("sqlite:///") :]).parent,
            pathlib.Path(self.encoded.url[len("sqlite:///") :]).parent,
            ),
          symlink,
        )
    self.hash = encoded_id
    self.cache = cache.mkcache("corpus", "encoded", encoded_id)
    if environment.WORLD_RANK == 0:
      commit.saveCommit(self.cache.path)
      commit.saveCommit(self.cache.path.parent.parent / "preprocessed" / preprocessed_id)
    distrib.barrier()
    l.logger().info("Initialized {}train corpus in {}".format("pre_" if self.pre_train else "", self.cache.path))
    return