Ejemplo n.º 1
0
def test_HashCache_GetHash_unmodified_directory(database_path, hash_fn):
  """Test that an unmodified file returns the same hash."""
  c = hashcache.HashCache(database_path, hash_fn)
  with tempfile.TemporaryDirectory() as d:
    hash_1 = c.GetHash(pathlib.Path(d))
    hash_2 = c.GetHash(pathlib.Path(d))
    assert hash_1 == hash_2
Ejemplo n.º 2
0
def test_HashCache_GetHash_empty_directory(database_path, hash_fn):
  """Test the hash of an empty directory."""
  c = hashcache.HashCache(database_path, hash_fn)
  with tempfile.TemporaryDirectory() as d:
    assert EMPTY_FILE_HASHES[hash_fn] == c.GetHash(pathlib.Path(d))
    # Once more to test a cache hit.
    assert EMPTY_FILE_HASHES[hash_fn] == c.GetHash(pathlib.Path(d))
Ejemplo n.º 3
0
def test_HashCache_GetHash_non_existent_path(database_path, hash_fn):
  """Test that a non-existent path raises an error."""
  c = hashcache.HashCache(database_path, hash_fn)
  with tempfile.TemporaryDirectory() as d:
    with pytest.raises(FileNotFoundError) as e_info:
      c.GetHash(pathlib.Path(d) / 'a')
    assert f"File not found: '{d}/a'" == str(e_info.value)
Ejemplo n.º 4
0
def test_HashCache_GetHash_modified_directory(database_path, hash_fn):
  """Test that modifying a directory changes the hash."""
  c = hashcache.HashCache(database_path, hash_fn)
  with tempfile.TemporaryDirectory() as d:
    hash_1 = c.GetHash(pathlib.Path(d))
    time.sleep(1)
    (pathlib.Path(d) / 'a').touch()
    hash_2 = c.GetHash(pathlib.Path(d))
    assert hash_1 != hash_2
Ejemplo n.º 5
0
def test_HashCache_GetHash_unmodified_file(database_path, hash_fn):
  """Test that an unmodified file returns the same hash."""
  c = hashcache.HashCache(database_path, hash_fn)
  with tempfile.TemporaryDirectory() as d:
    (pathlib.Path(d) / 'a').touch()
    hash_1 = c.GetHash(pathlib.Path(d) / 'a')
    # Touch does not change the contents of the file, but will cause a
    # cache miss because of the changed mtime timestamp.
    (pathlib.Path(d) / 'a').touch()
    hash_2 = c.GetHash(pathlib.Path(d) / 'a')
    assert hash_1 == hash_2
Ejemplo n.º 6
0
def test_HashCache_GetHash_modified_file(database_path, hash_fn):
  """Test that modifying a file changes the hash."""
  c = hashcache.HashCache(database_path, hash_fn)
  with tempfile.TemporaryDirectory() as d:
    (pathlib.Path(d) / 'a').touch()
    hash_1 = c.GetHash(pathlib.Path(d) / 'a')
    time.sleep(1)
    with open(pathlib.Path(d) / 'a', 'w') as f:
      f.write('Hello')
    hash_2 = c.GetHash(pathlib.Path(d) / 'a')
    assert hash_1 != hash_2
Ejemplo n.º 7
0
def test_HashCache_GetHash_in_memory_modified_file(database_path, hash_fn):
  """Test that modifying a file does not change the hash if in memory.

  This test emphasizes the danger of the in-memory hash, as it means that the
  validity of the cache is tied to the lifecycle of the process.
  """
  c = hashcache.HashCache(database_path, hash_fn, keep_in_memory=True)
  with tempfile.TemporaryDirectory() as d:
    (pathlib.Path(d) / 'a').touch()
    hash_1 = c.GetHash(pathlib.Path(d) / 'a')
    time.sleep(1)
    with open(pathlib.Path(d) / 'a', 'w') as f:
      f.write('Hello')
    hash_2 = c.GetHash(pathlib.Path(d) / 'a')
    assert hash_1 == hash_2
    # Clear the in-memory cache and re-run the test. Now it will be a cache miss
    # and the correct hash will be returned.
    c.Clear()
    hash_3 = c.GetHash(pathlib.Path(d) / 'a')
    assert hash_1 != hash_3
Ejemplo n.º 8
0
    def __init__(self, config: corpus_pb2.Corpus):
        """Instantiate a corpus from a proto config.

    If this is a new corpus, a number of files will be created, which may
    take some time.

    Args:
      config: A Corpus message.

    Raises:
      TypeError: If the config argument is not a Sampler proto.
      UserError: In case the corpus is not found, or config contains invalid
        options.
      EmptyCorpusException: In case the corpus contains no data.
    """
        if not isinstance(config, corpus_pb2.Corpus):
            t = type(config).__name__
            raise TypeError(f"Config must be a Corpus proto. Received: '{t}'")

        # Make a local copy of the configuration.
        self.config = corpus_pb2.Corpus()
        self.config.CopyFrom(AssertConfigIsValid(config))
        self._atomizer = None
        self._created = False

        cache.cachepath('corpus').mkdir(parents=True, exist_ok=True)
        hc = hashcache.HashCache(cache.cachepath('hashcache.db'), 'sha1')
        self.content_id = ResolveContentId(self.config, hc)
        # Database of pre-processed files.
        preprocessed_id = ResolvePreprocessedId(self.content_id, self.config)
        cache.cachepath('corpus', 'preprocessed',
                        preprocessed_id).mkdir(exist_ok=True, parents=True)
        preprocessed_db_path = cache.cachepath('corpus', 'preprocessed',
                                               preprocessed_id,
                                               'preprocessed.db')
        if (self.config.HasField('content_id')
                and not preprocessed_db_path.is_file()):
            raise errors.UserError(
                f"Content ID not found: '{self.content_id}'")
        self.preprocessed = preprocessed.PreprocessedContentFiles(
            preprocessed_db_path)
        # Create symlink to contentfiles.
        symlink = self.preprocessed.database_path.parent / 'contentfiles'
        if not symlink.is_symlink():
            if config.HasField('local_directory'):
                os.symlink(
                    str(
                        ExpandConfigPath(
                            config.local_directory,
                            path_prefix=FLAGS.clgen_local_path_prefix)),
                    symlink)
            elif config.HasField('local_tar_archive'):
                os.symlink(
                    str(
                        ExpandConfigPath(
                            config.local_tar_archive,
                            path_prefix=FLAGS.clgen_local_path_prefix)),
                    symlink)
        # Data of encoded pre-preprocessed files.
        encoded_id = ResolveEncodedId(self.content_id, self.config)
        cache.cachepath('corpus', 'encoded', encoded_id).mkdir(exist_ok=True,
                                                               parents=True)
        self.encoded = encoded.EncodedContentFiles(
            cache.cachepath('corpus', 'encoded', encoded_id, 'encoded.db'))
        self.atomizer_path = cache.cachepath('corpus', 'encoded', encoded_id,
                                             'atomizer.pkl')
        # Create symlink to preprocessed files.
        symlink = self.encoded.database_path.parent / 'preprocessed'
        if not symlink.is_symlink():
            os.symlink(
                os.path.relpath(self.preprocessed.database_path.parent,
                                self.encoded.database_path.parent), symlink)
        self.hash = encoded_id
        self.cache = cache.mkcache('corpus', 'encoded', encoded_id)
Ejemplo n.º 9
0
def test_HashCache_unrecognized_hash_fn(database_path, hash_fn):
  """Test that a non-existent path raises an error."""
  with pytest.raises(ValueError) as e_info:
    hashcache.HashCache(database_path, 'null')
  assert "Hash function not recognized: 'null'" == str(e_info.value)