def _limit_corpus_size(corpus_url, size_limit):
  """Limit number of files in a corpus url."""
  files_list = list(storage.list_blobs(corpus_url))
  corpus_size = len(files_list)

  if corpus_size <= size_limit:
    # Corpus directory size is within limit, no more work to do.
    return

  logs.log(
      'Limit corpus at {corpus_url} from {corpus_size} to {size_limit}.'.format(
          corpus_url=corpus_url, corpus_size=corpus_size,
          size_limit=size_limit))
  files_to_delete = random.sample(files_list, corpus_size - size_limit)
  bucket, _ = storage.get_bucket_name_and_path(corpus_url)
  for file_to_delete in files_to_delete:
    path_to_delete = storage.get_cloud_storage_file_path(bucket, file_to_delete)
    storage.delete(path_to_delete)
Exemple #2
0
def _limit_corpus_size(corpus_url):
    """Limit number of files and size of a corpus."""
    corpus_count = 0
    corpus_size = 0
    deleted_corpus_count = 0
    bucket, _ = storage.get_bucket_name_and_path(corpus_url)
    for corpus_file in storage.get_blobs(corpus_url):
        corpus_count += 1
        corpus_size += corpus_file['size']
        if (corpus_count > CORPUS_FILES_LIMIT_FOR_FAILURES
                or corpus_size > CORPUS_SIZE_LIMIT_FOR_FAILURES):
            path_to_delete = storage.get_cloud_storage_file_path(
                bucket, corpus_file['name'])
            storage.delete(path_to_delete)
            deleted_corpus_count += 1

    if deleted_corpus_count:
        logs.log('Removed %d files from oversized corpus: %s.' %
                 (deleted_corpus_count, corpus_url))