コード例 #1
0
def s3_estimate_size(src: str, s3: Any = None) -> ContentSizeInfo:
    s3 = s3 or boto3.resource("s3")

    dst_url = urlparse(src)

    bucket_name = dst_url.netloc
    key = dst_url.path[1:]

    bucket = s3.Bucket(bucket_name)

    # Are we able to access the key on it's own?
    obj = bucket.Object(key)
    try:
        if obj.content_length > 0:
            return ContentSizeInfo(obj.content_length, 1, obj.content_length)
    except ClientError:
        logger.info(
            f"Failed to get content_length for {obj}. May be not an object at all"
        )

    cnt = 0
    total_size = 0
    max_size = 0

    try:
        for sub_obj in bucket.objects.filter(Prefix=obj.key):
            if not sub_obj.size:
                continue

            cnt = cnt + 1
            total_size += sub_obj.size
            max_size = max(max_size, sub_obj.size)
    except ClientError as e:
        raise S3Error(str(e)) from e
    return ContentSizeInfo(int(total_size), cnt, int(max_size))
コード例 #2
0
def fetcher_event(descriptor_as_adict) -> FetcherBenchmarkEvent:
    return FetcherBenchmarkEvent(
        action_id=ACTION_ID,
        message_id="MESSAGE_ID",
        client_id="CLIENT_ID",
        client_version="CLIENT_VERSION",
        client_username="******",
        authenticated=False,
        tstamp=42,
        visited=[],
        type="PRODUCER_TOPIC",
        payload=FetcherPayload(
            toml=BenchmarkDoc(contents=descriptor_as_adict.to_dict(),
                              doc="",
                              sha1="SHA"),
            scripts=SCRIPTS,
            datasets=[
                DownloadableContent(
                    src="http://someserver.com/somedata.zip",
                    dst=DATASET_S3_URI,
                    path="/mount/path",
                    id=DATASET_ID,
                    size_info=ContentSizeInfo(total_size=42,
                                              file_count=1,
                                              max_size=42),
                    type=FetchedType.FILE,
                )
            ],
        ),
    )
コード例 #3
0
def test_http_estimator(mock_curl):
    size_info = http_estimate_size(SOME_DATASET_SRC)

    mock_curl.setopt.assert_has_calls([
        call(pycurl.URL, SOME_DATASET_SRC),
        call(pycurl.NOBODY, 1),
        call(pycurl.HEADER, 1)
    ],
                                      any_order=True)
    mock_curl.getinfo.assert_called_with(pycurl.CONTENT_LENGTH_DOWNLOAD)

    assert size_info == ContentSizeInfo(DATA_SIZE, 1, DATA_SIZE)
コード例 #4
0
def http_estimate_size(src) -> ContentSizeInfo:
    curl = pycurl.Curl()
    curl.setopt(pycurl.URL, src)
    curl.setopt(pycurl.FOLLOWLOCATION, 1)
    curl.setopt(pycurl.MAXREDIRS, 5)
    curl.setopt(pycurl.CONNECTTIMEOUT, 30)
    curl.setopt(pycurl.TIMEOUT,
                60)  # 60s should be enough to send HEAD and get back
    curl.setopt(pycurl.HEADER, 1)
    curl.setopt(pycurl.NOBODY, 1)

    http_perform(curl)

    content_length = curl.getinfo(pycurl.CONTENT_LENGTH_DOWNLOAD)

    return ContentSizeInfo(int(content_length), 1, int(content_length))
コード例 #5
0
def test_http_estimator():
    assert http_estimate_size(BIG_FILE) == ContentSizeInfo(
        HUGE_SIZE, 1, HUGE_SIZE)
コード例 #6
0
def test_s3_estimator_folder(mock_s3_with_folder):
    size_info = s3_estimate_size(SOME_S3_FOLDER, mock_s3_with_folder)
    assert size_info == ContentSizeInfo(SOME_SIZE, 1, SOME_SIZE)
コード例 #7
0
def test_s3_estimator_file(mock_s3_with_file):
    size_info = s3_estimate_size(SOME_S3_FILE, mock_s3_with_file)
    assert size_info == ContentSizeInfo(SOME_SIZE, 1, SOME_SIZE)
コード例 #8
0
def test_s3_with_a_folder(s3_with_a_folder):
    assert ContentSizeInfo(FILE_SIZE * FILE_COUNT, FILE_COUNT,
                           FILE_SIZE) == s3_estimate_size(
                               S3_FOLDER, s3_with_a_folder)
コード例 #9
0
def test_s3_with_a_file(s3_with_a_file):
    assert ContentSizeInfo(FILE_SIZE, 1, FILE_SIZE) == s3_estimate_size(
        S3_SINGLE_FILE, s3_with_a_file)
コード例 #10
0
# This test can be executed from IDE


# API boundary test - should just not fail starting the job - the job itself can fail
from bai_k8s_utils.kubernetes_tests_client import KubernetesTestUtilsClient


S3_DST = "s3://dst"
SOMEDATA_BIG = "http://*****:*****@pytest.mark.parametrize("size_info", [BIG_SIZE, SMALL_SIZE], ids=["big", "small"])
def test_kubernetes_client(
    k8s_dispatcher: KubernetesDispatcher,
    benchmark_event_dummy_payload: BenchmarkEvent,
    k8s_test_client: KubernetesTestUtilsClient,
    fetcher_job_config: FetcherJobConfig,
    size_info: ContentSizeInfo,
):
    data_set = DownloadableContent(src=SOMEDATA_BIG, path="/mount/path", dst=S3_DST, md5=None, size_info=size_info)

    k8s_dispatcher.dispatch_fetch(data_set, benchmark_event_dummy_payload, "/data/sets/fake")
コード例 #11
0
    DownloadDispatcher,
    DownloadOnDone,
    ContentSizeEstimator,
)

FILE_SIZE = 42

ZK_VERSION = 1

CLIENT_ID = "CLIENT_ID"

ACTION_ID = "ACTION_ID"

SOME_PATH = "/some/path"

SOME_SIZE_INFO = ContentSizeInfo(FILE_SIZE, 1, FILE_SIZE)


def mock_size_estimator(src: str) -> ContentSizeInfo:
    return SOME_SIZE_INFO


@fixture
def failing_size_estimator() -> ContentSizeEstimator:
    mock = create_autospec(ContentSizeEstimator)
    mock.side_effect = UnRetryableError()
    return mock


def data_set_to_path(client_id: str,
                     action_id: str = None,
コード例 #12
0
NODE_SELECTOR = {"label1": "val1", "label2": "val2"}

NAMESPACE = "internal"

PULL_POLICY = "OnFailure"

RESTART_POLICY = "OnFailure"

TTL = 42

SMALL_DATA_SET_SIZE = 1 * MB

MIN_VOLUME_SIZE_MB = 64


SMALL_DATA_SET_SIZE_INFO = ContentSizeInfo(SMALL_DATA_SET_SIZE, 1, SMALL_DATA_SET_SIZE)
BIG_DATA_SET_SIZE_INFO = ContentSizeInfo(MIN_VOLUME_SIZE_MB * MB, 1, MIN_VOLUME_SIZE_MB * MB)

FETCHER_JOB_CONFIG = FetcherJobConfig(
    namespace=NAMESPACE,
    image=FETCHER_JOB_IMAGE,
    node_selector=NODE_SELECTOR,
    pull_policy=PULL_POLICY,
    ttl=TTL,
    restart_policy=RESTART_POLICY,
    volume=FetcherVolumeConfig(MIN_VOLUME_SIZE_MB),
)

KUBECONFIG = "path/cfg"