Example #1
0
def run(cloudburst: CloudburstConnection, num_requests: int, num_fns: int,
        data_size: str, do_optimize: bool):
    def fusion_op(self, row: Row) -> bytes:
        return row['data']

    print(f'Creating flow with {num_fns} operators and {data_size}' +
          f' ({DATA_SIZES[data_size]}) inputs.')

    flow = Flow('fusion-benchmark', FlowType.PUSH, cloudburst)

    marker = flow
    for _ in range(num_fns):
        marker = marker.map(fusion_op, names=['data'])

    if do_optimize:
        flow = optimize(flow, rules=optimize_rules)
        print('Flow has been optimized...')

    flow.deploy()
    print('Flow successfully deployed!')

    latencies = []
    inp = Table([('data', BtsType)])
    inp.insert([os.urandom(DATA_SIZES[data_size])])

    print('Starting benchmark...')
    for i in range(num_requests):
        if i % 100 == 0 and i > 0:
            print(f'On request {i}...')

        start = time.time()
        res = flow.run(inp).get()
        end = time.time()

        latencies.append(end - start)

    print_latency_stats(latencies, 'E2E')
Example #2
0
def run(cloudburst: CloudburstConnection, num_requests: int, gamma: int,
        num_replicas: int):
    def stage1(self, val: int) -> int:
        return val + 1

    def stage2(self, row: Row) -> float:
        import time
        from scipy.stats import gamma

        delay = gamma.rvs(3.0, scale=row['scale']) * 10 / 1000  # convert to ms
        time.sleep(delay)

        return delay

    def stage3(self, row: Row) -> float:
        return row['val']

    print(f'Creating flow with {num_replicas} replicas and' +
          f' gamma={GAMMA_VALS[gamma]}')

    flow = Flow('fusion-benchmark', FlowType.PUSH, cloudburst)
    flow.map(stage1, col='val') \
        .map(stage2, names=['val'], high_variance=True) \
        .map(stage3, names=['val'])

    optimize_rules['compete_replicas'] = num_replicas
    flow = optimize(flow, rules=optimize_rules)
    print('Flow has been optimized...')

    flow.deploy()
    print('Flow successfully deployed!')

    latencies = []
    inp = Table([('val', IntType), ('scale', FloatType)])
    inp.insert([1, GAMMA_VALS[gamma]])

    print('Starting benchmark...')
    for i in range(num_requests):
        if i % 100 == 0 and i > 0:
            print(f'On request {i}...')

        time.sleep(.300)  # Sleep to let the queue drain.
        start = time.time()
        res = flow.run(inp).get()
        end = time.time()

        latencies.append(end - start)

    print_latency_stats(latencies, 'E2E')
Example #3
0
def run(cloudburst: CloudburstConnection,
        num_requests: int,
        data_size: str,
        do_optimize: bool):

    def stage1(self, row: Row) -> bytes:
        import numpy as np

        return np.random.rand(row['size'])

    def stage2(self, row: Row) -> int:
        return 3

    print(f'Creating flow with {data_size} ({DATA_SIZES[data_size]}) inputs.')

    flow = Flow('colocate-benchmark', FlowType.PUSH, cloudburst)
    f1 = flow.map(stage1)

    p1 = f1.map(stage2, names=['val1'])
    p2 = f1.map(stage2, names=['val2'])
    p3 = f1.map(stage2, names=['val3'])
    p4 = f1.map(stage2, names=['val4'])
    p5 = f1.map(stage2, names=['val5'])
    # p6 = f1.map(stage2, names=['val6'])
    # p7 = f1.map(stage2, names=['val7'])
    # p8 = f1.map(stage2, names=['val8'])

    p1.join(p2).join(p3).join(p4).join(p5) # .join(p6).join(p7).join(p8)

    if do_optimize:
        flow = optimize(flow, rules=optimize_rules)
        print('Flow has been optimized...')

    flow.deploy()
    print('Flow successfully deployed!')

    latencies = []
    inp = Table([('size', IntType)])
    inp.insert([DATA_SIZES[data_size]])

    print('Starting benchmark...')
    for i in range(num_requests):
        if i % 100 == 0 and i > 0:
            print(f'On request {i}...')

        start = time.time()
        res = flow.run(inp).get()
        end = time.time()

        latencies.append(end - start)

    print_latency_stats(latencies, 'E2E')
Example #4
0
def optimize(flow, rules: dict = DEFAULT_RULES):
    for key in DEFAULT_RULES:
        if key not in rules:
            rules[key] = False

    if rules['colocate'] and rules['breakpoint']:
        raise FlowError('Cannot enable the colocate and breakpoint rules' +
                        ' together.')

    optimized = Flow(flow.flowname, flow.typ, flow.cloudburst, flow.source)

    if rules['whole']:
        cloned = optimize(
            flow, {
                'fusion': False,
                'compete': False,
                'compete_replicas': 1,
                'colocate': False,
                'breakpoint': False,
                'whole': False
            })

        cloned.cloudburst = None  # Remove sockets to serialize and send flow.
        queue = [cloned]
        gpu = False
        batching = []
        while len(queue) > 0:
            op = queue.pop(0)
            op.cb_fn = None

            if type(op) != Flow:
                batching.append(op.batching)
            gpu = op.gpu if not gpu else gpu
            queue.extend(op.downstreams)

        if all(batching):
            cloned.batching = True

        optimized.multi([cloned], whole=True)
        multi_op = optimized.downstreams[0]
        multi_op.batching = all(batching)
        multi_op.gpu = gpu

        if gpu:
            multi_op.fn_name += '-gpu'

        return optimized

    ### OPERATOR FUSION ###
    queue = []
    join_tracker = {}
    processed = set()

    for ds in flow.downstreams:
        queue.append((ds, optimized))

    # NOTE: We clone the whole flow regardless. If fusion is turned on,
    # then we will fuse operators, and otherwise, we simply find chains,
    # throw them away, and add operators to the optimized flow.
    while len(queue) > 0:
        op, upstream = queue.pop(0)

        if op.fn_name in processed:
            continue

        chain = find_chain(op)

        if len(chain) == 0 or not rules['fusion']:
            downstreams = op.downstreams
            processed.add(op.fn_name)

            if type(op) == MapOperator:
                marker = upstream.map(op.fn, op.col, op.names,
                                      op.logic.preprocess, op.high_variance,
                                      op.gpu, op.batching, op.multi)
            if type(op) == FilterOperator:
                marker = upstream.filter(op.fn, op.group, op.logic.preprocess)
            if type(op) == GroupbyOperator:
                marker = upstream.gropuby(op.groupby_key, op.logic.preprocess)
            if type(op) == CombineOperator:
                marker = upstream.combine()
            if type(op) == LookupOperator:
                # Merge lookup operators with their successors.
                downstreams = []
                for ds in op.downstreams:
                    if isinstance(ds, MultiOperator):
                        ops = [op] + ds.ops
                    else:
                        ops = [op, ds]
                    marker = upstream.multi(ops)

                    for next_ds in ds.downstreams:
                        queue.append((next_ds, marker))
            if type(op) == AggOperator:
                marker = upstream.agg(op.aggregate, op.column)
            if type(op) == MultiOperator:
                # This will only happen in the case where the previous operator
                # was a LookupHelperOperator combined with something else.
                marker = upstream.multi(op.ops)
            if type(op) == JoinOperator:
                if op.fn_name not in join_tracker:
                    join_tracker[op.fn_name] = upstream
                    downstreams = []
                    processed.discard(op.fn_name)
                else:
                    other = join_tracker[op.fn_name]
                    marker = other.join(upstream, op.on, op.how,
                                        op.logic.preprocess)
        else:
            marker = upstream.multi(chain)
            downstreams = chain[-1].downstreams

            for op in chain:
                # Set the multi operator to have various properties.
                if op.high_variance:
                    optimized.operators[marker.position].high_variance = True
                if op.gpu:
                    optimized.operators[marker.position].gpu = True

                    # Hack for autoscaling...
                    optimized.operators[marker.position].fn_name += '-gpu'
                if op.batching:
                    optimized.operators[marker.position].batching = True

            if optimized.operators[marker.position].batching:
                for old in chain:
                    if not old.batching:
                        print('Cannot create a fused operator with' +
                              ' batching enabled if all operators do' +
                              ' not batch.')
                        optimized.operators[marker.position].batching = False

        for ds in downstreams:
            queue.append((ds, marker))

    ### LOCALITY BREAKPOINTS ###
    if rules['breakpoint']:
        queue = [optimized]
        processed = set()

        while len(queue) > 0:
            op = queue.pop(0)

            if op.fn_name in processed:
                continue

            # We only set breakpoints if we are in a linear chain portion of the
            # flow. This will only be true if there is only one operator in the
            # queue at a time. After pop, the length should be 0 until we add this
            # op's downstreams.
            if len(queue) == 0:
                if isinstance(op, LookupOperator):
                    op.breakpoint = True
                if isinstance(op, MultiOperator):
                    for sub in op.ops:
                        if isinstance(sub, LookupOperator):
                            op.breakpoint = True

            processed.add(op.fn_name)
            queue.extend(op.downstreams)

    ### COMPETITIVE EXECUTION ###
    if rules['compete']:
        new_ops = []
        for operator in optimized.operators.values():
            if operator.high_variance:
                for downstream in operator.downstreams:
                    if len(downstream.upstreams) > 1:
                        raise RuntimeError("Cannot have a competitive" +
                                           " execution map feed into an " +
                                           "operator with multiple upstreams.")
                    downstream.multi_exec = True

                for _ in range(rules['compete_replicas']):
                    # Create a new operator that is an exact replica.
                    if isinstance(operator, MapOperator):
                        new_op = MapOperator(operator.fn, operator.fntype,
                                             operator.flowname, operator.col,
                                             operator.names,
                                             operator.logic.preprocess,
                                             operator.high_variance,
                                             operator.gpu, operator.batching,
                                             operator.multi, optimized.sink)

                    if isinstance(operator, MultiOperator):
                        new_op = MultiOperator(operator.ops, operator.flowname,
                                               optimized.sink)

                    # Hook it into the DAG by updating all up/downstreams.
                    new_op.downstreams = list(operator.downstreams)
                    new_op.upstreams = list(operator.upstreams)

                    for op in new_op.downstreams:
                        op.upstreams.append(new_op)

                    for op in new_op.upstreams:
                        op.downstreams.append(new_op)

                    new_ops.append(new_op)
        for new_op in new_ops:
            optimized.operators[str(uuid.uuid4())] = new_op

    if rules['colocate']:
        curr_op = optimized

        while len(curr_op.downstreams) > 0:
            if len(curr_op.downstreams) == 1:
                curr_op = curr_op.downstreams[0]
            else:  # We only support one colocation for now.
                if not curr_op.supports_broadcast:
                    raise RuntimeError('Unsupported broadcast attempt.')

                colocates = list(
                    map(lambda op: op.fn_name, curr_op.downstreams))
                optimized.colocates = colocates

                for op in curr_op.downstreams:
                    if not curr_op.supports_broadcast:
                        raise RuntimeError('Unsupported broadcast attempt.')
                    args = list(op.init_args)
                    args[1] = True  # Receive broadcast.
                    op.init_args = tuple(args)

                args = list(curr_op.init_args)
                args[0] = True  # Send broadcast.
                curr_op.init_args = tuple(args)
                break

    return optimized
Example #5
0
        incept = inceptionv3_model_gpu
        incept_cons = inceptionv3_init_gpu
        trans = transform_batch
    else:
        resnet = resnet_model
        resnet_cons = resnet_init
        incept = inceptionv3_model
        incept_cons = inceptionv3_init
        trans = transform

    with open('imagenet_classes.txt', 'r') as f:
        classes = [line.strip() for line in f.readlines()]

    cloudburst.put_object('imagenet-classes', classes)

    flow = Flow('cascade-flow', FlowType.PUSH, cloudburst)
    rnet = flow.map(trans,
                    init=transform_init,
                    names=['img'],
                    batching=gpu) \
        .map(resnet,
             init=resnet_cons,
             names=['img', 'resnet_index', 'resnet_max_prob'],
             gpu=gpu,
             batching=gpu)

    incept = rnet.filter(low_prob) \
        .map(incept,
             init=incept_cons,
             names=['incept_index', 'incept_max_prob'],
             gpu=gpu,
Example #6
0
    indices = np.argsort(all_percentages)[::-1]
    return classes[indices[0]]


import base64
import sys

from cloudburst.client.client import CloudburstConnection

table = Table([('img', StrType)])
img = base64.b64encode(open('panda.jpg', "rb").read()).decode('ascii')

table.insert([img])

cloudburst = CloudburstConnection(sys.argv[1], '3.226.122.35')
flow = Flow('ensemble-flow', FlowType.PUSH, cloudburst)
img = flow.map(transform, init=transform_init, names=['img'])

anet = img.map(alexnet_model, init=alexnet_init, names=['alexnet_index', 'alexnet_perc'])
rnet = img.map(resnet_model, init=resnet_init, names=['resnet_index', 'resnet_perc'])
anet.join(rnet).map(ensemble_predict, names=['class'])

flow.deploy()

from cloudburst.server.benchmarks.utils import print_latency_stats
import time

print('Starting benchmark...')

latencies = []
for _ in range(100):
Example #7
0
cloudburst.list()

import random
import string
salt = "".join(random.choices(string.ascii_letters, k=6))

print("Running sanity check")
cloud_sq = cloudburst.register(lambda _, x: x * x, "square-2"+salt)
print(cloud_sq(2).get())
cloudburst.delete_dag("dag")
cloudburst.register_dag("dag", ["square-2"+salt], [])
print(cloudburst.call_dag("dag", {"square-2"+salt: [2]}).get())

# 1 / 0
print("Running example flow")
dataflow = Flow("example-flow"+salt, FlowType.PUSH, cloudburst)
dataflow.map(map_fn, names=["sum"]).filter(filter_fn)

table = Table([("a", IntType), ("b", IntType)])

table.insert([1, 2])
table.insert([1, 3])
table.insert([1, 4])

dataflow.register()
dataflow.deploy()

print(dataflow)
print("deployed")
print(dataflow.run(table).get())
Example #8
0
def run(cloudburst: CloudburstConnection, num_requests: int, batch_size: int,
        gpu: bool):

    with open('imagenet_classes.txt', 'r') as f:
        classes = [line.strip() for line in f.readlines()]

    cloudburst.put_object('imagenet-classes', classes)

    def resnet_init_gpu(self, cloudburst):
        import os

        import torch
        import torchvision
        from torchvision import transforms

        tpath = os.path.join(os.getenv('TORCH_HOME'), 'checkpoints')
        self.resnet = torch.load(os.path.join(tpath, 'resnet101.model')).cuda()
        self.resnet.eval()

        self.transforms = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
        ])

        self.classes = cloudburst.get('imagenet-classes')

    def resnet_model_gpu(self, table: Table) -> str:
        """
        AlexNet for image classification on ImageNet
        """
        import torch

        inputs = []
        for row in table.get():
            img = self.transforms(row['img'])
            inputs.append(img)

        inputs = torch.stack(inputs, dim=0).cuda()
        output = self.resnet(inputs)
        _, indices = torch.sort(output, descending=True)
        indices = indices.cpu().detach().numpy()

        result = []
        for idx_set in indices:
            index = idx_set[0]
            result.append(self.classes[index])

        return result

    def resnet_init_cpu(self, cloudburst):
        import os

        import torch
        import torchvision
        from torchvision import transforms

        tpath = os.path.join(os.getenv('TORCH_HOME'), 'checkpoints')
        self.resnet = torch.load(os.path.join(tpath, 'resnet101.model'))

        self.resnet.eval()

        self.transforms = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
        ])

        self.classes = cloudburst.get('imagenet-classes')

    def resnet_model_cpu(self, table: Table) -> str:
        """
        AlexNet for image classification on ImageNet
        """
        import torch

        inputs = []
        for row in table.get():
            img = self.transforms(row['img'])
            inputs.append(img)

        inputs = torch.stack(inputs, dim=0)
        output = self.resnet(inputs)
        _, indices = torch.sort(output, descending=True)
        indices = indices.detach().numpy()

        result = []
        for idx_set in indices:
            index = idx_set[0]
            result.append(self.classes[index])

        return result

    print(f'Creating flow with size {batch_size} batches.')

    flow = Flow('batching-benchmark', FlowType.PUSH, cloudburst)
    if gpu:
        flow.map(resnet_model_gpu,
                 init=resnet_init_gpu,
                 names=['class'],
                 gpu=True,
                 batching=True)
    else:
        flow.map(resnet_model_cpu,
                 init=resnet_init_cpu,
                 names=['class'],
                 batching=True)

    flow.deploy()
    print('Flow successfully deployed!')

    latencies = []
    inp = Table([('img', NumpyType)])
    img = np.array(Image.open('panda.jpg').convert('RGB').resize((224, 224)))

    inp.insert([img])

    kvs = cloudburst.kvs_client

    if gpu:
        print('Starting GPU warmup...')
        for _ in range(50):
            flow.run(inp).get()
        print('Finished warmup...')

    print('Starting benchmark...')
    for i in range(num_requests):
        if i % 100 == 0 and i > 0:
            print(f'On request {i}...')

        futs = []
        for _ in range(batch_size):
            futs.append(flow.run(inp))
        pending = set([fut.obj_id for fut in futs])

        # Break these apart to batch the KVS get requests.
        start = time.time()
        while len(pending) > 0:
            get_start = time.time()
            response = kvs.get(list(pending))

            for key in response:
                if response[key] is not None:
                    pending.discard(key)

        end = time.time()
        latencies.append(end - start)

    compute_time = np.mean(latencies) * num_requests
    tput = (batch_size * num_requests) / (compute_time)
    print('THROUGHPUT: %.2f' % (tput))
    print_latency_stats(latencies, 'E2E')
Example #9
0
                        type=str,
                        metavar='O',
                        help='The name of the file with the benchmark IPs',
                        dest='benchmarks',
                        required=True)

    args = parser.parse_args()

    benchmark_ips = []
    with open(args.benchmarks[0], 'r') as f:
        benchmark_ips = f.readlines()

    cloudburst = CloudburstConnection(args.cloudburst[0], args.ip[0])
    print('Successfully connected to Cloudburst')

    flow = Flow('scaling-benchmark', FlowType.PUSH, cloudburst)
    flow.map(stage1, names=['val']).map(stage2, names=['val'])

    table = Table([('val', IntType)])

    table.insert([1])

    num_bench = len(benchmark_ips)
    num_start = int(start_percent * num_bench)

    flow.cloudburst = None  # Hack to serialize and send flow.
    queue = [flow]
    while len(queue) > 0:
        op = queue.pop(0)
        op.cb_fn = None
Example #10
0
                        dest='threads',
                        required=True)
    parser.add_argument('-l',
                        '--local',
                        nargs=1,
                        type=str,
                        metavar='L',
                        help='Whether to run in local mode (required)',
                        dest='local',
                        required=True)

    args = parser.parse_args()
    print('Connecting to Cloudburst...')
    cloudburst = CloudburstConnection(args.cloudburst[0], args.ip[0])

    flow = Flow('recsys-flow', FlowType.PUSH, cloudburst)
    flow.lookup('user', dynamic=True) \
        .map(pick_category, names=['user', 'weights', 'category']) \
        .lookup('category', dynamic=True) \
        .map(get_topk, names=['1', '2', '3', '4', '5'])

    flow = optimize(flow, rules=optimize_rules)

    print('Creating data...')
    # for i in range(NUM_USERS):
    #     if i % 10000 == 0:
    #         print(f'On user {i}...')

    #     user_vector = np.random.randn(512)
    #     cloudburst.put_object(str(i), user_vector)
Example #11
0
def run(cloudburst: CloudburstConnection,
        num_requests: int,
        data_size: str,
        breakpoint: bool,
        do_optimize: bool):

    print('Creating data...')
    size = DATA_SIZES[data_size]
    for i in range(1, NUM_DATA_POINTS+1):
        arr = np.random.rand(size)
        cloudburst.put_object('data-' + str(i), arr)

    def stage1(self, row: Row) -> (int, str):
        idx = int(row['req_num'] / 10) + 1
        key = 'data-%d' % (idx)

        return idx, key

    def stage2(self, row: Row) -> str:
        import numpy as np
        arr = row[row['key']]

        return float(np.sum(arr))

    print(f'Creating flow with {data_size} ({DATA_SIZES[data_size]}) inputs.')

    flow = Flow('locality-benchmark', FlowType.PUSH, cloudburst)
    flow.map(stage1, names=['index', 'key']) \
        .lookup('key', dynamic=True) \
        .map(stage2, names=['sum'])

    optimize_rules['breakpoint'] = breakpoint
    if do_optimize:
        flow = optimize(flow, rules=optimize_rules)
        print('Flow has been optimized...')

    flow.deploy()
    print('Flow successfully deployed!')

    latencies = []
    inp = Table([('req_num', IntType)])

    if breakpoint:
        print('Starting warmup...')
        for i in range(NUM_DATA_POINTS):
            inp = Table([('req_num', IntType)])
            inp.insert([i * 10])

            res = flow.run(inp).get()

        print('Pausing to let cache metadata propagate...')
        time.sleep(15)

    print('Starting benchmark...')
    for i in range(num_requests):
        if i % 100 == 0 and i > 0:
            print(f'On request {i}...')

        inp = Table([('req_num', IntType)])
        inp.insert([i])

        start = time.time()
        res = flow.run(inp).get()
        end = time.time()

        latencies.append(end - start)

    with open('data.bts', 'wb') as f:
        from cloudburst.shared.serializer import Serializer
        ser = Serializer()
        bts = ser.dump(latencies)
        f.write(bts)

    print_latency_stats(latencies, 'E2E')
Example #12
0
        german_init = english_to_german_init_gpu
        french_init = english_to_french_init_gpu
        german = english_to_german_gpu
        french = english_to_french_gpu
    else:
        german_init = english_to_german_init
        french_init = english_to_french_init
        german = english_to_german
        french = english_to_french

    with open('imagenet_classes.txt', 'r') as f:
        classes = [line.strip() for line in f.readlines()]

    cloudburst.put_object('imagenet-classes', classes)

    flow = Flow('nmt-flow', FlowType.PUSH, cloudburst)
    classified = flow.map(classify_language,
                          init=classify_language_init,
                          names=['language', 'translate'],
                          batching=True)

    french = classified.filter(filter_french) \
        .map(french,
             init=french_init,
             names=['french'],
             gpu=gpu,
             high_variance=True,
             batching=gpu) \
        .filter(true_filter)

    german = classified.filter(filter_german) \