def handler(event, context):

    tuner_function_name = "lambda_tuner"
    trial_function_name = "lambda_trial"
    function_start = time.time()
    function_duration = 14 * 60
    n_submit_trial = event.get('n_submit_trial', 0)

    # dataset setting
    dataset_name = 'higgs'
    data_bucket = "higgs-10"
    dataset_type = "dense_libsvm"   # dense_libsvm or sparse_libsvm
    n_features = 30
    n_classes = 2
    tmp_bucket = "tmp-params"
    merged_bucket = "merged-params"

    # training setting
    model = "lr"    # lr, svm, sparse_lr, or sparse_svm
    optim = "grad_avg"  # grad_avg, model_avg, or admm
    sync_mode = "reduce"    # async, reduce or reduce_scatter
    n_workers = 10

    # tuner configs
    tuner_strategy = "random_search"
    tuner_concurrency = 5
    lr_values = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10]
    lr_disc = DiscHyper("lr_discrete", lr_values)

    # hyper-parameters
    lr = 0.01
    batch_size = 100000
    n_epochs = 2
    valid_ratio = .2
    n_admm_epochs = 2
    lam = 0.01
    rho = 0.01

    # clear s3 bucket
    s3_client = s3_operator.get_client()
    s3_operator.clear_bucket(s3_client, tmp_bucket)
    s3_operator.clear_bucket(s3_client, merged_bucket)

    # set dynamodb table
    recorder_table_name = "recoder"
    dynamo_client = dynamo_operator.get_client()
    recorder_tb = DynamoTable(dynamo_client, recorder_table_name)
    items = recorder_tb.list()
    print("{} items in the recorder".format(len(items)))

    # lambda payload
    payload = dict()
    payload['dataset'] = dataset_name
    payload['data_bucket'] = data_bucket
    payload['dataset_type'] = dataset_type
    payload['n_features'] = n_features
    payload['n_classes'] = n_classes
    payload['n_workers'] = n_workers
    payload['tmp_bucket'] = tmp_bucket
    payload['merged_bucket'] = merged_bucket
    payload['model'] = model
    payload['optim'] = optim
    payload['sync_mode'] = sync_mode
    payload['lr'] = lr
    payload['batch_size'] = batch_size
    payload['n_epochs'] = n_epochs
    payload['valid_ratio'] = valid_ratio
    payload['n_admm_epochs'] = n_admm_epochs
    payload['lambda'] = lam
    payload['rho'] = rho

    # invoke functions
    lambda_client = boto3.client('lambda')

    n_trial = 10
    trial_counter = n_submit_trial

    for i in range(n_trial):
        n_recorder_items = len(recorder_tb.list())
        n_running_tail = trial_counter - n_recorder_items
        while n_running_tail >= tuner_concurrency:
            time.sleep(1)
            n_recorder_items = len(recorder_tb.list())
            n_running_tail = trial_counter - n_recorder_items
        for j in range(n_workers):
            payload = dict()
            payload['dataset'] = dataset_name
            payload['data_bucket'] = data_bucket
            payload['dataset_type'] = dataset_type
            payload['n_features'] = n_features
            payload['n_classes'] = n_classes
            payload['n_workers'] = n_workers
            payload['tmp_bucket'] = tmp_bucket
            payload['merged_bucket'] = merged_bucket
            payload['model'] = model
            payload['optim'] = optim
            payload['sync_mode'] = sync_mode
            payload['batch_size'] = batch_size
            payload['n_epochs'] = n_epochs
            payload['valid_ratio'] = valid_ratio
            payload['n_admm_epochs'] = n_admm_epochs
            payload['lambda'] = lam
            payload['rho'] = rho
            payload['function_name'] = trial_function_name

            payload['tmp_bucket'] = tmp_bucket + "-i"
            payload['merged_bucket'] = merged_bucket + "-i"
            payload['lr'] = lr_disc.next() if tuner_strategy == "grid_search" else lr_disc.sample()
            payload['worker_index'] = j
            payload['train_file'] = 'training_{}.pt'.format(j)
            payload['test_file'] = 'test.pt'
            lambda_client.invoke(FunctionName=trial_function_name,
                                 InvocationType='Event',
                                 Payload=json.dumps(payload))
        trial_counter += 1
        if time.time() - function_start > function_duration:
            # revoke itself
            print("Invoking the next round of tuner functions, total trials {}, submitted trials {}"
                  .format(n_trial, trial_counter))
            lambda_client = boto3.client('lambda')
            payload = {
                'n_submit_trial': n_submit_trial
            }
            lambda_client.invoke(FunctionName=tuner_function_name,
                                 InvocationType='Event',
                                 Payload=json.dumps(payload))
Exemple #2
0
def handler(event, context):

    tuner_function_name = "lambda_tuner"
    trial_function_name = "lambda_trial"
    function_start = time.time()
    function_duration = 14 * 60
    n_submit_trial = event.get('n_submit_trial', 0)

    # dataset setting
    dataset_name = 'cifar10'
    data_bucket = "cifar10dataset"
    n_features = 32 * 32
    n_classes = 10
    host = "127.0.0.1"
    port = 11211
    tmp_bucket = "tmp-params"
    merged_bucket = "merged-params"
    cp_bucket = "cp-model"

    # training setting
    model = "mobilenet"  # mobilenet or resnet
    optim = "grad_avg"  # grad_avg or model_avg
    sync_mode = "reduce"  # async, reduce or reduce_scatter
    n_workers = 10

    # tuner configs
    tuner_strategy = "random_search"
    tuner_concurrency = 5
    lr_values = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10]
    lr_disc = DiscHyper("lr_discrete", lr_values)

    # hyper-parameters
    batch_size = 256
    n_epochs = 5
    start_epoch = 0
    run_epochs = 3

    # set dynamodb table
    recorder_table_name = "recoder"
    dynamo_client = dynamo_operator.get_client()
    recorder_tb = DynamoTable(dynamo_client, recorder_table_name)
    items = recorder_tb.list()
    print("{} items in the recorder".format(len(items)))

    # lambda payload
    payload = dict()
    payload['dataset'] = dataset_name
    payload['data_bucket'] = data_bucket
    payload['n_features'] = n_features
    payload['n_classes'] = n_classes
    payload['n_workers'] = n_workers
    payload['host'] = host
    payload['port'] = port
    payload['tmp_bucket'] = tmp_bucket
    payload['merged_bucket'] = merged_bucket
    payload['cp_bucket'] = cp_bucket
    payload['model'] = model
    payload['optim'] = optim
    payload['sync_mode'] = sync_mode
    payload['lr'] = lr
    payload['batch_size'] = batch_size
    payload['n_epochs'] = n_epochs
    payload['start_epoch'] = start_epoch
    payload['run_epochs'] = run_epochs
    payload['function_name'] = function_name

    # invoke functions
    lambda_client = boto3.client('lambda')
    for i in range(n_workers):
        payload['worker_index'] = i
        payload['train_file'] = 'training_{}.pt'.format(i)
        payload['test_file'] = 'test.pt'
        lambda_client.invoke(FunctionName=function_name,
                             InvocationType='Event',
                             Payload=json.dumps(payload))

    # invoke functions
    lambda_client = boto3.client('lambda')

    n_trial = 10
    trial_counter = n_submit_trial

    for i in range(n_trial):
        n_recorder_items = len(recorder_tb.list())
        n_running_tail = trial_counter - n_recorder_items
        while n_running_tail >= tuner_concurrency:
            time.sleep(1)
            n_recorder_items = len(recorder_tb.list())
            n_running_tail = trial_counter - n_recorder_items
        for j in range(n_workers):
            # lambda payload
            payload = dict()
            payload['dataset'] = dataset_name
            payload['data_bucket'] = data_bucket
            payload['n_features'] = n_features
            payload['n_classes'] = n_classes
            payload['n_workers'] = n_workers
            payload['host'] = host
            payload['port'] = port
            payload['model'] = model
            payload['optim'] = optim
            payload['sync_mode'] = sync_mode
            payload['lr'] = lr
            payload['batch_size'] = batch_size
            payload['n_epochs'] = n_epochs
            payload['start_epoch'] = start_epoch
            payload['run_epochs'] = run_epochs
            payload['function_name'] = function_name

            payload['tmp_bucket'] = tmp_bucket + "-i"
            payload['merged_bucket'] = merged_bucket + "-i"
            payload['cp_bucket'] = cp_bucket + "-i"
            payload['lr'] = lr_disc.next(
            ) if tuner_strategy == "grid_search" else lr_disc.sample()
            payload['worker_index'] = j
            payload['train_file'] = 'training_{}.pt'.format(j)
            payload['test_file'] = 'test.pt'
            lambda_client.invoke(FunctionName=trial_function_name,
                                 InvocationType='Event',
                                 Payload=json.dumps(payload))
        trial_counter += 1
        if time.time() - function_start > function_duration:
            # revoke itself
            print(
                "Invoking the next round of tuner functions, total trials {}, submitted trials {}"
                .format(n_trial, trial_counter))
            lambda_client = boto3.client('lambda')
            payload = {'n_submit_trial': n_submit_trial}
            lambda_client.invoke(FunctionName=tuner_function_name,
                                 InvocationType='Event',
                                 Payload=json.dumps(payload))