Beispiel #1
0
def upload(filename,
           prefix=False,
           bucket_name=False,
           key=None,
           secret=None,
           host=None):
    """
    Uploading files to Amamzon S3.
    """
    s3 = S3Storage(bucket_name=bucket_name, key=key, secret=secret, host=host)
    name = None

    if isinstance(filename, basestring):
        filename = open(filename, 'rb')
        name = path.basename(filename)
    elif isinstance(filename, (file, File)):
        name = filename.name

    if not name:
        raise TypeError('Filename must be file or string instance.')

    if prefix:
        if prefix.endswith('/'):
            full_path = prefix + name
        else:
            full_path = prefix + '/' + name
    else:
        full_path = name

    key = s3.save(full_path, filename)

    return s3.url(full_path)
Beispiel #2
0
def storage_engine():
    with mock_s3():
        # Create a test bucket and put some test content.
        boto.connect_s3().create_bucket(_TEST_BUCKET)
        engine = S3Storage(_TEST_CONTEXT, "some/path", _TEST_BUCKET, _TEST_USER, _TEST_PASSWORD)
        engine.put_content(_TEST_PATH, _TEST_CONTENT)

        yield engine
def test_copy(bucket, username, password, storage_engine):
    # Copy the content to another engine.
    another_engine = S3Storage(_TEST_CONTEXT, "another/path", _TEST_BUCKET,
                               _TEST_USER, _TEST_PASSWORD)
    boto.connect_s3().create_bucket("another_bucket")
    storage_engine.copy_to(another_engine, _TEST_PATH)

    # Verify it can be retrieved.
    assert another_engine.get_content(_TEST_PATH) == _TEST_CONTENT
Beispiel #4
0
def test_stream_write_error():
    with mock_s3():
        # Create an engine but not the bucket.
        engine = S3Storage(_TEST_CONTEXT, "some/path", _TEST_BUCKET, _TEST_USER, _TEST_PASSWORD)

        # Attempt to write to the uncreated bucket, which should raise an error.
        with pytest.raises(IOError):
            engine.stream_write(_TEST_PATH, StringIO("hello world"), content_type="Cool/Type")

        assert not engine.exists(_TEST_PATH)
Beispiel #5
0
def storage_engine():
    with mock_s3():
        # Create a test bucket and put some test content.
        boto3.client("s3").create_bucket(Bucket=_TEST_BUCKET)
        engine = S3Storage(_TEST_CONTEXT, "some/path", _TEST_BUCKET,
                           _TEST_USER, _TEST_PASSWORD, _TEST_REGION)
        assert engine._connect_kwargs[
            "endpoint_url"] == "https://s3.{}.amazonaws.com".format(
                _TEST_REGION)
        engine.put_content(_TEST_PATH, _TEST_CONTENT)

        yield engine
def storage_engine(request):
    if request.param == 'test':
        yield test_storage
    else:
        with mock_s3():
            # Create a test bucket and put some test content.
            boto.connect_s3().create_bucket(_TEST_BUCKET)
            engine = DistributedStorage(
                {
                    'foo':
                    S3Storage(_TEST_CONTEXT, 'some/path', _TEST_BUCKET,
                              _TEST_USER, _TEST_PASSWORD)
                }, ['foo'])
            yield engine
Beispiel #7
0
def remove(name=None,
           prefix=False,
           bucket_name=False,
           key=None,
           secret=None,
           host=None):
    """
    Deletes file from Amazon S3.
    """
    full_path = _get_name(name, prefix)

    s3 = S3Storage(bucket_name=bucket_name, key=key, secret=secret, host=host)

    s3.delete(full_path)
Beispiel #8
0
def test_stream_write_error():
    with mock_s3():
        # Create an engine but not the bucket.
        engine = S3Storage(_TEST_CONTEXT, "some/path", _TEST_BUCKET,
                           _TEST_USER, _TEST_PASSWORD)

        # Attempt to write to the uncreated bucket, which should raise an error.
        with pytest.raises(IOError):
            engine.stream_write(_TEST_PATH,
                                BytesIO(b"hello world"),
                                content_type="Cool/Type")

        with pytest.raises(botocore.exceptions.ClientError) as excinfo:
            engine.exists(_TEST_PATH)
            assert s3r.value.response["Error"]["Code"] == "NoSuchBucket"
Beispiel #9
0
def download(name=None,
             prefix=False,
             bucket_name=False,
             key=None,
             secret=None,
             host=None):
    """
    Download file from Amazon S3.
    Returns TemporaryFile().
    """
    full_path = _get_name(name, prefix)

    s3 = S3Storage(bucket_name=bucket_name, key=key, secret=secret, host=host)

    return s3.open(full_path)
Beispiel #10
0
def storage_engine(request):
    if request.param == "test":
        yield test_storage
    else:
        with mock_s3():
            # Create a test bucket and put some test content.
            boto3.client("s3").create_bucket(Bucket=_TEST_BUCKET)
            engine = DistributedStorage(
                {
                    "foo":
                    S3Storage(_TEST_CONTEXT, "some/path", _TEST_BUCKET,
                              _TEST_USER, _TEST_PASSWORD)
                },
                ["foo"],
            )
            yield engine
Beispiel #11
0
def get_url(name=None,
            prefix=False,
            bucket_name=False,
            key=None,
            secret=None,
            host=None,
            expires=30,
            query_auth=False,
            force_http=False):
    """
    Get Url for key on Amazon S3.
    Returns String.
    """
    full_path = _get_name(name, prefix)

    s3 = S3Storage(bucket_name=bucket_name, key=key, secret=secret, host=host)

    return s3.url(full_path, expires, query_auth, force_http)
Beispiel #12
0
    def init(self):
        self.read_file_map()

        self.input_dir = self.config['input_dir']
        self.output_dir = self.config['output_dir']

        s_cfg = self.config['storage']
        self.storage = S3Storage(**s_cfg)

        # read file map
        self.file_map

        self.minify_dir = os.path.join(self.output_dir, 'minify')
        self.hash_input_dir = os.path.join(self.output_dir, 'hash_input')
        self.hash_output_dir = os.path.join(self.output_dir, 'hash_output')
        self.prepare_dir(self.minify_dir)
        self.prepare_dir(self.hash_input_dir)
        self.prepare_dir(self.hash_output_dir)

        self.mini_js_ext = '.mini.js'
        self.mini_css_ext = '.mini.css'
        self.gzip_ext = '.gzip'
        # init JS groups
        self.js_config = FileConfig(self.input_dir)
        for group in self.config['js_groups']:
            name = group['name']
            files = group['files']
            gzip = group.get('gzip', True)
            self.js_config.add_group(name, files, gzip)

        # init CSS groups
        self.css_config = FileConfig(self.input_dir)
        for group in self.config['css_groups']:
            name = group['name']
            files = group['files']
            gzip = group.get('gzip', True)
            self.css_config.add_group(name, files, gzip)

        self.hash_file = HashFile(self.hash_input_dir,
                                  self.hash_output_dir,
                                  hash_version=self.config.get(
                                      'hash_version', ''))
Beispiel #13
0
def upload(filename,
           name=None,
           prefix=False,
           bucket_name=False,
           key=None,
           secret=None,
           host=None,
           expires=30,
           query_auth=False,
           force_http=False,
           policy=None,
           replace=True):
    """
    Uploading files to Amamzon S3.
    Returns String.
    """
    if isinstance(filename, basestring):
        fl = open(filename, 'rb')
    elif isinstance(filename, (file, File)):
        fl = filename
    else:
        raise TypeError('File must be file or string instance.')

    if not name:
        name = fl.name

    full_path = _get_name(name, prefix)

    s3 = S3Storage(bucket_name=bucket_name,
                   key=key,
                   secret=secret,
                   host=host,
                   policy=policy,
                   replace=replace)
    s3.save(full_path, fl)

    return s3.url(full_path, expires, query_auth, force_http)
Beispiel #14
0
def handler(event, context):
    start_time = time.time()

    # dataset setting
    file = event['file']
    data_bucket = event['data_bucket']
    dataset_type = event['dataset_type']
    assert dataset_type == "dense_libsvm"
    n_features = event['n_features']
    n_classes = event['n_classes']
    n_workers = event['n_workers']
    worker_index = event['worker_index']
    tmp_table_name = event['tmp_table_name']
    merged_table_name = event['merged_table_name']
    key_col = event['key_col']

    # training setting
    model_name = event['model']
    optim = event['optim']
    sync_mode = event['sync_mode']
    assert model_name.lower() in MLModel.Linear_Models
    assert optim.lower() == Optimization.ADMM
    assert sync_mode.lower() in [
        Synchronization.Reduce, Synchronization.Reduce_Scatter
    ]

    # hyper-parameter
    learning_rate = event['lr']
    batch_size = event['batch_size']
    n_epochs = event['n_epochs']
    valid_ratio = event['valid_ratio']
    n_admm_epochs = event['n_admm_epochs']
    lam = event['lambda']
    rho = event['rho']

    print('data bucket = {}'.format(data_bucket))
    print("file = {}".format(file))
    print('number of workers = {}'.format(n_workers))
    print('worker index = {}'.format(worker_index))
    print('model = {}'.format(model_name))
    print('optimization = {}'.format(optim))
    print('sync mode = {}'.format(sync_mode))

    s3_storage = S3Storage()
    dynamo_client = dynamo_operator.get_client()
    tmp_table = DynamoTable(dynamo_client, tmp_table_name)
    merged_table = DynamoTable(dynamo_client, merged_table_name)
    communicator = DynamoCommunicator(dynamo_client, tmp_table, merged_table,
                                      key_col, n_workers, worker_index)

    # Read file from s3
    read_start = time.time()
    lines = s3_storage.load(file,
                            data_bucket).read().decode('utf-8').split("\n")
    print("read data cost {} s".format(time.time() - read_start))

    parse_start = time.time()
    dataset = libsvm_dataset.from_lines(lines, n_features, dataset_type)
    print("parse data cost {} s".format(time.time() - parse_start))

    preprocess_start = time.time()
    # Creating data indices for training and validation splits:
    dataset_size = len(dataset)
    indices = list(range(dataset_size))
    split = int(np.floor(valid_ratio * dataset_size))
    shuffle_dataset = True
    random_seed = 100
    if shuffle_dataset:
        np.random.seed(random_seed)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]

    # Creating data samplers and loaders:
    train_sampler = SubsetRandomSampler(train_indices)
    valid_sampler = SubsetRandomSampler(val_indices)
    train_loader = torch.utils.data.DataLoader(dataset,
                                               batch_size=batch_size,
                                               sampler=train_sampler)
    n_train_batch = len(train_loader)
    validation_loader = torch.utils.data.DataLoader(dataset,
                                                    batch_size=batch_size,
                                                    sampler=valid_sampler)
    print("preprocess data cost {} s, dataset size = {}".format(
        time.time() - preprocess_start, dataset_size))

    model = linear_models.get_model(model_name, n_features, n_classes)

    z, u = initialize_z_and_u(model.linear.weight.data.size())
    print("size of z = {}".format(z.shape))
    print("size of u = {}".format(u.shape))

    # Loss and Optimizer
    # Softmax is internally computed.
    # Set parameters to be updated.
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    # Training the Model
    train_start = time.time()
    for admm_epoch in range(n_admm_epochs):
        print(">>> ADMM Epoch[{}]".format(admm_epoch))
        admm_epoch_start = time.time()
        admm_epoch_cal_time = 0
        admm_epoch_comm_time = 0
        admm_epoch_test_time = 0
        for epoch in range(n_epochs):
            epoch_start = time.time()
            epoch_loss = 0.
            for batch_index, (items, labels) in enumerate(train_loader):
                batch_start = time.time()
                items = Variable(items.view(-1, n_features))
                labels = Variable(labels)

                # Forward + Backward + Optimize
                optimizer.zero_grad()
                outputs = model(items)
                classify_loss = criterion(outputs, labels)
                epoch_loss += classify_loss.item()
                u_z = torch.from_numpy(u) - torch.from_numpy(z)
                loss = classify_loss
                for name, param in model.named_parameters():
                    if name.split('.')[-1] == "weight":
                        loss += rho / 2.0 * torch.norm(param + u_z, p=2)
                        # loss = classify_loss + rho / 2.0 * torch.norm(torch.sum(model.linear.weight, u_z))
                optimizer.zero_grad()
                loss.backward(retain_graph=True)
                optimizer.step()

            epoch_cal_time = time.time() - epoch_start
            admm_epoch_cal_time += epoch_cal_time

            # Test the Model
            test_start = time.time()
            n_test_correct = 0
            n_test = 0
            test_loss = 0
            for items, labels in validation_loader:
                items = Variable(items.view(-1, n_features))
                labels = Variable(labels)
                outputs = model(items)
                test_loss += criterion(outputs, labels).item()
                _, predicted = torch.max(outputs.data, 1)
                n_test += labels.size(0)
                n_test_correct += (predicted == labels).sum()
            epoch_test_time = time.time() - test_start
            admm_epoch_test_time += epoch_test_time

            print(
                'Epoch: [%d/%d], Step: [%d/%d], Time: %.4f, Loss: %.4f, epoch cost %.4f, '
                'cal cost %.4f s, test cost %.4f s, accuracy of the model on the %d test samples: %d %%, loss = %f'
                % (epoch + 1, n_epochs, batch_index + 1, n_train_batch,
                   time.time() - train_start, epoch_loss,
                   time.time() - epoch_start, epoch_cal_time, epoch_test_time,
                   n_test, 100. * n_test_correct / n_test, test_loss / n_test))

        sync_start = time.time()
        w = model.linear.weight.data.numpy()
        w_shape = w.shape
        b = model.linear.bias.data.numpy()
        b_shape = b.shape
        u_shape = u.shape

        w_b = np.concatenate((w.flatten(), b.flatten()))
        u_w_b = np.concatenate((u.flatten(), w_b.flatten()))

        # admm does not support async
        if sync_mode == "reduce":
            u_w_b_merge = communicator.reduce_epoch(u_w_b, admm_epoch)
        elif sync_mode == "reduce_scatter":
            u_w_b_merge = communicator.reduce_scatter_epoch(u_w_b, admm_epoch)

        u_mean = u_w_b_merge[:u_shape[0] *
                             u_shape[1]].reshape(u_shape) / float(n_workers)
        w_mean = u_w_b_merge[u_shape[0] * u_shape[1]: u_shape[0] * u_shape[1] + w_shape[0] * w_shape[1]]\
                     .reshape(w_shape) / float(n_workers)
        b_mean = u_w_b_merge[u_shape[0] * u_shape[1] + w_shape[0] * w_shape[1]:]\
                     .reshape(b_shape[0]) / float(n_workers)

        model.linear.weight.data = torch.from_numpy(w_mean)
        model.linear.bias.data = torch.from_numpy(b_mean)
        admm_epoch_comm_time += time.time() - sync_start

        if worker_index == 0:
            delete_start = time.time()
            communicator.delete_expired_epoch(admm_epoch)
            admm_epoch_comm_time += time.time() - delete_start

        # z, u, r, s = update_z_u(w, z, u, rho, num_workers, lam)
        # stop = check_stop(ep_abs, ep_rel, r, s, dataset_size, num_features, w, z, u, rho)
        # print("stop = {}".format(stop))

        # z = num_workers * rho / (2 * lam + num_workers * rho) * (w + u_mean)
        z = update_z(w_mean, u_mean, rho, n_workers, lam)
        u = u + model.linear.weight.data.numpy() - z

        print(
            "ADMM Epoch[{}] finishes, cost {} s, cal cost {} s, sync cost {} s, test cost {} s"
            .format(admm_epoch,
                    time.time() - admm_epoch_start, admm_epoch_cal_time,
                    admm_epoch_comm_time, admm_epoch_test_time))

    # Test the Model
    n_test_correct = 0
    n_test = 0
    test_loss = 0
    for items, labels in validation_loader:
        items = Variable(items.view(-1, n_features))
        labels = Variable(labels)
        outputs = model(items)
        test_loss += criterion(outputs, labels).item()
        _, predicted = torch.max(outputs.data, 1)
        n_test += labels.size(0)
        n_test_correct += (predicted == labels).sum()

    print(
        'Train finish, time = %.4f, accuracy of the model on the %d test samples: %d %%, loss = %f'
        % (time.time() - train_start, n_test, 100. * n_test_correct / n_test,
           test_loss / n_test))

    if worker_index == 0:
        s3_storage.clear(tmp_table_name)
        s3_storage.clear(merged_table_name)

    end_time = time.time()
    print("Elapsed time = {} s".format(end_time - start_time))
Beispiel #15
0
def get_storage(config, verify=True):
    return S3Storage(config,
                     verify=verify) if config.USE_S3 else LocalStorage()
Beispiel #16
0
def handler(event, context):
    # dataset
    data_bucket = event['data_bucket']
    file = event['file']
    dataset_type = event["dataset_type"]
    assert dataset_type == "dense_libsvm"
    n_features = event['n_features']
    n_workers = event["n_workers"]
    worker_index = event['worker_index']
    tmp_table_name = event['tmp_table_name']
    merged_table_name = event['merged_table_name']
    key_col = event['key_col']

    # hyper-parameter
    n_clusters = event['n_clusters']
    n_epochs = event["n_epochs"]
    threshold = event["threshold"]
    sync_mode = event["sync_mode"]
    assert sync_mode.lower() in [
        Synchronization.Reduce, Synchronization.Reduce_Scatter
    ]

    print('data bucket = {}'.format(data_bucket))
    print("file = {}".format(file))
    print('number of workers = {}'.format(n_workers))
    print('worker index = {}'.format(worker_index))
    print('num clusters = {}'.format(n_clusters))
    print('sync mode = {}'.format(sync_mode))

    s3_storage = S3Storage()
    dynamo_client = dynamo_operator.get_client()
    tmp_table = DynamoTable(dynamo_client, tmp_table_name)
    merged_table = DynamoTable(dynamo_client, merged_table_name)
    communicator = DynamoCommunicator(dynamo_client, tmp_table, merged_table,
                                      key_col, n_workers, worker_index)

    # Reading data from S3
    read_start = time.time()
    lines = s3_storage.load(file,
                            data_bucket).read().decode('utf-8').split("\n")
    print("read data cost {} s".format(time.time() - read_start))

    parse_start = time.time()
    dataset = libsvm_dataset.from_lines(lines, n_features, dataset_type).ins_np
    data_type = dataset.dtype
    centroid_shape = (n_clusters, dataset.shape[1])
    print("parse data cost {} s".format(time.time() - parse_start))
    print("dataset type: {}, dtype: {}, Centroids shape: {}, num_features: {}".
          format(dataset_type, data_type, centroid_shape, n_features))

    init_centroids_start = time.time()
    if worker_index == 0:
        centroids = dataset[0:n_clusters]
        merged_table.save(centroids.tobytes(), Prefix.KMeans_Init_Cent + "-1",
                          key_col)
    else:
        centroid_bytes = (merged_table.load_or_wait(
            Prefix.KMeans_Init_Cent + "-1", key_col, 0.1))['value'].value
        centroids = centroid_bytes2np(centroid_bytes, n_clusters, data_type)
        if centroid_shape != centroids.shape:
            raise Exception("The shape of centroids does not match.")

    print("initialize centroids takes {} s".format(time.time() -
                                                   init_centroids_start))

    model = cluster_models.get_model(dataset, centroids, dataset_type,
                                     n_features, n_clusters)

    train_start = time.time()
    for epoch in range(n_epochs):
        epoch_start = time.time()

        # rearrange data points
        model.find_nearest_cluster()

        local_cent = model.get_centroids("numpy").reshape(-1)
        local_cent_error = np.concatenate(
            (local_cent.flatten(), np.array([model.error], dtype=np.float32)))
        epoch_cal_time = time.time() - epoch_start

        # sync local centroids and error
        epoch_comm_start = time.time()

        if sync_mode == "reduce":
            cent_error_merge = communicator.reduce_epoch(
                local_cent_error, epoch)
        elif sync_mode == "reduce_scatter":
            cent_error_merge = communicator.reduce_scatter_epoch(
                local_cent_error, epoch)

        cent_merge = cent_error_merge[:-1].reshape(centroid_shape) / float(
            n_workers)
        error_merge = cent_error_merge[-1] / float(n_workers)

        model.centroids = cent_merge
        model.error = error_merge
        epoch_comm_time = time.time() - epoch_comm_start

        print("one {} round cost {} s".format(sync_mode, epoch_comm_time))

        print(
            "Epoch[{}] Worker[{}], error = {}, cost {} s, cal cost {} s, sync cost {} s"
            .format(epoch, worker_index, model.error,
                    time.time() - epoch_start, epoch_cal_time,
                    epoch_comm_time))

        if model.error < threshold:
            break

    if worker_index == 0:
        tmp_table.clear(key_col)
        merged_table.clear(key_col)

    print("Worker[{}] finishes training: Error = {}, cost {} s".format(
        worker_index, model.error,
        time.time() - train_start))
    return
def handler(event, context):
    start_time = time.time()

    # dataset setting
    file = event['file']
    data_bucket = event['data_bucket']
    dataset_type = event['dataset_type']
    assert dataset_type == "dense_libsvm"
    n_features = event['n_features']
    n_classes = event['n_classes']
    n_workers = event['n_workers']
    worker_index = event['worker_index']
    tmp_table_name = event['tmp_table_name']
    merged_table_name = event['merged_table_name']
    key_col = event['key_col']

    # training setting
    model_name = event['model']
    optim = event['optim']
    sync_mode = event['sync_mode']
    assert model_name.lower() in MLModel.Linear_Models
    assert optim.lower() in Optimization.All
    assert sync_mode.lower() in Synchronization.All

    # hyper-parameter
    learning_rate = event['lr']
    batch_size = event['batch_size']
    n_epochs = event['n_epochs']
    valid_ratio = event['valid_ratio']

    shuffle_dataset = True
    random_seed = 100

    print('bucket = {}'.format(data_bucket))
    print("file = {}".format(file))
    print('number of workers = {}'.format(n_workers))
    print('worker index = {}'.format(worker_index))
    print('model = {}'.format(model_name))
    print('optimization = {}'.format(optim))
    print('sync mode = {}'.format(sync_mode))

    s3_storage = S3Storage()
    dynamo_client = dynamo_operator.get_client()
    tmp_table = DynamoTable(dynamo_client, tmp_table_name)
    merged_table = DynamoTable(dynamo_client, merged_table_name)
    communicator = DynamoCommunicator(dynamo_client, tmp_table, merged_table,
                                      key_col, n_workers, worker_index)

    # Read file from s3
    read_start = time.time()
    lines = s3_storage.load(file,
                            data_bucket).read().decode('utf-8').split("\n")
    print("read data cost {} s".format(time.time() - read_start))

    parse_start = time.time()
    dataset = libsvm_dataset.from_lines(lines, n_features, dataset_type)
    print("parse data cost {} s".format(time.time() - parse_start))

    preprocess_start = time.time()
    # Creating data indices for training and validation splits:
    dataset_size = len(dataset)
    indices = list(range(dataset_size))
    split = int(np.floor(valid_ratio * dataset_size))
    if shuffle_dataset:
        np.random.seed(random_seed)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]

    # Creating data samplers and loaders:
    train_sampler = SubsetRandomSampler(train_indices)
    valid_sampler = SubsetRandomSampler(val_indices)
    train_loader = torch.utils.data.DataLoader(dataset,
                                               batch_size=batch_size,
                                               sampler=train_sampler)
    n_train_batch = len(train_loader)
    validation_loader = torch.utils.data.DataLoader(dataset,
                                                    batch_size=batch_size,
                                                    sampler=valid_sampler)
    print("preprocess data cost {} s, dataset size = {}".format(
        time.time() - preprocess_start, dataset_size))

    model = linear_models.get_model(model_name, n_features, n_classes)

    # Loss and Optimizer
    # Softmax is internally computed.
    # Set parameters to be updated.
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    train_start = time.time()
    # Training the Model
    for epoch in range(n_epochs):
        epoch_start = time.time()
        epoch_cal_time = 0
        epoch_comm_time = 0
        epoch_loss = 0
        for batch_idx, (items, labels) in enumerate(train_loader):
            # print("------worker {} epoch {} batch {}------".format(worker_index, epoch, batch_index))
            batch_start = time.time()
            items = Variable(items.view(-1, n_features))
            labels = Variable(labels)

            # Forward + Backward + Optimize
            optimizer.zero_grad()
            outputs = model(items)
            loss = criterion(outputs, labels)
            epoch_loss += loss.item()
            loss.backward()

            if optim == "grad_avg":
                if sync_mode == "reduce" or sync_mode == "reduce_scatter":
                    w_grad = model.linear.weight.grad.data.numpy()
                    w_grad_shape = w_grad.shape
                    b_grad = model.linear.bias.grad.data.numpy()
                    b_grad_shape = b_grad.shape
                    w_b_grad = np.concatenate(
                        (w_grad.flatten(), b_grad.flatten()))
                    batch_cal_time = time.time() - batch_start
                    epoch_cal_time += batch_cal_time

                    batch_comm_start = time.time()

                    if sync_mode == "reduce":
                        w_b_grad_merge = communicator.reduce_batch(
                            w_b_grad, epoch, batch_idx)
                    elif sync_mode == "reduce_scatter":
                        w_b_grad_merge = communicator.reduce_scatter_batch(
                            w_b_grad, epoch, batch_idx)

                    w_grad_merge = w_b_grad_merge[:w_grad_shape[0] * w_grad_shape[1]]\
                                       .reshape(w_grad_shape) / float(n_workers)
                    b_grad_merge = w_b_grad_merge[w_grad_shape[0] * w_grad_shape[1]:]\
                                       .reshape(b_grad_shape[0]) / float(n_workers)

                    model.linear.weight.grad = Variable(
                        torch.from_numpy(w_grad_merge))
                    model.linear.bias.grad = Variable(
                        torch.from_numpy(b_grad_merge))
                    batch_comm_time = time.time() - batch_comm_start
                    print("one {} round cost {} s".format(
                        sync_mode, batch_comm_time))
                    epoch_comm_time += batch_comm_time
                elif sync_mode == "async":
                    # async does step before sync
                    optimizer.step()
                    w = model.linear.weight.data.numpy()
                    w_shape = w.shape
                    b = model.linear.bias.data.numpy()
                    b_shape = b.shape
                    w_b = np.concatenate((w.flatten(), b.flatten()))
                    batch_cal_time = time.time() - epoch_start
                    epoch_cal_time += batch_cal_time

                    batch_comm_start = time.time()
                    # init model
                    if worker_index == 0 and epoch == 0 and batch_idx == 0:
                        merged_table.save(w_b.tobytes(), Prefix.w_b_prefix,
                                          key_col)

                    w_b_merge = communicator.async_reduce(
                        w_b, Prefix.w_b_prefix)
                    # do not need average
                    w_merge = w_b_merge[:w_shape[0] *
                                        w_shape[1]].reshape(w_shape)
                    b_merge = w_b_merge[w_shape[0] * w_shape[1]:].reshape(
                        b_shape[0])
                    model.linear.weight.data = torch.from_numpy(w_merge)
                    model.linear.bias.data = torch.from_numpy(b_merge)
                    batch_comm_time = time.time() - batch_comm_start
                    print("one {} round cost {} s".format(
                        sync_mode, batch_comm_time))
                    epoch_comm_time += batch_comm_time

            if sync_mode != "async":
                step_start = time.time()
                optimizer.step()
                epoch_cal_time += time.time() - step_start

            if batch_idx % 10 == 0:
                print(
                    "Epoch: [%d/%d], Step: [%d/%d], Time: %.4f s, Loss: %.4f, batch cost %.4f s"
                    % (epoch + 1, n_epochs, batch_idx + 1, n_train_batch,
                       time.time() - train_start, loss.item(),
                       time.time() - batch_start))

        if optim == "model_avg":
            w = model.linear.weight.data.numpy()
            w_shape = w.shape
            b = model.linear.bias.data.numpy()
            b_shape = b.shape
            w_b = np.concatenate((w.flatten(), b.flatten()))
            epoch_cal_time += time.time() - epoch_start

            epoch_comm_start = time.time()

            if sync_mode == "reduce":
                w_b_merge = communicator.reduce_epoch(w_b, epoch)
            elif sync_mode == "reduce_scatter":
                w_b_merge = communicator.reduce_scatter_epoch(w_b, epoch)
            elif sync_mode == "async":
                if worker_index == 0 and epoch == 0:
                    merged_table.save(w_b.tobytes(), Prefix.w_b_prefix,
                                      key_col)
                w_b_merge = communicator.async_reduce(w_b, Prefix.w_b_prefix)

            w_merge = w_b_merge[:w_shape[0] * w_shape[1]].reshape(w_shape)
            b_merge = w_b_merge[w_shape[0] * w_shape[1]:].reshape(b_shape[0])
            if sync_mode == "reduce" or sync_mode == "reduce_scatter":
                w_merge = w_merge / float(n_workers)
                b_merge = b_merge / float(n_workers)
            model.linear.weight.data = torch.from_numpy(w_merge)
            model.linear.bias.data = torch.from_numpy(b_merge)
            print("one {} round cost {} s".format(
                sync_mode,
                time.time() - epoch_comm_start))
            epoch_comm_time += time.time() - epoch_comm_start

        if worker_index == 0:
            delete_start = time.time()
            # model avg delete by epoch
            if optim == "model_avg" and sync_mode != "async":
                communicator.delete_expired_epoch(epoch)
            elif optim == "grad_avg" and sync_mode != "async":
                communicator.delete_expired_batch(epoch, batch_idx)
            epoch_comm_time += time.time() - delete_start

        # Test the Model
        test_start = time.time()
        n_test_correct = 0
        n_test = 0
        test_loss = 0
        for items, labels in validation_loader:
            items = Variable(items.view(-1, n_features))
            labels = Variable(labels)
            outputs = model(items)
            test_loss += criterion(outputs, labels).data
            _, predicted = torch.max(outputs.data, 1)
            n_test += labels.size(0)
            n_test_correct += (predicted == labels).sum()
        test_time = time.time() - test_start

        print(
            'Epoch: [%d/%d], Step: [%d/%d], Time: %.4f, Loss: %.4f, epoch cost %.4f: '
            'calculation cost = %.4f s, communication cost %.4f s, test cost %.4f s, '
            'accuracy of the model on the %d test samples: %d %%, loss = %f' %
            (epoch + 1, n_epochs, batch_idx + 1, n_train_batch,
             time.time() - train_start, epoch_loss, time.time() - epoch_start,
             epoch_cal_time, epoch_comm_time, test_time, n_test,
             100. * n_test_correct / n_test, test_loss / n_test))

    if worker_index == 0:
        tmp_table.clear(key_col)
        merged_table.clear(key_col)

    end_time = time.time()
    print("Elapsed time = {} s".format(end_time - start_time))
Beispiel #18
0
def handler(event, context):
    start_time = time.time()

    # dataset setting
    train_file = event['train_file']
    test_file = event['test_file']
    data_bucket = event['data_bucket']
    n_features = event['n_features']
    n_classes = event['n_classes']
    n_workers = event['n_workers']
    worker_index = event['worker_index']
    tmp_bucket = event['tmp_bucket']
    merged_bucket = event['merged_bucket']
    cp_bucket = event['cp_bucket']

    # training setting
    model_name = event['model']
    optim = event['optim']
    sync_mode = event['sync_mode']
    assert model_name.lower() in MLModel.Deep_Models
    assert optim.lower() in Optimization.All
    assert sync_mode.lower() in Synchronization.All

    # hyper-parameter
    learning_rate = event['lr']
    batch_size = event['batch_size']
    n_epochs = event['n_epochs']
    start_epoch = event['start_epoch']
    run_epochs = event['run_epochs']

    function_name = event['function_name']

    print('data bucket = {}'.format(data_bucket))
    print("train file = {}".format(train_file))
    print("test file = {}".format(test_file))
    print('number of workers = {}'.format(n_workers))
    print('worker index = {}'.format(worker_index))
    print('model = {}'.format(model_name))
    print('optimization = {}'.format(optim))
    print('sync mode = {}'.format(sync_mode))
    print('start epoch = {}'.format(start_epoch))
    print('run epochs = {}'.format(run_epochs))

    print("Run function {}, round: {}/{}, epoch: {}/{} to {}/{}"
          .format(function_name, int(start_epoch/run_epochs) + 1, math.ceil(n_epochs / run_epochs),
                  start_epoch + 1, n_epochs, start_epoch + run_epochs, n_epochs))

    storage = S3Storage()
    communicator = S3Communicator(storage, tmp_bucket, merged_bucket, n_workers, worker_index)

    # download file from s3
    local_dir = "/tmp"
    read_start = time.time()
    storage.download(data_bucket, train_file, os.path.join(local_dir, train_file))
    storage.download(data_bucket, test_file, os.path.join(local_dir, test_file))
    print("download file from s3 cost {} s".format(time.time() - read_start))

    train_set = torch.load(os.path.join(local_dir, train_file))
    test_set = torch.load(os.path.join(local_dir, test_file))
    train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_set, batch_size=100, shuffle=False)
    classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

    print("read data cost {} s".format(time.time() - read_start))

    random_seed = 100
    torch.manual_seed(random_seed)

    device = 'cpu'
    net = deep_models.get_models(model_name).to(device)

    # Loss and Optimizer
    # Softmax is internally computed.
    # Set parameters to be updated.
    optimizer = torch.optim.SGD(net.parameters(), lr=learning_rate)

    # load checkpoint model if it is not the first round
    if start_epoch != 0:
        checked_file = 'checkpoint_{}.pt'.format(start_epoch - 1)
        storage.download(cp_bucket, checked_file, os.path.join(local_dir, checked_file))
        checkpoint_model = torch.load(os.path.join(local_dir, checked_file))

        net.load_state_dict(checkpoint_model['model_state_dict'])
        optimizer.load_state_dict(checkpoint_model['optimizer_state_dict'])
        print("load checkpoint model at epoch {}".format(start_epoch - 1))

    for epoch in range(start_epoch, min(start_epoch + run_epochs, n_epochs)):

        train_loss, train_acc = train_one_epoch(epoch, net, train_loader, optimizer, worker_index,
                                                communicator, optim, sync_mode)
        test_loss, test_acc = test(epoch, net, test_loader)

        print('Epoch: {}/{},'.format(epoch + 1, n_epochs),
              'train loss: {}'.format(train_loss),
              'train acc: {},'.format(train_acc),
              'test loss: {}'.format(test_loss),
              'test acc: {}.'.format(test_acc), )

    if worker_index == 0:
        storage.clear(tmp_bucket)
        storage.clear(merged_bucket)

    # training is not finished yet, invoke next round
    if epoch < n_epochs - 1:
        checkpoint_model = {
            'epoch': epoch,
            'model_state_dict': net.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': train_loss.average
        }

        checked_file = 'checkpoint_{}.pt'.format(epoch)

        if worker_index == 0:
            torch.save(checkpoint_model, os.path.join(local_dir, checked_file))
            storage.upload_file(cp_bucket, checked_file, os.path.join(local_dir, checked_file))
            print("checkpoint model at epoch {} saved!".format(epoch))

        print("Invoking the next round of functions. round: {}/{}, start epoch: {}, run epoch: {}"
              .format(int((epoch + 1) / run_epochs) + 1, math.ceil(n_epochs / run_epochs),
                      epoch + 1, run_epochs))
        lambda_client = boto3.client('lambda')
        payload = {
            'train_file': event['train_file'],
            'test_file': event['test_file'],
            'data_bucket': event['data_bucket'],
            'n_features': event['n_features'],
            'n_classes': event['n_classes'],
            'n_workers': event['n_workers'],
            'worker_index': event['worker_index'],
            'tmp_bucket': event['tmp_bucket'],
            'merged_bucket': event['merged_bucket'],
            'cp_bucket': event['cp_bucket'],
            'model': event['model'],
            'optim': event['optim'],
            'sync_mode': event['sync_mode'],
            'lr': event['lr'],
            'batch_size': event['batch_size'],
            'n_epochs': event['n_epochs'],
            'start_epoch': epoch + 1,
            'run_epochs': event['run_epochs'],
            'function_name': event['function_name']
        }
        lambda_client.invoke(FunctionName=function_name,
                             InvocationType='Event',
                             Payload=json.dumps(payload))

    end_time = time.time()
    print("Elapsed time = {} s".format(end_time - start_time))
def handler(event, context):
    start_time = time.time()

    # dataset setting
    file = event['file']
    data_bucket = event['data_bucket']
    dataset_type = event['dataset_type']
    n_features = event['n_features']
    n_classes = event['n_classes']
    n_workers = event['n_workers']
    worker_index = event['worker_index']
    host = event['host']
    port = event['port']
    tmp_bucket = event['tmp_bucket']
    merged_bucket = event['merged_bucket']

    # training setting
    model_name = event['model']
    optim = event['optim']
    sync_mode = event['sync_mode']
    assert model_name.lower() in MLModel.Sparse_Linear_Models
    assert optim.lower() == Optimization.ADMM
    assert sync_mode.lower() in [
        Synchronization.Reduce, Synchronization.Reduce_Scatter
    ]

    # hyper-parameter
    learning_rate = event['lr']
    batch_size = event['batch_size']
    n_epochs = event['n_epochs']
    valid_ratio = event['valid_ratio']
    n_admm_epochs = event['n_admm_epochs']
    lam = event['lambda']
    rho = event['rho']

    print('data bucket = {}'.format(data_bucket))
    print("file = {}".format(file))
    print('number of workers = {}'.format(n_workers))
    print('worker index = {}'.format(worker_index))
    print('model = {}'.format(model_name))
    print('optimization = {}'.format(optim))
    print('sync mode = {}'.format(sync_mode))

    s3_storage = S3Storage()
    mem_storage = MemcachedStorage(host, port)
    communicator = MemcachedCommunicator(mem_storage, tmp_bucket,
                                         merged_bucket, n_workers,
                                         worker_index)
    if worker_index == 0:
        mem_storage.clear()

    # Read file from s3
    read_start = time.time()
    lines = s3_storage.load(file,
                            data_bucket).read().decode('utf-8').split("\n")
    print("read data cost {} s".format(time.time() - read_start))

    parse_start = time.time()
    dataset = libsvm_dataset.from_lines(lines, n_features, dataset_type)
    print("parse data cost {} s".format(time.time() - parse_start))

    preprocess_start = time.time()
    # Creating data indices for training and validation splits:
    dataset_size = len(dataset)
    indices = list(range(dataset_size))
    split = int(np.floor(valid_ratio * dataset_size))
    shuffle_dataset = True
    random_seed = 100
    if shuffle_dataset:
        np.random.seed(random_seed)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]

    # split train set and test set
    train_set = [dataset[i] for i in train_indices]
    n_train_batch = math.floor(len(train_set) / batch_size)
    val_set = [dataset[i] for i in val_indices]
    print("preprocess data cost {} s, dataset size = {}".format(
        time.time() - preprocess_start, dataset_size))

    model = linear_models.get_sparse_model(model_name, train_set, val_set,
                                           n_features, n_epochs, learning_rate,
                                           batch_size)

    z, u = initialize_z_and_u(model.weight.data.size())
    print("size of z = {}".format(z.shape))
    print("size of u = {}".format(u.shape))

    # Training the Model
    train_start = time.time()
    for admm_epoch in range(n_admm_epochs):
        print(">>> ADMM Epoch[{}]".format(admm_epoch + 1))
        admm_epoch_start = time.time()
        admm_epoch_cal_time = 0
        admm_epoch_comm_time = 0
        admm_epoch_test_time = 0
        for epoch in range(n_epochs):
            epoch_start = time.time()
            epoch_loss = 0.

            for batch_idx in range(n_train_batch):
                batch_start = time.time()
                batch_loss, batch_acc = model.one_batch()

                u_z = torch.from_numpy(u) - torch.from_numpy(z)
                new_grad = torch.add(model.weight, u_z).mul(rho)
                new_grad.mul_(-1.0 * learning_rate)

                model.weight.add_(new_grad)
                batch_loss = batch_loss.average + rho / 2.0 * torch.norm(
                    model.weight + u_z, p=2).item()
                epoch_loss += batch_loss

                if batch_idx % 10 == 0:
                    print(
                        "ADMM Epoch: [{}/{}], Epoch: [{}/{}], Batch: [{}/{}], "
                        "time: {:.4f} s, batch cost {:.4f} s, loss: {}, accuracy: {}"
                        .format(admm_epoch + 1, n_admm_epochs, epoch + 1,
                                n_epochs, batch_idx + 1, n_train_batch,
                                time.time() - train_start,
                                time.time() - batch_start, batch_loss,
                                batch_acc))

            epoch_cal_time = time.time() - epoch_start
            admm_epoch_cal_time += epoch_cal_time

            # Test the Model
            test_start = time.time()
            test_loss, test_acc = model.evaluate()
            epoch_test_time = time.time() - test_start
            admm_epoch_test_time += epoch_test_time

            print(
                "ADMM Epoch: [{}/{}] Epoch: [{}/{}] finishes, Batch: [{}/{}], "
                "Time: {:.4f}, Loss: {:.4f}, epoch cost {:.4f} s, "
                "calculation cost = {:.4f} s, test cost {:.4f} s, "
                "accuracy of the model on the {} test samples: {}, loss = {}".
                format(admm_epoch + 1, n_admm_epochs, epoch + 1, n_epochs,
                       batch_idx + 1, n_train_batch,
                       time.time() - train_start, epoch_loss,
                       time.time() - epoch_start, epoch_cal_time,
                       epoch_test_time, len(val_set), test_acc, test_loss))

        sync_start = time.time()
        w = model.weight.numpy()
        w_shape = w.shape
        b = np.array([model.bias], dtype=np.float32)
        b_shape = b.shape
        u_shape = u.shape

        w_b = np.concatenate((w.flatten(), b.flatten()))
        u_w_b = np.concatenate((u.flatten(), w_b.flatten()))

        # admm does not support async
        if sync_mode == "reduce":
            u_w_b_merge = communicator.reduce_epoch(u_w_b, admm_epoch)
        elif sync_mode == "reduce_scatter":
            u_w_b_merge = communicator.reduce_scatter_epoch(u_w_b, admm_epoch)

        u_mean = u_w_b_merge[:u_shape[0] *
                             u_shape[1]].reshape(u_shape) / float(n_workers)
        w_mean = u_w_b_merge[u_shape[0] * u_shape[1]: u_shape[0] * u_shape[1] + w_shape[0] * w_shape[1]]\
                     .reshape(w_shape) / float(n_workers)
        b_mean = u_w_b_merge[u_shape[0] * u_shape[1] + w_shape[0] * w_shape[1]:]\
                     .reshape(b_shape[0]) / float(n_workers)

        model.weight = torch.from_numpy(w_mean)
        model.bias = torch.from_numpy(b_mean)
        admm_epoch_comm_time += time.time() - sync_start
        print("one {} round cost {} s".format(sync_mode, admm_epoch_comm_time))

        if worker_index == 0:
            delete_start = time.time()
            communicator.delete_expired_epoch(admm_epoch)
            admm_epoch_comm_time += time.time() - delete_start

        # z, u, r, s = update_z_u(w, z, u, rho, num_workers, lam)
        # stop = check_stop(ep_abs, ep_rel, r, s, dataset_size, num_features, w, z, u, rho)
        # print("stop = {}".format(stop))

        # z = num_workers * rho / (2 * lam + num_workers * rho) * (w + u_mean)
        z = update_z(w_mean, u_mean, rho, n_workers, lam)
        u = u + model.weight.data.numpy() - z

        print(
            "ADMM Epoch[{}] finishes, cost {} s, cal cost {} s, comm cost {} s, test cost {} s"
            .format(admm_epoch,
                    time.time() - admm_epoch_start, admm_epoch_cal_time,
                    admm_epoch_comm_time, admm_epoch_test_time))

    # Test the Model
    test_loss, test_acc = model.evaluate()

    print(
        "Train finish, cost {} s, accuracy of the model on the {} test samples = {}, loss = {}"
        .format(time.time() - train_start, len(val_set), test_acc, test_loss))

    if worker_index == 0:
        mem_storage.clear()

    end_time = time.time()
    print("Elapsed time = {} s".format(end_time - start_time))
Beispiel #20
0
def handler(event, context):

    function_name = "lambda_core"

    # dataset setting
    dataset_name = 'cifar10'
    data_bucket = "cifar10dataset"
    n_features = 32 * 32
    n_classes = 10
    tmp_table_name = "tmp-params"
    merged_table_name = "merged-params"
    cp_bucket = "cp-model"
    key_col = "key"

    # training setting
    model = "mobilenet"     # mobilenet or resnet
    optim = "grad_avg"  # grad_avg or model_avg
    sync_mode = "reduce"    # async, reduce or reduce_scatter
    n_workers = 10

    # hyper-parameters
    lr = 0.01
    batch_size = 256
    n_epochs = 5
    start_epoch = 0
    run_epochs = 3

    # clear dynamodb table
    s3_storage = S3Storage()
    s3_storage.clear(cp_bucket)
    dynamo_client = dynamo_operator.get_client()
    tmp_tb = DynamoTable(dynamo_client, tmp_table_name)
    merged_tb = DynamoTable(dynamo_client, tmp_table_name)
    tmp_tb.clear(key_col)
    merged_tb.clear(key_col)

    # lambda payload
    payload = dict()
    payload['dataset'] = dataset_name
    payload['data_bucket'] = data_bucket
    payload['n_features'] = n_features
    payload['n_classes'] = n_classes
    payload['n_workers'] = n_workers
    payload['tmp_table_name'] = tmp_table_name
    payload['merged_table_name'] = merged_table_name
    payload['key_col'] = key_col
    payload['cp_bucket'] = cp_bucket
    payload['model'] = model
    payload['optim'] = optim
    payload['sync_mode'] = sync_mode
    payload['lr'] = lr
    payload['batch_size'] = batch_size
    payload['n_epochs'] = n_epochs
    payload['start_epoch'] = start_epoch
    payload['run_epochs'] = run_epochs
    payload['function_name'] = function_name

    # invoke functions
    lambda_client = boto3.client('lambda')
    for i in range(n_workers):
        payload['worker_index'] = i
        payload['train_file'] = 'training_{}.pt'.format(i)
        payload['test_file'] = 'test.pt'
        lambda_client.invoke(FunctionName=function_name,
                             InvocationType='Event',
                             Payload=json.dumps(payload))
Beispiel #21
0
def handler(event, context):
    # dataset
    data_bucket = event['data_bucket']
    file = event['file']
    dataset_type = event["dataset_type"]
    assert dataset_type == "dense_libsvm"
    n_features = event['n_features']

    host = event['host']
    port = event['port']
    tmp_bucket = event["tmp_bucket"]
    merged_bucket = event["merged_bucket"]

    # hyper-parameter
    n_clusters = event['n_clusters']
    n_epochs = event["n_epochs"]
    threshold = event["threshold"]
    sync_mode = event["sync_mode"]
    n_workers = event["n_workers"]
    worker_index = event['worker_index']
    assert sync_mode.lower() in [
        Synchronization.Reduce, Synchronization.Reduce_Scatter
    ]

    print('data bucket = {}'.format(data_bucket))
    print("file = {}".format(file))
    print('number of workers = {}'.format(n_workers))
    print('worker index = {}'.format(worker_index))
    print('num clusters = {}'.format(n_clusters))
    print('sync mode = {}'.format(sync_mode))

    s3_storage = S3Storage()
    mem_storage = MemcachedStorage(host, port)
    communicator = MemcachedCommunicator(mem_storage, tmp_bucket,
                                         merged_bucket, n_workers,
                                         worker_index)
    if worker_index == 0:
        mem_storage.clear()

    # Reading data from S3
    read_start = time.time()
    lines = s3_storage.load(file,
                            data_bucket).read().decode('utf-8').split("\n")
    print("read data cost {} s".format(time.time() - read_start))

    parse_start = time.time()
    dataset = libsvm_dataset.from_lines(lines, n_features, dataset_type)
    if dataset_type == "dense_libsvm":
        dataset = dataset.ins_np
        data_type = dataset.dtype
        centroid_shape = (n_clusters, dataset.shape[1])
    elif dataset_type == "sparse_libsvm":
        dataset = dataset.ins_list
        first_entry = dataset[0].to_dense().numpy()
        data_type = first_entry.dtype
        centroid_shape = (n_clusters, first_entry.shape[1])
    print("parse data cost {} s".format(time.time() - parse_start))
    print("dataset type: {}, dtype: {}, Centroids shape: {}, num_features: {}".
          format(dataset_type, data_type, centroid_shape, n_features))

    init_centroids_start = time.time()
    if worker_index == 0:
        if dataset_type == "dense_libsvm":
            centroids = dataset[0:n_clusters]
        elif dataset_type == "sparse_libsvm":
            centroids = sparse_centroid_to_numpy(dataset[0:n_clusters],
                                                 n_clusters)
        mem_storage.save_v2(centroids.tobytes(),
                            Prefix.KMeans_Init_Cent + "-1", merged_bucket)
        print("generate initial centroids takes {} s".format(
            time.time() - init_centroids_start))
    else:
        centroid_bytes = mem_storage.load_or_wait_v2(
            Prefix.KMeans_Init_Cent + "-1", merged_bucket)
        centroids = centroid_bytes2np(centroid_bytes, n_clusters, data_type)
        if centroid_shape != centroids.shape:
            raise Exception("The shape of centroids does not match.")
        print("Waiting for initial centroids takes {} s".format(
            time.time() - init_centroids_start))

    model = cluster_models.get_model(dataset, centroids, dataset_type,
                                     n_features, n_clusters)

    train_start = time.time()
    for epoch in range(n_epochs):
        epoch_start = time.time()

        # rearrange data points
        model.find_nearest_cluster()

        local_cent = model.get_centroids("numpy").reshape(-1)
        local_cent_error = np.concatenate(
            (local_cent.flatten(), np.array([model.error])))
        epoch_cal_time = time.time() - epoch_start

        # sync local centroids and error
        epoch_sync_start = time.time()
        if sync_mode == "reduce":
            cent_error_merge = communicator.reduce_epoch(
                local_cent_error, epoch)
        elif sync_mode == "reduce_scatter":
            cent_error_merge = communicator.reduce_scatter_epoch(
                local_cent_error, epoch)

        cent_merge = cent_error_merge[:-1].reshape(centroid_shape) / float(
            n_workers)
        error_merge = cent_error_merge[-1] / float(n_workers)

        model.centroids = cent_merge
        model.error = error_merge
        print("one {} round cost {} s".format(sync_mode,
                                              time.time() - epoch_sync_start))
        epoch_sync_time = time.time() - epoch_sync_start

        print(
            "Epoch[{}] Worker[{}], error = {}, cost {} s, cal cost {} s, sync cost {} s"
            .format(epoch, worker_index, model.error,
                    time.time() - epoch_start, epoch_cal_time,
                    epoch_sync_time))

        if model.error < threshold:
            break

    #if worker_index == 0:
    #    mem_storage.clear()

    print("Worker[{}] finishes training: Error = {}, cost {} s".format(
        worker_index, model.error,
        time.time() - train_start))
    return
def handler(event, context):
    start_time = time.time()

    # dataset setting
    file = event['file']
    data_bucket = event['data_bucket']
    dataset_type = event['dataset_type']
    assert dataset_type == "sparse_libsvm"
    n_features = event['n_features']
    n_classes = event['n_classes']
    n_workers = event['n_workers']
    worker_index = event['worker_index']
    host = event['host']
    port = event['port']
    tmp_bucket = event['tmp_bucket']
    merged_bucket = event['merged_bucket']

    # training setting
    model_name = event['model']
    optim = event['optim']
    sync_mode = event['sync_mode']
    assert model_name.lower() in MLModel.Sparse_Linear_Models
    assert optim.lower() in [Optimization.Grad_Avg, Optimization.Model_Avg]
    assert sync_mode.lower() in Synchronization.All

    # hyper-parameter
    learning_rate = event['lr']
    batch_size = event['batch_size']
    n_epochs = event['n_epochs']
    valid_ratio = event['valid_ratio']

    shuffle_dataset = True
    random_seed = 100

    print('bucket = {}'.format(data_bucket))
    print("file = {}".format(file))
    print('number of workers = {}'.format(n_workers))
    print('worker index = {}'.format(worker_index))
    print('model = {}'.format(model_name))
    print('optimization = {}'.format(optim))
    print('sync mode = {}'.format(sync_mode))

    s3_storage = S3Storage()
    mem_storage = MemcachedStorage(host, port)
    communicator = MemcachedCommunicator(mem_storage, tmp_bucket,
                                         merged_bucket, n_workers,
                                         worker_index)
    if worker_index == 0:
        mem_storage.clear()

    # Read file from s3
    read_start = time.time()
    lines = s3_storage.load(file,
                            data_bucket).read().decode('utf-8').split("\n")
    print("read data cost {} s".format(time.time() - read_start))

    parse_start = time.time()
    dataset = libsvm_dataset.from_lines(lines, n_features, dataset_type)
    print("parse data cost {} s".format(time.time() - parse_start))

    preprocess_start = time.time()
    # Creating data indices for training and validation splits:
    dataset_size = len(dataset)
    indices = list(range(dataset_size))
    split = int(np.floor(valid_ratio * dataset_size))
    if shuffle_dataset:
        np.random.seed(random_seed)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]

    # split train set and test set
    train_set = [dataset[i] for i in train_indices]
    n_train_batch = math.floor(len(train_set) / batch_size)
    val_set = [dataset[i] for i in val_indices]
    print("preprocess data cost {} s, dataset size = {}".format(
        time.time() - preprocess_start, dataset_size))

    model = linear_models.get_sparse_model(model_name, train_set, val_set,
                                           n_features, n_epochs, learning_rate,
                                           batch_size)

    train_start = time.time()
    # Training the Model
    for epoch in range(n_epochs):
        epoch_start = time.time()
        epoch_cal_time = 0
        epoch_comm_time = 0
        epoch_loss = 0.

        for batch_idx in range(n_train_batch):
            batch_start = time.time()
            batch_loss, batch_acc = model.one_batch()
            epoch_loss += batch_loss.average

            if optim == "grad_avg":
                if sync_mode == "reduce" or sync_mode == "reduce_scatter":
                    w_b = np.concatenate((model.weight.numpy().flatten(),
                                          np.array([model.bias],
                                                   dtype=np.float32)))
                    batch_cal_time = time.time() - batch_start
                    epoch_cal_time += batch_cal_time
                    print("batch cal cost {} s".format(batch_cal_time))

                    batch_comm_start = time.time()

                    if sync_mode == "reduce":
                        w_b_merge = communicator.reduce_batch(
                            w_b, epoch, batch_idx)
                    elif sync_mode == "reduce_scatter":
                        w_b_merge = communicator.reduce_scatter_batch(
                            w_b, epoch, batch_idx)

                    w_merge = w_b_merge[:n_features] / float(n_workers)
                    b_merge = w_b_merge[-1] / float(n_workers)
                    model.weight = torch.from_numpy(w_merge).reshape(
                        n_features, 1)
                    model.bias = float(b_merge)

                    batch_comm_time = time.time() - batch_comm_start
                    print("one {} round cost {} s".format(
                        sync_mode, batch_comm_time))
                    epoch_comm_time += batch_comm_time
                elif sync_mode == "async":
                    w_b = np.concatenate((model.weight.numpy().flatten(),
                                          np.array([model.bias],
                                                   dtype=np.float32)))
                    batch_cal_time = time.time() - batch_start
                    epoch_cal_time += batch_cal_time

                    batch_comm_start = time.time()
                    # init model
                    if worker_index == 0 and epoch == 0 and batch_idx == 0:
                        mem_storage.save_v2(w_b.tobytes(), Prefix.w_b_prefix,
                                            merged_bucket)

                    w_b_merge = communicator.async_reduce(
                        w_b, Prefix.w_b_prefix)
                    # async des not need average
                    w_merge = w_b_merge[:n_features]
                    b_merge = w_b_merge[-1]
                    model.weight = torch.from_numpy(w_merge).reshape(
                        n_features, 1)
                    model.bias = float(b_merge)

                    batch_comm_time = time.time() - batch_comm_start
                    print("one {} round cost {} s".format(
                        sync_mode, batch_comm_time))
                    epoch_comm_time += batch_comm_time

            if batch_idx % 10 == 0:
                print(
                    'Epoch: [%d/%d], Batch: [%d/%d], Time: %.4f s, Loss: %.4f, Accuracy: %.4f, batch cost %.4f s'
                    % (epoch + 1, n_epochs, batch_idx + 1, n_train_batch,
                       time.time() - train_start, batch_loss.average,
                       batch_acc.accuracy, time.time() - batch_start))

        if optim == "model_avg":
            w_b = np.concatenate((model.weight.numpy().flatten(),
                                  np.array([model.bias], dtype=np.float32)))
            epoch_cal_time += time.time() - epoch_start

            epoch_sync_start = time.time()

            if sync_mode == "reduce":
                w_b_merge = communicator.reduce_epoch(w_b, epoch)
            elif sync_mode == "reduce_scatter":
                w_b_merge = communicator.reduce_scatter_epoch(w_b, epoch)
            elif sync_mode == "async":
                if worker_index == 0 and epoch == 0:
                    mem_storage.save_v2(w_b.tobytes(), Prefix.w_b_prefix,
                                        merged_bucket)
                w_b_merge = communicator.async_reduce(w_b, Prefix.w_b_prefix)

            w_merge = w_b_merge[:n_features]
            b_merge = w_b_merge[-1]
            # async des not need average
            if sync_mode == "reduce" or sync_mode == "reduce_scatter":
                w_merge = w_merge / float(n_workers)
                b_merge = b_merge / float(n_workers)
            model.weight = torch.from_numpy(w_merge).reshape(n_features, 1)
            model.bias = float(b_merge)
            print("one {} round cost {} s".format(
                sync_mode,
                time.time() - epoch_sync_start))
            epoch_comm_time += time.time() - epoch_sync_start

        if worker_index == 0:
            delete_start = time.time()
            # model avg delete by epoch
            if optim == "model_avg" and sync_mode != "async":
                communicator.delete_expired_epoch(epoch)
            elif optim == "grad_avg" and sync_mode != "async":
                communicator.delete_expired_batch(epoch, batch_idx)
            epoch_comm_time += time.time() - delete_start

        # Test the Model
        test_start = time.time()
        test_loss, test_acc = model.evaluate()
        test_time = time.time() - test_start

        print(
            "Epoch: [{}/{}] finishes, Batch: [{}/{}], Time: {:.4f}, Loss: {:.4f}, epoch cost {:.4f} s, "
            "calculation cost = {:.4f} s, synchronization cost {:.4f} s, test cost {:.4f} s, "
            "accuracy of the model on the {} test samples: {}, loss = {}".
            format(epoch + 1, n_epochs, batch_idx + 1, n_train_batch,
                   time.time() - train_start, epoch_loss,
                   time.time() - epoch_start, epoch_cal_time, epoch_comm_time,
                   test_time, len(val_set), test_acc.accuracy,
                   test_loss.average))

    if worker_index == 0:
        mem_storage.clear()

    end_time = time.time()
    print("Elapsed time = {} s".format(end_time - start_time))
def test_copy_with_error(storage_engine):
    another_engine = S3Storage(_TEST_CONTEXT, "another/path", "anotherbucket",
                               "foo", "bar")

    with pytest.raises(IOError):
        storage_engine.copy_to(another_engine, _TEST_PATH)
Beispiel #24
0
def handler(event, context):
    start_time = time.time()

    # dataset setting
    train_file = event['train_file']
    test_file = event['test_file']
    data_bucket = event['data_bucket']
    n_features = event['n_features']
    n_classes = event['n_classes']
    n_workers = event['n_workers']
    worker_index = event['worker_index']
    cp_bucket = event['cp_bucket']

    # ps setting
    host = event['host']
    port = event['port']

    # training setting
    model_name = event['model']
    optim = event['optim']
    sync_mode = event['sync_mode']
    assert model_name.lower() in MLModel.Deep_Models
    assert optim.lower() in Optimization.Grad_Avg
    assert sync_mode.lower() in Synchronization.Reduce

    # hyper-parameter
    learning_rate = event['lr']
    batch_size = event['batch_size']
    n_epochs = event['n_epochs']
    start_epoch = event['start_epoch']
    run_epochs = event['run_epochs']

    function_name = event['function_name']

    print('data bucket = {}'.format(data_bucket))
    print("train file = {}".format(train_file))
    print("test file = {}".format(test_file))
    print('number of workers = {}'.format(n_workers))
    print('worker index = {}'.format(worker_index))
    print('model = {}'.format(model_name))
    print('optimization = {}'.format(optim))
    print('sync mode = {}'.format(sync_mode))
    print('start epoch = {}'.format(start_epoch))
    print('run epochs = {}'.format(run_epochs))
    print('host = {}'.format(host))
    print('port = {}'.format(port))

    print("Run function {}, round: {}/{}, epoch: {}/{} to {}/{}".format(
        function_name,
        int(start_epoch / run_epochs) + 1, math.ceil(n_epochs / run_epochs),
        start_epoch + 1, n_epochs, start_epoch + run_epochs, n_epochs))

    # download file from s3
    storage = S3Storage()
    local_dir = "/tmp"
    read_start = time.time()
    storage.download(data_bucket, train_file,
                     os.path.join(local_dir, train_file))
    storage.download(data_bucket, test_file,
                     os.path.join(local_dir, test_file))
    print("download file from s3 cost {} s".format(time.time() - read_start))

    train_set = torch.load(os.path.join(local_dir, train_file))
    test_set = torch.load(os.path.join(local_dir, test_file))
    train_loader = torch.utils.data.DataLoader(train_set,
                                               batch_size=batch_size,
                                               shuffle=True)
    n_train_batch = len(train_loader)
    test_loader = torch.utils.data.DataLoader(test_set,
                                              batch_size=100,
                                              shuffle=False)
    classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse',
               'ship', 'truck')

    print("read data cost {} s".format(time.time() - read_start))

    random_seed = 100
    torch.manual_seed(random_seed)

    device = 'cpu'
    model = deep_models.get_models(model_name).to(device)
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    # load checkpoint model if it is not the first round
    if start_epoch != 0:
        checked_file = 'checkpoint_{}.pt'.format(start_epoch - 1)
        storage.download(cp_bucket, checked_file,
                         os.path.join(local_dir, checked_file))
        checkpoint_model = torch.load(os.path.join(local_dir, checked_file))

        model.load_state_dict(checkpoint_model['model_state_dict'])
        optimizer.load_state_dict(checkpoint_model['optimizer_state_dict'])
        print("load checkpoint model at epoch {}".format(start_epoch - 1))

    # Set thrift connection
    # Make socket
    transport = TSocket.TSocket(host, port)
    # Buffering is critical. Raw sockets are very slow
    transport = TTransport.TBufferedTransport(transport)
    # Wrap in a protocol
    protocol = TBinaryProtocol.TBinaryProtocol(transport)
    # Create a client to use the protocol encoder
    t_client = ParameterServer.Client(protocol)
    # Connect!
    transport.open()
    # test thrift connection
    ps_client.ping(t_client)
    print("create and ping thrift server >>> HOST = {}, PORT = {}".format(
        host, port))

    # register model
    parameter_shape = []
    parameter_length = []
    model_length = 0
    for param in model.parameters():
        tmp_shape = 1
        parameter_shape.append(param.data.numpy().shape)
        for w in param.data.numpy().shape:
            tmp_shape *= w
        parameter_length.append(tmp_shape)
        model_length += tmp_shape

    ps_client.register_model(t_client, worker_index, model_name, model_length,
                             n_workers)
    ps_client.exist_model(t_client, model_name)
    print("register and check model >>> name = {}, length = {}".format(
        model_name, model_length))

    # Training the Model
    train_start = time.time()
    iter_counter = 0
    for epoch in range(start_epoch, min(start_epoch + run_epochs, n_epochs)):

        model.train()
        epoch_start = time.time()

        train_acc = Accuracy()
        train_loss = Average()

        for batch_idx, (inputs, targets) in enumerate(train_loader):
            batch_start = time.time()
            batch_cal_time = 0
            batch_comm_time = 0

            # pull latest model
            ps_client.can_pull(t_client, model_name, iter_counter,
                               worker_index)
            latest_model = ps_client.pull_model(t_client, model_name,
                                                iter_counter, worker_index)
            pos = 0
            for layer_index, param in enumerate(model.parameters()):
                param.data = Variable(
                    torch.from_numpy(
                        np.asarray(latest_model[pos:pos +
                                                parameter_length[layer_index]],
                                   dtype=np.float32).reshape(
                                       parameter_shape[layer_index])))
                pos += parameter_length[layer_index]
            batch_comm_time += time.time() - batch_start

            batch_cal_start = time.time()
            outputs = model(inputs)
            loss = F.cross_entropy(outputs, targets)
            optimizer.zero_grad()
            loss.backward()

            # flatten and concat gradients of weight and bias
            param_grad = np.zeros((1))
            for param in model.parameters():
                # print("shape of layer = {}".format(param.data.numpy().flatten().shape))
                param_grad = np.concatenate(
                    (param_grad, param.data.numpy().flatten()))
            param_grad = np.delete(param_grad, 0)
            #print("model_length = {}".format(param_grad.shape))
            batch_cal_time += time.time() - batch_cal_start

            # push gradient to PS
            batch_push_start = time.time()
            ps_client.can_push(t_client, model_name, iter_counter,
                               worker_index)
            ps_client.push_grad(t_client, model_name, param_grad,
                                -1. * learning_rate / n_workers, iter_counter,
                                worker_index)
            ps_client.can_pull(t_client, model_name, iter_counter + 1,
                               worker_index)  # sync all workers
            batch_comm_time += time.time() - batch_push_start

            train_acc.update(outputs, targets)
            train_loss.update(loss.item(), inputs.size(0))

            optimizer.step()
            iter_counter += 1

            if batch_idx % 10 == 0:
                print(
                    'Epoch: [%d/%d], Batch: [%d/%d], Time: %.4f, Loss: %.4f, epoch cost %.4f, '
                    'batch cost %.4f s: cal cost %.4f s and communication cost %.4f s'
                    % (epoch + 1, n_epochs, batch_idx + 1, n_train_batch,
                       time.time() - train_start, loss.item(),
                       time.time() - epoch_start, time.time() - batch_start,
                       batch_cal_time, batch_comm_time))

        test_loss, test_acc = test(epoch, model, test_loader)

        print(
            'Epoch: {}/{},'.format(epoch + 1, n_epochs),
            'train loss: {},'.format(train_loss),
            'train acc: {},'.format(train_acc),
            'test loss: {},'.format(test_loss),
            'test acc: {}.'.format(test_acc),
        )

    # training is not finished yet, invoke next round
    if epoch < n_epochs - 1:
        checkpoint_model = {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': train_loss.average
        }

        checked_file = 'checkpoint_{}.pt'.format(epoch)

        if worker_index == 0:
            torch.save(checkpoint_model, os.path.join(local_dir, checked_file))
            storage.upload(cp_bucket, checked_file,
                           os.path.join(local_dir, checked_file))
            print("checkpoint model at epoch {} saved!".format(epoch))

        print(
            "Invoking the next round of functions. round: {}/{}, start epoch: {}, run epoch: {}"
            .format(
                int((epoch + 1) / run_epochs) + 1,
                math.ceil(n_epochs / run_epochs), epoch + 1, run_epochs))
        lambda_client = boto3.client('lambda')
        payload = {
            'train_file': event['train_file'],
            'test_file': event['test_file'],
            'data_bucket': event['data_bucket'],
            'n_features': event['n_features'],
            'n_classes': event['n_classes'],
            'n_workers': event['n_workers'],
            'worker_index': event['worker_index'],
            'cp_bucket': event['cp_bucket'],
            'host': event['host'],
            'port': event['port'],
            'model': event['model'],
            'optim': event['optim'],
            'sync_mode': event['sync_mode'],
            'lr': event['lr'],
            'batch_size': event['batch_size'],
            'n_epochs': event['n_epochs'],
            'start_epoch': epoch + 1,
            'run_epochs': event['run_epochs'],
            'function_name': event['function_name']
        }
        lambda_client.invoke(FunctionName=function_name,
                             InvocationType='Event',
                             Payload=json.dumps(payload))

    end_time = time.time()
    print("Elapsed time = {} s".format(end_time - start_time))