Esempio n. 1
0
def handler(event, context):
    # dataset
    data_bucket = event['data_bucket']
    file = event['file']
    dataset_type = event["dataset_type"]
    assert dataset_type == "sparse_libsvm"
    n_features = event['n_features']

    # ps setting
    host = event['host']
    port = event['port']

    # hyper-parameter
    n_clusters = event['n_clusters']
    n_epochs = event["n_epochs"]
    threshold = event["threshold"]
    sync_mode = event["sync_mode"]
    n_workers = event["n_workers"]
    worker_index = event['worker_index']
    assert sync_mode.lower() == Synchronization.Reduce

    print('data bucket = {}'.format(data_bucket))
    print("file = {}".format(file))
    print('number of workers = {}'.format(n_workers))
    print('worker index = {}'.format(worker_index))
    print('num clusters = {}'.format(n_clusters))
    print('host = {}'.format(host))
    print('port = {}'.format(port))

    # Set thrift connection
    # Make socket
    transport = TSocket.TSocket(host, port)
    # Buffering is critical. Raw sockets are very slow
    transport = TTransport.TBufferedTransport(transport)
    # Wrap in a protocol
    protocol = TBinaryProtocol.TBinaryProtocol(transport)
    # Create a client to use the protocol encoder
    t_client = ParameterServer.Client(protocol)
    # Connect!
    transport.open()
    # test thrift connection
    ps_client.ping(t_client)
    print("create and ping thrift server >>> HOST = {}, PORT = {}".format(host, port))

    # Reading data from S3
    read_start = time.time()
    storage = S3Storage()
    lines = storage.load(file, data_bucket).read().decode('utf-8').split("\n")
    print("read data cost {} s".format(time.time() - read_start))

    parse_start = time.time()
    dataset = libsvm_dataset.from_lines(lines, n_features, dataset_type)
    train_set = dataset.ins_list
    np_dtype = train_set[0].to_dense().numpy().dtype
    centroid_shape = (n_clusters, n_features)
    print("parse data cost {} s".format(time.time() - parse_start))
    print("dataset type: {}, data type: {}, centroids shape: {}"
          .format(dataset_type, np_dtype, centroid_shape))

    # register model
    model_name = Prefix.KMeans_Cent
    model_length = centroid_shape[0] * centroid_shape[1] + 1
    ps_client.register_model(t_client, worker_index, model_name, model_length, n_workers)
    ps_client.exist_model(t_client, model_name)
    print("register and check model >>> name = {}, length = {}".format(model_name, model_length))

    init_centroids_start = time.time()
    ps_client.can_pull(t_client, model_name, 0, worker_index)
    ps_model = ps_client.pull_model(t_client, model_name, 0, worker_index)
    if worker_index == 0:
        centroids_np = sparse_centroid_to_numpy(train_set[0:n_clusters], n_clusters)
        ps_client.can_push(t_client, model_name, 0, worker_index)
        ps_client.push_grad(t_client, model_name,
                            np.append(centroids_np.flatten(), 1000.).astype(np.double) - np.asarray(ps_model).astype(np.double),
                            1., 0, worker_index)
    else:
        centroids_np = np.zeros(centroid_shape)
        ps_client.can_push(t_client, model_name, 0, worker_index)
        ps_client.push_grad(t_client, model_name,
                            np.append(centroids_np.flatten(), 0).astype(np.double),
                            0, 0, worker_index)
    ps_client.can_pull(t_client, model_name, 1, worker_index)
    ps_model = ps_client.pull_model(t_client, model_name, 1, worker_index)
    cur_centroids = np.array(ps_model[0:-1]).astype(np.float32).reshape(centroid_shape)
    cur_error = float(ps_model[-1])
    print("initial centroids cost {} s".format(time.time() - init_centroids_start))

    model = cluster_models.get_model(train_set, torch.from_numpy(cur_centroids), dataset_type,
                                     n_features, n_clusters)

    train_start = time.time()
    for epoch in range(1, n_epochs + 1):
        epoch_start = time.time()

        # local computation
        model.find_nearest_cluster()
        local_cent = model.get_centroids("numpy").reshape(-1)
        local_cent_error = np.concatenate((local_cent.astype(np.double).flatten(),
                                           np.array([model.error], dtype=np.double)))
        epoch_cal_time = time.time() - epoch_start

        # push updates
        epoch_comm_start = time.time()
        last_cent_error = np.concatenate((cur_centroids.astype(np.double).flatten(),
                                          np.array([cur_error], dtype=np.double)))
        ps_model_inc = local_cent_error - last_cent_error
        ps_client.can_push(t_client, model_name, epoch, worker_index)
        ps_client.push_grad(t_client, model_name,
                            ps_model_inc, 1. / n_workers, epoch, worker_index)

        # pull new model
        ps_client.can_pull(t_client, model_name, epoch + 1, worker_index)   # sync all workers
        ps_model = ps_client.pull_model(t_client, model_name, epoch + 1, worker_index)
        model.centroids = [torch.from_numpy(c).reshape(1, n_features).to_sparse()
                           for c in np.array(ps_model[0:-1]).astype(np.float32).reshape(centroid_shape)]

        model.error = float(ps_model[-1])
        cur_centroids = model.get_centroids("numpy")
        cur_error = model.error

        epoch_comm_time = time.time() - epoch_comm_start

        print("Epoch[{}] Worker[{}], error = {}, cost {} s, cal cost {} s, sync cost {} s"
              .format(epoch, worker_index, model.error,
                      time.time() - epoch_start, epoch_cal_time, epoch_comm_time))

        if model.error < threshold:
            break

    print("Worker[{}] finishes training: Error = {}, cost {} s"
          .format(worker_index, model.error, time.time() - train_start))
    return
Esempio n. 2
0
def handler(event, context):
    # dataset
    data_bucket = event['data_bucket']
    file = event['file']
    dataset_type = event["dataset_type"]
    n_features = event['n_features']
    tmp_bucket = event["tmp_bucket"]
    merged_bucket = event["merged_bucket"]

    # hyper-parameter
    n_clusters = event['n_clusters']
    n_epochs = event["n_epochs"]
    threshold = event["threshold"]
    sync_mode = event["sync_mode"]
    n_workers = event["n_workers"]
    worker_index = event['worker_index']
    assert sync_mode.lower() in [
        Synchronization.Reduce, Synchronization.Reduce_Scatter
    ]

    print('data bucket = {}'.format(data_bucket))
    print("file = {}".format(file))
    print('number of workers = {}'.format(n_workers))
    print('worker index = {}'.format(worker_index))
    print('num clusters = {}'.format(n_clusters))
    print('sync mode = {}'.format(sync_mode))

    storage = S3Storage()
    communicator = S3Communicator(storage, tmp_bucket, merged_bucket,
                                  n_workers, worker_index)

    # Reading data from S3
    read_start = time.time()
    lines = storage.load(file, data_bucket).read().decode('utf-8').split("\n")
    print("read data cost {} s".format(time.time() - read_start))

    parse_start = time.time()
    dataset = libsvm_dataset.from_lines(lines, n_features, dataset_type)
    if dataset_type == "dense_libsvm":
        dataset = dataset.ins_np
        data_type = dataset.dtype
        centroid_shape = (n_clusters, dataset.shape[1])
    elif dataset_type == "sparse_libsvm":
        dataset = dataset.ins_list
        first_entry = dataset[0].to_dense().numpy()
        data_type = first_entry.dtype
        centroid_shape = (n_clusters, first_entry.shape[1])
    print("parse data cost {} s".format(time.time() - parse_start))
    print("dataset type: {}, dtype: {}, Centroids shape: {}, num_features: {}".
          format(dataset_type, data_type, centroid_shape, n_features))

    init_centroids_start = time.time()
    if worker_index == 0:
        if dataset_type == "dense_libsvm":
            centroids = dataset[0:n_clusters]
        elif dataset_type == "sparse_libsvm":
            centroids = sparse_centroid_to_numpy(dataset[0:n_clusters],
                                                 n_clusters)
        storage.save(centroids.tobytes(), Prefix.KMeans_Init_Cent + "-1",
                     merged_bucket)
        print("generate initial centroids takes {} s".format(
            time.time() - init_centroids_start))
    else:
        centroid_bytes = storage.load_or_wait(Prefix.KMeans_Init_Cent + "-1",
                                              merged_bucket).read()
        centroids = centroid_bytes2np(centroid_bytes, n_clusters, data_type)
        if centroid_shape != centroids.shape:
            raise Exception("The shape of centroids does not match.")
        print("Waiting for initial centroids takes {} s".format(
            time.time() - init_centroids_start))

    model = cluster_models.get_model(dataset, centroids, dataset_type,
                                     n_features, n_clusters)

    train_start = time.time()
    for epoch in range(n_epochs):
        epoch_start = time.time()

        # rearrange data points
        model.find_nearest_cluster()

        local_cent = model.get_centroids("numpy").reshape(-1)
        local_cent_error = np.concatenate(
            (local_cent.flatten(), np.array([model.error])))
        epoch_cal_time = time.time() - epoch_start

        # sync local centroids and error
        epoch_sync_start = time.time()
        postfix = str(epoch)
        if sync_mode == "reduce":
            cent_error_merge = communicator.reduce_epoch(
                local_cent_error, postfix)
        elif sync_mode == "reduce_scatter":
            cent_error_merge = communicator.reduce_scatter_epoch(
                local_cent_error, postfix)

        cent_merge = cent_error_merge[:-1].reshape(centroid_shape) / float(
            n_workers)
        error_merge = cent_error_merge[-1] / float(n_workers)

        model.centroids = cent_merge
        model.error = error_merge
        epoch_sync_time = time.time() - epoch_sync_start

        print(
            "Epoch[{}] Worker[{}], error = {}, cost {} s, cal cost {} s, sync cost {} s"
            .format(epoch, worker_index, model.error,
                    time.time() - epoch_start, epoch_cal_time,
                    epoch_sync_time))

        if model.error < threshold:
            break

    if worker_index == 0:
        storage.clear(tmp_bucket)
        storage.clear(merged_bucket)

    print("Worker[{}] finishes training: Error = {}, cost {} s".format(
        worker_index, model.error,
        time.time() - train_start))
    return
Esempio n. 3
0
def handler(event, context):
    # dataset
    data_bucket = event['data_bucket']
    file = event['file']
    dataset_type = event["dataset_type"]
    assert dataset_type == "dense_libsvm"
    n_features = event['n_features']
    n_workers = event["n_workers"]
    worker_index = event['worker_index']
    tmp_table_name = event['tmp_table_name']
    merged_table_name = event['merged_table_name']
    key_col = event['key_col']

    # hyper-parameter
    n_clusters = event['n_clusters']
    n_epochs = event["n_epochs"]
    threshold = event["threshold"]
    sync_mode = event["sync_mode"]
    assert sync_mode.lower() in [
        Synchronization.Reduce, Synchronization.Reduce_Scatter
    ]

    print('data bucket = {}'.format(data_bucket))
    print("file = {}".format(file))
    print('number of workers = {}'.format(n_workers))
    print('worker index = {}'.format(worker_index))
    print('num clusters = {}'.format(n_clusters))
    print('sync mode = {}'.format(sync_mode))

    s3_storage = S3Storage()
    dynamo_client = dynamo_operator.get_client()
    tmp_table = DynamoTable(dynamo_client, tmp_table_name)
    merged_table = DynamoTable(dynamo_client, merged_table_name)
    communicator = DynamoCommunicator(dynamo_client, tmp_table, merged_table,
                                      key_col, n_workers, worker_index)

    # Reading data from S3
    read_start = time.time()
    lines = s3_storage.load(file,
                            data_bucket).read().decode('utf-8').split("\n")
    print("read data cost {} s".format(time.time() - read_start))

    parse_start = time.time()
    dataset = libsvm_dataset.from_lines(lines, n_features, dataset_type).ins_np
    data_type = dataset.dtype
    centroid_shape = (n_clusters, dataset.shape[1])
    print("parse data cost {} s".format(time.time() - parse_start))
    print("dataset type: {}, dtype: {}, Centroids shape: {}, num_features: {}".
          format(dataset_type, data_type, centroid_shape, n_features))

    init_centroids_start = time.time()
    if worker_index == 0:
        centroids = dataset[0:n_clusters]
        merged_table.save(centroids.tobytes(), Prefix.KMeans_Init_Cent + "-1",
                          key_col)
    else:
        centroid_bytes = (merged_table.load_or_wait(
            Prefix.KMeans_Init_Cent + "-1", key_col, 0.1))['value'].value
        centroids = centroid_bytes2np(centroid_bytes, n_clusters, data_type)
        if centroid_shape != centroids.shape:
            raise Exception("The shape of centroids does not match.")

    print("initialize centroids takes {} s".format(time.time() -
                                                   init_centroids_start))

    model = cluster_models.get_model(dataset, centroids, dataset_type,
                                     n_features, n_clusters)

    train_start = time.time()
    for epoch in range(n_epochs):
        epoch_start = time.time()

        # rearrange data points
        model.find_nearest_cluster()

        local_cent = model.get_centroids("numpy").reshape(-1)
        local_cent_error = np.concatenate(
            (local_cent.flatten(), np.array([model.error], dtype=np.float32)))
        epoch_cal_time = time.time() - epoch_start

        # sync local centroids and error
        epoch_comm_start = time.time()

        if sync_mode == "reduce":
            cent_error_merge = communicator.reduce_epoch(
                local_cent_error, epoch)
        elif sync_mode == "reduce_scatter":
            cent_error_merge = communicator.reduce_scatter_epoch(
                local_cent_error, epoch)

        cent_merge = cent_error_merge[:-1].reshape(centroid_shape) / float(
            n_workers)
        error_merge = cent_error_merge[-1] / float(n_workers)

        model.centroids = cent_merge
        model.error = error_merge
        epoch_comm_time = time.time() - epoch_comm_start

        print("one {} round cost {} s".format(sync_mode, epoch_comm_time))

        print(
            "Epoch[{}] Worker[{}], error = {}, cost {} s, cal cost {} s, sync cost {} s"
            .format(epoch, worker_index, model.error,
                    time.time() - epoch_start, epoch_cal_time,
                    epoch_comm_time))

        if model.error < threshold:
            break

    if worker_index == 0:
        tmp_table.clear(key_col)
        merged_table.clear(key_col)

    print("Worker[{}] finishes training: Error = {}, cost {} s".format(
        worker_index, model.error,
        time.time() - train_start))
    return
Esempio n. 4
0
def handler(event, context):
    # dataset
    data_bucket = event['data_bucket']
    file = event['file']
    dataset_type = event["dataset_type"]
    n_features = event['n_features']
    tmp_bucket = event["tmp_bucket"]
    merged_bucket = event["merged_bucket"]
    assert dataset_type == "sparse_libsvm"

    # hyper-parameter
    n_clusters = event['n_clusters']
    n_epochs = event["n_epochs"]
    threshold = event["threshold"]
    sync_mode = event["sync_mode"]
    n_workers = event["n_workers"]
    worker_index = event['worker_index']
    assert sync_mode.lower() in [
        Synchronization.Reduce, Synchronization.Reduce_Scatter
    ]

    print('data bucket = {}'.format(data_bucket))
    print("file = {}".format(file))
    print('number of workers = {}'.format(n_workers))
    print('worker index = {}'.format(worker_index))
    print('num clusters = {}'.format(n_clusters))
    print('sync mode = {}'.format(sync_mode))

    storage = S3Storage()
    communicator = S3Communicator(storage, tmp_bucket, merged_bucket,
                                  n_workers, worker_index)

    # Reading data from S3
    read_start = time.time()
    lines = storage.load(file, data_bucket).read().decode('utf-8').split("\n")
    print("read data cost {} s".format(time.time() - read_start))

    parse_start = time.time()
    dataset = libsvm_dataset.from_lines(lines, n_features, dataset_type)
    train_set = dataset.ins_list
    np_dtype = train_set[0].to_dense().numpy().dtype
    centroid_shape = (n_clusters, n_features)

    print("parse data cost {} s".format(time.time() - parse_start))
    print("dataset type: {}, data type: {}, centroids shape: {}".format(
        dataset_type, np_dtype, centroid_shape))

    # initialize centroids
    init_centroids_start = time.time()
    if worker_index == 0:
        centroids_np = sparse_centroid_to_numpy(train_set[0:n_clusters],
                                                n_clusters)
        storage.save(centroids_np.tobytes(), Prefix.KMeans_Init_Cent + "-1",
                     merged_bucket)
    else:
        centroid_bytes = storage.load_or_wait(Prefix.KMeans_Init_Cent + "-1",
                                              merged_bucket).read()
        centroids_np = np.frombuffer(centroid_bytes,
                                     dtype=np_dtype).reshape(centroid_shape)

    centroids = torch.from_numpy(centroids_np)
    print("initial centroids cost {} s".format(time.time() -
                                               init_centroids_start))

    model = cluster_models.get_model(train_set, centroids, dataset_type,
                                     n_features, n_clusters)
    assert isinstance(model, SparseKMeans)

    train_start = time.time()
    for epoch in range(n_epochs):
        epoch_start = time.time()

        # rearrange data points
        model.find_nearest_cluster()

        local_cent = model.get_centroids("numpy").astype(
            np.float32).reshape(-1)
        local_cent_error = np.concatenate(
            (local_cent.flatten(), np.array([model.error], dtype=np.float32)))
        epoch_cal_time = time.time() - epoch_start

        # sync local centroids and error
        epoch_sync_start = time.time()
        postfix = str(epoch)
        if sync_mode == "reduce":
            cent_error_merge = communicator.reduce_epoch(
                local_cent_error, postfix)
        elif sync_mode == "reduce_scatter":
            cent_error_merge = communicator.reduce_scatter_epoch(
                local_cent_error, postfix)

        print("one {} round cost {} s".format(sync_mode,
                                              time.time() - epoch_sync_start))

        cent_merge = cent_error_merge[:-1].reshape(centroid_shape) / float(
            n_workers)
        print("merged centroids shape = {}".format(cent_merge.shape))
        error_merge = cent_error_merge[-1] / float(n_workers)

        model.centroids = [
            torch.from_numpy(c).reshape(1, n_features).to_sparse()
            for c in cent_merge
        ]
        model.error = error_merge
        epoch_sync_time = time.time() - epoch_sync_start

        print(
            "Epoch[{}] Worker[{}], error = {}, cost {} s, cal cost {} s, sync cost {} s"
            .format(epoch, worker_index, model.error,
                    time.time() - epoch_start, epoch_cal_time,
                    epoch_sync_time))

        if model.error < threshold:
            break

    if worker_index == 0:
        storage.clear(tmp_bucket)
        storage.clear(merged_bucket)

    print("Worker[{}] finishes training: Error = {}, cost {} s".format(
        worker_index, model.error,
        time.time() - train_start))
    return