Beispiel #1
0
def partition_agaricus(batch_size, train_file, test_file):
    train_dataset = SparseDatasetWithLines(train_file, 127)
    test_dataset = SparseDatasetWithLines(test_file, 127)

    size = dist.get_world_size()
    bsz = 1 if batch_size == 1 else int(batch_size / float(size))
    train_partition_sizes = [1.0 / size for _ in range(size)]
    train_partition = DataPartitioner(train_dataset, train_partition_sizes)
    train_partition = train_partition.use(dist.get_rank())
    train_loader = DataLoader(train_partition, batch_size=bsz, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
    return train_partition, train_loader, bsz, test_loader
Beispiel #2
0
def run(args):
    device = torch.device(
        'cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu')
    torch.manual_seed(1234)
    read_start = time.time()
    avg_error = np.iinfo(np.int16).max
    logging.info(f"{args.rank}-th worker starts.")

    file_name = "{}/{}_{}".format(args.root, args.rank, args.world_size)
    train_file = open(file_name, 'r').readlines()

    train_set = SparseDatasetWithLines(train_file, args.features)
    train_set = [t[0] for t in train_set]
    logging.info(f"Loading dataset costs {time.time() - read_start}s")

    # initialize centroids
    init_cent_start = time.time()
    if args.rank == 0:
        c_dense_list = [t.to_dense() for t in train_set[:args.num_clusters]]
        centroids = torch.stack(c_dense_list).reshape(args.num_clusters,
                                                      args.features)
    else:
        centroids = torch.empty(args.num_clusters, args.features)

    if dist_is_initialized():
        dist.broadcast(centroids, 0)
    logging.info(
        f"Receiving initial centroids costs {time.time() - init_cent_start}s")

    training_start = time.time()
    for epoch in range(args.epochs):
        if avg_error >= args.threshold:
            start_compute = time.time()
            model = SparseKmeans(train_set, centroids, args.features,
                                 args.num_clusters)
            model.find_nearest_cluster()
            error = torch.tensor(model.error)
            end_compute = time.time()
            logging.info(
                f"{args.rank}-th worker computing centroids takes {end_compute - start_compute}s"
            )
            sync_start = time.time()
            if dist_is_initialized():
                centroids, avg_error = broadcast_average(
                    args, model.get_centroids("dense_tensor"), error)
            logging.info(
                f"{args.rank}-th worker finished {epoch} epoch. "
                f"Computing takes {end_compute - start_compute}s. "
                f"Communicating takes {time.time() - sync_start}s. "
                # f"Centroids: {model.get_centroids('dense_tensor')}. "
                f"Loss: {model.error}")
        else:
            logging.info(
                f"{args.rank}-th worker finished training. Error = {avg_error}, centroids = {centroids}"
            )
            logging.info(
                f"Whole process time : {time.time() - training_start}")
            return
Beispiel #3
0
def partition_sparse(file, num_feature):
    train_dataset = SparseDatasetWithLines(file, num_feature)
    size = 1
    rank = 0
    if dist_is_initialized():
        size = dist.get_world_size()
        rank = dist.get_rank()
    train_partition_sizes = [1.0 / size for _ in range(size)]
    train_partition = DataPartitioner(train_dataset, train_partition_sizes)
    train_partition = train_partition.use(rank)
    return train_partition
Beispiel #4
0
        return

    def get_centroids(self, centroids_type):
        if centroids_type == "sparse_tensor":
            return self.centroids
        if centroids_type == "numpy":
            cent_lst = [
                self.centroids[i].to_dense().numpy()
                for i in range(self.nr_cluster)
            ]
            centroid = np.array(cent_lst).reshape(self.nr_cluster,
                                                  self.nr_feature)
            return centroid
        if centroids_type == "dense_tensor":
            cent_tensor_lst = [
                self.centroids[i].to_dense() for i in range(self.nr_cluster)
            ]
            return torch.stack(cent_tensor_lst)


if __name__ == "__main__":
    train_file = "../dataset/agaricus_127d_train.libsvm"
    test_file = "../dataset/agaricus_127d_test.libsvm"
    dim = 127
    train_data = SparseDatasetWithLines(train_file, dim)
    test_data = SparseDatasetWithLines(test_file, dim)
    nr_cluster = 10
    centroids = train_data.ins_list[:nr_cluster]
    kmeans_model = SparseKmeans(train_data, centroids, dim, nr_cluster)
    kmeans_model.find_nearest_cluster()
Beispiel #5
0
def handler(event, context):
    avg_error = np.iinfo(np.int16).max

    num_features = event['num_features']
    num_clusters = event['num_clusters']
    worker_cent_bucket = event["worker_cent_bucket"]
    avg_cent_bucket = event["avg_cent_bucket"]
    num_epochs = event["num_epochs"]
    threshold = event["threshold"]
    dataset_type = event["dataset_type"]
    elastic_location = event["elasticache"]
    elastic_endpoint = memcached_init(elastic_location)
    print(elastic_endpoint)
    #Reading data from S3
    bucket_name = event['bucket_name']
    key = urllib.parse.unquote_plus(event['key'], encoding='utf-8')
    logger.info(
        f"Reading training data from bucket = {bucket_name}, key = {key}")
    key_splits = key.split("_")
    num_worker = int(key_splits[-1])
    worker_index = int(key_splits[0])

    event_start = time.time()
    file = get_object(bucket_name, key).read().decode('utf-8').split("\n")
    s3_end = time.time()
    logger.info(f"Getting object from s3 takes {s3_end - event_start}s")

    if dataset_type == "dense":
        # dataset is stored as numpy array
        dataset = DenseDatasetWithLines(file, num_features).ins_np
        dt = dataset.dtype
        centroid_shape = (num_clusters, dataset.shape[1])
    else:
        # dataset is sparse, stored as sparse tensor
        dataset = SparseDatasetWithLines(file, num_features)
        first_entry = dataset.ins_list[0].to_dense().numpy()
        dt = first_entry.dtype
        centroid_shape = (num_clusters, first_entry.shape[1])
    parse_end = time.time()
    logger.info(f"Parsing dataset takes {parse_end - s3_end}s")
    logger.info(
        f"worker index: {worker_index},Dataset: {dataset_type}, dtype: {dt}. Centroids shape: {centroid_shape}. num_features: {num_features}"
    )

    if worker_index == 0:
        if dataset_type == "dense":
            centroids = dataset[0:num_clusters].reshape(-1)
            hset_object(elastic_endpoint, avg_cent_bucket, "initial",
                        centroids.tobytes())
            centroids = centroids.reshape(centroid_shape)
        else:
            centroids = store_centroid_as_numpy(
                dataset.ins_list[0:num_clusters], num_clusters)
            hset_object(elastic_endpoint, avg_cent_bucket, "initial",
                        centroids.tobytes())
    else:
        cent = hget_object_or_wait(elastic_endpoint, avg_cent_bucket,
                                   "initial", 0.00001)
        centroids = process_centroid(cent, num_clusters, dt)
        #centroids = np.frombuffer(cent,dtype=dt)
        if centroid_shape != centroids.shape:
            logger.error("The shape of centroids does not match.")
        logger.info(
            f"Waiting for initial centroids takes {time.time() - parse_end} s")

    training_start = time.time()
    sync_time = 0
    for epoch in range(num_epochs):
        logger.info(f"{worker_index}-th worker in {epoch}-th epoch")
        epoch_start = time.time()
        if epoch != 0:
            last_epoch = epoch - 1
            cent_with_error = hget_object_or_wait(elastic_endpoint,
                                                  avg_cent_bucket,
                                                  f"avg-{last_epoch}", 0.00001)
            wait_end = time.time()
            if worker_index != 0:
                logger.info(
                    f"Wait for centroid for {epoch}-th epoch. Takes {wait_end - epoch_start}"
                )
                sync_time += wait_end - epoch_start
            avg_error, centroids = process_centroid(cent_with_error,
                                                    num_clusters, dt, True)
        if avg_error >= threshold:
            print("get new centro")
            res = get_new_centroids(dataset, dataset_type, centroids, epoch,
                                    num_features, num_clusters)
            #dt = res.dtype
            sync_start = time.time()
            success = hset_object(elastic_endpoint, worker_cent_bucket,
                                  f"{worker_index}_{epoch}", res.tobytes())

            if worker_index == 0 and success:

                compute_average_centroids(elastic_endpoint, avg_cent_bucket,
                                          worker_cent_bucket, num_worker,
                                          centroid_shape, epoch, dt)
                logger.info(
                    f"Waiting for all workers takes {time.time() - sync_start} s"
                )
                if epoch != 0:
                    sync_time += time.time() - sync_start

        else:
            print("sync time = {}".format(sync_time))
            logger.info(
                f"{worker_index}-th worker finished training. Error = {avg_error}, centroids = {centroids}"
            )
            logger.info(f"Whole process time : {time.time() - training_start}")
            return
        print("sync time = {}".format(sync_time))
        put_object("kmeans-time", "time_{}".format(worker_index),
                   np.asarray(sync_time).tostring())
Beispiel #6
0
def run(args):
    device = torch.device(
        'cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu')
    logging.info(f"{args.rank}-th worker starts.")

    read_start = time.time()
    torch.manual_seed(1234)
    train_file = open(args.train_file, 'r').readlines()
    dataset = SparseDatasetWithLines(train_file, args.features)
    logging.info(f"Loading dataset costs {time.time() - read_start}s")

    preprocess_start = time.time()
    dataset_size = len(dataset)
    indices = list(range(dataset_size))
    split = int(np.floor(0.2 * dataset_size))
    if args.shuffle:
        np.random.seed(42)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]

    train_set = [dataset[i] for i in train_indices]
    val_set = [dataset[i] for i in val_indices]
    logging.info("preprocess data cost {} s".format(time.time() -
                                                    preprocess_start))

    lr = LogisticRegression(train_set, val_set, args.features, args.epochs,
                            args.learning_rate, args.batch_size)
    training_start = time.time()
    for epoch in range(args.epochs):
        epoch_start = time.time()
        num_batches = math.floor(len(train_set) / args.batch_size)
        for batch_idx in range(num_batches):
            batch_start = time.time()
            batch_ins, batch_label = lr.next_batch(batch_idx)
            batch_grad = torch.zeros(lr.n_input, 1, requires_grad=False)
            batch_bias = np.float(0)
            train_loss = Loss()
            train_acc = Accuracy()
            for i in range(len(batch_ins)):
                z = lr.forward(batch_ins[i])
                h = lr.sigmoid(z)
                loss = lr.loss(h, batch_label[i])
                # print("z= {}, h= {}, loss = {}".format(z, h, loss))
                train_loss.update(loss, 1)
                train_acc.update(h, batch_label[i])
                g = lr.backward(batch_ins[i], h.item(), batch_label[i])
                batch_grad.add_(g)
                batch_bias += np.sum(h.item() - batch_label[i])
            batch_grad = batch_grad.div(len(batch_ins))
            batch_bias = batch_bias / len(batch_ins)
            batch_grad.mul_(-1.0 * args.learning_rate)
            lr.grad.add_(batch_grad)
            lr.bias = lr.bias - batch_bias * args.learning_rate
            end_compute = time.time()
            logging.info(
                f"Train loss: {train_loss}, train accurary: {train_acc}")
            logging.info(
                f"{args.rank}-th worker finishes computing one batch. Takes {time.time() - batch_start}"
            )

            weights = np.append(lr.grad.numpy().flatten(), lr.bias)
            weights_merged = broadcast_average(args, torch.tensor(weights))
            lr.grad, lr.bias = weights_merged[:-1].reshape(
                args.features, 1), float(weights_merged[-1])
            logging.info(
                f"{args.rank}-th worker finishes sychronizing. Takes {time.time() - end_compute}"
            )

        val_loss, val_acc = lr.evaluate()
        logging.info(
            f"Validation loss: {val_loss}, validation accuracy: {val_acc}")
        logging.info(f"Epoch takes {time.time() - epoch_start}s")

    logging.info(
        f"Finishes training. {args.epochs} takes {time.time() - training_start}s."
    )
Beispiel #7
0
def handler(event, context):
    try:
        start_time = time.time()
        num_features = event['num_features']
        learning_rate = event["learning_rate"]
        batch_size = event["batch_size"]
        num_epochs = event["num_epochs"]
        validation_ratio = event["validation_ratio"]

        # Reading data from S3
        bucket_name = event['bucket_name']
        key = urllib.parse.unquote_plus(event['key'], encoding='utf-8')
        print(f"Reading training data from bucket = {bucket_name}, key = {key}")
        key_splits = key.split("_")
        worker_index = int(key_splits[0])
        num_worker = int(key_splits[1])

        # read file from s3
        file = get_object(bucket_name, key).read().decode('utf-8').split("\n")
        print("read data cost {} s".format(time.time() - start_time))

        parse_start = time.time()
        dataset = SparseDatasetWithLines(file, num_features)
        print("parse data cost {} s".format(time.time() - parse_start))

        preprocess_start = time.time()
        dataset_size = len(dataset)
        indices = list(range(dataset_size))
        split = int(np.floor(validation_ratio * dataset_size))
        if shuffle_dataset:
            np.random.seed(random_seed)
            np.random.shuffle(indices)
        train_indices, val_indices = indices[split:], indices[:split]

        train_set = [dataset[i] for i in train_indices]
        val_set = [dataset[i] for i in val_indices]

        print("preprocess data cost {} s".format(time.time() - preprocess_start))
        svm = SparseSVM(train_set, val_set, num_features, num_epochs, learning_rate, batch_size)

        # Training the Model
        for epoch in range(num_epochs):
            epoch_start = time.time()
            num_batches = math.floor(len(train_set) / batch_size)
            print(f"worker {worker_index} epoch {epoch}")
            for batch_idx in range(num_batches):
                batch_start = time.time()
                batch_ins, batch_label = svm.next_batch(batch_idx)
                acc = svm.one_epoch(batch_idx, epoch)

                np_grad = svm.weights.numpy().flatten()
                print(f"computation takes {time.time() - batch_start}s")

                sync_start = time.time()
                put_object(grad_bucket, w_grad_prefix + str(worker_index), np_grad.tobytes())

                file_postfix = "{}_{}".format(epoch, batch_idx)
                if worker_index == 0:
                    w_grad_merge = merge_weights(grad_bucket, num_worker, np_grad.dtype, np_grad.shape)
                    put_object(model_bucket, w_grad_prefix + file_postfix, w_grad_merge.tobytes())
                    # delete_expired_w_b(model_bucket, epoch, batch_idx, w_grad_prefix)
                    svm.weights = torch.from_numpy(w_grad_merge).reshape(num_features, 1)
                else:
                    w_data = get_object_or_wait(model_bucket, w_grad_prefix + file_postfix, 0.1).read()
                    w_grad_merge = np.frombuffer(w_data, dtype=np_grad.dtype).reshape(np_grad.shape)
                    svm.weights = torch.from_numpy(w_grad_merge).reshape(num_features, 1)
                print(f"synchronization cost {time.time() - sync_start}s")
                print(f"batch takes {time.time() - batch_start}s")

                if (batch_idx + 1) % 10 == 0:
                    print(f"Epoch: {epoch + 1}/{num_epochs}, Step: {batch_idx + 1}/{len(train_indices) / batch_size}, "
                          f"train acc: {acc}")

            val_acc = svm.evaluate()
            print(f"validation accuracy: {val_acc}")
            print(f"Epoch takes {time.time() - epoch_start}s")

        if worker_index == 0:
            clear_bucket(model_bucket)
            clear_bucket(grad_bucket)
        print("elapsed time = {} s".format(time.time() - start_time))

    except Exception as e:
        print("Error {}".format(e))
Beispiel #8
0
def handler(event, context):
    try:
        start_time = time.time()
        bucket_name = event['bucket_name']
        worker_index = event['rank']
        num_workers = event['num_workers']
        key = event['file']
        merged_bucket = event['merged_bucket']
        num_features = event['num_features']
        learning_rate = event["learning_rate"]
        batch_size = event["batch_size"]
        num_epochs = event["num_epochs"]
        validation_ratio = event["validation_ratio"]
        elasti_location = event['elasticache']
        endpoint = memcached_init(elasti_location)

        # Reading data from S3
        print(f"Reading training data from bucket = {bucket_name}, key = {key}")
        file = get_object(bucket_name, key).read().decode('utf-8').split("\n")
        print("read data cost {} s".format(time.time() - start_time))

        parse_start = time.time()
        dataset = SparseDatasetWithLines(file, num_features)
        print("parse data cost {} s".format(time.time() - parse_start))

        preprocess_start = time.time()
        dataset_size = len(dataset)
        indices = list(range(dataset_size))
        split = int(np.floor(validation_ratio * dataset_size))
        if shuffle_dataset:
            np.random.seed(random_seed)
            np.random.shuffle(indices)
        train_indices, val_indices = indices[split:], indices[:split]

        train_set = [dataset[i] for i in train_indices]
        val_set = [dataset[i] for i in val_indices]

        print("preprocess data cost {} s".format(time.time() - preprocess_start))
        svm = SparseSVM(train_set, val_set, num_features, num_epochs, learning_rate, batch_size)

        # Training the Model
        train_start = time.time()
        for epoch in range(num_epochs):
            epoch_start = time.time()
            num_batches = math.floor(len(train_set) / batch_size)
            print("worker {} epoch {}".format(worker_index, epoch))
            for batch_idx in range(num_batches):
                batch_start = time.time()
                batch_ins, batch_label = svm.next_batch(batch_idx)
                acc = svm.one_epoch(batch_idx, epoch)
                if (batch_idx + 1) % 10 == 0:
                    print("Epoch: {}/{}, Step: {}/{}, train acc: {}"
                          .format(epoch + 1, num_epochs, batch_idx + 1, num_batches, acc))
            cal_time = time.time() - epoch_start

            sync_start = time.time()
            np_w = svm.weights.numpy().flatten()
            postfix = str(epoch)
            w_merge = reduce_epoch(endpoint, np_w, merged_bucket, num_workers, worker_index, postfix)
            svm.weights = torch.from_numpy(w_merge).reshape(num_features, 1)
            sync_time = time.time() - sync_start

            test_start = time.time()
            val_acc = svm.evaluate()
            test_time = time.time() - test_start

            print('Epoch: [%d/%d], Step: [%d/%d], Time: %.4f, epoch cost %.4f, '
                  'cal cost %.4f s, sync cost %.4f s, test cost %.4f s, test accuracy: %s %%'
                  % (epoch + 1, num_epochs, batch_idx + 1, num_batches,
                     time.time() - train_start, time.time() - epoch_start,
                     cal_time, sync_time, test_time, val_acc))

        if worker_index == 0:
            clear_bucket(endpoint)
        print("elapsed time = {} s".format(time.time() - start_time))

    except Exception as e:
        print("Error {}".format(e))
Beispiel #9
0
def handler(event, context):
    start_time = time.time()
    bucket = event['bucket_name']
    worker_index = event['rank']
    num_workers = event['num_workers']
    key = event['file']
    tmp_bucket = event['tmp_bucket']
    merged_bucket = event['merged_bucket']
    num_epochs = event['num_epochs']
    learning_rate = event['learning_rate']
    batch_size = event['batch_size']

    print('bucket = {}'.format(bucket))
    print("file = {}".format(key))
    print('tmp bucket = {}'.format(tmp_bucket))
    print('merged bucket = {}'.format(merged_bucket))
    print('number of workers = {}'.format(num_workers))
    print('worker index = {}'.format(worker_index))
    print('num epochs = {}'.format(num_epochs))
    print('learning rate = {}'.format(learning_rate))
    print("batch size = {}".format(batch_size))

    # read file from s3
    file = get_object(bucket, key).read().decode('utf-8').split("\n")
    print("read data cost {} s".format(time.time() - start_time))

    parse_start = time.time()
    dataset = SparseDatasetWithLines(file, num_features)
    print("parse data cost {} s".format(time.time() - parse_start))

    preprocess_start = time.time()
    # Creating data indices for training and validation splits:
    dataset_size = len(dataset)
    indices = list(range(dataset_size))
    split = int(np.floor(validation_ratio * dataset_size))
    if shuffle_dataset:
        np.random.seed(random_seed)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]

    # Creating PT data samplers and loaders:
    train_sampler = SubsetRandomSampler(train_indices)
    valid_sampler = SubsetRandomSampler(val_indices)

    train_loader = torch.utils.data.DataLoader(dataset,
                                               batch_size=batch_size,
                                               sampler=train_sampler)
    validation_loader = torch.utils.data.DataLoader(dataset,
                                                    batch_size=batch_size,
                                                    sampler=valid_sampler)

    print("preprocess data cost {} s".format(time.time() - preprocess_start))

    model = LogisticRegression(num_features, num_classes)

    # Loss and Optimizer
    # Softmax is internally computed.
    # Set parameters to be updated.
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    # Training the Model
    for epoch in range(num_epochs):
        for batch_index, (items, labels) in enumerate(train_loader):
            print("------worker {} epoch {} batch {}------".format(
                worker_index, epoch, batch_index))
            batch_start = time.time()
            items = Variable(items.view(-1, num_features))
            labels = Variable(labels)

            # Forward + Backward + Optimize
            optimizer.zero_grad()
            outputs = model(items)
            loss = criterion(outputs, labels)
            loss.backward()

            print("forward and backward cost {} s".format(time.time() -
                                                          batch_start))

            w_grad = model.linear.weight.grad.data.numpy()
            b_grad = model.linear.bias.grad.data.numpy()
            #print("dtype of grad = {}".format(w_grad.dtype))
            #print("w_grad before merge = {}".format(w_grad[0][0:5]))
            #print("b_grad before merge = {}".format(b_grad))

            sync_start = time.time()
            put_object(grad_bucket, w_grad_prefix + str(worker_index),
                       w_grad.tobytes())
            put_object(grad_bucket, b_grad_prefix + str(worker_index),
                       b_grad.tobytes())

            file_postfix = "{}_{}".format(epoch, batch_index)
            if worker_index == 0:
                w_grad_merge, b_grad_merge = \
                    merge_w_b_grads(grad_bucket, num_worker, w_grad.dtype,
                                    w_grad.shape, b_grad.shape,
                                    w_grad_prefix, b_grad_prefix)
                put_merged_w_b_grad(model_bucket, w_grad_merge, b_grad_merge,
                                    file_postfix, w_grad_prefix, b_grad_prefix)
                delete_expired_w_b(model_bucket, epoch, batch_index,
                                   w_grad_prefix, b_grad_prefix)
                model.linear.weight.grad = Variable(
                    torch.from_numpy(w_grad_merge))
                model.linear.bias.grad = Variable(
                    torch.from_numpy(b_grad_merge))
            else:
                w_grad_merge, b_grad_merge = get_merged_w_b_grad(
                    model_bucket, file_postfix, w_grad.dtype, w_grad.shape,
                    b_grad.shape, w_grad_prefix, b_grad_prefix)
                model.linear.weight.grad = Variable(
                    torch.from_numpy(w_grad_merge))
                model.linear.bias.grad = Variable(
                    torch.from_numpy(b_grad_merge))

            #print("w_grad after merge = {}".format(model.linear.weight.grad.data.numpy()[0][:5]))
            #print("b_grad after merge = {}".format(model.linear.bias.grad.data.numpy()))

            print("synchronization cost {} s".format(time.time() - sync_start))

            optimizer.step()

            print("batch cost {} s".format(time.time() - batch_start))

            if (batch_index + 1) % 10 == 0:
                print('Epoch: [%d/%d], Step: [%d/%d], Loss: %.4f' %
                      (epoch + 1, num_epochs, batch_index + 1,
                       len(train_indices) / batch_size, loss.data))

    if worker_index == 0:
        clear_bucket(model_bucket)
        clear_bucket(grad_bucket)

    # Test the Model
    correct = 0
    total = 0
    for items, labels in validation_loader:
        items = Variable(items.view(-1, num_features))
        # items = Variable(items)
        outputs = model(items)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum()

    print('Accuracy of the model on the %d test samples: %d %%' %
          (len(val_indices), 100 * correct / total))

    endTs = time.time()
    print("elapsed time = {} s".format(endTs - startTs))