Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model",
                        type=str,
                        required=True,
                        help="model to be tested")
    parser.add_argument("--config",
                        type=str,
                        default="../config/local_w8.yml",
                        help="configuration for ps")
    parser.add_argument("--val",
                        action="store_true",
                        help="whether to use validation")
    parser.add_argument("--cache", default=None, help="cache policy")
    parser.add_argument("--bsp",
                        action="store_true",
                        help="whether to use bsp instead of asp")
    parser.add_argument("--bound", default=100, help="cache bound")
    args = parser.parse_args()
    config = args.config
    import models
    model = eval('models.' + args.model)
    settings = yaml.load(open(config).read(), Loader=yaml.FullLoader)
    comm, device_id = ad.mpi_nccl_init()
    print('Model:', args.model, '; rank:', device_id)
    value = settings['w' + str(device_id)]
    for k, v in value.items():
        os.environ[k] = str(v)
    worker(model, device_id, args)
    ad.mpi_nccl_finish(comm)
Esempio n. 2
0
def worker(args):
    def validate():
        hits, ndcgs = [], []
        for idx in range(testData.shape[0]):
            start_index = idx * 100
            predictions = val_executor.run(convert_to_numpy_ret_vals=True)
            map_item_score = {testItemInput[start_index + i]: predictions[0][i] for i in range(100)}
            gtItem = testItemInput[start_index]
            # Evaluate top rank list
            ranklist = heapq.nlargest(topK, map_item_score, key=map_item_score.get)
            hr = getHitRatio(ranklist, gtItem)
            ndcg = getNDCG(ranklist, gtItem)
            hits.append(hr)
            ndcgs.append(ndcg)
        hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
        return hr, ndcg
    def get_current_shard(data):
        if args.comm is not None:
            part_size = data.shape[0] // nrank
            start = part_size * rank
            end = start + part_size if rank != nrank - 1 else data.shape[0]
            return data[start:end]
        else:
            return data

    device_id = 0
    if args.comm == 'PS':
        rank = ad.get_worker_communicate().rank()
        nrank = int(os.environ['DMLC_NUM_WORKER'])
        device_id = rank % 8
    elif args.comm == 'Hybrid':
        comm, rank = ad.mpi_nccl_init()
        nrank = int(os.environ['DMLC_NUM_WORKER'])
        device_id = rank % 8

    from movielens import getdata
    if args.all:
        trainData, testData = getdata('ml-25m', 'datasets')
        trainUsers = get_current_shard(trainData['user_input'])
        trainItems = get_current_shard(trainData['item_input'])
        trainLabels = get_current_shard(trainData['labels'])
        testData = get_current_shard(testData)
        testUserInput = np.repeat(np.arange(testData.shape[0], dtype=np.int32), 100)
        testItemInput = testData.reshape((-1,))
    else:
        trainData, testData = getdata('ml-25m', 'datasets')
        trainUsers = get_current_shard(trainData['user_input'][:1024000])
        trainItems = get_current_shard(trainData['item_input'][:1024000])
        trainLabels = get_current_shard(trainData['labels'][:1024000])
        testData = get_current_shard(testData[:1470])
        testUserInput = np.repeat(np.arange(testData.shape[0], dtype=np.int32), 100)
        testItemInput = testData.reshape((-1,))

    num_users, num_items = {
        'ml-1m': (6040, 3706),
        'ml-20m': (138493, 26744),
        'ml-25m': (162541, 59047),
    }['ml-25m']
    # assert not args.all or num_users == testData.shape[0]
    batch_size = 1024
    num_negatives = 4
    topK = 10
    user_input = dl.dataloader_op([
        dl.Dataloader(trainUsers, batch_size, 'train'),
        dl.Dataloader(testUserInput, 100, 'validate'),
    ])
    item_input = dl.dataloader_op([
        dl.Dataloader(trainItems, batch_size, 'train'),
        dl.Dataloader(testItemInput, 100, 'validate'),
    ])
    y_ = dl.dataloader_op([
        dl.Dataloader(trainLabels.reshape((-1, 1)), batch_size, 'train'),
    ])

    loss, y, train_op = neural_mf(user_input, item_input, y_, num_users, num_items)

    executor = ad.Executor([loss, train_op], ctx=ndarray.gpu(device_id), dataloader_name='train', \
        comm_mode=args.comm, cstable_policy=args.cache, bsp=args.bsp, cache_bound=args.bound, seed=123)
    val_executor = ad.Executor([y], ctx=ndarray.gpu(device_id), inference=True, dataloader_name='validate', comm_mode=args.comm, bsp=args.bsp)

    path = 'logs/hetulog_%s' % ({None: 'local', 'PS': 'ps', 'Hybrid': 'hybrid'}[args.comm])
    path += '_%d.txt' % rank if args.comm else '.txt'
    log = Logging(path=path)
    epoch = 7
    start = time.time()
    for ep in range(epoch):
        ep_st = time.time()
        log.write('epoch %d' % ep)
        train_loss = []
        for idx in tqdm(range(executor.batch_num)):
            loss_val = executor.run(convert_to_numpy_ret_vals=True)
            train_loss.append(loss_val[0])

            # if idx % 10000 == 0:
            #     hr, ndcg = validate()
            #     printstr = "HR: %.4f, NDCF: %.4f" % (hr, ndcg)
            #     log.write(printstr)

        tra_loss = np.mean(train_loss)
        ep_en = time.time()

        # validate phase
        if args.val:
            hr, ndcg = validate()
            printstr = "train_loss: %.4f, HR: %.4f, NDCF: %.4f, train_time: %.4f" % (tra_loss, hr, ndcg, ep_en - ep_st)
        else:
            printstr = "train_loss: %.4f, train_time: %.4f" % (tra_loss, ep_en - ep_st)
        log.write(printstr)
    log.write('all time: %f' % (time.time() - start))
Esempio n. 3
0
def test(args):
    comm, device_id = ad.mpi_nccl_init()
    rank = comm.localRank.value
    size = comm.nRanks.value

    dataset_info = {
        'Reddit': [232965, 602, 41],
        'Proteins': [132534, 602, 8],
        'Arch': [1644228, 602, 10],
        'Products': [2449029, 100, 47]
    }

    node_count, num_features, num_classes = dataset_info[args.dataset]

    hidden_layer_size = 128
    if num_features < 128:
        hidden_layer_size = 64

    replication = args.replication

    node_Count_Self = row_num(node_count, rank // replication,
                              size // replication)
    node_Count_All = node_count

    _, _, row_groups, col_groups = get_proc_groups(size, replication)

    executor_ctx = ndarray.gpu(device_id)

    if size > 1:
        adj_part, data_part, row_part, col_part, input_part, label_part = load_data(
            args, size, replication, rank)
    else:
        adj_part, data_part, row_part, col_part, input_part, label_part = load_data_whole(
            args)

    adj_matrix = ndarray.sparse_array(data_part, (row_part, col_part),
                                      shape=adj_part.shape,
                                      ctx=executor_ctx)

    # train:val:test=6:2:2
    # Our optimization on distributed GNN algorithm does NOT affect the correctness!
    # Here due to the limitation of current slice_op, data is split continuously.
    # Continuous split is unfriendly for reordered graph data where nodes are already clustered.
    # Specifically, training on some node clusters and testing on other clusters may cause poor test accuracy.
    # The better way is to split data randomly!
    train_split, test_split = 0.6, 0.8
    train_node = int(train_split * node_Count_Self)
    test_node = int(test_split * node_Count_Self)

    A = ad.Variable(name="A", trainable=False)
    H = ad.Variable(name="H")
    np.random.seed(123)
    bounds = np.sqrt(6.0 / (num_features + hidden_layer_size))
    W1_val = np.random.uniform(low=-bounds,
                               high=bounds,
                               size=[num_features,
                                     hidden_layer_size]).astype(np.float32)
    W1 = ad.Variable(name="W1", value=W1_val)
    bounds = np.sqrt(6.0 / (num_classes + hidden_layer_size))
    np.random.seed(123)
    W2_val = np.random.uniform(low=-bounds,
                               high=bounds,
                               size=[hidden_layer_size,
                                     num_classes]).astype(np.float32)

    W2 = ad.Variable(name="W2", value=W2_val)
    y_ = ad.Variable(name="y_")

    z = ad.distgcn_15d_op(A, H, W1, node_Count_Self, node_Count_All, size,
                          replication, device_id, comm,
                          [row_groups, col_groups], True)
    H1 = ad.relu_op(z)
    y = ad.distgcn_15d_op(A, H1, W2, node_Count_Self, node_Count_All, size,
                          replication, device_id, comm,
                          [row_groups, col_groups], True)

    y_train = ad.slice_op(y, (0, 0), (train_node, num_classes))
    label_train = ad.slice_op(y_, (0, 0), (train_node, num_classes))

    y_test = ad.slice_op(y, (test_node, 0),
                         (node_Count_Self - test_node, num_classes))
    label_test = ad.slice_op(y_, (test_node, 0),
                             (node_Count_Self - test_node, num_classes))

    loss = ad.softmaxcrossentropy_op(y_train, label_train)
    loss_test = ad.softmaxcrossentropy_op(y_test, label_test)
    opt = optimizer.AdamOptimizer()
    train_op = opt.minimize(loss)

    executor = ad.Executor([loss, y, loss_test, train_op], ctx=executor_ctx)

    feed_dict = {
        A:
        adj_matrix,
        H:
        ndarray.array(input_part, ctx=executor_ctx),
        y_:
        ndarray.array(convert_to_one_hot(label_part, max_val=num_classes),
                      ctx=executor_ctx),
    }

    epoch_num = 100
    epoch_all, epoch_0 = 0, 0

    for i in range(epoch_num):
        epoch_start_time = time.time()
        results = executor.run(feed_dict=feed_dict)
        loss = results[0].asnumpy().sum()
        y_out = results[1]
        loss_test = results[2].asnumpy().sum()
        epoch_end_time = time.time()
        epoch_time = epoch_end_time - epoch_start_time
        epoch_all += epoch_time
        if i == 0:
            epoch_0 = epoch_time

        print("[Epoch: %d, Rank: %d] Epoch time: %.3f, Total time: %.3f" %
              (i, rank, epoch_time, epoch_all))

        y_out_train, y_predict = y_out.asnumpy().argmax(
            axis=1)[:train_node], y_out.asnumpy().argmax(axis=1)[test_node:]
        label_train, label_test = label_part[:train_node], label_part[
            test_node:]
        train_acc = ndarray.array(np.array([(y_out_train == label_train).sum()
                                            ]),
                                  ctx=executor_ctx)
        test_acc = ndarray.array(np.array([(y_predict == label_test).sum()]),
                                 ctx=executor_ctx)
        train_loss = ndarray.array(np.array([loss]), ctx=executor_ctx)
        test_loss = ndarray.array(np.array([loss_test]), ctx=executor_ctx)

        if replication > 1:
            col_groups[rank % replication].dlarrayNcclAllReduce(
                test_acc, test_acc, ncclDataType_t.ncclFloat32,
                ncclRedOp_t.ncclSum)
            col_groups[rank % replication].dlarrayNcclAllReduce(
                test_loss, test_loss, ncclDataType_t.ncclFloat32,
                ncclRedOp_t.ncclSum)
            col_groups[rank % replication].dlarrayNcclAllReduce(
                train_acc, train_acc, ncclDataType_t.ncclFloat32,
                ncclRedOp_t.ncclSum)
            col_groups[rank % replication].dlarrayNcclAllReduce(
                train_loss, train_loss, ncclDataType_t.ncclFloat32,
                ncclRedOp_t.ncclSum)
        else:
            comm.dlarrayNcclAllReduce(test_acc, test_acc,
                                      ncclDataType_t.ncclFloat32,
                                      ncclRedOp_t.ncclSum)
            comm.dlarrayNcclAllReduce(test_loss, test_loss,
                                      ncclDataType_t.ncclFloat32,
                                      ncclRedOp_t.ncclSum)
            comm.dlarrayNcclAllReduce(train_acc, train_acc,
                                      ncclDataType_t.ncclFloat32,
                                      ncclRedOp_t.ncclSum)
            comm.dlarrayNcclAllReduce(train_loss, train_loss,
                                      ncclDataType_t.ncclFloat32,
                                      ncclRedOp_t.ncclSum)

        test_acc = float(
            test_acc.asnumpy()[0]) / (node_count - test_split * node_count)
        test_loss = test_loss.asnumpy()[0] / (node_count -
                                              test_split * node_count)
        train_acc = float(train_acc.asnumpy()[0]) / (train_split * node_count)
        train_loss = train_loss.asnumpy()[0] / (train_split * node_count)

        if rank == 0:
            print("[Epoch: %d] Train Loss: %.3f, Train Accuracy: %.3f, Test Loss: %.3f, Test Accuracy: %.3f"\
            %(i,train_loss, train_acc, test_loss, test_acc))

    avg_epoch_time = (epoch_all - epoch_0) / (epoch_num - 1)
    results = ndarray.array(np.array([epoch_all, avg_epoch_time]),
                            ctx=executor_ctx)
    comm.dlarrayNcclAllReduce(results,
                              results,
                              ncclDataType_t.ncclFloat32,
                              reduceop=ncclRedOp_t.ncclSum)
    results = results.asnumpy() / size

    if rank == 0:
        print("\nAverage Total Time: %.3f, Average Epoch Time: %.3f" %
              (results[0], results[1]))
Esempio n. 4
0
            train_state.sync_and_clear()
            if epoch >= num_epoch:
                break
        g_sample, mp_val, mask, mask_eval = g_sample_nxt, mp_val_nxt, mask_nxt, mask_eval_nxt


def signal_handler(signal, frame):
    print("SIGINT signal caught, stop Training")
    exit(0)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("config")
    parser.add_argument("--path", "-p", required=True)
    parser.add_argument("--num_epoch", default=300, type=int)
    parser.add_argument("--hidden_size", default=128, type=int)
    parser.add_argument("--learning_rate", default=1, type=float)
    parser.add_argument("--batch_size", default=128, type=int)
    parser.add_argument("--cache", default="LFUOpt", type=str)
    args = parser.parse_args()
    comm, device_id = ad.mpi_nccl_init()
    file_path = args.config
    settings = yaml.load(open(file_path).read(), Loader=yaml.FullLoader)
    for k, v in settings['shared'].items():
        os.environ[k] = str(v)
    os.environ["DMLC_ROLE"] = "worker"
    signal.signal(signal.SIGINT, signal_handler)
    train_main(args)
    ad.mpi_nccl_finish(comm)
Esempio n. 5
0
def worker(args):
    def train(iterations, auc_enabled=True, tqdm_enabled=False):
        localiter = tqdm(
            range(iterations)) if tqdm_enabled else range(iterations)
        train_loss = []
        train_acc = []
        if auc_enabled:
            train_auc = []
        for it in localiter:
            loss_val, predict_y, y_val, _ = executor.run(
                convert_to_numpy_ret_vals=True)
            if y_val.shape[1] == 1:  # for criteo case
                acc_val = np.equal(y_val, predict_y > 0.5).astype(np.float)
            else:
                acc_val = np.equal(np.argmax(y_val, 1),
                                   np.argmax(predict_y, 1)).astype(np.float)
            train_loss.append(loss_val[0])
            train_acc.append(acc_val)
            if auc_enabled:
                train_auc.append(metrics.roc_auc_score(y_val, predict_y))
        if auc_enabled:
            return np.mean(train_loss), np.mean(train_acc), np.mean(train_auc)
        else:
            return np.mean(train_loss), np.mean(train_acc)

    def validate(iterations, tqdm_enabled=False):
        localiter = tqdm(
            range(iterations)) if tqdm_enabled else range(iterations)
        test_loss = []
        test_acc = []
        test_auc = []
        for it in localiter:
            loss_val, test_y_predicted, y_test_val = val_executor.run(
                convert_to_numpy_ret_vals=True)
            if y_test_val.shape[1] == 1:  # for criteo case
                correct_prediction = np.equal(
                    y_test_val, test_y_predicted > 0.5).astype(np.float)
            else:
                correct_prediction = np.equal(np.argmax(y_test_val, 1),
                                              np.argmax(test_y_predicted,
                                                        1)).astype(np.float)
            test_loss.append(loss_val[0])
            test_acc.append(correct_prediction)
            test_auc.append(metrics.roc_auc_score(y_test_val,
                                                  test_y_predicted))
        return np.mean(test_loss), np.mean(test_acc), np.mean(test_auc)

    def get_current_shard(data):
        if args.comm is not None:
            part_size = data.shape[0] // nrank
            start = part_size * rank
            end = start + part_size if rank != nrank - 1 else data.shape[0]
            return data[start:end]
        else:
            return data

    batch_size = 128
    dataset = args.dataset
    model = args.model
    device_id = 0

    if args.comm == 'PS':
        rank = ad.get_worker_communicate().rank()
        nrank = int(os.environ['DMLC_NUM_WORKER'])
        device_id = rank % 8
    elif args.comm == 'Hybrid':
        comm, rank = ad.mpi_nccl_init()
        nrank = int(os.environ['DMLC_NUM_WORKER'])
        device_id = rank % 8

    if dataset == 'criteo':
        # define models for criteo
        if args.all:
            from models.load_data import process_all_criteo_data
            dense, sparse, labels = process_all_criteo_data(
                return_val=args.val)
        elif args.val:
            from models.load_data import process_head_criteo_data
            dense, sparse, labels = process_head_criteo_data(return_val=True)
        else:
            from models.load_data import process_sampled_criteo_data
            dense, sparse, labels = process_sampled_criteo_data()
        if isinstance(dense, tuple):
            dense_input = dl.dataloader_op(
                [[get_current_shard(dense[0]), batch_size, 'train'],
                 [get_current_shard(dense[1]), batch_size, 'validate']])
            sparse_input = dl.dataloader_op(
                [[get_current_shard(sparse[0]), batch_size, 'train'],
                 [get_current_shard(sparse[1]), batch_size, 'validate']])
            y_ = dl.dataloader_op(
                [[get_current_shard(labels[0]), batch_size, 'train'],
                 [get_current_shard(labels[1]), batch_size, 'validate']])
        else:
            dense_input = dl.dataloader_op(
                [[get_current_shard(dense), batch_size, 'train']])
            sparse_input = dl.dataloader_op(
                [[get_current_shard(sparse), batch_size, 'train']])
            y_ = dl.dataloader_op(
                [[get_current_shard(labels), batch_size, 'train']])
    elif dataset == 'adult':
        from models.load_data import load_adult_data
        x_train_deep, x_train_wide, y_train, x_test_deep, x_test_wide, y_test = load_adult_data(
        )
        dense_input = [
            dl.dataloader_op([
                [get_current_shard(x_train_deep[:, i]), batch_size, 'train'],
                [get_current_shard(x_test_deep[:, i]), batch_size, 'validate'],
            ]) for i in range(12)
        ]
        sparse_input = dl.dataloader_op([
            [get_current_shard(x_train_wide), batch_size, 'train'],
            [get_current_shard(x_test_wide), batch_size, 'validate'],
        ])
        y_ = dl.dataloader_op([
            [get_current_shard(y_train), batch_size, 'train'],
            [get_current_shard(y_test), batch_size, 'validate'],
        ])
    else:
        raise NotImplementedError
    print("Data loaded.")

    loss, prediction, y_, train_op = model(dense_input, sparse_input, y_)

    executor = ad.Executor([loss, prediction, y_, train_op], ctx=ndarray.gpu(device_id),\
        dataloader_name='train', stream_mode='AllStreams', comm_mode=args.comm, cstable_policy=args.cache, bsp=args.bsp, cache_bound=args.bound, seed=123, log_path='./logs/')
    if args.val:
        print('Validation enabled...')
        val_executor = ad.Executor([loss, prediction, y_], ctx=ndarray.gpu(device_id),\
            dataloader_name='validate', stream_mode='AllStreams', inference=True, comm_mode=args.comm)

    if args.all and dataset == 'criteo':
        print('Processing all data...')
        file_path = '%s_%s' % ({
            None: 'local',
            'PS': 'ps',
            'Hybrid': 'hybrid'
        }[args.comm], args.raw_model)
        file_path += '%d.log' % rank if args.comm else '.log'
        file_path = os.path.join(
            os.path.split(os.path.abspath(__file__))[0], 'logs', file_path)
        log_file = open(file_path, 'w')
        total_epoch = 11
        for ep in range(total_epoch):
            print("ep: %d" % ep)
            ep_st = time.time()
            train_loss, train_acc, train_auc = train(executor.batch_num // 10 +
                                                     (ep % 10 == 9) *
                                                     (executor.batch_num % 10),
                                                     tqdm_enabled=True)
            ep_en = time.time()
            if args.val:
                val_loss, val_acc, val_auc = validate(val_executor.batch_num)
                printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, test_loss: %.4f, test_acc: %.4f, test_auc: %.4f, train_time: %.4f"\
                        % (train_loss, train_acc, train_auc, val_loss, val_acc, val_auc, ep_en - ep_st)
            else:
                printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\
                        % (train_loss, train_acc, train_auc, ep_en - ep_st)
            print(printstr)
            log_file.write(printstr + '\n')
            log_file.flush()
    else:
        total_epoch = 50
        for ep in range(total_epoch):
            if ep == 5:
                start = time.time()
            print("epoch %d" % ep)
            ep_st = time.time()
            train_loss, train_acc = train(executor.batch_num,
                                          auc_enabled=False)
            ep_en = time.time()
            if args.val:
                val_loss, val_acc, val_auc = validate(val_executor.batch_num)
                print(
                    "train_loss: %.4f, train_acc: %.4f, train_time: %.4f, test_loss: %.4f, test_acc: %.4f, test_auc: %.4f"
                    % (train_loss, train_acc, ep_en - ep_st, val_loss, val_acc,
                       val_auc))
            else:
                print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f" %
                      (train_loss, train_acc, ep_en - ep_st))
        print('all time:', time.time() - start)
    if args.comm == 'Hybrid':
        ad.mpi_nccl_finish(comm)