Python DenseLibsvmDataset.add_moreの例

プログラミング言語: Python

名前空間/パッケージ名: data_loader.YFCCLibsvmDataset

クラス/型: DenseLibsvmDataset

メソッド/関数: add_more

hotexamples.comのコード掲載数: 5

Python DenseLibsvmDataset.add_more - 5件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのdata_loader.YFCCLibsvmDataset.DenseLibsvmDataset.add_moreの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

DenseLibsvmDataset(6)

__getitem__(6)

__len__(6)

add_more(5)

コード例 #1

ファイルを表示

ファイル: data_partition.py プロジェクト: lemonviv/LambdaML

def partition_yfcc100m(file_list, n_features, pos_tag, batch_size,
                       validation_ratio):
    parse_start = time.time()
    f = open(file_list[0]).readlines()
    dataset = DenseLibsvmDataset(f, n_features, pos_tag)
    if len(file_list) > 1:
        for file_name in file_list[1:]:
            f = open(file_name).readlines()
            dataset.add_more(f)

    total_count = dataset.__len__()
    pos_count = 0
    for i in range(total_count):
        if dataset.__getitem__(i)[1] == 1:
            pos_count += 1
    print("{} positive observations out of {}".format(pos_count, total_count))

    print("parse data cost {} s".format(time.time() - parse_start))

    preprocess_start = time.time()
    dataset_size = len(dataset)
    indices = list(range(dataset_size))
    split = int(np.floor(validation_ratio * dataset_size))
    random_seed = 42
    np.random.seed(random_seed)
    np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]

    # Creating PT data samplers and loaders:
    train_sampler = SubsetRandomSampler(train_indices)
    valid_sampler = SubsetRandomSampler(val_indices)

    train_loader = torch.utils.data.DataLoader(dataset,
                                               batch_size=batch_size,
                                               sampler=train_sampler)
    test_loader = torch.utils.data.DataLoader(dataset,
                                              batch_size=batch_size,
                                              sampler=valid_sampler)

    print("preprocess data cost {} s".format(time.time() - preprocess_start))
    return train_loader, test_loader

コード例 #2

ファイルを表示

ファイル: LR_ADMM_reduce_ec2.py プロジェクト: lemonviv/LambdaML

def handler(event, context):
    start_time = time.time()
    bucket = event['bucket_name']
    worker_index = event['rank']
    num_workers = event['num_workers']
    key = event['file'].split(",")
    merged_bucket = event['merged_bucket']
    num_classes = event['num_classes']
    num_features = event['num_features']
    pos_tag = event['pos_tag']
    num_epochs = event['num_epochs']
    num_admm_epochs = event['num_admm_epochs']
    learning_rate = event['learning_rate']
    batch_size = event['batch_size']
    lam = event['lambda']
    rho = event['rho']
    elasti_location = event['elasticache']
    endpoint = memcached_init(elasti_location)

    print('bucket = {}'.format(bucket))
    print("file = {}".format(key))
    print('number of workers = {}'.format(num_workers))
    print('worker index = {}'.format(worker_index))
    print('merge bucket = {}'.format(merged_bucket))
    print('num epochs = {}'.format(num_epochs))
    print('num admm epochs = {}'.format(num_admm_epochs))
    print('num classes = {}'.format(num_classes))
    print('num features = {}'.format(num_features))
    print('positive tag = {}'.format(pos_tag))
    print('learning rate = {}'.format(learning_rate))
    print("batch_size = {}".format(batch_size))
    print("lambda = {}".format(lam))
    print("rho = {}".format(rho))

    # read file from s3
    file = get_object(bucket, key[0]).read().decode('utf-8').split("\n")
    dataset = DenseLibsvmDataset(file, num_features, pos_tag)
    if len(key) > 1:
        for more_key in key[1:]:
            file = get_object(bucket, more_key).read().decode('utf-8').split("\n")
            dataset.add_more(file)
    print("read data cost {} s".format(time.time() - start_time))

    parse_start = time.time()
    total_count = dataset.__len__()
    pos_count = 0
    for i in range(total_count):
        if dataset.__getitem__(i)[1] == 1:
            pos_count += 1
    print("{} positive observations out of {}".format(pos_count, total_count))

    print("parse data cost {} s".format(time.time() - parse_start))

    preprocess_start = time.time()
    # Creating data indices for training and validation splits:
    dataset_size = len(dataset)

    indices = list(range(dataset_size))
    split = int(np.floor(validation_ratio * dataset_size))
    if shuffle_dataset:
        np.random.seed(random_seed)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]

    # Creating PT data samplers and loaders:
    train_sampler = SubsetRandomSampler(train_indices)
    valid_sampler = SubsetRandomSampler(val_indices)

    train_loader = torch.utils.data.DataLoader(dataset,
                                               batch_size=batch_size,
                                               sampler=train_sampler)
    validation_loader = torch.utils.data.DataLoader(dataset,
                                                    batch_size=batch_size,
                                                    sampler=valid_sampler)

    print("preprocess data cost {} s, dataset size = {}"
          .format(time.time() - preprocess_start, dataset_size))

    model = LogisticRegression(num_features, num_classes).double()
    print("size of w = {}".format(model.linear.weight.data.size()))

    z, u = initialize_z_and_u(model.linear.weight.data.size())
    print("size of z = {}".format(z.shape))
    print("size of u = {}".format(u.shape))

    # Loss and Optimizer
    # Softmax is internally computed.
    # Set parameters to be updated.
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    # Training the Model
    train_start = time.time()
    stop = False
    for admm_epoch in range(num_admm_epochs):
        print("ADMM Epoch >>> {}".format(admm_epoch))
        admm_epoch_start = time.time()
        for epoch in range(num_epochs):
            epoch_start = time.time()
            epoch_loss = 0
            for batch_index, (items, labels) in enumerate(train_loader):
                #   print("------worker {} epoch {} batch {}------".format(worker_index, epoch, batch_index))
                batch_start = time.time()
                items = Variable(items.view(-1, num_features))
                labels = Variable(labels)

                # Forward + Backward + Optimize
                optimizer.zero_grad()
                outputs = model(items.double())
                classify_loss = criterion(outputs, labels)
                epoch_loss += classify_loss.data
                u_z = torch.from_numpy(u).double() - torch.from_numpy(z).double()
                loss = classify_loss
                for name, param in model.named_parameters():
                    if name.split('.')[-1] == "weight":
                        loss += rho / 2.0 * torch.norm(param + u_z, p=2)
                #loss = classify_loss + rho / 2.0 * torch.norm(torch.sum(model.linear.weight, u_z))
                optimizer.zero_grad()
                loss.backward(retain_graph=True)
                optimizer.step()

            # Test the Model
            test_start = time.time()
            correct = 0
            total = 0
            test_loss = 0
            for items, labels in validation_loader:
                items = Variable(items.view(-1, num_features))
                labels = Variable(labels)
                outputs = model(items.double())
                test_loss += criterion(outputs, labels).data
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum()
            test_time = time.time() - test_start

            print('ADMM Epoch: [%d/%d], Epoch: [%d/%d], Batch [%d], '
                  'Time: %.4f, Loss: %.4f, epoch cost %.4f, test cost %.4f s, '
                  'accuracy of the model on the %d test samples: %d %%, loss = %f'
                  % (admm_epoch, num_admm_epochs, epoch, num_epochs, batch_index,
                     time.time() - train_start, epoch_loss.data, time.time() - epoch_start, test_time,
                     len(val_indices), 100 * correct / total, test_loss / total))

        w = model.linear.weight.data.numpy()
        w_shape = w.shape
        b = model.linear.bias.data.numpy()
        b_shape = b.shape
        u_shape = u.shape

        w_and_b = np.concatenate((w.flatten(), b.flatten()))
        u_w_b = np.concatenate((u.flatten(), w_and_b.flatten()))
        cal_time = time.time() - admm_epoch_start

        sync_start = time.time()
        postfix = "{}".format(admm_epoch)
        u_w_b_merge = reduce_epoch(endpoint, u_w_b, merged_bucket, num_workers, worker_index, postfix)

        u_mean = u_w_b_merge[:u_shape[0] * u_shape[1]].reshape(u_shape) / float(num_workers)
        w_mean = u_w_b_merge[u_shape[0]*u_shape[1] : u_shape[0]*u_shape[1]+w_shape[0]*w_shape[1]].reshape(w_shape) / float(num_workers)
        b_mean = u_w_b_merge[u_shape[0]*u_shape[1]+w_shape[0]*w_shape[1]:].reshape(b_shape[0]) / float(num_workers)
        #model.linear.weight.data = torch.from_numpy(w)
        model.linear.bias.data = torch.from_numpy(b_mean)
        sync_time = time.time() - sync_start

        #z, u, r, s = update_z_u(w, z, u, rho, num_workers, lam)
        #stop = check_stop(ep_abs, ep_rel, r, s, dataset_size, num_features, w, z, u, rho)
        #print("stop = {}".format(stop))
        #z = num_workers * rho / (2 * lam + num_workers * rho) * (w + u_mean)
        z = update_z(w_mean, u_mean, rho, num_workers, lam)
        #print(z)
        u = u + model.linear.weight.data.numpy() - z
        #print(u)

        # Test the Model
        test_start = time.time()
        correct = 0
        total = 0
        test_loss = 0
        for items, labels in validation_loader:
            items = Variable(items.view(-1, num_features))
            labels = Variable(labels)
            outputs = model(items.double())
            test_loss += criterion(outputs, labels).data
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum()
        test_time = time.time() - test_start

        print('ADMM Epoch: [%d/%d], Time: %.4f, Loss: %.4f, '
              'ADMM epoch cost %.4f: computation cost %.4f s communication cost %.4f s test cost %.4f s, '
              'accuracy of the model on the %d test samples: %d %%, loss = %f'
              % (admm_epoch, num_admm_epochs, time.time() - train_start, epoch_loss.data,
                 time.time() - admm_epoch_start, cal_time, sync_time, test_time,
                 len(val_indices), 100 * correct / total, test_loss / total))

    # Test the Model
    correct = 0
    total = 0
    test_loss = 0
    for items, labels in validation_loader:
        items = Variable(items.view(-1, num_features))
        labels = Variable(labels)
        outputs = model(items.double())
        test_loss += criterion(outputs, labels).data
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum()

    print('Time = %.4f, accuracy of the model on the %d test samples: %d %%, loss = %f'
          % (time.time() - train_start, len(val_indices), 100 * correct / total, test_loss / total))

    if worker_index == 0:
        clear_bucket(endpoint)

    end_time = time.time()
    print("Elapsed time = {} s".format(end_time - start_time))

コード例 #3

ファイルを表示

ファイル: SVM_grad_avg_ps.py プロジェクト: lemonviv/LambdaML

def handler(event, context):
    start_time = time.time()
    bucket = event['bucket_name']
    worker_index = event['rank']
    num_workers = event['num_workers']
    key = event['file'].split(",")
    num_classes = event['num_classes']
    num_features = event['num_features']
    pos_tag = event['pos_tag']
    num_epochs = event['num_epochs']
    learning_rate = event['learning_rate']
    batch_size = event['batch_size']
    host = event['host']
    port = event['port']

    print('bucket = {}'.format(bucket))
    print('number of workers = {}'.format(num_workers))
    print('worker index = {}'.format(worker_index))
    print("file = {}".format(key))
    print('number of workers = {}'.format(num_workers))
    print('worker index = {}'.format(worker_index))
    print('num epochs = {}'.format(num_epochs))
    print('num classes = {}'.format(num_classes))
    print('num features = {}'.format(num_features))
    print('positive tag = {}'.format(pos_tag))
    print('learning rate = {}'.format(learning_rate))
    print("batch_size = {}".format(batch_size))
    print("host = {}".format(host))
    print("port = {}".format(port))

    # Set thrift connection
    # Make socket
    transport = TSocket.TSocket(host, port)
    # Buffering is critical. Raw sockets are very slow
    transport = TTransport.TBufferedTransport(transport)
    # Wrap in a protocol
    protocol = TBinaryProtocol.TBinaryProtocol(transport)
    # Create a client to use the protocol encoder
    t_client = ParameterServer.Client(protocol)
    # Connect!
    transport.open()

    # test thrift connection
    ps_client.ping(t_client)
    print("create and ping thrift server >>> HOST = {}, PORT = {}".format(
        host, port))

    # read file from s3
    file = get_object(bucket, key[0]).read().decode('utf-8').split("\n")
    dataset = DenseLibsvmDataset(file, num_features, pos_tag)
    if len(key) > 1:
        for more_key in key[1:]:
            file = get_object(bucket,
                              more_key).read().decode('utf-8').split("\n")
            dataset.add_more(file)
    print("read data cost {} s".format(time.time() - start_time))

    parse_start = time.time()
    total_count = dataset.__len__()
    pos_count = 0
    for i in range(total_count):
        if dataset.__getitem__(i)[1] == 1:
            pos_count += 1
    print("{} positive observations out of {}".format(pos_count, total_count))
    print("parse data cost {} s".format(time.time() - parse_start))

    preprocess_start = time.time()
    # Creating data indices for training and validation splits:
    dataset_size = len(dataset)

    indices = list(range(dataset_size))
    split = int(np.floor(VALIDATION_RATIO * dataset_size))
    if SHUFFLE_DATASET:
        np.random.seed(RANDOM_SEED)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]

    # Creating PT data samplers and loaders:
    train_sampler = SubsetRandomSampler(train_indices)
    valid_sampler = SubsetRandomSampler(val_indices)

    train_loader = torch.utils.data.DataLoader(dataset,
                                               batch_size=batch_size,
                                               sampler=train_sampler)
    validation_loader = torch.utils.data.DataLoader(dataset,
                                                    batch_size=batch_size,
                                                    sampler=valid_sampler)

    print("preprocess data cost {} s, dataset size = {}".format(
        time.time() - preprocess_start, dataset_size))
    model = SVM(NUM_FEATURES, NUM_CLASSES)

    # Loss and Optimizer
    # Softmax is internally computed.
    # Set parameters to be updated.
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)

    # register model
    model_name = "w.b"
    weight_shape = model.linear.weight.data.numpy().shape
    weight_length = weight_shape[0] * weight_shape[1]
    bias_shape = model.linear.bias.data.numpy().shape
    bias_length = bias_shape[0]
    model_length = weight_length + bias_length
    ps_client.register_model(t_client, worker_index, model_name, model_length,
                             num_workers)
    ps_client.exist_model(t_client, model_name)
    print("register and check model >>> name = {}, length = {}".format(
        model_name, model_length))

    # Training the Model
    train_start = time.time()
    iter_counter = 0
    for epoch in range(NUM_EPOCHS):
        epoch_start = time.time()
        for batch_index, (items, labels) in enumerate(train_loader):
            print("------worker {} epoch {} batch {}------".format(
                worker_index, epoch, batch_index))
            batch_start = time.time()

            # pull latest model
            ps_client.can_pull(t_client, model_name, iter_counter,
                               worker_index)
            latest_model = ps_client.pull_model(t_client, model_name,
                                                iter_counter, worker_index)
            model.linear.weight = Parameter(
                torch.from_numpy(
                    np.asarray(latest_model[:weight_length],
                               dtype=np.double).reshape(weight_shape)))
            model.linear.bias = Parameter(
                torch.from_numpy(
                    np.asarray(latest_model[weight_length:],
                               dtype=np.double).reshape(bias_shape[0])))

            items = Variable(items.view(-1, NUM_FEATURES))
            labels = Variable(labels)

            # Forward + Backward + Optimize
            optimizer.zero_grad()
            outputs = model(items.double())
            loss = criterion(outputs, labels)
            loss.backward()

            # flatten and concat gradients of weight and bias
            w_b_grad = np.concatenate(
                (model.linear.weight.grad.data.numpy().flatten(),
                 model.linear.bias.grad.data.numpy().flatten()))
            cal_time = time.time() - batch_start

            # push gradient to PS
            sync_start = time.time()
            ps_client.can_push(t_client, model_name, iter_counter,
                               worker_index)
            ps_client.push_grad(t_client, model_name, w_b_grad, LEARNING_RATE,
                                iter_counter, worker_index)
            ps_client.can_pull(t_client, model_name, iter_counter + 1,
                               worker_index)  # sync all workers
            sync_time = time.time() - sync_start

            print(
                'Epoch: [%d/%d], Step: [%d/%d] >>> Time: %.4f, Loss: %.4f, epoch cost %.4f, '
                'batch cost %.4f s: cal cost %.4f s and communication cost %.4f s'
                % (epoch + 1, NUM_EPOCHS, batch_index + 1,
                   len(train_indices) / BATCH_SIZE, time.time() - train_start,
                   loss.data, time.time() - epoch_start,
                   time.time() - batch_start, cal_time, sync_time))
            iter_counter += 1

        # Test the Model
        correct = 0
        total = 0
        test_loss = 0
        for items, labels in validation_loader:
            items = Variable(items.view(-1, NUM_FEATURES))
            labels = Variable(labels)
            outputs = model(items)
            test_loss += criterion(outputs, labels).data
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum()

        print(
            'Time = %.4f, accuracy of the model on the %d test samples: %d %%, loss = %f'
            % (time.time() - train_start, len(val_indices),
               100 * correct / total, test_loss))

    end_time = time.time()
    print("Elapsed time = {} s".format(end_time - start_time))

コード例 #4

ファイルを表示

ファイル: LR_model_avg_reduce_ec2.py プロジェクト: lemonviv/LambdaML

def handler(event, context):
    start_time = time.time()
    bucket = event['bucket_name']
    worker_index = event['rank']
    num_workers = event['num_workers']
    key = event['file'].split(",")
    merged_bucket = event['merged_bucket']
    num_classes = event['num_classes']
    num_features = event['num_features']
    pos_tag = event['pos_tag']
    num_epochs = event['num_epochs']
    learning_rate = event['learning_rate']
    batch_size = event['batch_size']
    elasti_location = event['elasticache']
    endpoint = memcached_init(elasti_location)

    print('bucket = {}'.format(bucket))
    print("file = {}".format(key))
    print('number of workers = {}'.format(num_workers))
    print('worker index = {}'.format(worker_index))
    print('merge bucket = {}'.format(merged_bucket))
    print('num epochs = {}'.format(num_epochs))
    print('num classes = {}'.format(num_classes))
    print('num features = {}'.format(num_features))
    print('positive tag = {}'.format(pos_tag))
    print('learning rate = {}'.format(learning_rate))
    print("batch_size = {}".format(batch_size))

    # read file from s3
    file = get_object(bucket, key[0]).read().decode('utf-8').split("\n")
    dataset = DenseLibsvmDataset(file, num_features, pos_tag)
    if len(key) > 1:
        for more_key in key[1:]:
            file = get_object(bucket,
                              more_key).read().decode('utf-8').split("\n")
            dataset.add_more(file)
    print("read data cost {} s".format(time.time() - start_time))

    parse_start = time.time()
    total_count = dataset.__len__()
    pos_count = 0
    for i in range(total_count):
        if dataset.__getitem__(i)[1] == 1:
            pos_count += 1
    print("{} positive observations out of {}".format(pos_count, total_count))
    print("parse data cost {} s".format(time.time() - parse_start))

    preprocess_start = time.time()
    # Creating data indices for training and validation splits:
    dataset_size = len(dataset)

    indices = list(range(dataset_size))
    split = int(np.floor(validation_ratio * dataset_size))
    if shuffle_dataset:
        np.random.seed(random_seed)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]

    # Creating PT data samplers and loaders:
    train_sampler = SubsetRandomSampler(train_indices)
    valid_sampler = SubsetRandomSampler(val_indices)

    train_loader = torch.utils.data.DataLoader(dataset,
                                               batch_size=batch_size,
                                               sampler=train_sampler)
    validation_loader = torch.utils.data.DataLoader(dataset,
                                                    batch_size=batch_size,
                                                    sampler=valid_sampler)

    print("preprocess data cost {} s, dataset size = {}".format(
        time.time() - preprocess_start, dataset_size))

    model = LogisticRegression(num_features, num_classes)

    # Loss and Optimizer
    # Softmax is internally computed.
    # Set parameters to be updated.
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    # Training the Model
    train_start = time.time()
    for epoch in range(num_epochs):
        epoch_start = time.time()
        epoch_loss = 0
        for batch_index, (items, labels) in enumerate(train_loader):
            batch_start = time.time()
            items = Variable(items.view(-1, num_features))
            labels = Variable(labels)

            # Forward + Backward + Optimize
            optimizer.zero_grad()
            outputs = model(items)
            loss = criterion(outputs, labels)
            epoch_loss += loss.data
            loss.backward()

            optimizer.step()

        w = model.linear.weight.data.numpy()
        w_shape = w.shape
        b = model.linear.bias.data.numpy()
        b_shape = b.shape
        w_and_b = np.concatenate((w.flatten(), b.flatten()))
        cal_time = time.time() - epoch_start

        sync_start = time.time()
        postfix = str(epoch)
        u_w_b_merge = reduce_epoch(endpoint, w_and_b, merged_bucket,
                                   num_workers, worker_index, postfix)

        w_mean = u_w_b_merge[:w_shape[0] *
                             w_shape[1]].reshape(w_shape) / float(num_workers)
        b_mean = u_w_b_merge[w_shape[0] * w_shape[1]:].reshape(
            b_shape[0]) / float(num_workers)
        model.linear.weight.data = torch.from_numpy(w_mean)
        model.linear.bias.data = torch.from_numpy(b_mean)
        sync_time = time.time() - sync_start

        # Test the Model
        test_start = time.time()
        correct = 0
        total = 0
        test_loss = 0
        for items, labels in validation_loader:
            items = Variable(items.view(-1, num_features))
            labels = Variable(labels)
            outputs = model(items)
            test_loss += criterion(outputs, labels).data
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum()
        test_time = time.time() - test_start

        print(
            'Epoch: [%d/%d] has %d batches, Time: %.4f, Loss: %.4f, '
            'epoch cost %.4f: computation cost %.4f s communication cost %.4f s test cost %.4f s, '
            'accuracy of the model on the %d test samples: %d %%, loss = %f' %
            (epoch + 1, num_epochs,
             batch_index, time.time() - train_start, epoch_loss.data,
             time.time() - epoch_start, cal_time, sync_time, test_time,
             len(val_indices), 100 * correct / total, test_loss / total))

    # Test the Model
    correct = 0
    total = 0
    for items, labels in validation_loader:
        items = Variable(items.view(-1, num_features))
        # items = Variable(items)
        outputs = model(items)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum()

    print('Accuracy of the model on the %d test samples: %d %%' %
          (len(val_indices), 100 * correct / total))

    if worker_index == 0:
        clear_bucket(endpoint)

    end_time = time.time()
    print("Elapsed time = {} s".format(end_time - start_time))

コード例 #5

ファイルを表示

def run(args):
    device = torch.device(
        'cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu')
    torch.manual_seed(1234)

    logging.info(f"{args.rank}-th worker starts.")

    read_start = time.time()

    f_id_start = args.rank * args.num_files
    f_id_end = f_id_start + args.num_files
    f_path_list = [
        "{}/{}".format(args.root, i) for i in range(f_id_start, f_id_end)
    ]
    f = open(f_path_list[0]).readlines()
    dataset = DenseLibsvmDataset(f, args.features, args.pos_tag)
    if len(f_path_list) > 1:
        for file_name in f_path_list[1:]:
            f = open(file_name).readlines()
            dataset.add_more(f)

    total_count = dataset.__len__()
    pos_count = 0
    for i in range(total_count):
        if dataset.__getitem__(i)[1] == 1:
            pos_count += 1
    print("{} positive observations out of {}".format(pos_count, total_count))

    train_set = np.array(dataset.ins_list)

    dt = train_set.dtype
    centroid_shape = (args.num_clusters, train_set.shape[1])
    logging.info(f"Loading dataset costs {time.time() - read_start}s")
    logging.info(f"centorid shape: {centroid_shape}")

    # initialize centroids
    init_cent_start = time.time()
    if args.rank == 0:
        centroids = torch.tensor(train_set[0:args.num_clusters])
    else:
        centroids = torch.empty(args.num_clusters, args.features)

    if dist_is_initialized():
        dist.broadcast(centroids, 0)
    logging.info(
        f"Receiving initial centroids costs {time.time() - init_cent_start}s")

    training_start = time.time()
    avg_error = np.iinfo(np.int16).max
    for epoch in range(args.epochs):
        if avg_error >= args.threshold:
            start_compute = time.time()
            model = Kmeans(train_set,
                           centroids,
                           avg_error,
                           centroid_type='tensor')
            model.find_nearest_cluster()
            end_compute = time.time()
            #logging.info(f"{args.rank}-th worker computing centroids takes {end_compute - start_compute}s")
            sync_start = time.time()
            if dist_is_initialized():
                centroids, avg_error = broadcast_average(
                    args, model.get_centroids("dense_tensor"),
                    torch.tensor(model.error))
            logging.info(f"{args.rank}-th worker finished {epoch} epoch. "
                         f"Computing takes {end_compute - start_compute}s."
                         f"Communicating takes {time.time() - sync_start}s. "
                         #f"Centroids: {model.get_centroids('dense_tensor')}. "
                         f"Loss: {model.error}")
        else:
            logging.info(
                f"{args.rank}-th worker finished training. Error = {avg_error}, centroids = {centroids}"
            )
            logging.info(
                f"Whole process time : {time.time() - training_start}")
            return