Exemple #1
0
def train_finetune(
    epoch,
    train_loader,
    model,
    output_layer,
    criterion,
    optimizer,
    output_layer_optimizer,
    sw,
    opt,
):
    """
    one epoch training for moco
    """
    n_batch = len(train_loader)
    model.train()
    output_layer.train()

    batch_time = AverageMeter()
    data_time = AverageMeter()
    loss_meter = AverageMeter()
    f1_meter = AverageMeter()
    epoch_loss_meter = AverageMeter()
    epoch_f1_meter = AverageMeter()
    prob_meter = AverageMeter()
    graph_size = AverageMeter()
    max_num_nodes = 0
    max_num_edges = 0

    end = time.time()
    for idx, batch in enumerate(train_loader):
        data_time.update(time.time() - end)
        graph_q, y = batch

        graph_q.to(torch.device(opt.gpu))
        y = y.to(torch.device(opt.gpu))

        bsz = graph_q.batch_size

        # ===================forward=====================

        feat_q = model(graph_q)

        assert feat_q.shape == (graph_q.batch_size, opt.hidden_size)
        out = output_layer(feat_q)

        loss = criterion(out, y)

        # ===================backward=====================
        optimizer.zero_grad()
        output_layer_optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_value_(model.parameters(), 1)
        torch.nn.utils.clip_grad_value_(output_layer.parameters(), 1)
        global_step = epoch * n_batch + idx
        lr_this_step = opt.learning_rate * warmup_linear(
            global_step / (opt.epochs * n_batch), 0.1)
        for param_group in optimizer.param_groups:
            param_group["lr"] = lr_this_step
        for param_group in output_layer_optimizer.param_groups:
            param_group["lr"] = lr_this_step
        optimizer.step()
        output_layer_optimizer.step()

        preds = out.argmax(dim=1)
        f1 = f1_score(y.cpu().numpy(), preds.cpu().numpy(), average="micro")

        # ===================meters=====================
        f1_meter.update(f1, bsz)
        epoch_f1_meter.update(f1, bsz)
        loss_meter.update(loss.item(), bsz)
        epoch_loss_meter.update(loss.item(), bsz)
        graph_size.update(graph_q.number_of_nodes() / bsz, bsz)
        max_num_nodes = max(max_num_nodes, graph_q.number_of_nodes())
        max_num_edges = max(max_num_edges, graph_q.number_of_edges())

        torch.cuda.synchronize()
        batch_time.update(time.time() - end)
        end = time.time()

        # print info
        if (idx + 1) % opt.print_freq == 0:
            mem = psutil.virtual_memory()
            #  print(f'{idx:8} - {mem.percent:5} - {mem.free/1024**3:10.2f} - {mem.available/1024**3:10.2f} - {mem.used/1024**3:10.2f}')
            #  mem_used.append(mem.used/1024**3)
            print("Train: [{0}][{1}/{2}]\t"
                  "BT {batch_time.val:.3f} ({batch_time.avg:.3f})\t"
                  "DT {data_time.val:.3f} ({data_time.avg:.3f})\t"
                  "loss {loss.val:.3f} ({loss.avg:.3f})\t"
                  "f1 {f1.val:.3f} ({f1.avg:.3f})\t"
                  "GS {graph_size.val:.3f} ({graph_size.avg:.3f})\t"
                  "mem {mem:.3f}".format(
                      epoch,
                      idx + 1,
                      n_batch,
                      batch_time=batch_time,
                      data_time=data_time,
                      loss=loss_meter,
                      f1=f1_meter,
                      graph_size=graph_size,
                      mem=mem.used / 1024**3,
                  ))
            #  print(out[0].abs().max())

        # tensorboard logger
        if (idx + 1) % opt.tb_freq == 0:
            sw.add_scalar("ft_loss", loss_meter.avg, global_step)
            sw.add_scalar("ft_f1", f1_meter.avg, global_step)
            sw.add_scalar("graph_size", graph_size.avg, global_step)
            sw.add_scalar("lr", lr_this_step, global_step)
            sw.add_scalar("graph_size/max", max_num_nodes, global_step)
            sw.add_scalar("graph_size/max_edges", max_num_edges, global_step)
            #  sw.add_scalar(
            #      "learning_rate", optimizer.param_groups[0]["lr"], global_step
            #  )
            loss_meter.reset()
            f1_meter.reset()
            graph_size.reset()
            max_num_nodes, max_num_edges = 0, 0
    return epoch_loss_meter.avg, epoch_f1_meter.avg
Exemple #2
0
def train_moco(epoch, train_loader, model, model_ema, contrast, criterion,
               optimizer, sw, opt):
    """
    one epoch training for moco
    """
    n_batch = train_loader.dataset.total // opt.batch_size
    model.train()
    model_ema.eval()

    def set_bn_train(m):
        classname = m.__class__.__name__
        if classname.find("BatchNorm") != -1:
            m.train()

    model_ema.apply(set_bn_train)

    batch_time = AverageMeter()
    data_time = AverageMeter()
    loss_meter = AverageMeter()
    epoch_loss_meter = AverageMeter()
    prob_meter = AverageMeter()
    graph_size = AverageMeter()
    gnorm_meter = AverageMeter()
    max_num_nodes = 0
    max_num_edges = 0

    end = time.time()
    for idx, batch in enumerate(train_loader):
        data_time.update(time.time() - end)
        graph_q, graph_k = batch

        graph_q.to(torch.device(opt.gpu))
        graph_k.to(torch.device(opt.gpu))

        bsz = graph_q.batch_size

        if opt.moco:
            # ===================Moco forward=====================
            feat_q = model(graph_q)
            with torch.no_grad():
                feat_k = model_ema(graph_k)

            out = contrast(feat_q, feat_k)
            prob = out[:, 0].mean()
        else:
            # ===================Negative sampling forward=====================
            feat_q = model(graph_q)
            feat_k = model(graph_k)

            out = torch.matmul(feat_k, feat_q.t()) / opt.nce_t
            prob = out[range(graph_q.batch_size),
                       range(graph_q.batch_size)].mean()

        assert feat_q.shape == (graph_q.batch_size, opt.hidden_size)

        # ===================backward=====================
        optimizer.zero_grad()
        loss = criterion(out)
        loss.backward()
        grad_norm = clip_grad_norm(model.parameters(), opt.clip_norm)

        global_step = epoch * n_batch + idx
        lr_this_step = opt.learning_rate * warmup_linear(
            global_step / (opt.epochs * n_batch), 0.1)
        for param_group in optimizer.param_groups:
            param_group["lr"] = lr_this_step
        optimizer.step()

        # ===================meters=====================
        loss_meter.update(loss.item(), bsz)
        epoch_loss_meter.update(loss.item(), bsz)
        prob_meter.update(prob.item(), bsz)
        graph_size.update(
            (graph_q.number_of_nodes() + graph_k.number_of_nodes()) / 2.0 /
            bsz, 2 * bsz)
        gnorm_meter.update(grad_norm, 1)
        max_num_nodes = max(max_num_nodes, graph_q.number_of_nodes())
        max_num_edges = max(max_num_edges, graph_q.number_of_edges())

        if opt.moco:
            moment_update(model, model_ema, opt.alpha)

        torch.cuda.synchronize()
        batch_time.update(time.time() - end)
        end = time.time()

        # print info
        if (idx + 1) % opt.print_freq == 0:
            mem = psutil.virtual_memory()
            #  print(f'{idx:8} - {mem.percent:5} - {mem.free/1024**3:10.2f} - {mem.available/1024**3:10.2f} - {mem.used/1024**3:10.2f}')
            #  mem_used.append(mem.used/1024**3)
            print("Train: [{0}][{1}/{2}]\t"
                  "BT {batch_time.val:.3f} ({batch_time.avg:.3f})\t"
                  "DT {data_time.val:.3f} ({data_time.avg:.3f})\t"
                  "loss {loss.val:.3f} ({loss.avg:.3f})\t"
                  "prob {prob.val:.3f} ({prob.avg:.3f})\t"
                  "GS {graph_size.val:.3f} ({graph_size.avg:.3f})\t"
                  "mem {mem:.3f}".format(
                      epoch,
                      idx + 1,
                      n_batch,
                      batch_time=batch_time,
                      data_time=data_time,
                      loss=loss_meter,
                      prob=prob_meter,
                      graph_size=graph_size,
                      mem=mem.used / 1024**3,
                  ))
            #  print(out[0].abs().max())

        # tensorboard logger
        if (idx + 1) % opt.tb_freq == 0:
            global_step = epoch * n_batch + idx
            sw.add_scalar("moco_loss", loss_meter.avg, global_step)
            sw.add_scalar("moco_prob", prob_meter.avg, global_step)
            sw.add_scalar("graph_size", graph_size.avg, global_step)
            sw.add_scalar("graph_size/max", max_num_nodes, global_step)
            sw.add_scalar("graph_size/max_edges", max_num_edges, global_step)
            sw.add_scalar("gnorm", gnorm_meter.avg, global_step)
            sw.add_scalar("learning_rate", optimizer.param_groups[0]["lr"],
                          global_step)
            loss_meter.reset()
            prob_meter.reset()
            graph_size.reset()
            gnorm_meter.reset()
            max_num_nodes, max_num_edges = 0, 0
    return epoch_loss_meter.avg
Exemple #3
0
def train_moco(epoch, model_name, train_loader, model, model_ema, contrast,
               criterion, optimizer, sw, opt, output_layer,
               output_layer_optimizer, global_output_layer,
               global_output_layer_optimizer):
    """
    one epoch training for moco
    """
    n_batch = train_loader.dataset.total // opt.batch_size
    model.train()
    model_ema.eval()
    print("pretrain")

    def set_bn_train(m):
        classname = m.__class__.__name__
        if classname.find("BatchNorm") != -1:
            m.train()

    model_ema.apply(set_bn_train)
    batch_time = AverageMeter()
    data_time = AverageMeter()
    loss_meter = AverageMeter()
    epoch_loss_meter = AverageMeter()
    global_loss_meter = AverageMeter()
    epoch_global_loss_meter = AverageMeter()
    prob_meter = AverageMeter()
    graph_size = AverageMeter()
    gnorm_meter = AverageMeter()
    max_num_nodes = 0
    max_num_edges = 0
    end = time.time()

    # read global label
    graph_list = np.zeros(15)
    f = open("./motifs/" + model_name + "-counts.out")
    for line in f:
        nums = [int(x) for x in line.split()]
        graph_list += np.array(nums)
    global_label = torch.FloatTensor(
        np.array([x * 1.0 / sum(graph_list) for x in graph_list]))
    for idx, batch in enumerate(train_loader):
        data_time.update(time.time() - end)
        graph_q, label, degree_label = batch
        graph_q.to(torch.device(opt.gpu))
        bsz = graph_q.batch_size
        # ===================Negative sampling forward=====================
        feat_q = model(graph_q)
        out = output_layer(feat_q)

        #global_feature.append(feat_q.detach().cpu())
        #mean_t = torch.mean(torch.cat(global_feature), dim=0, keepdim=True).squeeze()
        #print(len(global_feature), mean_t.shape)
        #continue
        #print(global_feature[0].shape)
        #print(global_feature, len(global_feature))
        #mean_t = mean_t.to(torch.device(opt.gpu))
        #global_out = global_output_layer(mean_t)
        degree_out = global_output_layer(feat_q)
        # print(feat_q.size(), feat_k.size())
        #print("negative sampling")
        assert feat_q.shape == (graph_q.batch_size, opt.hidden_size)

        # ===================backward=====================
        optimizer.zero_grad()
        loss = criterion(out, label)
        global_loss = criterion(degree_out, degree_label)
        output_layer_optimizer.zero_grad()
        loss = loss + global_loss
        loss.backward(retain_graph=True)
        global_output_layer_optimizer.zero_grad()
        global_loss.backward()

        torch.nn.utils.clip_grad_value_(output_layer.parameters(), 1)
        torch.nn.utils.clip_grad_value_(global_output_layer.parameters(), 1)
        grad_norm = clip_grad_norm(model.parameters(), opt.clip_norm)
        global_step = epoch * n_batch + idx
        lr_this_step = opt.learning_rate * warmup_linear(
            global_step / (opt.epochs * n_batch), 0.1)
        for param_group in optimizer.param_groups:
            param_group["lr"] = lr_this_step
        for param_group in output_layer_optimizer.param_groups:
            param_group["lr"] = lr_this_step
        for param_group in global_output_layer_optimizer.param_groups:
            param_group["lr"] = lr_this_step
        optimizer.step()
        output_layer_optimizer.step()
        global_output_layer_optimizer.step()
        # ===================meters=====================
        loss_meter.update(loss.item(), bsz)
        epoch_loss_meter.update(loss.item(), bsz)
        global_loss_meter.update(loss.item(), bsz)
        epoch_global_loss_meter.update(loss.item(), bsz)
        gnorm_meter.update(grad_norm, 1)
        max_num_nodes = max(max_num_nodes, graph_q.number_of_nodes())
        max_num_edges = max(max_num_edges, graph_q.number_of_edges())

        torch.cuda.synchronize()
        batch_time.update(time.time() - end)
        end = time.time()
        # print info
        if (idx + 1) % opt.print_freq == 0:
            mem = psutil.virtual_memory()
            #  print(f'{idx:8} - {mem.percent:5} - {mem.free/1024**3:10.2f} - {mem.available/1024**3:10.2f} - {mem.used/1024**3:10.2f}')
            #  mem_used.append(mem.used/1024**3)
            print("Train: [{0}][{1}/{2}]\t"
                  "BT {batch_time.val:.3f} ({batch_time.avg:.3f})\t"
                  "DT {data_time.val:.3f} ({data_time.avg:.3f})\t"
                  "loss {loss.val:.3f} ({loss.avg:.3f})\t"
                  "prob {prob.val:.3f} ({prob.avg:.3f})\t"
                  "GS {graph_size.val:.3f} ({graph_size.avg:.3f})\t"
                  "mem {mem:.3f}".format(
                      epoch,
                      idx + 1,
                      n_batch,
                      batch_time=batch_time,
                      data_time=data_time,
                      loss=loss_meter,
                      prob=prob_meter,
                      graph_size=graph_size,
                      mem=mem.used / 1024**3,
                  ))
        # tensorboard logger
        if (idx + 1) % opt.tb_freq == 0:
            global_step = epoch * n_batch + idx
            sw.add_scalar("moco_loss", loss_meter.avg, global_step)
            sw.add_scalar("global_moco_loss", global_loss_meter.avg,
                          global_step)
            sw.add_scalar("moco_prob", prob_meter.avg, global_step)
            sw.add_scalar("graph_size", graph_size.avg, global_step)
            sw.add_scalar("graph_size/max", max_num_nodes, global_step)
            sw.add_scalar("graph_size/max_edges", max_num_edges, global_step)
            sw.add_scalar("gnorm", gnorm_meter.avg, global_step)
            sw.add_scalar("learning_rate", optimizer.param_groups[0]["lr"],
                          global_step)
            loss_meter.reset()
            global_loss_meter.reset()
            prob_meter.reset()
            graph_size.reset()
            gnorm_meter.reset()
            max_num_nodes, max_num_edges = 0, 0
    return epoch_loss_meter.avg, epoch_global_loss_meter.avg