def inference(self, mode='validation', verbose=False):
        assert mode in ['validation', 'testing'], "got mode {}".format(mode)
        from dgl.dataloading import NodeDataLoader, MultiLayerNeighborSampler
        self.eval()
        if mode == 'testing':
            sampler = MultiLayerNeighborSampler([None])
        else:
            sampler = MultiLayerNeighborSampler(self.fans)
        g = self.cpu_graph
        kwargs = {
            'batch_size': 64,
            'shuffle': True,
            'drop_last': False,
            'num_workers': 6,
        }
        dataloader = NodeDataLoader(g, th.arange(g.number_of_nodes()), sampler,
                                    **kwargs)
        if verbose:
            dataloader = tqdm(dataloader)

        x = self.embedding.weight
        x = th.cat((self.W1(x[:self.num_users]), self.W2(x[self.num_users:])),
                   dim=0)

        # Within a layer, iterate over nodes in batches
        for input_nodes, output_nodes, blocks in dataloader:
            block = blocks[0].to(commons.device)
            h = self.forward_block(block, x[input_nodes])
            self.check_point[output_nodes] = h

        if verbose:
            print('Inference Done Successfully')
    def inference(self, mode='validation', verbose=False):
        assert mode in ['validation', 'testing'], "got mode {}".format(mode)
        from dgl.dataloading import NodeDataLoader, MultiLayerNeighborSampler
        self.eval()
        if mode == 'testing':
            sampler = MultiLayerNeighborSampler([None] * self.num_layers)
        else:
            sampler = MultiLayerNeighborSampler(self.fans)

        g = self.cpu_graph
        kwargs = {
            'batch_size': 1024,
            'shuffle': True,
            'drop_last': False,
            'num_workers': commons.workers,
        }

        dataloader = NodeDataLoader(g, th.arange(g.number_of_nodes()), sampler,
                                    **kwargs)
        # Within a layer, iterate over nodes in batches
        if verbose:
            dataloader = tqdm(dataloader)
        for input_nodes, output_nodes, blocks in dataloader:
            blocks = [x.to(commons.device) for x in blocks]
            users = th.arange(output_nodes.shape[0]).long().to(self.device)
            d1 = th.zeros((0, )).long().to(self.device)
            d2 = th.zeros((0, )).long().to(self.device)
            h = self.forward_blocks(blocks, users, d1, d2)[0]
            self.check_point[output_nodes] = h
        if verbose:
            print('Inference Done Successfully')
Exemple #3
0
def train(args):
    set_random_seed(args.seed)
    device = get_device(args.device)
    g, author_rank, field_ids, true_relevance = load_rank_data(device)
    field_paper = recall_paper(g.cpu(), field_ids, args.num_recall)
    data = RatingKnowledgeGraphDataset()
    user_item_graph = data.user_item_graph
    knowledge_graph = dgl.sampling.sample_neighbors(
        data.knowledge_graph, data.knowledge_graph.nodes(), args.neighbor_size, replace=True
    )

    sampler = MultiLayerNeighborSampler([args.neighbor_size] * args.num_hops)
    train_loader = KGCNEdgeDataLoader(
        user_item_graph, torch.arange(user_item_graph.num_edges()), sampler, knowledge_graph,
        device=device, batch_size=args.batch_size
    )

    model = KGCN(args.num_hidden, args.neighbor_size, 'sum', args.num_hops, *data.get_num()).to(device)
    optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    for epoch in range(args.epochs):
        model.train()
        losses = []
        for _, pair_graph, blocks in train_loader:
            scores = model(pair_graph, blocks)
            loss = F.binary_cross_entropy(scores, pair_graph.edata['label'])
            losses.append(loss.item())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print('Epoch {:d} | Loss {:.4f}'.format(epoch, sum(losses) / len(losses)))
        print(METRICS_STR.format(*evaluate(
            model, g, knowledge_graph, sampler, field_ids, author_rank, true_relevance, field_paper
        )))
Exemple #4
0
def train(args):
    set_random_seed(args.seed)
    device = get_device(args.device)
    data, g, _, labels, predict_ntype, train_idx, val_idx, test_idx, evaluator = \
        load_data(args.dataset, device)
    add_node_feat(g, 'pretrained', args.node_embed_path, True)

    sampler = MultiLayerNeighborSampler(
        list(range(args.neighbor_size, args.neighbor_size + args.num_layers)))
    train_loader = NodeDataLoader(g, {predict_ntype: train_idx},
                                  sampler,
                                  device=device,
                                  batch_size=args.batch_size)
    loader = NodeDataLoader(g, {predict_ntype: g.nodes(predict_ntype)},
                            sampler,
                            device=device,
                            batch_size=args.batch_size)

    model = RHGNN(
        {ntype: g.nodes[ntype].data['feat'].shape[1]
         for ntype in g.ntypes}, args.num_hidden, data.num_classes,
        args.num_rel_hidden, args.num_rel_hidden, args.num_heads, g.ntypes,
        g.canonical_etypes, predict_ntype, args.num_layers,
        args.dropout).to(device)
    optimizer = optim.Adam(model.parameters(),
                           lr=args.lr,
                           weight_decay=args.weight_decay)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                     T_max=len(train_loader) *
                                                     args.epochs,
                                                     eta_min=args.lr / 100)
    warnings.filterwarnings(
        'ignore', 'Setting attributes on ParameterDict is not supported')
    for epoch in range(args.epochs):
        model.train()
        losses = []
        for input_nodes, output_nodes, blocks in tqdm(train_loader):
            batch_logits = model(blocks, blocks[0].srcdata['feat'])
            batch_labels = labels[output_nodes[predict_ntype]]
            loss = F.cross_entropy(batch_logits, batch_labels)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()
            torch.cuda.empty_cache()
        print('Epoch {:d} | Loss {:.4f}'.format(epoch,
                                                sum(losses) / len(losses)))
        if epoch % args.eval_every == 0 or epoch == args.epochs - 1:
            print(
                METRICS_STR.format(*evaluate(
                    model, loader, g, labels, data.num_classes, predict_ntype,
                    train_idx, val_idx, test_idx, evaluator)))
    if args.save_path:
        torch.save(model.cpu().state_dict(), args.save_path)
        print('模型已保存到', args.save_path)
Exemple #5
0
def train(args):
    set_random_seed(args.seed)
    device = get_device(args.device)
    data, g, _, labels, predict_ntype, train_idx, val_idx, test_idx, evaluator = \
        load_data(args.dataset, device)
    add_node_feat(g, args.node_feat, args.node_embed_path)

    sampler = MultiLayerNeighborSampler([args.neighbor_size] * args.num_layers)
    train_loader = NodeDataLoader(g, {predict_ntype: train_idx},
                                  sampler,
                                  device=device,
                                  batch_size=args.batch_size)
    loader = NodeDataLoader(g, {predict_ntype: g.nodes(predict_ntype)},
                            sampler,
                            device=device,
                            batch_size=args.batch_size)

    model = HGT(
        {ntype: g.nodes[ntype].data['feat'].shape[1]
         for ntype in g.ntypes}, args.num_hidden, data.num_classes,
        args.num_heads, g.ntypes, g.canonical_etypes, predict_ntype,
        args.num_layers, args.dropout).to(device)
    optimizer = optim.AdamW(model.parameters(), eps=1e-6)
    scheduler = optim.lr_scheduler.OneCycleLR(
        optimizer,
        args.max_lr,
        epochs=args.epochs,
        steps_per_epoch=len(train_loader),
        pct_start=0.05,
        anneal_strategy='linear',
        final_div_factor=10.0)
    warnings.filterwarnings(
        'ignore', 'Setting attributes on ParameterDict is not supported')
    for epoch in range(args.epochs):
        model.train()
        losses = []
        for input_nodes, output_nodes, blocks in tqdm(train_loader):
            batch_logits = model(blocks, blocks[0].srcdata['feat'])
            batch_labels = labels[output_nodes[predict_ntype]]
            loss = F.cross_entropy(batch_logits, batch_labels)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()
            torch.cuda.empty_cache()
        print('Epoch {:d} | Loss {:.4f}'.format(epoch,
                                                sum(losses) / len(losses)))
        if epoch % args.eval_every == 0 or epoch == args.epochs - 1:
            print(
                METRICS_STR.format(*evaluate(
                    model, loader, g, labels, data.num_classes, predict_ntype,
                    train_idx, val_idx, test_idx, evaluator)))
    if args.save_path:
        torch.save(model.cpu().state_dict(), args.save_path)
        print('模型已保存到', args.save_path)
Exemple #6
0
def calc_attn_pos(g, num_classes, predict_ntype, num_samples, device, args):
    """使用预训练的HGT模型计算的注意力权重选择目标顶点的正样本。"""
    # 第1层只保留AB边,第2层只保留BA边,其中A是目标顶点类型,B是中间顶点类型
    num_neighbors = [{}, {}]
    # 形如ABA的元路径,其中A是目标顶点类型
    metapaths = []
    rev_etype = {
        e: next(re for rs, re, rd in g.canonical_etypes
                if rs == d and rd == s and re != e)
        for s, e, d in g.canonical_etypes
    }
    for s, e, d in g.canonical_etypes:
        if d == predict_ntype:
            re = rev_etype[e]
            num_neighbors[0][re] = num_neighbors[1][e] = 10
            metapaths.append((re, e))
    for i in range(len(num_neighbors)):
        d = dict.fromkeys(g.etypes, 0)
        d.update(num_neighbors[i])
        num_neighbors[i] = d
    sampler = MultiLayerNeighborSampler(num_neighbors)
    loader = NodeDataLoader(g, {predict_ntype: g.nodes(predict_ntype)},
                            sampler,
                            device=device,
                            batch_size=args.batch_size)

    model = HGT(
        {ntype: g.nodes[ntype].data['feat'].shape[1]
         for ntype in g.ntypes}, args.num_hidden, num_classes, args.num_heads,
        g.ntypes, g.canonical_etypes, predict_ntype, 2,
        args.dropout).to(device)
    model.load_state_dict(torch.load(args.hgt_model_path, map_location=device))

    # 每条元路径ABA对应一个正样本图G_ABA,加一个总体正样本图G_pos
    pos = [
        torch.zeros(g.num_nodes(predict_ntype),
                    num_samples,
                    dtype=torch.long,
                    device=device) for _ in range(len(metapaths) + 1)
    ]
    with torch.no_grad():
        for input_nodes, output_nodes, blocks in tqdm(loader):
            _ = model(blocks, blocks[0].srcdata['feat'])
            # List[tensor(N_src, N_dst)]
            attn = [
                calc_attn(mp, model, blocks, device).t() for mp in metapaths
            ]
            for i in range(len(attn)):
                _, nid = torch.topk(attn[i], num_samples)  # (N_dst, T_pos)
                # nid是blocks[0]中的源顶点id,将其转换为原异构图中的顶点id
                pos[i][output_nodes[predict_ntype]] = input_nodes[
                    predict_ntype][nid]
            _, nid = torch.topk(sum(attn), num_samples)
            pos[-1][
                output_nodes[predict_ntype]] = input_nodes[predict_ntype][nid]
    return [p.cpu() for p in pos]
Exemple #7
0
def init_dataloaders(args,
                     g,
                     train_idx,
                     test_idx,
                     target_idx,
                     device,
                     use_ddp=False):
    fanouts = [int(fanout) for fanout in args.fanout.split(',')]
    sampler = MultiLayerNeighborSampler(fanouts)

    train_loader = DataLoader(g,
                              target_idx[train_idx],
                              sampler,
                              use_ddp=use_ddp,
                              device=device,
                              batch_size=args.batch_size,
                              shuffle=True,
                              drop_last=False)

    # The datasets do not have a validation subset, use the train subset
    val_loader = DataLoader(g,
                            target_idx[train_idx],
                            sampler,
                            use_ddp=use_ddp,
                            device=device,
                            batch_size=args.batch_size,
                            shuffle=False,
                            drop_last=False)

    # -1 for sampling all neighbors
    test_sampler = MultiLayerNeighborSampler([-1] * len(fanouts))
    test_loader = DataLoader(g,
                             target_idx[test_idx],
                             test_sampler,
                             use_ddp=use_ddp,
                             device=device,
                             batch_size=32,
                             shuffle=False,
                             drop_last=False)

    return train_loader, val_loader, test_loader
Exemple #8
0
def train(args):
    set_random_seed(args.seed)
    device = get_device(args.device)
    g, labels, num_classes, train_idx, val_idx, test_idx, evaluator = \
        load_data(args.ogb_path, device)
    load_pretrained_node_embed(g, args.node_embed_path)
    g = g.to(device)

    sampler = MultiLayerNeighborSampler(
        list(range(args.neighbor_size, args.neighbor_size + args.num_layers))
    )
    train_loader = NodeDataLoader(g, {'paper': train_idx}, sampler, device=device, batch_size=args.batch_size)
    val_loader = NodeDataLoader(g, {'paper': val_idx}, sampler, device=device, batch_size=args.batch_size)
    test_loader = NodeDataLoader(g, {'paper': test_idx}, sampler, device=device, batch_size=args.batch_size)

    model = RHGNN(
        {ntype: g.nodes[ntype].data['feat'].shape[1] for ntype in g.ntypes},
        args.num_hidden, num_classes, args.num_rel_hidden, args.num_rel_hidden, args.num_heads,
        g.ntypes, g.canonical_etypes, 'paper', args.num_layers, args.dropout, residual=args.residual
    ).to(device)
    optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(
        optimizer, T_max=len(train_loader) * args.epochs, eta_min=args.lr / 100
    )
    warnings.filterwarnings('ignore', 'Setting attributes on ParameterDict is not supported')
    for epoch in range(args.epochs):
        model.train()
        logits, train_labels, losses = [], [], []
        for input_nodes, output_nodes, blocks in tqdm(train_loader):
            batch_labels = labels[output_nodes['paper']]
            batch_logits = model(blocks, blocks[0].srcdata['feat'])
            loss = F.cross_entropy(batch_logits, batch_labels.squeeze(dim=1))

            logits.append(batch_logits.detach().cpu())
            train_labels.append(batch_labels.detach().cpu())
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()
            torch.cuda.empty_cache()

        train_acc = accuracy(torch.cat(logits, dim=0), torch.cat(train_labels, dim=0), evaluator)
        val_acc = evaluate(val_loader, device, model, labels, evaluator)
        test_acc = evaluate(test_loader, device, model, labels, evaluator)
        print('Epoch {:d} | Train Loss {:.4f} | Train Acc {:.4f} | Val Acc {:.4f} | Test Acc {:.4f}'.format(
            epoch, torch.tensor(losses).mean().item(), train_acc, val_acc, test_acc
        ))
    # embed = model.inference(g, g.ndata['feat'], device, args.batch_size)
    # test_acc = accuracy(embed[test_idx], labels[test_idx], evaluator)
    test_acc = evaluate(test_loader, device, model, labels, evaluator)
    print('Test Acc {:.4f}'.format(test_acc))
Exemple #9
0
def train(args):
    set_random_seed(args.seed)
    device = get_device(args.device)
    data = RatingKnowledgeGraphDataset(args.dataset)
    user_item_graph = data.user_item_graph
    knowledge_graph = dgl.sampling.sample_neighbors(
        data.knowledge_graph, data.knowledge_graph.nodes(), args.neighbor_size, replace=True
    )

    train_eids, test_eids = train_test_split(
        torch.arange(user_item_graph.num_edges()), train_size=args.train_size,
        random_state=args.seed
    )
    sampler = MultiLayerNeighborSampler([args.neighbor_size] * args.num_hops)
    train_loader = KGCNEdgeDataLoader(
        user_item_graph, train_eids, sampler, knowledge_graph,
        device=device, batch_size=args.batch_size
    )
    test_loader = KGCNEdgeDataLoader(
        user_item_graph, test_eids, sampler, knowledge_graph,
        device=device, batch_size=args.batch_size
    )

    model = KGCN(args.num_hidden, args.neighbor_size, args.aggregator, args.num_hops, *data.get_num()).to(device)
    optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    for epoch in range(args.epochs):
        model.train()
        losses = []
        for _, pair_graph, blocks in train_loader:
            scores = model(pair_graph, blocks)
            loss = F.binary_cross_entropy(scores, pair_graph.edata['label'])
            losses.append(loss.item())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print('Epoch {:d} | Train Loss {:.4f} | Train AUC {:.4f} | Train F1 {:.4f} | Test AUC {:.4f} | Test F1 {:.4f}'.format(
            epoch, sum(losses) / len(losses), *evaluate(model, train_loader), *evaluate(model, test_loader)
        ))
Exemple #10
0
def run(args, graph, labels, train_idx, val_idx, test_idx, evaluator,
        n_running):
    evaluator_wrapper = lambda pred, labels: evaluator.eval({
        "y_pred": pred,
        "y_true": labels
    })["rocauc"]

    train_batch_size = (len(train_idx) + 9) // 10
    # batch_size = len(train_idx)
    train_sampler = MultiLayerNeighborSampler(
        [16 for _ in range(args.n_layers)])
    # sampler = MultiLayerFullNeighborSampler(args.n_layers)
    train_dataloader = DataLoaderWrapper(
        NodeDataLoader(
            graph.cpu(),
            train_idx.cpu(),
            train_sampler,
            batch_sampler=BatchSampler(len(train_idx),
                                       batch_size=train_batch_size),
            num_workers=4,
        ))

    eval_sampler = MultiLayerNeighborSampler(
        [60 for _ in range(args.n_layers)])
    # sampler = MultiLayerFullNeighborSampler(args.n_layers)
    eval_dataloader = DataLoaderWrapper(
        NodeDataLoader(
            graph.cpu(),
            torch.cat([train_idx.cpu(),
                       val_idx.cpu(),
                       test_idx.cpu()]),
            eval_sampler,
            batch_sampler=BatchSampler(graph.number_of_nodes(),
                                       batch_size=32768),
            num_workers=4,
        ))

    criterion = nn.BCEWithLogitsLoss()

    model = gen_model(args).to(device)

    optimizer = optim.AdamW(model.parameters(),
                            lr=args.lr,
                            weight_decay=args.wd)
    lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                        mode="max",
                                                        factor=0.75,
                                                        patience=50,
                                                        verbose=True)

    total_time = 0
    val_score, best_val_score, final_test_score = 0, 0, 0

    train_scores, val_scores, test_scores = [], [], []
    losses, train_losses, val_losses, test_losses = [], [], [], []
    final_pred = None

    for epoch in range(1, args.n_epochs + 1):
        tic = time.time()

        loss = train(args, model, train_dataloader, labels, train_idx,
                     criterion, optimizer, evaluator_wrapper)

        toc = time.time()
        total_time += toc - tic

        if epoch == args.n_epochs or epoch % args.eval_every == 0 or epoch % args.log_every == 0:
            train_score, val_score, test_score, train_loss, val_loss, test_loss, pred = evaluate(
                args, model, eval_dataloader, labels, train_idx, val_idx,
                test_idx, criterion, evaluator_wrapper)

            if val_score > best_val_score:
                best_val_score = val_score
                final_test_score = test_score
                final_pred = pred

            if epoch % args.log_every == 0:
                print(
                    f"Run: {n_running}/{args.n_runs}, Epoch: {epoch}/{args.n_epochs}, Average epoch time: {total_time / epoch:.2f}s"
                )
                print(
                    f"Loss: {loss:.4f}\n"
                    f"Train/Val/Test loss: {train_loss:.4f}/{val_loss:.4f}/{test_loss:.4f}\n"
                    f"Train/Val/Test/Best val/Final test score: {train_score:.4f}/{val_score:.4f}/{test_score:.4f}/{best_val_score:.4f}/{final_test_score:.4f}"
                )

            for l, e in zip(
                [
                    train_scores, val_scores, test_scores, losses,
                    train_losses, val_losses, test_losses
                ],
                [
                    train_score, val_score, test_score, loss, train_loss,
                    val_loss, test_loss
                ],
            ):
                l.append(e)

        lr_scheduler.step(val_score)

    print("*" * 50)
    print(
        f"Best val score: {best_val_score}, Final test score: {final_test_score}"
    )
    print("*" * 50)

    if args.plot_curves:
        fig = plt.figure(figsize=(24, 24))
        ax = fig.gca()
        ax.set_xticks(np.arange(0, args.n_epochs, 100))
        ax.set_yticks(np.linspace(0, 1.0, 101))
        ax.tick_params(labeltop=True, labelright=True)
        for y, label in zip([train_scores, val_scores, test_scores],
                            ["train score", "val score", "test score"]):
            plt.plot(range(1, args.n_epochs + 1, args.log_every),
                     y,
                     label=label,
                     linewidth=1)
        ax.xaxis.set_major_locator(MultipleLocator(100))
        ax.xaxis.set_minor_locator(AutoMinorLocator(1))
        ax.yaxis.set_major_locator(MultipleLocator(0.01))
        ax.yaxis.set_minor_locator(AutoMinorLocator(2))
        plt.grid(which="major", color="red", linestyle="dotted")
        plt.grid(which="minor", color="orange", linestyle="dotted")
        plt.legend()
        plt.tight_layout()
        plt.savefig(f"gat_score_{n_running}.png")

        fig = plt.figure(figsize=(24, 24))
        ax = fig.gca()
        ax.set_xticks(np.arange(0, args.n_epochs, 100))
        ax.tick_params(labeltop=True, labelright=True)
        for y, label in zip([losses, train_losses, val_losses, test_losses],
                            ["loss", "train loss", "val loss", "test loss"]):
            plt.plot(range(1, args.n_epochs + 1, args.log_every),
                     y,
                     label=label,
                     linewidth=1)
        ax.xaxis.set_major_locator(MultipleLocator(100))
        ax.xaxis.set_minor_locator(AutoMinorLocator(1))
        ax.yaxis.set_major_locator(MultipleLocator(0.1))
        ax.yaxis.set_minor_locator(AutoMinorLocator(5))
        plt.grid(which="major", color="red", linestyle="dotted")
        plt.grid(which="minor", color="orange", linestyle="dotted")
        plt.legend()
        plt.tight_layout()
        plt.savefig(f"gat_loss_{n_running}.png")

    if args.save_pred:
        os.makedirs("./output", exist_ok=True)
        torch.save(F.softmax(final_pred, dim=1), f"./output/{n_running}.pt")

    return best_val_score, final_test_score
Exemple #11
0
def train(args):
    set_random_seed(args.seed)
    data = DBLPFourAreaDataset()
    g = data[0]
    metapaths = data.metapaths
    predict_ntype = data.predict_ntype
    generate_one_hot_id(g)
    features = g.ndata['feat']  # Dict[str, tensor(N_i, d_i)]
    labels = g.nodes[predict_ntype].data['label']
    train_idx = g.nodes[predict_ntype].data['train_mask'].nonzero(
        as_tuple=True)[0]
    val_idx = g.nodes[predict_ntype].data['val_mask'].nonzero(as_tuple=True)[0]
    test_idx = g.nodes[predict_ntype].data['test_mask'].nonzero(
        as_tuple=True)[0]
    out_shape = (g.num_nodes(predict_ntype), data.num_classes)

    print('正在生成基于元路径的图(有点慢)...')
    mgs = [metapath_based_graph(g, metapath) for metapath in metapaths]
    mgs[0].ndata['feat'] = features[predict_ntype]
    sampler = MultiLayerNeighborSampler([args.neighbor_size])
    collators = [NodeCollator(mg, None, sampler) for mg in mgs]
    train_dataloader = DataLoader(train_idx, batch_size=args.batch_size)
    val_dataloader = DataLoader(val_idx, batch_size=args.batch_size)
    test_dataloader = DataLoader(test_idx, batch_size=args.batch_size)

    metapaths_ntype = [to_ntype_list(g, metapath) for metapath in metapaths]
    model = MAGNNMinibatch(
        predict_ntype, metapaths_ntype,
        {ntype: feat.shape[1]
         for ntype, feat in features.items()}, args.num_hidden,
        data.num_classes, args.num_heads, args.encoder, args.dropout)
    optimizer = optim.Adam(model.parameters(),
                           lr=args.lr,
                           weight_decay=args.weight_decay)
    for epoch in range(args.epochs):
        model.train()
        losses = []
        train_logits = torch.zeros(out_shape)
        for batch in train_dataloader:
            gs = [collator.collate(batch)[2][0] for collator in collators]
            train_logits[batch] = logits = model(gs, features)
            loss = F.cross_entropy(logits, labels[batch])
            losses.append(loss.item())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        train_metrics = micro_macro_f1_score(train_logits[train_idx],
                                             labels[train_idx])
        print(
            'Epoch {:d} | Train Loss {:.4f} | Train Micro-F1 {:.4f} | Train Macro-F1 {:.4f}'
            .format(epoch,
                    torch.tensor(losses).mean().item(), *train_metrics))
        if (epoch + 1) % 10 == 0:
            val_metrics = evaluate(out_shape, collators, val_dataloader, model,
                                   features, labels)
            print('Val Micro-F1 {:.4f} | Val Macro-F1 {:.4f}'.format(
                *val_metrics))

    test_metrics = evaluate(out_shape, collators, test_dataloader, model,
                            features, labels)
    print('Test Micro-F1 {:.4f} | Test Macro-F1 {:.4f}'.format(*test_metrics))
Exemple #12
0
def run(args, graph, labels, train_idx, val_idx, test_idx, evaluator,
        n_running):
    evaluator_wrapper = lambda pred, labels: evaluator.eval(
        {
            "y_pred": pred.argmax(dim=-1, keepdim=True),
            "y_true": labels
        })["acc"]
    criterion = custom_loss_function

    n_train_samples = train_idx.shape[0]
    train_batch_size = (n_train_samples + 29) // 30
    train_sampler = MultiLayerNeighborSampler(
        [10 for _ in range(args.n_layers)])
    train_dataloader = DataLoaderWrapper(
        DataLoader(
            graph.cpu(),
            train_idx.cpu(),
            train_sampler,
            batch_sampler=BatchSampler(len(train_idx),
                                       batch_size=train_batch_size,
                                       shuffle=True),
            num_workers=4,
        ))

    eval_batch_size = 32768
    eval_sampler = MultiLayerNeighborSampler(
        [15 for _ in range(args.n_layers)])

    if args.estimation_mode:
        test_idx_during_training = test_idx[torch.arange(start=0,
                                                         end=len(test_idx),
                                                         step=45)]
    else:
        test_idx_during_training = test_idx

    eval_idx = torch.cat(
        [train_idx.cpu(),
         val_idx.cpu(),
         test_idx_during_training.cpu()])
    eval_dataloader = DataLoaderWrapper(
        DataLoader(
            graph.cpu(),
            eval_idx,
            eval_sampler,
            batch_sampler=BatchSampler(len(eval_idx),
                                       batch_size=eval_batch_size,
                                       shuffle=False),
            num_workers=4,
        ))

    model = gen_model(args).to(device)

    optimizer = optim.AdamW(model.parameters(),
                            lr=args.lr,
                            weight_decay=args.wd)
    lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                        mode="max",
                                                        factor=0.7,
                                                        patience=20,
                                                        verbose=True,
                                                        min_lr=1e-4)

    best_model_state_dict = None

    total_time = 0
    val_score, best_val_score, final_test_score = 0, 0, 0

    scores, train_scores, val_scores, test_scores = [], [], [], []
    losses, train_losses, val_losses, test_losses = [], [], [], []

    for epoch in range(1, args.n_epochs + 1):
        tic = time.time()

        score, loss = train(args, model, train_dataloader, labels, train_idx,
                            criterion, optimizer, evaluator_wrapper)

        toc = time.time()
        total_time += toc - tic

        if epoch == args.n_epochs or epoch % args.eval_every == 0 or epoch % args.log_every == 0:
            train_score, val_score, test_score, train_loss, val_loss, test_loss = evaluate(
                args,
                model,
                eval_dataloader,
                labels,
                train_idx,
                val_idx,
                test_idx_during_training,
                criterion,
                evaluator_wrapper,
            )

            if val_score > best_val_score:
                best_val_score = val_score
                final_test_score = test_score
                if args.estimation_mode:
                    best_model_state_dict = {
                        k: v.to("cpu")
                        for k, v in model.state_dict().items()
                    }

            if epoch == args.n_epochs or epoch % args.log_every == 0:
                print(
                    f"Run: {n_running}/{args.n_runs}, Epoch: {epoch}/{args.n_epochs}, Average epoch time: {total_time / epoch:.2s}\n"
                    f"Loss: {loss:.4f}, Score: {score:.4f}\n"
                    f"Train/Val/Test loss: {train_loss:.4f}/{val_loss:.4f}/{test_loss:.4f}\n"
                    f"Train/Val/Test/Best val/Final test score: {train_score:.4f}/{val_score:.4f}/{test_score:.4f}/{best_val_score:.4f}/{final_test_score:.4f}"
                )

            for l, e in zip(
                [
                    scores, train_scores, val_scores, test_scores, losses,
                    train_losses, val_losses, test_losses
                ],
                [
                    score, train_score, val_score, test_score, loss,
                    train_loss, val_loss, test_loss
                ],
            ):
                l.append(e)

        lr_scheduler.step(val_score)

    if args.estimation_mode:
        model.load_state_dict(best_model_state_dict)
        eval_dataloader = DataLoaderWrapper(
            DataLoader(
                graph.cpu(),
                test_idx.cpu(),
                eval_sampler,
                batch_sampler=BatchSampler(len(test_idx),
                                           batch_size=eval_batch_size,
                                           shuffle=False),
                num_workers=4,
            ))
        final_test_score = evaluate(args, model, eval_dataloader, labels,
                                    train_idx, val_idx, test_idx, criterion,
                                    evaluator_wrapper)[2]

    print("*" * 50)
    print(
        f"Best val score: {best_val_score}, Final test score: {final_test_score}"
    )
    print("*" * 50)

    if args.plot_curves:
        fig = plt.figure(figsize=(24, 24))
        ax = fig.gca()
        ax.set_xticks(np.arange(0, args.n_epochs, 100))
        ax.set_yticks(np.linspace(0, 1.0, 101))
        ax.tick_params(labeltop=True, labelright=True)
        for y, label in zip([train_scores, val_scores, test_scores],
                            ["train score", "val score", "test score"]):
            plt.plot(range(1, args.n_epochs + 1, args.log_every),
                     y,
                     label=label,
                     linewidth=1)
        ax.xaxis.set_major_locator(MultipleLocator(10))
        ax.xaxis.set_minor_locator(AutoMinorLocator(1))
        ax.yaxis.set_major_locator(MultipleLocator(0.01))
        ax.yaxis.set_minor_locator(AutoMinorLocator(2))
        plt.grid(which="major", color="red", linestyle="dotted")
        plt.grid(which="minor", color="orange", linestyle="dotted")
        plt.legend()
        plt.tight_layout()
        plt.savefig(f"gat_score_{n_running}.png")

        fig = plt.figure(figsize=(24, 24))
        ax = fig.gca()
        ax.set_xticks(np.arange(0, args.n_epochs, 100))
        ax.tick_params(labeltop=True, labelright=True)
        for y, label in zip([losses, train_losses, val_losses, test_losses],
                            ["loss", "train loss", "val loss", "test loss"]):
            plt.plot(range(1, args.n_epochs + 1, args.log_every),
                     y,
                     label=label,
                     linewidth=1)
        ax.xaxis.set_major_locator(MultipleLocator(10))
        ax.xaxis.set_minor_locator(AutoMinorLocator(1))
        ax.yaxis.set_major_locator(MultipleLocator(0.1))
        ax.yaxis.set_minor_locator(AutoMinorLocator(5))
        plt.grid(which="major", color="red", linestyle="dotted")
        plt.grid(which="minor", color="orange", linestyle="dotted")
        plt.legend()
        plt.tight_layout()
        plt.savefig(f"gat_loss_{n_running}.png")

    return best_val_score, final_test_score
Exemple #13
0
def train(args):
    set_random_seed(args.seed)
    device = get_device(args.device)
    g, author_rank, field_ids, true_relevance = load_rank_data(device)
    out_dim = g.nodes['field'].data['feat'].shape[1]
    add_node_feat(g, 'pretrained', args.node_embed_path, use_raw_id=True)
    field_paper = recall_paper(g.cpu(), field_ids,
                               args.num_recall)  # {field_id: [paper_id]}

    sampler = MultiLayerNeighborSampler([args.neighbor_size] * args.num_layers)
    sampler.set_output_context(to_dgl_context(device))
    triplet_collator = TripletNodeCollator(g, sampler)

    model = RHGNN(
        {ntype: g.nodes[ntype].data['feat'].shape[1]
         for ntype in g.ntypes}, args.num_hidden, out_dim, args.num_rel_hidden,
        args.num_rel_hidden, args.num_heads, g.ntypes, g.canonical_etypes,
        'author', args.num_layers, args.dropout).to(device)
    if args.load_path:
        model.load_state_dict(torch.load(args.load_path, map_location=device))
    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                     T_max=len(field_ids) *
                                                     args.epochs,
                                                     eta_min=args.lr / 100)
    warnings.filterwarnings(
        'ignore', 'Setting attributes on ParameterDict is not supported')
    for epoch in range(args.epochs):
        model.train()
        losses = []
        for f in tqdm(field_ids):
            false_author_ids = list(
                set(g.in_edges(field_paper[f], etype='writes')[0].tolist()) -
                set(author_rank[f]))
            triplets = sample_triplets(f, author_rank[f], false_author_ids,
                                       args.num_triplets).to(device)
            aid, blocks = triplet_collator.collate(triplets)
            author_embeds = model(blocks, blocks[0].srcdata['feat'])
            author_embeds = author_embeds / author_embeds.norm(dim=1,
                                                               keepdim=True)
            aid_map = {a: i for i, a in enumerate(aid.tolist())}
            anchor = g.nodes['field'].data['feat'][triplets[:, 0]]
            positive = author_embeds[[
                aid_map[a] for a in triplets[:, 1].tolist()
            ]]
            negative = author_embeds[[
                aid_map[a] for a in triplets[:, 2].tolist()
            ]]
            loss = F.triplet_margin_loss(anchor, positive, negative,
                                         args.margin)

            losses.append(loss.item())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()
            torch.cuda.empty_cache()
        print('Epoch {:d} | Loss {:.4f}'.format(epoch,
                                                sum(losses) / len(losses)))
        torch.save(model.state_dict(), args.model_save_path)
        if epoch % args.eval_every == 0 or epoch == args.epochs - 1:
            print(
                METRICS_STR.format(*evaluate(
                    model, g, out_dim, sampler, args.batch_size, device,
                    field_ids, field_paper, author_rank, true_relevance)))
    torch.save(model.state_dict(), args.model_save_path)
    print('模型已保存到', args.model_save_path)

    embeds = infer(model, g, 'author', out_dim, sampler, args.batch_size,
                   device)
    author_embed_save_path = DATA_DIR / 'rank/author_embed.pkl'
    torch.save(embeds.cpu(), author_embed_save_path)
    print('学者嵌入已保存到', author_embed_save_path)