Beispiel #1
0
def test_metapath2vec():
    edge_index_dict = {
        ('author', 'writes', 'paper'):
        torch.tensor([[0, 1, 1, 2], [0, 0, 1, 1]]),
        ('paper', 'written_by', 'author'):
        torch.tensor([[0, 0, 1, 1], [0, 1, 1, 2]])
    }

    metapath = [
        ('author', 'writes', 'paper'),
        ('paper', 'written_by', 'author'),
    ]

    model = MetaPath2Vec(edge_index_dict, embedding_dim=16, metapath=metapath,
                         walk_length=2, context_size=2)
    assert model.__repr__() == 'MetaPath2Vec(5, 16)'

    z = model('author')
    assert z.size() == (3, 16)

    z = model('paper')
    assert z.size() == (2, 16)

    z = model('author', torch.arange(2))
    assert z.size() == (2, 16)

    pos_rw, neg_rw = model.sample(torch.arange(3))

    loss = model.loss(pos_rw, neg_rw)
    assert 0 <= loss.item()

    acc = model.test(torch.ones(20, 16), torch.randint(10, (20, )),
                     torch.ones(20, 16), torch.randint(10, (20, )))
    assert 0 <= acc and acc <= 1
Beispiel #2
0
        print(pyg_data)

        metapath = [('author', 'writes', 'paper'),
                    ('paper', 'has_topic', 'field_of_study'),
                    ('field_of_study', 'rev_has_topic', 'paper'),
                    ('paper', 'rev_cites', 'paper'),
                    ('paper', 'rev_writes', 'author'),
                    ('author', 'affiliated_with', 'institution'),
                    ('institution', 'rev_affiliated_with', 'author'),
                    ('author', 'writes', 'paper'), ('paper', 'cites', 'paper'),
                    ('paper', 'rev_writes', 'author')]

        metapath2vec_model = MetaPath2Vec(
            pyg_data.edge_index_dict,
            embedding_dim=args['embedding_dim'],
            metapath=metapath,
            walk_length=args['walk_length'],
            context_size=args['context_size'],
            walks_per_node=args['walks_per_node'],
            num_negative_samples=args['num_negative_samples']).to(args['cuda'])

        loader = metapath2vec_model.loader(batch_size=128,
                                           shuffle=True,
                                           num_workers=4)
        optimizer = torch.optim.Adam(metapath2vec_model.parameters(), lr=0.01)

        metapath2vec_model.train()
        for epoch in range(1, args['epochs'] + 1):
            for i, (pos_rw, neg_rw) in enumerate(loader):
                optimizer.zero_grad()
                loss = metapath2vec_model.loss(pos_rw.to(args['cuda']),
                                               neg_rw.to(args['cuda']))
path = osp.join(osp.dirname(osp.realpath(__file__)), '../../data/AMiner')
dataset = AMiner(path)
data = dataset[0]

metapath = [
    ('author', 'writes', 'paper'),
    ('paper', 'published_in', 'venue'),
    ('venue', 'publishes', 'paper'),
    ('paper', 'written_by', 'author'),
]

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = MetaPath2Vec(data.edge_index_dict,
                     embedding_dim=128,
                     metapath=metapath,
                     walk_length=50,
                     context_size=7,
                     walks_per_node=5,
                     num_negative_samples=5,
                     sparse=True).to(device)

loader = model.loader(batch_size=128, shuffle=True, num_workers=6)
optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)


def train(epoch, log_steps=100, eval_steps=2000):
    model.train()

    total_loss = 0
    for i, (pos_rw, neg_rw) in enumerate(loader):
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device))
def main():
    path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data',
                    'AMiner')
    dataset = AMiner(path)
    data = dataset[0]
    print(data)

    metapath = [
        ('author', 'wrote', 'paper'),
        ('paper', 'published in', 'venue'),
        ('venue', 'published', 'paper'),
        ('paper', 'written by', 'author'),
    ]

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = MetaPath2Vec(data.edge_index_dict, embedding_dim=128,
                         metapath=metapath, walk_length=50, context_size=7,
                         walks_per_node=5, num_negative_samples=5,
                         sparse=True).to(device)

    loader = model.loader(batch_size=128, shuffle=True, num_workers=12)
    optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)

    def train(epoch, log_steps=100, eval_steps=2000):
        model.train()

        total_loss = 0
        for i, (pos_rw, neg_rw) in enumerate(loader):
            optimizer.zero_grad()
            loss = model.loss(pos_rw.to(device), neg_rw.to(device))
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            if (i + 1) % log_steps == 0:
                print((f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, '
                       f'Loss: {total_loss / log_steps:.4f}'))
                total_loss = 0

            if (i + 1) % eval_steps == 0:
                acc = test()
                print((f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, '
                       f'Acc: {acc:.4f}'))

    @torch.no_grad()
    def test(train_ratio=0.1):
        model.eval()

        z = model('author', batch=data.y_index_dict['author'])
        y = data.y_dict['author']

        perm = torch.randperm(z.size(0))
        train_perm = perm[:int(z.size(0) * train_ratio)]
        test_perm = perm[int(z.size(0) * train_ratio):]

        return model.test(z[train_perm], y[train_perm], z[test_perm],
                          y[test_perm], max_iter=150)

    for epoch in range(1, 6):
        train(epoch)
        acc = test()
        print(f'Epoch: {epoch}, Accuracy: {acc:.4f}')
Beispiel #5
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-MAG (MetaPath2Vec)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--embedding_dim', type=int, default=128)
    parser.add_argument('--walk_length', type=int, default=64)
    parser.add_argument('--context_size', type=int, default=7)
    parser.add_argument('--walks_per_node', type=int, default=5)
    parser.add_argument('--num_negative_samples', type=int, default=5)
    parser.add_argument('--batch_size', type=int, default=128)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=5)
    parser.add_argument('--log_steps', type=int, default=100)
    args = parser.parse_args()

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygNodePropPredDataset('ogbn-mag')
    data = dataset[0]

    # We need to add reverse edges to the heterogeneous graph.
    data.edge_index_dict[('institution', 'employs', 'author')] = transpose(
        data.edge_index_dict[('author', 'affiliated_with', 'institution')],
        None,
        m=data.num_nodes_dict['author'],
        n=data.num_nodes_dict['institution'])[0]
    data.edge_index_dict[('paper', 'written_by', 'author')] = transpose(
        data.edge_index_dict[('author', 'writes', 'paper')],
        None,
        m=data.num_nodes_dict['author'],
        n=data.num_nodes_dict['paper'])[0]
    data.edge_index_dict[('field_of_study', 'contains', 'paper')] = transpose(
        data.edge_index_dict[('paper', 'has_topic', 'field_of_study')],
        None,
        m=data.num_nodes_dict['paper'],
        n=data.num_nodes_dict['field_of_study'])[0]
    print(data)

    metapath = [
        ('author', 'writes', 'paper'),
        ('paper', 'has_topic', 'field_of_study'),
        ('field_of_study', 'contains', 'paper'),
        ('paper', 'written_by', 'author'),
        ('author', 'affiliated_with', 'institution'),
        ('institution', 'employs', 'author'),
        ('author', 'writes', 'paper'),
        ('paper', 'cites', 'paper'),
        ('paper', 'written_by', 'author'),
    ]

    model = MetaPath2Vec(data.edge_index_dict,
                         embedding_dim=128,
                         metapath=metapath,
                         walk_length=64,
                         context_size=7,
                         walks_per_node=5,
                         num_negative_samples=5,
                         sparse=True).to(device)

    loader = model.loader(batch_size=128, shuffle=True, num_workers=4)
    optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)

    model.train()
    for epoch in range(1, args.epochs + 1):
        for i, (pos_rw, neg_rw) in enumerate(loader):
            optimizer.zero_grad()
            loss = model.loss(pos_rw.to(device), neg_rw.to(device))
            loss.backward()
            optimizer.step()

            if (i + 1) % args.log_steps == 0:
                print(f'Epoch: {epoch:02d}, Step: {i+1:03d}/{len(loader)}, '
                      f'Loss: {loss:.4f}')

            if (i + 1) % 1000 == 0:  # Save model every 1000 steps.
                save_embedding(model)
        save_embedding(model)
Beispiel #6
0
def metapath2vec(fp, PARAMS):
    """[function to generate metapath2vec]

    Args:
        fp ([string]): [the file path of the root of the data]
        PARAMS ([dict]): [the parameters of the node2vec model,
                        KEYS:{
                                GRAPH_NAME: the name of the graph file
                                EMBEDDING_DIM: dimension of embedding, 
                                WALK_LENGTH: random walk length, 
                                CONTEXT_SIZE: context size, 
                                WALKS_PER_NODE: number of walks per node, 
                                NUM_NEG_SAMPLES: number of negative samples,
                                LEARNING_RATE: learning rate, 
                                BATCH_SIZE: batch size of each batch, 
                                NUM_EPOCH: number of epoch to be trained,
                                CUDA: use GPU
                                }]
    Returns:
        [np.array]: [numpy array format of the metapath2vec embedding]
    """
    g = io.loadmat(osp.join(fp, 'interim', 'graph', PARAMS['GRAPH_NAME']))
    user_user = from_scipy_sparse_matrix(g['U'])
    author_post = from_scipy_sparse_matrix(g['A'])
    post_user = from_scipy_sparse_matrix(g['P'])
    data = Data(edge_index_dict={
        ('user', 'replied by', 'user'): user_user[0],
        ('user', 'wrote', 'post'): author_post[0],
        ('post', 'commented by', 'user'): post_user[0],
    },
                num_nodes_dict={
                    'post': g['post_indx'].shape[1],
                    'user': g['user_indx'].shape[1]
                })
    if PARAMS['CUDA']:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
    else:
        device = 'cpu'
    model = MetaPath2Vec(data.edge_index_dict,
                         embedding_dim=PARAMS['EMBEDDING_DIM'],
                         metapath=metapath,
                         walk_length=PARAMS['WALK_LENGTH'],
                         context_size=PARAMS['CONTEXT_SIZE'],
                         walks_per_node=PARAMS['WALKS_PER_NODE'],
                         num_negative_samples=PARAMS['NUM_NEG_SAMPLES'],
                         sparse=True).to(device)
    losses = []
    if not PARAMS["TEST"]:
        loader = model.loader(batch_size=PARAMS['BATCH_SIZE'],
                              shuffle=True,
                              num_workers=8)
        optimizer = torch.optim.SparseAdam(model.parameters(),
                                           lr=PARAMS['LEARNING_RATE'])

        def train(epoch, log_steps=100):
            model.train()
            total_loss = 0
            store = []
            i = 1
            loading = iter(loader)
            while loading != None:
                try:
                    pos_rw, neg_rw = next(loading)
                except IndexError:
                    continue
                except StopIteration:
                    loading = None
                optimizer.zero_grad()
                loss = model.loss(pos_rw.to(device), neg_rw.to(device))
                loss.backward()
                optimizer.step()
                total_loss += loss.item()
                if (i + 1) % log_steps == 0:
                    print((f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, '
                           f'Loss: {total_loss / log_steps:.4f}'))
                    store.append(total_loss / log_steps)
                    total_loss = 0
                i += 1
            return store

        for epoch in range(1, PARAMS['NUM_EPOCH'] + 1):
            losses.append(train(epoch))
    model.eval()
    with torch.no_grad():
        z = model('post').detach().cpu().numpy()
    if not os.path.exists(os.path.join(fp, 'processed', 'metapath2vec')):
        os.makedirs(os.path.join(fp, 'processed', 'metapath2vec'),
                    exist_ok=True)
    with open(
            osp.join(fp, 'processed', 'metapath2vec',
                     PARAMS['EMBEDDING_NAME'] + 'log.json'), 'w') as f:
        json.dump({'loss': losses}, f)
    np.save(
        osp.join(fp, 'processed', 'metapath2vec', PARAMS['EMBEDDING_NAME']), z)
    print('successfully saved embedding')
    return z