Beispiel #1
0
 def __init__(self):
     d_name = "ogbl-ppa"
     dataset = LinkPropPredDataset(name=d_name)
     splitted_edge = dataset.get_edge_split()
     graph = dataset[0]
     self.num_nodes = graph["num_nodes"]
     self.ogb_evaluator = Evaluator(name="ogbl-ppa")
Beispiel #2
0
 def __init__(self, path: str):
     ogbl_dataset = LinkPropPredDataset("ogbl-citation2", path)
     edge_split = ogbl_dataset.get_edge_split()
     super(OGBLCitation2Dataset, self).__init__([
         _OGBLDatasetUtil.ogbl_data_to_general_static_graph(
             ogbl_dataset[0], {
                 ('', '', ''):
                 torch.from_numpy(ogbl_dataset[0]['edge_index']),
                 ('', 'train_pos_edge', ''):
                 torch.from_numpy(edge_split['train']['edge']),
                 ('', 'val_pos_edge', ''):
                 torch.from_numpy(edge_split['valid']['edge']),
                 ('', 'val_neg_edge', ''):
                 torch.from_numpy(edge_split['valid']['edge_neg']),
                 ('', 'test_pos_edge', ''):
                 torch.from_numpy(edge_split['test']['edge']),
                 ('', 'test_neg_edge', ''):
                 torch.from_numpy(edge_split['test']['edge_neg'])
             }, ({
                 'node_feat': 'feat',
                 'node_year': 'year'
             } if _backend.DependentBackend.is_dgl() else {
                 'node_feat': 'x',
                 'node_year': 'year'
             }))
     ])
Beispiel #3
0
    def __init__(self,
                 graph_wrapper=None,
                 buf_size=1000,
                 batch_size=128,
                 num_workers=1,
                 shuffle=True,
                 phase="train"):
        super(PPADataGenerator, self).__init__(buf_size=buf_size,
                                               num_workers=num_workers,
                                               batch_size=batch_size,
                                               shuffle=shuffle)

        self.d_name = "ogbl-ppa"
        self.graph_wrapper = graph_wrapper
        dataset = LinkPropPredDataset(name=self.d_name)
        splitted_edge = dataset.get_edge_split()
        self.phase = phase
        graph = dataset[0]
        edges = graph["edge_index"].T
        #self.graph = pgl.graph.Graph(num_nodes=graph["num_nodes"],
        #       edges=edges,
        #       node_feat={"nfeat": graph["node_feat"],
        #             "node_id": np.arange(0, graph["num_nodes"], dtype="int64").reshape(-1, 1) })

        #self.graph.indegree()
        self.num_nodes = graph["num_nodes"]
        if self.phase == 'train':
            edges = splitted_edge["train"]["edge"]
            labels = np.ones(len(edges))
        elif self.phase == "valid":
            # Compute the embedding for all the nodes
            pos_edges = splitted_edge["valid"]["edge"]
            neg_edges = splitted_edge["valid"]["edge_neg"]
            pos_labels = np.ones(len(pos_edges))
            neg_labels = np.zeros(len(neg_edges))
            edges = np.vstack([pos_edges, neg_edges])
            labels = pos_labels.tolist() + neg_labels.tolist()
        elif self.phase == "test":
            # Compute the embedding for all the nodes
            pos_edges = splitted_edge["test"]["edge"]
            neg_edges = splitted_edge["test"]["edge_neg"]
            pos_labels = np.ones(len(pos_edges))
            neg_labels = np.zeros(len(neg_edges))
            edges = np.vstack([pos_edges, neg_edges])
            labels = pos_labels.tolist() + neg_labels.tolist()

        self.line_examples = []
        Example = namedtuple('Example', ['src', "dst", "label"])
        for edge, label in zip(edges, labels):
            self.line_examples.append(
                Example(src=edge[0], dst=edge[1], label=label))
        print("Phase", self.phase)
        print("Len Examples", len(self.line_examples))
Beispiel #4
0
    def _load(self) -> None:
        try:
            from ogb.linkproppred import LinkPropPredDataset
        except ImportError as e:
            raise ModuleNotFoundError(
                f'Need to `pip install ogb` to use pykeen.datasets.{self.__class__.__name__}.',
            ) from e

        dataset = LinkPropPredDataset(name=self.name, root=self.cache_root)
        edge_split = dataset.get_edge_split()
        self._training = self._make_tf(edge_split["train"])
        self._testing = self._make_tf(
            edge_split["test"],
            entity_to_id=self._training.entity_to_id,
            relation_to_id=self._training.relation_to_id,
        )
        self._validation = self._make_tf(
            edge_split["valid"],
            entity_to_id=self._training.entity_to_id,
            relation_to_id=self._training.relation_to_id,
        )
Beispiel #5
0
 def __init__(self, path: str):
     ogbl_dataset = LinkPropPredDataset("ogbl-ddi", path)
     edge_split = ogbl_dataset.get_edge_split()
     super(OGBLDDIDataset, self).__init__([
         GeneralStaticGraphGenerator.create_heterogeneous_static_graph(
             {'': {
                 '_NID': torch.arange(ogbl_dataset[0]['num_nodes'])
             }}, {
                 ('', '', ''):
                 torch.from_numpy(ogbl_dataset[0]['edge_index']),
                 ('', 'train_pos_edge', ''):
                 torch.from_numpy(edge_split['train']['edge']),
                 ('', 'val_pos_edge', ''):
                 torch.from_numpy(edge_split['valid']['edge']),
                 ('', 'val_neg_edge', ''):
                 torch.from_numpy(edge_split['valid']['edge_neg']),
                 ('', 'test_pos_edge', ''):
                 torch.from_numpy(edge_split['test']['edge']),
                 ('', 'test_neg_edge', ''):
                 torch.from_numpy(edge_split['test']['edge_neg'])
             })
     ])
Beispiel #6
0
def main(_):
    ds = LinkPropPredDataset(FLAGS.dataset)
    split_edge = ds.get_edge_split()
    train_edges = split_edge['train']['edge']
    train_edges = np.concatenate([train_edges, train_edges[:, ::-1]], axis=0)

    spa = scipy.sparse.csr_matrix(
        (np.ones([len(train_edges)]), (train_edges[:, 0], train_edges[:, 1])))
    mult_f = tf_fsvd.WYSDeepWalkPF(spa,
                                   window=FLAGS.wys_window,
                                   mult_degrees=False,
                                   neg_sample_coef=FLAGS.wys_neg_coef)

    tt = tqdm.tqdm(range(FLAGS.num_runs))
    test_metrics = []
    val_metrics = []
    for run in tt:
        u, s, v = tf_fsvd.fsvd(mult_f,
                               FLAGS.k,
                               n_iter=FLAGS.svd_iters,
                               n_redundancy=FLAGS.k * 3)

        dataset = LinkPropPredDataset(FLAGS.dataset)
        evaluator = Evaluator(name=FLAGS.dataset)
        evaluator.K = FLAGS.hits
        split_edge = dataset.get_edge_split()

        metrics = []
        for split in ('test', 'valid'):
            pos_edges = split_edge[split]['edge']
            neg_edges = split_edge[split]['edge_neg']

            pos_scores = tf.reduce_sum(tf.gather(u * s, pos_edges[:, 0]) *
                                       tf.gather(v, pos_edges[:, 1]),
                                       axis=1).numpy()
            neg_scores = tf.reduce_sum(tf.gather(u * s, neg_edges[:, 0]) *
                                       tf.gather(v, neg_edges[:, 1]),
                                       axis=1).numpy()
            metric = evaluator.eval({
                'y_pred_pos': pos_scores,
                'y_pred_neg': neg_scores
            })
            metrics.append(metric['hits@%i' % FLAGS.hits])
        test_metrics.append(metrics[0])
        val_metrics.append(metrics[1])

        tt.set_description(
            'HITS@%i: validate=%g; test=%g' %
            (FLAGS.hits, np.mean(val_metrics), np.mean(test_metrics)))

    print('\n\n *** Trained for %i times and average metrics are:')
    print('HITS@20 test: mean=%g; std=%g' %
          (np.mean(test_metrics), np.std(test_metrics)))
    print('HITS@20 validate: mean=%g; std=%g' %
          (np.mean(val_metrics), np.std(val_metrics)))
Beispiel #7
0
    def _add_eig(self, norm='none', number=6):

        dataset = LinkPropPredDataset(name='ogbl-collab')
        graph = dataset[0]
        G = nx.Graph()
        G.add_nodes_from([i for i in range(235868)])

        for nod1, nod2 in zip(graph['edge_index'][0], graph['edge_index'][1]):
            G.add_edge(nod1, nod2)

        components = list(nx.connected_components(G))
        list_G = []
        list_nodes = []

        for component in components:
            G_new = nx.Graph()
            G_new.add_nodes_from(list(component))
            list_G.append(G_new)
            list_nodes.append(list(component))
        for i in range(len(list_G)):
            for nod1, nod2 in list(G.edges(list_nodes[i])):
                list_G[i].add_edge(nod1, nod2)

        EigVec_global = np.ones((235868, number))
        for g in list_G:
            node_list = list(g.nodes)
            A = nx.adjacency_matrix(g, nodelist=node_list).astype(float)
            if norm == 'none':
                D = sp.diags(list(map(lambda x: x[1], g.degree())))
                L = D - A
            elif norm == 'sym':
                D_norm = sp.diags(list(map(lambda x: x[1]**(-0.5), g.degree())))
                D = sp.diags(list(map(lambda x: x[1], g.degree())))
                L = D_norm * (D - A) * D_norm
            elif norm == 'walk':
                D_norm = sp.diags(list(map(lambda x: x[1]**(-1), g.degree())))
                D = sp.diags(list(map(lambda x: x[1], g.degree())))
                L = D_norm * (D - A)

            if len(node_list) > 2:
                EigVal, EigVec = sp.linalg.eigs(L, k=min(len(node_list) - 2, number), which='SR', tol=0)
                EigVec = EigVec[:, EigVal.argsort()] / np.max(EigVec[:, EigVal.argsort()], 0)
                EigVec_global[node_list, : min(len(node_list) - 2, number)] = EigVec[:, :]
            elif len(node_list) == 2:
                EigVec_global[node_list[0], :number] = np.zeros((1, number))
        self.graph.ndata['eig'] = torch.from_numpy(EigVec_global).float()
        print(sorted(self.graph.ndata['eig'][1]))
Beispiel #8
0
def main(args):
    if (not args.do_train) and (not args.do_valid) and (not args.do_test) and (
            not args.evaluate_train):
        raise ValueError('one of train/val/test mode must be choosed.')

    if args.init_checkpoint:
        override_config(args)

    args.save_path = 'log/%s/%s/%s-%s/%s' % (
        args.dataset, args.model, args.hidden_dim, args.gamma,
        time.time()) if args.save_path == None else args.save_path
    writer = SummaryWriter(args.save_path)

    # Write logs to checkpoint and console
    set_logger(args)

    dataset = LinkPropPredDataset(name='ogbl-biokg')
    split_edge = dataset.get_edge_split()
    train_triples, valid_triples, test_triples = split_edge[
        "train"], split_edge["valid"], split_edge["test"]
    nrelation = int(max(train_triples['relation'])) + 1
    entity_dict = dict()
    cur_idx = 0
    for key in dataset[0]['num_nodes_dict']:
        entity_dict[key] = (cur_idx,
                            cur_idx + dataset[0]['num_nodes_dict'][key])
        cur_idx += dataset[0]['num_nodes_dict'][key]
    nentity = sum(dataset[0]['num_nodes_dict'].values())

    evaluator = Evaluator(name=args.dataset)

    args.nentity = nentity
    args.nrelation = nrelation

    logging.info('Model: %s' % args.model)
    logging.info('Dataset: %s' % args.dataset)
    logging.info('#entity: %d' % nentity)
    logging.info('#relation: %d' % nrelation)

    # train_triples = split_dict['train']
    logging.info('#train: %d' % len(train_triples['head']))
    # valid_triples = split_dict['valid']
    logging.info('#valid: %d' % len(valid_triples['head']))
    # test_triples = split_dict['test']
    logging.info('#test: %d' % len(test_triples['head']))

    train_count, train_true_head, train_true_tail = defaultdict(
        lambda: 4), defaultdict(list), defaultdict(list)
    for i in tqdm(range(len(train_triples['head']))):
        head, relation, tail = train_triples['head'][i], train_triples[
            'relation'][i], train_triples['tail'][i]
        head_type, tail_type = train_triples['head_type'][i], train_triples[
            'tail_type'][i]
        train_count[(head, relation, head_type)] += 1
        train_count[(tail, -relation - 1, tail_type)] += 1
        train_true_head[(relation, tail)].append(head)
        train_true_tail[(head, relation)].append(tail)

    kge_model = KGEModel(
        model_name=args.model,
        nentity=nentity,
        nrelation=nrelation,
        hidden_dim=args.hidden_dim,
        gamma=args.gamma,
        double_entity_embedding=args.double_entity_embedding,
        double_relation_embedding=args.double_relation_embedding,
        evaluator=evaluator)

    logging.info('Model Parameter Configuration:')
    for name, param in kge_model.named_parameters():
        logging.info('Parameter %s: %s, require_grad = %s' %
                     (name, str(param.size()), str(param.requires_grad)))

    if args.cuda:
        kge_model = kge_model.cuda()

    if args.init_checkpoint:
        # Restore model from checkpoint directory
        logging.info('Loading checkpoint %s...' % args.init_checkpoint)
        checkpoint = torch.load(
            os.path.join(args.init_checkpoint, 'checkpoint'))
        entity_dict = checkpoint['entity_dict']

    if args.do_train:
        # Set training dataloader iterator
        train_dataloader_head = DataLoader(
            TrainDataset(train_triples, nentity, nrelation,
                         args.negative_sample_size, 'head-batch', train_count,
                         train_true_head, train_true_tail, entity_dict),
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=max(1, args.cpu_num // 2),
            collate_fn=TrainDataset.collate_fn)

        train_dataloader_tail = DataLoader(
            TrainDataset(train_triples, nentity, nrelation,
                         args.negative_sample_size, 'tail-batch', train_count,
                         train_true_head, train_true_tail, entity_dict),
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=max(1, args.cpu_num // 2),
            collate_fn=TrainDataset.collate_fn)

        train_iterator = BidirectionalOneShotIterator(train_dataloader_head,
                                                      train_dataloader_tail)

        # Set training configuration
        current_learning_rate = args.learning_rate
        optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                            kge_model.parameters()),
                                     lr=current_learning_rate)
        if args.warm_up_steps:
            warm_up_steps = args.warm_up_steps
        else:
            warm_up_steps = args.max_steps // 2

    if args.init_checkpoint:
        # Restore model from checkpoint directory
        # logging.info('Loading checkpoint %s...' % args.init_checkpoint)
        # checkpoint = torch.load(os.path.join(args.init_checkpoint, 'checkpoint'))
        init_step = checkpoint['step']
        kge_model.load_state_dict(checkpoint['model_state_dict'])
        # entity_dict = checkpoint['entity_dict']
        if args.do_train:
            current_learning_rate = checkpoint['current_learning_rate']
            warm_up_steps = checkpoint['warm_up_steps']
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    else:
        logging.info('Ramdomly Initializing %s Model...' % args.model)
        init_step = 0

    step = init_step

    logging.info('Start Training...')
    logging.info('init_step = %d' % init_step)
    logging.info('batch_size = %d' % args.batch_size)
    logging.info('negative_adversarial_sampling = %d' %
                 args.negative_adversarial_sampling)
    logging.info('hidden_dim = %d' % args.hidden_dim)
    logging.info('gamma = %f' % args.gamma)
    logging.info('negative_adversarial_sampling = %s' %
                 str(args.negative_adversarial_sampling))
    if args.negative_adversarial_sampling:
        logging.info('adversarial_temperature = %f' %
                     args.adversarial_temperature)

    # Set valid dataloader as it would be evaluated during training

    if args.do_train:
        logging.info('learning_rate = %d' % current_learning_rate)

        training_logs = []

        #Training Loop
        for step in range(init_step, args.max_steps):

            log = kge_model.train_step(kge_model, optimizer, train_iterator,
                                       args)
            training_logs.append(log)

            if step >= warm_up_steps:
                current_learning_rate = current_learning_rate / 10
                logging.info('Change learning_rate to %f at step %d' %
                             (current_learning_rate, step))
                optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                                    kge_model.parameters()),
                                             lr=current_learning_rate)
                warm_up_steps = warm_up_steps * 3

            if step % args.save_checkpoint_steps == 0 and step > 0:  # ~ 41 seconds/saving
                save_variable_list = {
                    'step': step,
                    'current_learning_rate': current_learning_rate,
                    'warm_up_steps': warm_up_steps,
                    'entity_dict': entity_dict
                }
                save_model(kge_model, optimizer, save_variable_list, args)

            if step % args.log_steps == 0:
                metrics = {}
                for metric in training_logs[0].keys():
                    metrics[metric] = sum(
                        [log[metric]
                         for log in training_logs]) / len(training_logs)
                log_metrics('Train', step, metrics, writer)
                training_logs = []

            if args.do_valid and step % args.valid_steps == 0 and step > 0:
                logging.info('Evaluating on Valid Dataset...')
                metrics = kge_model.test_step(kge_model, valid_triples, args,
                                              entity_dict)
                log_metrics('Valid', step, metrics, writer)

        save_variable_list = {
            'step': step,
            'current_learning_rate': current_learning_rate,
            'warm_up_steps': warm_up_steps
        }
        save_model(kge_model, optimizer, save_variable_list, args)

    if args.do_valid:
        logging.info('Evaluating on Valid Dataset...')
        metrics = kge_model.test_step(kge_model, valid_triples, args,
                                      entity_dict)
        log_metrics('Valid', step, metrics, writer)

    if args.do_test:
        logging.info('Evaluating on Test Dataset...')
        metrics = kge_model.test_step(kge_model, test_triples, args,
                                      entity_dict)
        log_metrics('Test', step, metrics, writer)

    if args.evaluate_train:
        logging.info('Evaluating on Training Dataset...')
        small_train_triples = {}
        indices = np.random.choice(len(train_triples['head']),
                                   args.ntriples_eval_train,
                                   replace=False)
        for i in train_triples:
            if 'type' in i:
                small_train_triples[i] = [train_triples[i][x] for x in indices]
            else:
                small_train_triples[i] = train_triples[i][indices]
        metrics = kge_model.test_step(kge_model,
                                      small_train_triples,
                                      args,
                                      entity_dict,
                                      random_sampling=True)
        log_metrics('Train', step, metrics, writer)
Beispiel #9
0
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================
"""ogbl_collab dataset.
"""
import os
import numpy as np
from ogb.linkproppred import LinkPropPredDataset

# load data
dataset = LinkPropPredDataset(name='ogbl-collab')
split_edge = dataset.get_edge_split()
train_edge, valid_edge, test_edge = split_edge['train'], split_edge[
    'valid'], split_edge['test']
# train_edge['edge'], (1179052, 2)
# train_edge['weight'], (1179052,)
# train_edge['year'], (1179052,)
# valid_edge, 60084
# test_edge, 46329

graph = dataset[0]
num_nodes = graph['num_nodes']  # 235868
node_feat = graph['node_feat']  # shape(235868, 128)

# dump to disk
root = 'ogbl_collab/'
Beispiel #10
0
from collections import defaultdict

import time
import pdb

import datetime


def now():
    d = datetime.datetime.now()
    x = d - datetime.timedelta(microseconds=d.microsecond)
    return x


d_name = "ogbl-biokg"
dataset = LinkPropPredDataset(name=d_name)

split_edge = dataset.get_edge_split()
train_triples, valid_triples, test_triples = split_edge["train"], split_edge[
    "valid"], split_edge["test"]

nrelation = int(max(train_triples['relation'])) + 1  #4
nentity = sum(dataset[0]['num_nodes_dict'].values())

entity_dict = dict()
cur_idx = 0
for key in dataset[0][
        'num_nodes_dict']:  #['drug', 'sideeffect', 'protein', 'disease', 'function']:
    entity_dict[key] = (cur_idx, cur_idx + dataset[0]['num_nodes_dict'][key])
    cur_idx += dataset[0]['num_nodes_dict'][key]
nentity = sum(
Beispiel #11
0
def main(args):
    if (
        (not args.do_train)
        and (not args.do_valid)
        and (not args.do_test)
        and (not args.evaluate_train)
    ):
        raise ValueError("one of train/val/test mode must be choosed.")

    if args.init_checkpoint:
        override_config(args)

    args.save_path = (
        "log/%s/%s/%s-%s/%s"
        % (args.dataset, args.model, args.hidden_dim, args.gamma, time.time())
        if args.save_path == None
        else args.save_path
    )
    writer = SummaryWriter(args.save_path)

    # Write logs to checkpoint and console
    set_logger(args)

    dataset = LinkPropPredDataset(name=args.dataset)
    split_dict = dataset.get_edge_split()
    nentity = dataset.graph["num_nodes"]
    nrelation = int(max(dataset.graph["edge_reltype"])[0]) + 1

    evaluator = Evaluator(name=args.dataset)

    args.nentity = nentity
    args.nrelation = nrelation

    logging.info("Model: %s" % args.model)
    logging.info("Dataset: %s" % args.dataset)
    logging.info("#entity: %d" % nentity)
    logging.info("#relation: %d" % nrelation)

    train_triples = split_dict["train"]
    logging.info("#train: %d" % len(train_triples["head"]))
    valid_triples = split_dict["valid"]
    logging.info("#valid: %d" % len(valid_triples["head"]))
    test_triples = split_dict["test"]
    logging.info("#test: %d" % len(test_triples["head"]))

    train_count, train_true_head, train_true_tail = (
        defaultdict(lambda: 4),
        defaultdict(list),
        defaultdict(list),
    )
    for i in tqdm(range(len(train_triples["head"]))):
        head, relation, tail = (
            train_triples["head"][i],
            train_triples["relation"][i],
            train_triples["tail"][i],
        )
        train_count[(head, relation)] += 1
        train_count[(tail, -relation - 1)] += 1
        train_true_head[(relation, tail)].append(head)
        train_true_tail[(head, relation)].append(tail)

    kge_model = KGEModel(
        model_name=args.model,
        nentity=nentity,
        nrelation=nrelation,
        hidden_dim=args.hidden_dim,
        gamma=args.gamma,
        double_entity_embedding=args.double_entity_embedding,
        double_relation_embedding=args.double_relation_embedding,
        evaluator=evaluator,
    )

    logging.info("Model Parameter Configuration:")
    for name, param in kge_model.named_parameters():
        logging.info(
            "Parameter %s: %s, require_grad = %s"
            % (name, str(param.size()), str(param.requires_grad))
        )

    if args.cuda:
        kge_model = kge_model.cuda()

    if args.do_train:
        # Set training dataloader iterator
        train_dataloader_head = DataLoader(
            TrainDataset(
                train_triples,
                nentity,
                nrelation,
                args.negative_sample_size,
                "head-batch",
                train_count,
                train_true_head,
                train_true_tail,
            ),
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=max(1, args.cpu_num // 2),
            collate_fn=TrainDataset.collate_fn,
        )

        train_dataloader_tail = DataLoader(
            TrainDataset(
                train_triples,
                nentity,
                nrelation,
                args.negative_sample_size,
                "tail-batch",
                train_count,
                train_true_head,
                train_true_tail,
            ),
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=max(1, args.cpu_num // 2),
            collate_fn=TrainDataset.collate_fn,
        )

        train_iterator = BidirectionalOneShotIterator(
            train_dataloader_head, train_dataloader_tail
        )

        # Set training configuration
        current_learning_rate = args.learning_rate
        optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, kge_model.parameters()),
            lr=current_learning_rate,
        )
        if args.warm_up_steps:
            warm_up_steps = args.warm_up_steps
        else:
            warm_up_steps = args.max_steps // 2

    if args.init_checkpoint:
        # Restore model from checkpoint directory
        logging.info("Loading checkpoint %s..." % args.init_checkpoint)
        checkpoint = torch.load(os.path.join(args.init_checkpoint, "checkpoint"))
        init_step = checkpoint["step"]
        kge_model.load_state_dict(checkpoint["model_state_dict"])
        if args.do_train:
            current_learning_rate = checkpoint["current_learning_rate"]
            warm_up_steps = checkpoint["warm_up_steps"]
            optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
    else:
        logging.info("Ramdomly Initializing %s Model..." % args.model)
        init_step = 0

    step = init_step

    logging.info("Start Training...")
    logging.info("init_step = %d" % init_step)
    logging.info("batch_size = %d" % args.batch_size)
    logging.info(
        "negative_adversarial_sampling = %d" % args.negative_adversarial_sampling
    )
    logging.info("hidden_dim = %d" % args.hidden_dim)
    logging.info("gamma = %f" % args.gamma)
    logging.info(
        "negative_adversarial_sampling = %s" % str(args.negative_adversarial_sampling)
    )
    if args.negative_adversarial_sampling:
        logging.info("adversarial_temperature = %f" % args.adversarial_temperature)

    # Set valid dataloader as it would be evaluated during training

    if args.do_train:
        logging.info("learning_rate = %d" % current_learning_rate)

        training_logs = []

        # Training Loop
        for step in range(init_step, args.max_steps):

            log = kge_model.train_step(kge_model, optimizer, train_iterator, args)
            training_logs.append(log)

            if step >= warm_up_steps:
                current_learning_rate = current_learning_rate / 10
                logging.info(
                    "Change learning_rate to %f at step %d"
                    % (current_learning_rate, step)
                )
                optimizer = torch.optim.Adam(
                    filter(lambda p: p.requires_grad, kge_model.parameters()),
                    lr=current_learning_rate,
                )
                warm_up_steps = warm_up_steps * 3

            if (
                step % args.save_checkpoint_steps == 0 and step > 0
            ):  # ~ 41 seconds/saving
                save_variable_list = {
                    "step": step,
                    "current_learning_rate": current_learning_rate,
                    "warm_up_steps": warm_up_steps,
                }
                save_model(kge_model, optimizer, save_variable_list, args)

            if step % args.log_steps == 0:
                metrics = {}
                for metric in training_logs[0].keys():
                    metrics[metric] = sum([log[metric] for log in training_logs]) / len(
                        training_logs
                    )
                log_metrics("Train", step, metrics, writer)
                training_logs = []

            if args.do_valid and step % args.valid_steps == 0 and step > 0:
                logging.info("Evaluating on Valid Dataset...")
                metrics = kge_model.test_step(kge_model, valid_triples, args)
                log_metrics("Valid", step, metrics, writer)

        save_variable_list = {
            "step": step,
            "current_learning_rate": current_learning_rate,
            "warm_up_steps": warm_up_steps,
        }
        save_model(kge_model, optimizer, save_variable_list, args)

    if args.do_valid:
        logging.info("Evaluating on Valid Dataset...")
        metrics = kge_model.test_step(kge_model, valid_triples, args)
        log_metrics("Valid", step, metrics, writer)

    if args.do_test:
        logging.info("Evaluating on Test Dataset...")
        metrics = kge_model.test_step(kge_model, test_triples, args)
        log_metrics("Test", step, metrics, writer)

    if args.evaluate_train:
        logging.info("Evaluating on Training Dataset...")
        small_train_triples = {}
        indices = np.random.choice(
            len(train_triples["head"]), args.ntriples_eval_train, replace=False
        )
        for i in train_triples:
            small_train_triples[i] = train_triples[i][indices]
        metrics = kge_model.test_step(
            kge_model, small_train_triples, args, random_sampling=True
        )
        log_metrics("Train", step, metrics, writer)
Beispiel #12
0
def main():
    parser = argparse.ArgumentParser(description='OGBL-DDI (MADGraph)')
    parser.add_argument('--lr', type=float, default=0.005)
    parser.add_argument('--epochs', type=int, default=200)
    parser.add_argument('--eval_steps', type=int, default=5)
    parser.add_argument('--runs', type=int, default=10)
    parser.add_argument('--batch_size', type=int, default=4 * 1024)
    parser.add_argument('--dim', type=int, default=12)
    parser.add_argument('--heads', type=int, default=12)
    parser.add_argument('--samples', type=int, default=8)
    parser.add_argument('--nearest', type=int, default=8)
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--sentinels', type=int, default=8)
    parser.add_argument('--memory', type=str, default='all')
    parser.add_argument('--softmin', type=bool, default=True)
    parser.add_argument('--output_csv', type=str, default='')
    args = parser.parse_args()
    print(args)

    DNAME = 'ogbl-ddi'
    dataset = LinkPropPredDataset(name=DNAME)
    graph = dataset[0]
    n_nodes = graph['num_nodes']

    data = dataset.get_edge_split()
    for group in 'train valid test'.split():
        if group in data:
            sets = data[group]
            for key in ('edge', 'edge_neg'):
                if key in sets:
                    sets[key] = gpu(torch.from_numpy(sets[key]))
    data['eval_train'] = {
        'edge':
        data['train']['edge'][torch.randperm(
            data['train']['edge'].shape[0])[:data['valid']['edge'].shape[0]]]
    }

    model = MADGraph(
        n_nodes=n_nodes,
        node_feats=args.dim,
        src=data['train']['edge'][:, 0],
        dst=data['train']['edge'][:, 1],
        n_samples=args.samples,
        n_heads=args.heads,
        n_sentinels=args.sentinels,
        memory=['none', 'stat', 'all'].index(args.memory),
        softmin=args.softmin,
        n_nearest=args.nearest,
    )
    params = [p for net in [model] for p in net.parameters()]
    print('params:', sum(p.numel() for p in params))

    evaluator = Evaluator(name=DNAME)
    loggers = {
        'Hits@10': Logger(args.runs, args),
        'Hits@20': Logger(args.runs, args),
        'Hits@30': Logger(args.runs, args),
    }

    for run in range(args.runs):
        torch.manual_seed(args.seed + run)
        opt = optim.Adam(params, lr=args.lr)

        torch.nn.init.xavier_uniform_(model.pos.data)
        torch.nn.init.xavier_uniform_(model.field.data)
        model.uncertainty.data = model.uncertainty.data * 0 + 1

        for epoch in range(1, args.epochs + 1):
            model.train()
            for chunk in sample(data['train']['edge'], args.batch_size):
                opt.zero_grad()
                p_edge = torch.sigmoid(model(chunk))
                edge_neg_chunk = gpu(torch.randint(0, n_nodes, chunk.shape))
                p_edge_neg = torch.sigmoid(model(edge_neg_chunk))
                loss = (-torch.log(1e-5 + 1 - p_edge_neg).mean() -
                        torch.log(1e-5 + p_edge).mean())
                loss.backward()
                opt.step()

            if epoch % args.eval_steps:
                continue

            with torch.no_grad():
                model.eval()
                p_train = torch.cat([
                    model(chunk) for chunk in sample(
                        data['eval_train']['edge'], args.batch_size)
                ])
                n_train = torch.cat([
                    model(chunk) for chunk in sample(data['valid']['edge_neg'],
                                                     args.batch_size)
                ])
                p_valid = torch.cat([
                    model(chunk)
                    for chunk in sample(data['valid']['edge'], args.batch_size)
                ])
                n_valid = n_train
                p_test = torch.cat([
                    model(chunk)
                    for chunk in sample(data['test']['edge'], args.batch_size)
                ])
                n_test = torch.cat([
                    model(chunk) for chunk in sample(data['test']['edge_neg'],
                                                     args.batch_size)
                ])
                for K in [10, 20, 30]:
                    evaluator.K = K
                    key = f'Hits@{K}'
                    h_train = evaluator.eval({
                        'y_pred_pos': p_train,
                        'y_pred_neg': n_train,
                    })[f'hits@{K}']
                    h_valid = evaluator.eval({
                        'y_pred_pos': p_valid,
                        'y_pred_neg': n_valid,
                    })[f'hits@{K}']
                    h_test = evaluator.eval({
                        'y_pred_pos': p_test,
                        'y_pred_neg': n_test,
                    })[f'hits@{K}']
                    loggers[key].add_result(run, (h_train, h_valid, h_test))
                    print(key)
                    print(f'Run: {run + 1:02d}, '
                          f'Epoch: {epoch:02d}, '
                          f'Loss: {loss:.4f}, '
                          f'Train: {100 * h_train:.2f}%, '
                          f'Valid: {100 * h_valid:.2f}%, '
                          f'Test: {100 * h_test:.2f}%')
                print('---')

        for key in loggers.keys():
            print(key)
            loggers[key].print_statistics(run)

    for key in loggers.keys():
        print(key)
        loggers[key].print_statistics()
Beispiel #13
0
def test_datasetsaver():
    # test on graph classification
    # ogbg-molhiv

    test_task = 'link'

    # testing all the dataset objects are working.
    if test_task == 'graph':
        from ogb.graphproppred import PygGraphPropPredDataset, DglGraphPropPredDataset, GraphPropPredDataset
        dataset_name = 'ogbg-molhiv'
        dataset = PygGraphPropPredDataset(dataset_name)
        dataset.get_idx_split()
        dataset = DglGraphPropPredDataset(dataset_name)
        dataset.get_idx_split()
        dataset = GraphPropPredDataset(dataset_name)
        dataset.get_idx_split()
    elif test_task == 'node':
        from ogb.nodeproppred import NodePropPredDataset, PygNodePropPredDataset, DglNodePropPredDataset
        dataset_name = 'ogbn-arxiv'  # test ogbn-proteins
        dataset = PygNodePropPredDataset(dataset_name)
        dataset.get_idx_split()
        dataset = DglNodePropPredDataset(dataset_name)
        dataset.get_idx_split()
        dataset = NodePropPredDataset(dataset_name)
        dataset.get_idx_split()
    elif test_task == 'link':
        from ogb.linkproppred import LinkPropPredDataset, PygLinkPropPredDataset, DglLinkPropPredDataset
        dataset_name = 'ogbl-collab'
        dataset = PygLinkPropPredDataset(dataset_name)
        dataset.get_edge_split()
        dataset = DglLinkPropPredDataset(dataset_name)
        dataset.get_edge_split()
        dataset = LinkPropPredDataset(dataset_name)
        dataset.get_edge_split()
    elif test_task == 'heteronode':
        from ogb.nodeproppred import NodePropPredDataset, PygNodePropPredDataset, DglNodePropPredDataset
        dataset_name = 'ogbn-mag'
        dataset = PygNodePropPredDataset(dataset_name)
        dataset.get_idx_split()
        dataset = DglNodePropPredDataset(dataset_name)
        dataset.get_idx_split()
        dataset = NodePropPredDataset(dataset_name)
        dataset.get_idx_split()
    elif test_task == 'heterolink':
        from ogb.linkproppred import LinkPropPredDataset, PygLinkPropPredDataset, DglLinkPropPredDataset
        dataset_name = 'ogbl-biokg'
        dataset = PygLinkPropPredDataset(dataset_name)
        dataset.get_edge_split()
        dataset = DglLinkPropPredDataset(dataset_name)
        dataset.get_edge_split()
        dataset = LinkPropPredDataset(dataset_name)
        dataset.get_edge_split()
    else:
        raise ValueError('Invalid task category')

    print(dataset[0])
    if 'link' in test_task:
        print(dataset.get_edge_split())
    else:
        print(dataset.get_idx_split())

    if 'graph' in test_task:
        graph_list = dataset.graphs
    else:
        graph_list = [dataset.graph]

    if 'link' not in test_task:
        labels = dataset.labels

    is_hetero = 'hetero' in test_task
    version = 2 if dataset_name == 'ogbn-mag' else 1
    saver = DatasetSaver(dataset_name, is_hetero, version=version)

    # saving graph objects
    saver.save_graph_list(graph_list)
    # saving target labels
    if 'link' not in test_task:
        saver.save_target_labels(labels)
    # saving split
    if 'link' in test_task:
        split_idx = dataset.get_edge_split()
    else:
        split_idx = dataset.get_idx_split()
    # second argument must be the name of the split
    saver.save_split(split_idx, dataset.meta_info['split'])
    # copying mapping dir
    # saver.copy_mapping_dir(f"dataset/{'_'.join(dataset_name.split('-'))}/mapping/")
    saver.copy_mapping_dir("dataset/{}/mapping/".format('_'.join(
        dataset_name.split('-'))))

    saver.save_task_info(
        dataset.task_type, dataset.eval_metric,
        dataset.num_classes if hasattr(dataset, 'num_classes') else None)

    meta_dict = saver.get_meta_dict()

    print(meta_dict)

    print('Now testing.')

    if 'graph' in test_task:
        print('library agnostic')
        dataset = GraphPropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = GraphPropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        print(dataset.get_idx_split())
        print('Pytorch Geometric')
        dataset = PygGraphPropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = PygGraphPropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        print(dataset.get_idx_split())
        print('DGL')
        dataset = DglGraphPropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = DglGraphPropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        print(dataset.get_idx_split())
    elif 'node' in test_task:
        print('library agnostic')
        dataset = NodePropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = NodePropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        print(dataset.get_idx_split())
        print('Pytorch Geometric')
        dataset = PygNodePropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = PygNodePropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        print(dataset.get_idx_split())
        print('DGL')
        dataset = DglNodePropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = DglNodePropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        print(dataset.get_idx_split())

    elif 'link' in test_task:
        print('library agnostic')
        dataset = LinkPropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = LinkPropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        # print(dataset.get_edge_split())
        print('Pytorch Geometric')
        dataset = PygLinkPropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = PygLinkPropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        # print(dataset.get_edge_split())
        print('DGL')
        dataset = DglLinkPropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = DglLinkPropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        # print(dataset.get_edge_split())
    else:
        raise ValueError('Invalid task category')

    # zip
    saver.zip()
    print('Finished zipping!')

    saver.cleanup()
Beispiel #14
0
def main():
    parser = ArgumentParser(description="ne")
    parser.add_argument("-d", "--dataset", type=str, default="cora", \
                        help="input dataset")
    parser.add_argument("-o", "--coarse", type=str, default="simple", \
                        help="choose either simple_coarse or lamg_coarse, [simple, lamg]")
    parser.add_argument("-c", "--mcr_dir", type=str, default="/opt/matlab/R2018A/", \
                        help="directory of matlab compiler runtime (only required by lamg_coarsen)")
    parser.add_argument("-s", "--search_ratio", type=int, default=12, \
                        help="control the search space in graph fusion process (only required by lamg_coarsen)")
    parser.add_argument("-r", "--reduce_ratio", type=int, default=2, \
                        help="control graph coarsening levels (only required by lamg_coarsen)")
    parser.add_argument("-v", "--level", type=int, default=1, \
                        help="number of coarsening levels (only required by simple_coarsen)")
    parser.add_argument("-n", "--num_neighs", type=int, default=2, \
                        help="control k-nearest neighbors in graph fusion process")
    parser.add_argument("-l", "--lda", type=float, default=0.1, \
                        help="control self loop in adjacency matrix")
    parser.add_argument("-e", "--embed_path", type=str, default="embed_results/embeddings_palone_deepwalk.npy", \
                        help="path of embedding result")
    parser.add_argument("-m", "--embed_method", type=str, default="deepwalk", \
                        help="[deepwalk, node2vec, graphsage]")
    parser.add_argument("-f", "--fusion", default=True, action="store_false", \
                        help="whether use graph fusion")
    parser.add_argument("-p", "--power", default=False, action="store_true", \
                        help="Strong power of graph filter, set True to enhance filter power")
    parser.add_argument("-g", "--sage_model", type=str, default="mean", \
                        help="aggregation function in graphsage")
    parser.add_argument("-w", "--sage_weighted", default=True, action="store_false", \
                        help="whether consider weighted reduced graph")

    args = parser.parse_args()

    dataset = args.dataset
    feature_path = "dataset/{}/{}-feats.npy".format(dataset, dataset)
    fusion_input_path = "dataset/{}/{}.mtx".format(dataset, dataset)
    reduce_results = "reduction_results/"
    mapping_path = "{}Mapping.mtx".format(reduce_results)

    if args.fusion:
        coarsen_input_path = "dataset/{}/fused_{}.mtx".format(dataset, dataset)
    else:
        coarsen_input_path = "dataset/{}/{}.mtx".format(dataset, dataset)

    ######Load Data######
    print("%%%%%% Loading Graph Data %%%%%%")

    if args.dataset == "ogb":
        d_name = "ogbl-ppa"

        from ogb.linkproppred import LinkPropPredDataset

        dataset = LinkPropPredDataset(name=d_name)
        print(dataset)
        print(dataset[0])

        split_edge = dataset.get_edge_split()
        print(split_edge)
        # train_edge, valid_edge, test_edge = split_edge["train"], split_edge["valid"], split_edge["test"]
        graph = dataset[0]  # graph: library-agnostic graph object

        print(graph['edge_index'].shape)
        print(graph['edge_feat'])
        print(graph['node_feat'])
        # print((np.array(graph['node_feat']) == 0.0).all())
        graph['directed'] = False
        print(graph)
        graph_nodes = [i for i in range(0, graph['num_nodes'])]
        G = nx.Graph()
        G.add_nodes_from(graph_nodes)
        G.add_edges_from(graph['edge_index'].T)
        # nx.draw(G, with_labels=True)
        print(G.nodes)
        # plt.show()
        laplacian = laplacian_matrix(G)
        print(laplacian)
    else:
        path = "dataset/ppi/ppi.mtx"
        G = mtx2graph(path)
        laplacian, edges = json2mtx(dataset)

    ## whether node features are required
    if args.fusion or args.embed_method == "graphsage":

        if args.dataset == 'ogb':
            feature = graph['node_feat']
        else:
            feature = np.load(feature_path)
        # print(feature[1][0])

    ######Embed Reduced Graph######

    print("%%%%%% Starting Graph Embedding %%%%%%")

    if args.embed_method == "deepwalk":
        embed_start = time.process_time()
        embeddings = deepwalk(G)

    elif args.embed_method == "node2vec":
        embed_start = time.process_time()
        embeddings = node2vec(G)

    elif args.embed_method == "graphsage":
        from embed_methods.graphsage.graphsage import graphsage
        nx.set_node_attributes(G, False, "test")
        nx.set_node_attributes(G, False, "val")

        ## obtain mapping operator
        if args.coarse == "lamg":
            mapping = normalize(mtx2matrix(mapping_path), norm='l1', axis=1)
        else:
            mapping = identity(feature.shape[0])
            for p in projections:
                mapping = mapping @ p
            mapping = normalize(mapping, norm='l1', axis=1).transpose()

        ## control iterations for training
        coarse_ratio = mapping.shape[1] / mapping.shape[0]

        ## map node feats to the coarse graph
        feats = mapping @ feature

        embed_start = time.process_time()
        embeddings = graphsage(G, feats, args.sage_model, args.sage_weighted,
                               int(1000 / coarse_ratio))

    embed_time = time.process_time() - embed_start

    ######Save Embeddings######

    np.save(args.embed_path, embeddings)

    ######Evaluation######
    print("%%%%%% Starting Evaluation %%%%%%")

    # link prediction
    embeds = np.load(args.embed_path)
    '''

    if args.dataset == "ogb":
        acc, pre, sen, mcc, auc = linkprediction_ogb(split_edge, embeds)
    else:
        acc, pre, sen, mcc, auc = linkprediction(edges, embeds, dataset)'''

    print("Running regression..")

    # node prediction
    # run_regression(np.array(train_embeds), np.array(train_labels), np.array(test_embeds), np.array(test_labels))
    # lr("dataset/{}/".format(dataset), args.embed_path, dataset)

    ######Report timing information######å
    print("%%%%%% CPU time %%%%%%")
    if args.fusion:
        total_time = embed_time
        print(f"Graph Fusion     Time:")
    else:
        total_time = embed_time
        print("Graph Fusion     Time: 0")

    print(f"Graph Embedding  Time: {embed_time:.3f}")
    print(f"Total Time = Embedding_time = {total_time:.3f}")
Beispiel #15
0
        return None
    map = dict()
    with open(file, newline='') as csvfile:
        csvfile.readline()
        for (idx, name) in csv.reader(csvfile, delimiter=',', quotechar='|'):
            map[name] = int(idx)
    return map


args = parse_args()
dataset_name = args.dataset

if args.do_test:
    meta = 'dataset_' + re.sub('-', '_', args.dataset) + '/meta_dict.pt'
    meta_dict = load(meta)
    dataset = LinkPropPredDataset(dataset_name, meta_dict=meta_dict)
    dsplit = dataset.get_edge_split()
    if args.print_relations:
        np.set_printoptions(threshold=np.inf)
        print('test.relations <- c(')
        print(
            re.sub('[\[\]]', '',
                   np.array2string(dsplit['test']['relation'],
                                   separator=', ')))
        print(')')
    elif args.select_head >= 0 or args.select_tail >= 0:
        for k in dsplit.keys():
            for i in range(len(dsplit[k]['head'])):
                (h, t, r) = (dsplit[k]['head'][i], dsplit[k]['tail'][i],
                             dsplit[k]['relation'][i])
                if args.select_head < 0 or args.select_head == h: