Esempio n. 1
0
    def __init__(self, 
            net_file,
            map_file,
            walk_length=80,
            window_size=5,
            num_walks=10,
            batch_size=32,
            negative=5,
            gpus=[0],
            fast_neg=True,
            ):
        """ This class has the following functions:
        1. Transform the txt network file into DGL graph;
        2. Generate random walk sequences for the trainer;
        3. Provide the negative table if the user hopes to sample negative
        nodes according to nodes' degrees;

        Parameter
        ---------
        net_file str : path of the txt network file
        walk_length int : number of nodes in a sequence
        window_size int : context window size
        num_walks int : number of walks for each node
        batch_size int : number of node sequences in each batch
        negative int : negative samples for each positve node pair
        fast_neg bool : whether do negative sampling inside a batch
        """
        self.walk_length = walk_length
        self.window_size = window_size
        self.num_walks = num_walks
        self.batch_size = batch_size
        self.negative = negative
        self.num_procs = len(gpus)
        self.fast_neg = fast_neg
        self.net, self.node2id, self.id2node, self.sm = ReadTxtNet(net_file)
        self.save_mapping(map_file)
        self.G = net2graph(self.sm)

        # random walk seeds
        start = time.time()
        seeds = torch.cat([torch.LongTensor(self.G.nodes())] * num_walks)
        self.seeds = torch.split(shuffle_walks(seeds), int(np.ceil(len(self.net) * self.num_walks / self.num_procs)), 0)
        end = time.time()
        t = end - start
        print("%d seeds in %.2fs" % (len(seeds), t))

        # negative table for true negative sampling
        if not fast_neg:
            node_degree = np.array(list(map(lambda x: len(self.net[x]), self.net.keys())))
            node_degree = np.power(node_degree, 0.75)
            node_degree /= np.sum(node_degree)
            node_degree = np.array(node_degree * 1e8, dtype=np.int)
            self.neg_table = []
            for idx, node in enumerate(self.net.keys()):
                self.neg_table += [node] * node_degree[idx]
            self.neg_table_size = len(self.neg_table)
            self.neg_table = np.array(self.neg_table, dtype=np.long)
            del node_degree
Esempio n. 2
0
    def fast_train(self):
        """ one process """
        # the number of postive node pairs of a node sequence
        num_pos = 2 * self.args.walk_length * self.args.window_size\
            - self.args.window_size * (self.args.window_size + 1)
        num_pos = int(num_pos)
        num_batches = len(
            self.dataset.net) * self.args.num_walks / self.args.batch_size
        num_batches = int(np.ceil(num_batches))
        print("num batchs: %d" % num_batches)

        self.init_device_emb()

        start_all = time.time()
        start = time.time()
        with torch.no_grad():
            i = 0
            max_i = self.args.iterations * num_batches
            for iteration in range(self.args.iterations):
                print("\nIteration: " + str(iteration + 1))
                self.dataset.walks = shuffle_walks(self.dataset.walks)

                while True:
                    # decay learning rate for SGD
                    lr = self.args.lr * (max_i - i) / max_i
                    if lr < 0.00001:
                        lr = 0.00001

                    # multi-sequence input
                    i_ = int(i % num_batches)
                    walks = list(self.dataset.walks[i_ * self.args.batch_size: \
                            (1+i_) * self.args.batch_size])
                    if len(walks) == 0:
                        break

                    if self.args.fast_neg:
                        self.emb_model.fast_learn_super(walks, lr)
                    else:
                        # do negative sampling
                        bs = len(walks)
                        neg_nodes = torch.LongTensor(
                            np.random.choice(self.dataset.neg_table,
                                             bs * num_pos * self.args.negative,
                                             replace=True))
                        self.emb_model.fast_learn_super(walks,
                                                        lr,
                                                        neg_nodes=neg_nodes)

                    i += 1
                    if i > 0 and i % self.args.print_interval == 0:
                        print("Batch %d, training time: %.2fs" %
                              (i, time.time() - start))
                        start = time.time()
                    if i_ == num_batches - 1:
                        break

        print("Training used time: %.2fs" % (time.time() - start_all))
        self.emb_model.save_embedding(self.dataset, self.args.emb_file)
Esempio n. 3
0
    def fast_train_mp(self):
        """ multi-cpu-core or mix cpu & multi-gpu """
        self.init_device_emb()
        self.emb_model.share_memory()
        self.dataset.walks = shuffle_walks(self.dataset.walks)

        start_all = time.time()
        ps = []

        l = len(self.dataset.walks)
        np_ = self.args.num_procs
        for i in range(np_):
            walks = self.dataset.walks[int(i * l / np_):int((i + 1) * l / np_)]
            p = mp.Process(target=self.fast_train_sp, args=(walks, i))
            ps.append(p)
            p.start()

        for p in ps:
            p.join()

        print("Used time: %.2fs" % (time.time() - start_all))
        self.emb_model.save_embedding(self.dataset, self.args.emb_file)
Esempio n. 4
0
    def __init__(self, 
            net_file,
            map_file,
            walk_length,
            window_size,
            num_walks,
            batch_size,
            negative=5,
            gpus=[0],
            fast_neg=True,
            ogbl_name="",
            load_from_ogbl=False,
            ):
        """ This class has the following functions:
        1. Transform the txt network file into DGL graph;
        2. Generate random walk sequences for the trainer;
        3. Provide the negative table if the user hopes to sample negative
        nodes according to nodes' degrees;

        Parameter
        ---------
        net_file str : path of the txt network file
        walk_length int : number of nodes in a sequence
        window_size int : context window size
        num_walks int : number of walks for each node
        batch_size int : number of node sequences in each batch
        negative int : negative samples for each positve node pair
        fast_neg bool : whether do negative sampling inside a batch
        """
        self.walk_length = walk_length
        self.window_size = window_size
        self.num_walks = num_walks
        self.batch_size = batch_size
        self.negative = negative
        self.num_procs = len(gpus)
        self.fast_neg = fast_neg

        if load_from_ogbl:
            assert len(gpus) == 1, "ogb.linkproppred is not compatible with multi-gpu training (CUDA error)."
            from load_dataset import load_from_ogbl_with_name
            self.G = load_from_ogbl_with_name(ogbl_name)
            self.G = make_undirected(self.G)
        else:
            self.net, self.node2id, self.id2node, self.sm = ReadTxtNet(net_file)
            self.save_mapping(map_file)
            self.G = net2graph(self.sm)

        self.num_nodes = self.G.number_of_nodes()

        # random walk seeds
        start = time.time()
        self.valid_seeds = find_connected_nodes(self.G)
        if len(self.valid_seeds) != self.num_nodes:
            print("WARNING: The node ids are not serial. Some nodes are invalid.")
        
        seeds = torch.cat([torch.LongTensor(self.valid_seeds)] * num_walks)
        self.seeds = torch.split(shuffle_walks(seeds), 
            int(np.ceil(len(self.valid_seeds) * self.num_walks / self.num_procs)), 
            0)
        end = time.time()
        t = end - start
        print("%d seeds in %.2fs" % (len(seeds), t))

        # negative table for true negative sampling
        if not fast_neg:
            node_degree = self.G.out_degrees(self.valid_seeds).numpy()
            node_degree = np.power(node_degree, 0.75)
            node_degree /= np.sum(node_degree)
            node_degree = np.array(node_degree * 1e8, dtype=np.int)
            self.neg_table = []
            
            for idx, node in enumerate(self.valid_seeds):
                self.neg_table += [node] * node_degree[idx]
            self.neg_table_size = len(self.neg_table)
            self.neg_table = np.array(self.neg_table, dtype=np.long)
            del node_degree