def __init__(self, net_file, map_file, walk_length=80, window_size=5, num_walks=10, batch_size=32, negative=5, gpus=[0], fast_neg=True, ): """ This class has the following functions: 1. Transform the txt network file into DGL graph; 2. Generate random walk sequences for the trainer; 3. Provide the negative table if the user hopes to sample negative nodes according to nodes' degrees; Parameter --------- net_file str : path of the txt network file walk_length int : number of nodes in a sequence window_size int : context window size num_walks int : number of walks for each node batch_size int : number of node sequences in each batch negative int : negative samples for each positve node pair fast_neg bool : whether do negative sampling inside a batch """ self.walk_length = walk_length self.window_size = window_size self.num_walks = num_walks self.batch_size = batch_size self.negative = negative self.num_procs = len(gpus) self.fast_neg = fast_neg self.net, self.node2id, self.id2node, self.sm = ReadTxtNet(net_file) self.save_mapping(map_file) self.G = net2graph(self.sm) # random walk seeds start = time.time() seeds = torch.cat([torch.LongTensor(self.G.nodes())] * num_walks) self.seeds = torch.split(shuffle_walks(seeds), int(np.ceil(len(self.net) * self.num_walks / self.num_procs)), 0) end = time.time() t = end - start print("%d seeds in %.2fs" % (len(seeds), t)) # negative table for true negative sampling if not fast_neg: node_degree = np.array(list(map(lambda x: len(self.net[x]), self.net.keys()))) node_degree = np.power(node_degree, 0.75) node_degree /= np.sum(node_degree) node_degree = np.array(node_degree * 1e8, dtype=np.int) self.neg_table = [] for idx, node in enumerate(self.net.keys()): self.neg_table += [node] * node_degree[idx] self.neg_table_size = len(self.neg_table) self.neg_table = np.array(self.neg_table, dtype=np.long) del node_degree
def fast_train(self): """ one process """ # the number of postive node pairs of a node sequence num_pos = 2 * self.args.walk_length * self.args.window_size\ - self.args.window_size * (self.args.window_size + 1) num_pos = int(num_pos) num_batches = len( self.dataset.net) * self.args.num_walks / self.args.batch_size num_batches = int(np.ceil(num_batches)) print("num batchs: %d" % num_batches) self.init_device_emb() start_all = time.time() start = time.time() with torch.no_grad(): i = 0 max_i = self.args.iterations * num_batches for iteration in range(self.args.iterations): print("\nIteration: " + str(iteration + 1)) self.dataset.walks = shuffle_walks(self.dataset.walks) while True: # decay learning rate for SGD lr = self.args.lr * (max_i - i) / max_i if lr < 0.00001: lr = 0.00001 # multi-sequence input i_ = int(i % num_batches) walks = list(self.dataset.walks[i_ * self.args.batch_size: \ (1+i_) * self.args.batch_size]) if len(walks) == 0: break if self.args.fast_neg: self.emb_model.fast_learn_super(walks, lr) else: # do negative sampling bs = len(walks) neg_nodes = torch.LongTensor( np.random.choice(self.dataset.neg_table, bs * num_pos * self.args.negative, replace=True)) self.emb_model.fast_learn_super(walks, lr, neg_nodes=neg_nodes) i += 1 if i > 0 and i % self.args.print_interval == 0: print("Batch %d, training time: %.2fs" % (i, time.time() - start)) start = time.time() if i_ == num_batches - 1: break print("Training used time: %.2fs" % (time.time() - start_all)) self.emb_model.save_embedding(self.dataset, self.args.emb_file)
def fast_train_mp(self): """ multi-cpu-core or mix cpu & multi-gpu """ self.init_device_emb() self.emb_model.share_memory() self.dataset.walks = shuffle_walks(self.dataset.walks) start_all = time.time() ps = [] l = len(self.dataset.walks) np_ = self.args.num_procs for i in range(np_): walks = self.dataset.walks[int(i * l / np_):int((i + 1) * l / np_)] p = mp.Process(target=self.fast_train_sp, args=(walks, i)) ps.append(p) p.start() for p in ps: p.join() print("Used time: %.2fs" % (time.time() - start_all)) self.emb_model.save_embedding(self.dataset, self.args.emb_file)
def __init__(self, net_file, map_file, walk_length, window_size, num_walks, batch_size, negative=5, gpus=[0], fast_neg=True, ogbl_name="", load_from_ogbl=False, ): """ This class has the following functions: 1. Transform the txt network file into DGL graph; 2. Generate random walk sequences for the trainer; 3. Provide the negative table if the user hopes to sample negative nodes according to nodes' degrees; Parameter --------- net_file str : path of the txt network file walk_length int : number of nodes in a sequence window_size int : context window size num_walks int : number of walks for each node batch_size int : number of node sequences in each batch negative int : negative samples for each positve node pair fast_neg bool : whether do negative sampling inside a batch """ self.walk_length = walk_length self.window_size = window_size self.num_walks = num_walks self.batch_size = batch_size self.negative = negative self.num_procs = len(gpus) self.fast_neg = fast_neg if load_from_ogbl: assert len(gpus) == 1, "ogb.linkproppred is not compatible with multi-gpu training (CUDA error)." from load_dataset import load_from_ogbl_with_name self.G = load_from_ogbl_with_name(ogbl_name) self.G = make_undirected(self.G) else: self.net, self.node2id, self.id2node, self.sm = ReadTxtNet(net_file) self.save_mapping(map_file) self.G = net2graph(self.sm) self.num_nodes = self.G.number_of_nodes() # random walk seeds start = time.time() self.valid_seeds = find_connected_nodes(self.G) if len(self.valid_seeds) != self.num_nodes: print("WARNING: The node ids are not serial. Some nodes are invalid.") seeds = torch.cat([torch.LongTensor(self.valid_seeds)] * num_walks) self.seeds = torch.split(shuffle_walks(seeds), int(np.ceil(len(self.valid_seeds) * self.num_walks / self.num_procs)), 0) end = time.time() t = end - start print("%d seeds in %.2fs" % (len(seeds), t)) # negative table for true negative sampling if not fast_neg: node_degree = self.G.out_degrees(self.valid_seeds).numpy() node_degree = np.power(node_degree, 0.75) node_degree /= np.sum(node_degree) node_degree = np.array(node_degree * 1e8, dtype=np.int) self.neg_table = [] for idx, node in enumerate(self.valid_seeds): self.neg_table += [node] * node_degree[idx] self.neg_table_size = len(self.neg_table) self.neg_table = np.array(self.neg_table, dtype=np.long) del node_degree