class DeepwalkTrainer: def __init__(self, args): """ Initializing the trainer with the input arguments """ self.args = args self.dataset = DeepwalkDataset( net_file=args.net_file, map_file=args.map_file, walk_length=args.walk_length, window_size=args.window_size, num_walks=args.num_walks, batch_size=args.batch_size, negative=args.negative, num_procs=args.num_procs, fast_neg=args.fast_neg, ) self.emb_size = len(self.dataset.net) self.emb_model = None def init_device_emb(self): """ set the device before training will be called once in fast_train_mp / fast_train """ choices = sum([self.args.only_gpu, self.args.only_cpu, self.args.mix]) assert choices == 1, "Must choose only *one* training mode in [only_cpu, only_gpu, mix]" assert self.args.num_procs >= 1, "The number of process must be larger than 1" choices = sum([self.args.sgd, self.args.adam, self.args.avg_sgd]) assert choices == 1, "Must choose only *one* gradient descent strategy in [sgd, avg_sgd, adam]" # initializing embedding on CPU self.emb_model = SkipGramModel( emb_size=self.emb_size, emb_dimension=self.args.dim, walk_length=self.args.walk_length, window_size=self.args.window_size, batch_size=self.args.batch_size, only_cpu=self.args.only_cpu, only_gpu=self.args.only_gpu, mix=self.args.mix, neg_weight=self.args.neg_weight, negative=self.args.negative, lr=self.args.lr, lap_norm=self.args.lap_norm, adam=self.args.adam, sgd=self.args.sgd, avg_sgd=self.args.avg_sgd, fast_neg=self.args.fast_neg, ) torch.set_num_threads(self.args.num_threads) if self.args.only_gpu: print("Run in 1 GPU") self.emb_model.all_to_device(0) elif self.args.mix: print("Mix CPU with %d GPU" % self.args.num_procs) if self.args.num_procs == 1: self.emb_model.set_device(0) else: print("Run in %d CPU process" % self.args.num_procs) def train(self): """ train the embedding """ if self.args.num_procs > 1: self.fast_train_mp() else: self.fast_train() def fast_train_mp(self): """ multi-cpu-core or mix cpu & multi-gpu """ self.init_device_emb() self.emb_model.share_memory() start_all = time.time() ps = [] np_ = self.args.num_procs for i in range(np_): p = mp.Process(target=self.fast_train_sp, args=(i,)) ps.append(p) p.start() for p in ps: p.join() print("Used time: %.2fs" % (time.time()-start_all)) self.emb_model.save_embedding(self.dataset, self.args.emb_file) @thread_wrapped_func def fast_train_sp(self, gpu_id): """ a subprocess for fast_train_mp """ if self.args.mix: self.emb_model.set_device(gpu_id) torch.set_num_threads(self.args.num_threads) sampler = self.dataset.create_sampler(gpu_id) dataloader = DataLoader( dataset=sampler.seeds, batch_size=self.args.batch_size, collate_fn=sampler.sample, shuffle=False, drop_last=False, num_workers=4, ) num_batches = len(dataloader) print("num batchs: %d in subprocess [%d]" % (num_batches, gpu_id)) # number of positive node pairs in a sequence num_pos = int(2 * self.args.walk_length * self.args.window_size\ - self.args.window_size * (self.args.window_size + 1)) start = time.time() with torch.no_grad(): max_i = self.args.iterations * num_batches for i, walks in enumerate(dataloader): # decay learning rate for SGD lr = self.args.lr * (max_i - i) / max_i if lr < 0.00001: lr = 0.00001 if self.args.fast_neg: self.emb_model.fast_learn(walks, lr) else: # do negative sampling bs = len(walks) neg_nodes = torch.LongTensor( np.random.choice(self.dataset.neg_table, bs * num_pos * self.args.negative, replace=True)) self.emb_model.fast_learn(walks, lr, neg_nodes=neg_nodes) if i > 0 and i % self.args.print_interval == 0: print("Solver [%d] batch %d tt: %.2fs" % (gpu_id, i, time.time()-start)) start = time.time() def fast_train(self): """ fast train with dataloader """ # the number of postive node pairs of a node sequence num_pos = 2 * self.args.walk_length * self.args.window_size\ - self.args.window_size * (self.args.window_size + 1) num_pos = int(num_pos) self.init_device_emb() sampler = self.dataset.create_sampler(0) dataloader = DataLoader( dataset=sampler.seeds, batch_size=self.args.batch_size, collate_fn=sampler.sample, shuffle=False, drop_last=False, num_workers=4, ) num_batches = len(dataloader) print("num batchs: %d" % num_batches) start_all = time.time() start = time.time() with torch.no_grad(): max_i = self.args.iterations * num_batches for iteration in range(self.args.iterations): print("\nIteration: " + str(iteration + 1)) for i, walks in enumerate(dataloader): # decay learning rate for SGD lr = self.args.lr * (max_i - i) / max_i if lr < 0.00001: lr = 0.00001 if self.args.fast_neg: self.emb_model.fast_learn(walks, lr) else: # do negative sampling bs = len(walks) neg_nodes = torch.LongTensor( np.random.choice(self.dataset.neg_table, bs * num_pos * self.args.negative, replace=True)) self.emb_model.fast_learn(walks, lr, neg_nodes=neg_nodes) if i > 0 and i % self.args.print_interval == 0: print("Batch %d, training time: %.2fs" % (i, time.time()-start)) start = time.time() print("Training used time: %.2fs" % (time.time()-start_all)) self.emb_model.save_embedding(self.dataset, self.args.emb_file)
class LineTrainer: def __init__(self, args): """ Initializing the trainer with the input arguments """ self.args = args self.dataset = LineDataset( net_file=args.data_file, batch_size=args.batch_size, negative=args.negative, gpus=args.gpus, fast_neg=args.fast_neg, ogbl_name=args.ogbl_name, load_from_ogbl=args.load_from_ogbl, ogbn_name=args.ogbn_name, load_from_ogbn=args.load_from_ogbn, num_samples=args.num_samples * 1000000, ) self.emb_size = self.dataset.G.number_of_nodes() self.emb_model = None def init_device_emb(self): """ set the device before training will be called once in fast_train_mp / fast_train """ choices = sum([self.args.only_gpu, self.args.only_cpu, self.args.mix]) assert choices == 1, "Must choose only *one* training mode in [only_cpu, only_gpu, mix]" # initializing embedding on CPU self.emb_model = SkipGramModel( emb_size=self.emb_size, emb_dimension=self.args.dim, batch_size=self.args.batch_size, only_cpu=self.args.only_cpu, only_gpu=self.args.only_gpu, only_fst=self.args.only_fst, only_snd=self.args.only_snd, mix=self.args.mix, neg_weight=self.args.neg_weight, negative=self.args.negative, lr=self.args.lr, lap_norm=self.args.lap_norm, fast_neg=self.args.fast_neg, record_loss=self.args.print_loss, async_update=self.args.async_update, num_threads=self.args.num_threads, ) torch.set_num_threads(self.args.num_threads) if self.args.only_gpu: print("Run in 1 GPU") assert self.args.gpus[0] >= 0 self.emb_model.all_to_device(self.args.gpus[0]) elif self.args.mix: print("Mix CPU with %d GPU" % len(self.args.gpus)) if len(self.args.gpus) == 1: assert self.args.gpus[ 0] >= 0, 'mix CPU with GPU should have avaliable GPU' self.emb_model.set_device(self.args.gpus[0]) else: print("Run in CPU process") def train(self): """ train the embedding """ if len(self.args.gpus) > 1: self.fast_train_mp() else: self.fast_train() def fast_train_mp(self): """ multi-cpu-core or mix cpu & multi-gpu """ self.init_device_emb() self.emb_model.share_memory() sum_up_params(self.emb_model) start_all = time.time() ps = [] for i in range(len(self.args.gpus)): p = mp.Process(target=self.fast_train_sp, args=(i, self.args.gpus[i])) ps.append(p) p.start() for p in ps: p.join() print("Used time: %.2fs" % (time.time() - start_all)) if self.args.save_in_pt: self.emb_model.save_embedding_pt(self.dataset, self.args.output_emb_file) else: self.emb_model.save_embedding(self.dataset, self.args.output_emb_file) def fast_train_sp(self, rank, gpu_id): """ a subprocess for fast_train_mp """ if self.args.mix: self.emb_model.set_device(gpu_id) torch.set_num_threads(self.args.num_threads) if self.args.async_update: self.emb_model.create_async_update() sampler = self.dataset.create_sampler(rank) dataloader = DataLoader( dataset=sampler.seeds, batch_size=self.args.batch_size, collate_fn=sampler.sample, shuffle=False, drop_last=False, num_workers=self.args.num_sampler_threads, ) num_batches = len(dataloader) print("num batchs: %d in process [%d] GPU [%d]" % (num_batches, rank, gpu_id)) start = time.time() with torch.no_grad(): for i, edges in enumerate(dataloader): if self.args.fast_neg: self.emb_model.fast_learn(edges) else: # do negative sampling bs = edges.size()[0] neg_nodes = torch.LongTensor( np.random.choice(self.dataset.neg_table, bs * self.args.negative, replace=True)) self.emb_model.fast_learn(edges, neg_nodes=neg_nodes) if i > 0 and i % self.args.print_interval == 0: if self.args.print_loss: if self.args.only_fst: print("GPU-[%d] batch %d time: %.2fs fst-loss: %.4f" \ % (gpu_id, i, time.time()-start, -sum(self.emb_model.loss_fst)/self.args.print_interval)) elif self.args.only_snd: print("GPU-[%d] batch %d time: %.2fs snd-loss: %.4f" \ % (gpu_id, i, time.time()-start, -sum(self.emb_model.loss_snd)/self.args.print_interval)) else: print("GPU-[%d] batch %d time: %.2fs fst-loss: %.4f snd-loss: %.4f" \ % (gpu_id, i, time.time()-start, \ -sum(self.emb_model.loss_fst)/self.args.print_interval, \ -sum(self.emb_model.loss_snd)/self.args.print_interval)) self.emb_model.loss_fst = [] self.emb_model.loss_snd = [] else: print("GPU-[%d] batch %d time: %.2fs" % (gpu_id, i, time.time() - start)) start = time.time() if self.args.async_update: self.emb_model.finish_async_update() def fast_train(self): """ fast train with dataloader with only gpu / only cpu""" self.init_device_emb() if self.args.async_update: self.emb_model.share_memory() self.emb_model.create_async_update() sum_up_params(self.emb_model) sampler = self.dataset.create_sampler(0) dataloader = DataLoader( dataset=sampler.seeds, batch_size=self.args.batch_size, collate_fn=sampler.sample, shuffle=False, drop_last=False, num_workers=self.args.num_sampler_threads, ) num_batches = len(dataloader) print("num batchs: %d\n" % num_batches) start_all = time.time() start = time.time() with torch.no_grad(): for i, edges in enumerate(dataloader): if self.args.fast_neg: self.emb_model.fast_learn(edges) else: # do negative sampling bs = edges.size()[0] neg_nodes = torch.LongTensor( np.random.choice(self.dataset.neg_table, bs * self.args.negative, replace=True)) self.emb_model.fast_learn(edges, neg_nodes=neg_nodes) if i > 0 and i % self.args.print_interval == 0: if self.args.print_loss: if self.args.only_fst: print("Batch %d time: %.2fs fst-loss: %.4f" \ % (i, time.time()-start, -sum(self.emb_model.loss_fst)/self.args.print_interval)) elif self.args.only_snd: print("Batch %d time: %.2fs snd-loss: %.4f" \ % (i, time.time()-start, -sum(self.emb_model.loss_snd)/self.args.print_interval)) else: print("Batch %d time: %.2fs fst-loss: %.4f snd-loss: %.4f" \ % (i, time.time()-start, \ -sum(self.emb_model.loss_fst)/self.args.print_interval, \ -sum(self.emb_model.loss_snd)/self.args.print_interval)) self.emb_model.loss_fst = [] self.emb_model.loss_snd = [] else: print("Batch %d, training time: %.2fs" % (i, time.time() - start)) start = time.time() if self.args.async_update: self.emb_model.finish_async_update() print("Training used time: %.2fs" % (time.time() - start_all)) if self.args.save_in_pt: self.emb_model.save_embedding_pt(self.dataset, self.args.output_emb_file) else: self.emb_model.save_embedding(self.dataset, self.args.output_emb_file)