def run(args, logger): init_time_start = time.time() # load dataset and samplers dataset = get_dataset(args.data_path, args.dataset, args.format, args.data_files) if args.neg_sample_size_eval < 0: args.neg_sample_size_eval = dataset.n_entities args.batch_size = get_compatible_batch_size(args.batch_size, args.neg_sample_size) args.batch_size_eval = get_compatible_batch_size(args.batch_size_eval, args.neg_sample_size_eval) args.eval_filter = not args.no_eval_filter if args.neg_deg_sample_eval: assert not args.eval_filter, "if negative sampling based on degree, we can't filter positive edges." train_data = TrainDataset(dataset, args, ranks=args.num_proc) # if there is no cross partition relaiton, we fall back to strict_rel_part args.strict_rel_part = args.mix_cpu_gpu and (train_data.cross_part == False) args.soft_rel_part = args.mix_cpu_gpu and args.soft_rel_part and train_data.cross_part args.num_workers = 8 # fix num_worker to 8 if args.num_proc > 1: train_samplers = [] for i in range(args.num_proc): train_sampler_head = train_data.create_sampler( args.batch_size, args.neg_sample_size, args.neg_sample_size, mode='head', num_workers=args.num_workers, shuffle=True, exclude_positive=False, rank=i) train_sampler_tail = train_data.create_sampler( args.batch_size, args.neg_sample_size, args.neg_sample_size, mode='tail', num_workers=args.num_workers, shuffle=True, exclude_positive=False, rank=i) train_samplers.append( NewBidirectionalOneShotIterator(train_sampler_head, train_sampler_tail, args.neg_sample_size, args.neg_sample_size, True, dataset.n_entities)) train_sampler = NewBidirectionalOneShotIterator( train_sampler_head, train_sampler_tail, args.neg_sample_size, args.neg_sample_size, True, dataset.n_entities) else: # This is used for debug train_sampler_head = train_data.create_sampler( args.batch_size, args.neg_sample_size, args.neg_sample_size, mode='head', num_workers=args.num_workers, shuffle=True, exclude_positive=False) train_sampler_tail = train_data.create_sampler( args.batch_size, args.neg_sample_size, args.neg_sample_size, mode='tail', num_workers=args.num_workers, shuffle=True, exclude_positive=False) train_sampler = NewBidirectionalOneShotIterator( train_sampler_head, train_sampler_tail, args.neg_sample_size, args.neg_sample_size, True, dataset.n_entities) if args.valid or args.test: if len(args.gpu) > 1: args.num_test_proc = args.num_proc if args.num_proc < len( args.gpu) else len(args.gpu) else: args.num_test_proc = args.num_proc eval_dataset = EvalDataset(dataset, args) if args.valid: if args.num_proc > 1: valid_sampler_heads = [] valid_sampler_tails = [] for i in range(args.num_proc): valid_sampler_head = eval_dataset.create_sampler( 'valid', args.batch_size_eval, args.neg_sample_size_eval, args.neg_sample_size_eval, args.eval_filter, mode='chunk-head', num_workers=args.num_workers, rank=i, ranks=args.num_proc) valid_sampler_tail = eval_dataset.create_sampler( 'valid', args.batch_size_eval, args.neg_sample_size_eval, args.neg_sample_size_eval, args.eval_filter, mode='chunk-tail', num_workers=args.num_workers, rank=i, ranks=args.num_proc) valid_sampler_heads.append(valid_sampler_head) valid_sampler_tails.append(valid_sampler_tail) else: # This is used for debug valid_sampler_head = eval_dataset.create_sampler( 'valid', args.batch_size_eval, args.neg_sample_size_eval, args.neg_sample_size_eval, args.eval_filter, mode='chunk-head', num_workers=args.num_workers, rank=0, ranks=1) valid_sampler_tail = eval_dataset.create_sampler( 'valid', args.batch_size_eval, args.neg_sample_size_eval, args.neg_sample_size_eval, args.eval_filter, mode='chunk-tail', num_workers=args.num_workers, rank=0, ranks=1) if args.test: if args.num_test_proc > 1: test_sampler_tails = [] test_sampler_heads = [] for i in range(args.num_test_proc): test_sampler_head = eval_dataset.create_sampler( 'test', args.batch_size_eval, args.neg_sample_size_eval, args.neg_sample_size_eval, args.eval_filter, mode='chunk-head', num_workers=args.num_workers, rank=i, ranks=args.num_test_proc) test_sampler_tail = eval_dataset.create_sampler( 'test', args.batch_size_eval, args.neg_sample_size_eval, args.neg_sample_size_eval, args.eval_filter, mode='chunk-tail', num_workers=args.num_workers, rank=i, ranks=args.num_test_proc) test_sampler_heads.append(test_sampler_head) test_sampler_tails.append(test_sampler_tail) else: test_sampler_head = eval_dataset.create_sampler( 'test', args.batch_size_eval, args.neg_sample_size_eval, args.neg_sample_size_eval, args.eval_filter, mode='chunk-head', num_workers=args.num_workers, rank=0, ranks=1) test_sampler_tail = eval_dataset.create_sampler( 'test', args.batch_size_eval, args.neg_sample_size_eval, args.neg_sample_size_eval, args.eval_filter, mode='chunk-tail', num_workers=args.num_workers, rank=0, ranks=1) # load model model = load_model(logger, args, dataset.n_entities, dataset.n_relations) if args.num_proc > 1 or args.async_update: model.share_memory() # We need to free all memory referenced by dataset. eval_dataset = None dataset = None print('Total initialize time {:.3f} seconds'.format(time.time() - init_time_start)) # train start = time.time() rel_parts = train_data.rel_parts if args.strict_rel_part or args.soft_rel_part else None cross_rels = train_data.cross_rels if args.soft_rel_part else None if args.num_proc > 1: procs = [] barrier = mp.Barrier(args.num_proc) for i in range(args.num_proc): valid_sampler = [valid_sampler_heads[i], valid_sampler_tails[i] ] if args.valid else None proc = mp.Process(target=train_mp, args=(args, model, train_samplers[i], valid_sampler, i, rel_parts, cross_rels, barrier)) procs.append(proc) proc.start() for proc in procs: proc.join() else: valid_samplers = [valid_sampler_head, valid_sampler_tail ] if args.valid else None train(args, model, train_sampler, valid_samplers, rel_parts=rel_parts) print('training takes {} seconds'.format(time.time() - start)) if args.save_emb is not None: if not os.path.exists(args.save_emb): os.mkdir(args.save_emb) model.save_emb(args.save_emb, args.dataset) # We need to save the model configurations as well. conf_file = os.path.join(args.save_emb, 'config.json') with open(conf_file, 'w') as outfile: json.dump( { 'dataset': args.dataset, 'model': args.model_name, 'emb_size': args.hidden_dim, 'max_train_step': args.max_step, 'batch_size': args.batch_size, 'neg_sample_size': args.neg_sample_size, 'lr': args.lr, 'gamma': args.gamma, 'double_ent': args.double_ent, 'double_rel': args.double_rel, 'neg_adversarial_sampling': args.neg_adversarial_sampling, 'adversarial_temperature': args.adversarial_temperature, 'regularization_coef': args.regularization_coef, 'regularization_norm': args.regularization_norm }, outfile, indent=4) # test if args.test: start = time.time() if args.num_test_proc > 1: queue = mp.Queue(args.num_test_proc) procs = [] for i in range(args.num_test_proc): proc = mp.Process(target=test_mp, args=(args, model, [ test_sampler_heads[i], test_sampler_tails[i] ], i, 'Test', queue)) procs.append(proc) proc.start() total_metrics = {} metrics = {} logs = [] for i in range(args.num_test_proc): log = queue.get() logs = logs + log for metric in logs[0].keys(): metrics[metric] = sum([log[metric] for log in logs]) / len(logs) for k, v in metrics.items(): print('Test average {} : {}'.format(k, v)) for proc in procs: proc.join() else: test(args, model, [test_sampler_head, test_sampler_tail]) print('testing takes {:.3f} seconds'.format(time.time() - start))
def start_worker(args, logger): """Start kvclient for training """ init_time_start = time.time() time.sleep(WAIT_TIME) # wait for launch script server_namebook = dgl.contrib.read_ip_config(filename=args.ip_config) args.machine_id = get_local_machine_id(server_namebook) dataset, entity_partition_book, local2global = get_partition_dataset( args.data_path, args.dataset, args.format, args.machine_id) n_entities = dataset.n_entities n_relations = dataset.n_relations print('Partition %d n_entities: %d' % (args.machine_id, n_entities)) print("Partition %d n_relations: %d" % (args.machine_id, n_relations)) entity_partition_book = F.tensor(entity_partition_book) relation_partition_book = get_long_tail_partition(dataset.n_relations, args.total_machine) relation_partition_book = F.tensor(relation_partition_book) local2global = F.tensor(local2global) relation_partition_book.share_memory_() entity_partition_book.share_memory_() local2global.share_memory_() train_data = TrainDataset(dataset, args, ranks=args.num_client) # if there is no cross partition relaiton, we fall back to strict_rel_part args.strict_rel_part = args.mix_cpu_gpu and (train_data.cross_part == False) args.soft_rel_part = args.mix_cpu_gpu and args.soft_rel_part and train_data.cross_part if args.neg_sample_size_eval < 0: args.neg_sample_size_eval = dataset.n_entities args.batch_size = get_compatible_batch_size(args.batch_size, args.neg_sample_size) args.batch_size_eval = get_compatible_batch_size(args.batch_size_eval, args.neg_sample_size_eval) args.num_workers = 8 # fix num_workers to 8 train_samplers = [] for i in range(args.num_client): train_sampler_head = train_data.create_sampler( args.batch_size, args.neg_sample_size, args.neg_sample_size, mode='head', num_workers=args.num_workers, shuffle=True, exclude_positive=False, rank=i) train_sampler_tail = train_data.create_sampler( args.batch_size, args.neg_sample_size, args.neg_sample_size, mode='tail', num_workers=args.num_workers, shuffle=True, exclude_positive=False, rank=i) train_samplers.append( NewBidirectionalOneShotIterator(train_sampler_head, train_sampler_tail, args.neg_sample_size, args.neg_sample_size, True, n_entities)) dataset = None model = load_model(logger, args, n_entities, n_relations) model.share_memory() print('Total initialize time {:.3f} seconds'.format(time.time() - init_time_start)) rel_parts = train_data.rel_parts if args.strict_rel_part or args.soft_rel_part else None cross_rels = train_data.cross_rels if args.soft_rel_part else None procs = [] barrier = mp.Barrier(args.num_client) for i in range(args.num_client): proc = mp.Process(target=dist_train_test, args=(args, model, train_samplers[i], entity_partition_book, relation_partition_book, local2global, i, rel_parts, cross_rels, barrier)) procs.append(proc) proc.start() for proc in procs: proc.join()
def main(args): args.eval_filter = not args.no_eval_filter if args.neg_deg_sample_eval: assert not args.eval_filter, "if negative sampling based on degree, we can't filter positive edges." # load dataset and samplers dataset = get_dataset(args.data_path, args.dataset, args.format, args.data_files) args.pickle_graph = False args.train = False args.valid = False args.test = True args.strict_rel_part = False args.soft_rel_part = False args.async_update = False logger = get_logger(args) # Here we want to use the regualr negative sampler because we need to ensure that # all positive edges are excluded. eval_dataset = EvalDataset(dataset, args) if args.neg_sample_size_eval < 0: args.neg_sample_size_eval = args.neg_sample_size = eval_dataset.g.number_of_nodes( ) args.batch_size_eval = get_compatible_batch_size(args.batch_size_eval, args.neg_sample_size_eval) args.num_workers = 8 # fix num_workers to 8 if args.num_proc > 1: test_sampler_tails = [] test_sampler_heads = [] for i in range(args.num_proc): test_sampler_head = eval_dataset.create_sampler( 'test', args.batch_size_eval, args.neg_sample_size_eval, args.neg_sample_size_eval, args.eval_filter, mode='chunk-head', num_workers=args.num_workers, rank=i, ranks=args.num_proc) test_sampler_tail = eval_dataset.create_sampler( 'test', args.batch_size_eval, args.neg_sample_size_eval, args.neg_sample_size_eval, args.eval_filter, mode='chunk-tail', num_workers=args.num_workers, rank=i, ranks=args.num_proc) test_sampler_heads.append(test_sampler_head) test_sampler_tails.append(test_sampler_tail) else: test_sampler_head = eval_dataset.create_sampler( 'test', args.batch_size_eval, args.neg_sample_size_eval, args.neg_sample_size_eval, args.eval_filter, mode='chunk-head', num_workers=args.num_workers, rank=0, ranks=1) test_sampler_tail = eval_dataset.create_sampler( 'test', args.batch_size_eval, args.neg_sample_size_eval, args.neg_sample_size_eval, args.eval_filter, mode='chunk-tail', num_workers=args.num_workers, rank=0, ranks=1) # load model n_entities = dataset.n_entities n_relations = dataset.n_relations ckpt_path = args.model_path model = load_model_from_checkpoint(logger, args, n_entities, n_relations, ckpt_path) if args.num_proc > 1: model.share_memory() # test args.step = 0 args.max_step = 0 start = time.time() if args.num_proc > 1: queue = mp.Queue(args.num_proc) procs = [] for i in range(args.num_proc): proc = mp.Process(target=test_mp, args=(args, model, [ test_sampler_heads[i], test_sampler_tails[i] ], i, 'Test', queue)) procs.append(proc) proc.start() total_metrics = {} metrics = {} logs = [] for i in range(args.num_proc): log = queue.get() logs = logs + log for metric in logs[0].keys(): metrics[metric] = sum([log[metric] for log in logs]) / len(logs) for k, v in metrics.items(): print('Test average {} at [{}/{}]: {}'.format( k, args.step, args.max_step, v)) for proc in procs: proc.join() else: test(args, model, [test_sampler_head, test_sampler_tail]) print('Test takes {:.3f} seconds'.format(time.time() - start))
def main(): args = ArgParser().parse_args() prepare_save_path(args) args.neg_sample_size_eval = 1000 set_global_seed(args.seed) init_time_start = time.time() dataset = get_dataset(args, args.data_path, args.dataset, args.format, args.delimiter, args.data_files, args.has_edge_importance) args.batch_size = get_compatible_batch_size(args.batch_size, args.neg_sample_size) args.batch_size_eval = get_compatible_batch_size(args.batch_size_eval, args.neg_sample_size_eval) #print(args) set_logger(args) print("To build training dataset") t1 = time.time() train_data = TrainDataset(dataset, args, has_importance=args.has_edge_importance) print("Training dataset built, it takes %s seconds" % (time.time() - t1)) args.num_workers = 8 # fix num_worker to 8 print("Building training sampler") t1 = time.time() train_sampler_head = train_data.create_sampler( batch_size=args.batch_size, num_workers=args.num_workers, neg_sample_size=args.neg_sample_size, neg_mode='head') train_sampler_tail = train_data.create_sampler( batch_size=args.batch_size, num_workers=args.num_workers, neg_sample_size=args.neg_sample_size, neg_mode='tail') train_sampler = NewBidirectionalOneShotIterator(train_sampler_head, train_sampler_tail) print("Training sampler created, it takes %s seconds" % (time.time() - t1)) if args.valid or args.test: if len(args.gpu) > 1: args.num_test_proc = args.num_proc if args.num_proc < len( args.gpu) else len(args.gpu) else: args.num_test_proc = args.num_proc print("To create eval_dataset") t1 = time.time() eval_dataset = EvalDataset(dataset, args) print("eval_dataset created, it takes %d seconds" % (time.time() - t1)) if args.valid: if args.num_proc > 1: valid_samplers = [] for i in range(args.num_proc): print("creating valid sampler for proc %d" % i) t1 = time.time() valid_sampler_tail = eval_dataset.create_sampler( 'valid', args.batch_size_eval, mode='tail', num_workers=args.num_workers, rank=i, ranks=args.num_proc) valid_samplers.append(valid_sampler_tail) print( "Valid sampler for proc %d created, it takes %s seconds" % (i, time.time() - t1)) else: valid_sampler_tail = eval_dataset.create_sampler( 'valid', args.batch_size_eval, mode='tail', num_workers=args.num_workers, rank=0, ranks=1) valid_samplers = [valid_sampler_tail] for arg in vars(args): logging.info('{:20}:{}'.format(arg, getattr(args, arg))) print("To create model") t1 = time.time() model = BaseKEModel(args=args, n_entities=dataset.n_entities, n_relations=dataset.n_relations, model_name=args.model_name, hidden_size=args.hidden_dim, entity_feat_dim=dataset.entity_feat.shape[1], relation_feat_dim=dataset.relation_feat.shape[1], gamma=args.gamma, double_entity_emb=args.double_ent, relation_times=args.ote_size, scale_type=args.scale_type) model.entity_feat = dataset.entity_feat model.relation_feat = dataset.relation_feat print(len(model.parameters())) if args.cpu_emb: print("using cpu emb\n" * 5) else: print("using gpu emb\n" * 5) optimizer = paddle.optimizer.Adam(learning_rate=args.mlp_lr, parameters=model.parameters()) lr_tensor = paddle.to_tensor(args.lr) global_step = 0 tic_train = time.time() log = {} log["loss"] = 0.0 log["regularization"] = 0.0 for step in range(0, args.max_step): pos_triples, neg_triples, ids, neg_head = next(train_sampler) loss = model.forward(pos_triples, neg_triples, ids, neg_head) log["loss"] = loss.numpy()[0] if args.regularization_coef > 0.0 and args.regularization_norm > 0: coef, nm = args.regularization_coef, args.regularization_norm reg = coef * norm(model.entity_embedding.curr_emb, nm) log['regularization'] = reg.numpy()[0] loss = loss + reg loss.backward() optimizer.step() if args.cpu_emb: model.entity_embedding.step(lr_tensor) optimizer.clear_grad() if (step + 1) % args.log_interval == 0: speed = args.log_interval / (time.time() - tic_train) logging.info( "step: %d, train loss: %.5f, regularization: %.4e, speed: %.2f steps/s" % (step, log["loss"], log["regularization"], speed)) log["loss"] = 0.0 tic_train = time.time() if args.valid and ( step + 1 ) % args.eval_interval == 0 and step > 1 and valid_samplers is not None: print("Valid begin") valid_start = time.time() valid_input_dict = test(args, model, valid_samplers, step, rank=0, mode='Valid') paddle.save( valid_input_dict, os.path.join(args.save_path, "valid_{}.pkl".format(step))) # Save the model for the inference if (step + 1) % args.save_step == 0: print("The step:{}, save model path:{}".format( step + 1, args.save_path)) model.save_model() print("Save model done.")
def __init__(self): # Step 1 of the process LOAD THE DATASET # contests dataset self.dataset = WikiKG90MDataset( root='/home/nibiohnproj9/public/dataset/') self.n_entities = self.dataset.num_entities self.n_relations = self.dataset.num_relations # The prediction task is to predict: # Tail given Head and Relation; or h,r -> t # head: the source node # relation: the type of edge # tail: the destination node self.train = self.dataset.train_hrt.T # the transpose of all the triplets # even when the argument has_edge_importance seems to be set by default # to True, the edges appear to have no additional information, so we will # keep this as False self.has_edge_importance = False # conditionals involving this variable will be removed for simplicity # neg_sample_size is the number of negative samples we use for each # positive sample in training; and _eval is the number of neg samples used # to evaluate a positive sample. self.neg_sample_size = 100 self.neg_sample_size_eval = 1000 # batch sizes for training and evaluation are both of type int and are set # by default to 400 and 50 respectively if not specified as an argument in # setup.sh self.batch_size = 400 self.batch_size_eval = 50 # we correct the values based on neg_sample sizes self.batch_size = get_compatible_batch_size(self.batch_size, self.neg_sample_size) self.batch_size_eval = get_compatible_batch_size( self.batch_size_eval, self.neg_sample_size_eval) # We should turn on mix CPU-GPU training for multi-GPU training. # mix_cpu_gpu: Training a knowledge graph embedding model with both CPUs and GPUs. self.mix_cpu_gpu = True # num_proc: The number of processes to train the model in parallel. # In multi-GPU training, the number of processes by default is set to match the number of GPUs. self.gpu = [0, 1, 2, 3] self.num_proc = len(self.gpu) # We force a synchronization between processes every x steps for # multiprocessing training. This potentially stablizes the training process # to get a better performance. self.force_sync_interval = 1000 # Disable filter positive edges from randomly constructed negative edges # for evaluation self.no_eval_filter = False # if included in args, set True self.eval_filter = not self.no_eval_filter # Construct negative samples proportional to vertex degree in the evaluation. # True if included as argument self.neg_deg_sample_eval = False if self.neg_deg_sample_eval: assert not self.eval_filter, "if negative sampling based on degree, we can't filter positive edges." # args.soft_rel_part = args.mix_cpu_gpu and args.rel_part # # if there is no cross partition relaiton, we fall back to strict_rel_part # args.strict_rel_part = args.mix_cpu_gpu and (train_data.cross_part == False) self.num_workers = 8 # fix num_worker to 8