def save(): tokenizer = EHRTokenizer(data_dir='../data') logger.info("Use Pretraining model") model = TSNE.from_pretrained(model_name, dx_voc=tokenizer.dx_voc, rx_voc=tokenizer.rx_voc) model(output_dir=output_dir) logger.info('# of model parameters: ' + str(get_n_params(model)))
def main(opts): if not opts.do_train and not opts.do_eval: raise ValueError('At least one of `do_train` or `do_eval` must be True.') if os.path.exists(opts.output_dir) and os.listdir(opts.output_dir) and opts.do_train: raise ValueError('Output directory ({}) already exists and is not empty.'.format(opts.output_dir)) # Create the output directory (if not exists) create_dir_if_not_exists(opts.output_dir) # Device device type opts.device = torch.device('cuda' if torch.cuda.is_available() and not opts.no_cuda else 'cpu') opts.n_gpus = torch.cuda.device_count() if str(opts.device) == 'cuda' else 0 print('Device Type: {} | Number of GPUs: {}'.format(opts.device, opts.n_gpus), flush=True) # Load Datasets and Ontology dataset = MyDataset() if opts.do_train: # Load model from scratch model = Model.from_scratch('bert-base-chinese') model.move_to_device(opts) print('Number of model parameters is: {}'.format(get_n_params(model))) # Start Training print('Start Training', flush=True) model.run_train(dataset, opts) # Free up all memory pytorch is taken from gpu memory del model torch.cuda.empty_cache()
n_rnn_layers=args.n_rnn_layers, dropout=args.dropout, att_method='general', device=device) print('_' * 72) print() print(model) print('_' * 72) print() print('Num training samples :', len(train_df)) print('Num validation samples:', len(val_df)) print('Num test samples :', len(test_df)) print('Num labels :', n_labels) print("Max sequence length :", max_seq_len) print('Total parameter :', get_n_params(model)) print('_' * 72) print() try: model = train(model, train_dataset, val_dataset, args) print() except KeyboardInterrupt as e: print(e) print('\nSave last model at {}\n'.format(args.model_dir + '/final_model.pth')) torch.save(model.state_dict(), args.model_dir + '/final_model.pth') finally: model = MultiIntentModel(n_labels=n_labels, vocab_size=vocab_size, padding_idx=padding_idx,
def main(opts): if not opts.do_train and not opts.do_eval: raise ValueError( 'At least one of `do_train` or `do_eval` must be True.') if os.path.exists(opts.output_dir) and os.listdir( opts.output_dir) and opts.do_train: raise ValueError( 'Output directory ({}) already exists and is not empty.'.format( opts.output_dir)) # Create the output directory (if not exists) create_dir_if_not_exists(opts.output_dir) # Device device type opts.device = torch.device( 'cuda' if torch.cuda.is_available() and not opts.no_cuda else 'cpu') opts.n_gpus = torch.cuda.device_count() if str( opts.device) == 'cuda' else 0 print('Device Type: {} | Number of GPUs: {}'.format( opts.device, opts.n_gpus), flush=True) # Load Datasets and Ontology dataset, ontology = load_dataset(opts.data_dir) print('Loaded Datasets and Ontology', flush=True) print('Number of Train Dialogues: {}'.format(len(dataset['train'])), flush=True) print('Number of Dev Dialogues: {}'.format(len(dataset['dev'])), flush=True) print('Number of Test Dialogues: {}'.format(len(dataset['test'])), flush=True) if opts.do_train: # Load model from scratch model = Model.from_scratch(opts.bert_model) model.move_to_device(opts) print('Number of model parameters is: {}'.format(get_n_params(model))) # Start Training print('Start Training', flush=True) model.run_train(dataset, ontology, opts) # Free up all memory pytorch is taken from gpu memory del model torch.cuda.empty_cache() if opts.do_eval: if not (os.path.exists(opts.output_dir) and os.listdir(opts.output_dir)): raise ValueError( 'Output directory ({}) is empty. Cannot do evaluation'.format( opts.output_dir)) # Load trained model model = Model.from_model_path(os.path.abspath(opts.output_dir)) model.move_to_device(opts) print('Number of model parameters is: {}'.format(get_n_params(model))) # Start evaluating print('Start Evaluating', flush=True) print(model.run_dev(dataset, ontology, opts), flush=True) print(model.run_test(dataset, ontology, opts), flush=True)
def main(): """ Save Path """ train_writer = None valid_writer = None test_writer = None if args.save == True: save_path = '{},{},{}epochs,b{},lr{}'.format(args.model, args.optim, args.epochs, args.batch_size, args.lr) time_stamp = datetime.datetime.now().strftime("%m-%d-%H:%M") save_path = os.path.join(time_stamp, save_path) save_path = os.path.join(args.dataName, save_path) save_path = os.path.join(args.save_path, save_path) print('==> Will save Everything to {}', save_path) if not os.path.exists(save_path): os.makedirs(save_path) #""" Setting for TensorboardX """ train_writer = SummaryWriter(os.path.join(save_path, 'train')) valid_writer = SummaryWriter(os.path.join(save_path, 'valid')) test_writer = SummaryWriter(os.path.join(save_path, 'test')) # output_writer = SummaryWriter(os.path.join(save_path, 'Output_Writer')) """ Transforms/ Data Augmentation Tec """ co_transforms = pc_transforms.Compose([ # pc_transforms.Delete(num_points=1466) # pc_transforms.Jitter_PC(sigma=0.01,clip=0.05), # pc_transforms.Scale(low=0.9,high=1.1), # pc_transforms.Shift(low=-0.01,high=0.01), # pc_transforms.Random_Rotate(), # pc_transforms.Random_Rotate_90(), # pc_transforms.Rotate_90(args,axis='x',angle=-1.0),# 1.0,2,3,4 # pc_transforms.Rotate_90(args, axis='z', angle=2.0), # pc_transforms.Rotate_90(args, axis='y', angle=2.0), # pc_transforms.Rotate_90(args, axis='shape_complete') TODO this is essential for Angela's data set ]) input_transforms = transforms.Compose([ pc_transforms.ArrayToTensor(), # transforms.Normalize(mean=[0.5,0.5],std=[1,1]) ]) target_transforms = transforms.Compose([ pc_transforms.ArrayToTensor(), # transforms.Normalize(mean=[0.5, 0.5], std=[1, 1]) ]) """-----------------------------------------------Data Loader----------------------------------------------------""" if (args.net_name == 'auto_encoder'): [train_dataset, valid_dataset] = Datasets.__dict__[args.dataName]( input_root=args.data, target_root=None, split=args.split_value, net_name=args.net_name, input_transforms=input_transforms, target_transforms=target_transforms, co_transforms=co_transforms) [test_dataset, _] = Datasets.__dict__[args.dataName]( input_root=args.datatest, target_root=None, split=None, net_name=args.net_name, input_transforms=input_transforms, target_transforms=target_transforms, co_transforms=co_transforms) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.workers, shuffle=True, pin_memory=True) valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=args.batch_size, num_workers=args.workers, shuffle=False, pin_memory=True) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.workers, shuffle=False, pin_memory=True) """----------------------------------------------Model Settings--------------------------------------------------""" print('Model:', args.model) if args.pretrained: network_data = torch.load(args.pretrained) args.model = network_data['model'] print("==> Using Pre-trained Model '{}' saved at {} ".format( args.model, args.pretrained)) else: network_data = None if (args.model == 'ae_pointnet'): model = models.__dict__[args.model](args, num_points=2048, global_feat=True, data=network_data).cuda() else: model = models.__dict__[args.model](network_data).cuda() # model = torch.nn.DataParallel(model.cuda(),device_ids=[0,1]) TODO To make dataparallel run do Nigels Fix """https://github.com/pytorch/pytorch/issues/1637#issuecomment-338268158""" params = get_n_params(model) print('| Number of parameters [' + str(params) + ']...') """-----------------------------------------------Optimizer Settings---------------------------------------------""" cudnn.benchmark = True print('Settings {} Optimizer'.format(args.optim)) # param_groups = [{'params': model.module.bias_parameters(), 'weight_decay': args.bias_decay}, # {'params': model.module.weight_parameters(), 'weight_decay':args.weight_decay} # ] if args.optim == 'Adam': optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, betas=(args.momentum, args.beta)) scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=args.milestones, gamma=args.gamma) # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer) """-------------------------------------------------Visualer Initialization-------------------------------------""" visualizer = Visualizer(args) args.display_id = args.display_id + 10 args.name = 'Validation' vis_Valid = Visualizer(args) vis_Valida = [] args.display_id = args.display_id + 10 for i in range(1, 12): vis_Valida.append(Visualizer(args)) args.display_id = args.display_id + 10 """---------------------------------------------------Loss Setting-----------------------------------------------""" chamfer = ChamferLoss(args) best_loss = -1 valid_loss = 1000 if args.test_only == True: epoch = 0 test_loss, _, _ = test(valid_loader, model, epoch, args, chamfer, vis_Valid, vis_Valida, test_writer) test_writer.add_scalar('mean Loss', test_loss, epoch) print('Average Loss :{}'.format(test_loss)) else: """------------------------------------------------Training and Validation-----------------------------------""" for epoch in range(args.start_epoch, args.epochs): scheduler.step() train_loss, _, _ = train(train_loader, model, optimizer, epoch, args, chamfer, visualizer, train_writer) train_writer.add_scalar('mean Loss', train_loss, epoch) valid_loss, _, _ = validation(test_loader, model, epoch, args, chamfer, vis_Valid, vis_Valida, valid_writer) valid_writer.add_scalar('mean Loss', valid_loss, epoch) if best_loss < 0: best_loss = valid_loss is_best = valid_loss < best_loss best_loss = min(valid_loss, best_loss) if args.save == True: save_checkpoint( { 'epoch': epoch + 1, 'model': args.model, 'state_dict': model.state_dict( ), # TODO if data parallel is fized write model.module.state_dict() 'state_dict_encoder': model.encoder.state_dict(), 'state_dict_decoder': model.decoder.state_dict(), 'best_loss': best_loss }, is_best, save_path)
def main(args): if args.seed is not None: torch.manual_seed(args.seed) np.random.seed(args.seed) if args.gpu < 0: device = "cpu" else: device = f"cuda:{args.gpu}" # Load dataset data = load_data(device, args) g, labels, num_classes, train_nid, val_nid, test_nid = data evaluator = get_evaluator(args.dataset) # Preprocess neighbor-averaged features over sampled relation subgraphs rel_subsets = read_relation_subsets(args.use_relation_subsets) with torch.no_grad(): feats = preprocess_features(g, rel_subsets, args, device) print("Done preprocessing") labels = labels.to(device) # Release the graph since we are not going to use it later g = None # Set up logging logging.basicConfig(format='[%(levelname)s] %(message)s', level=logging.INFO) logging.info(str(args)) _, num_feats, in_feats = feats[0].shape logging.info(f"new input size: {num_feats} {in_feats}") # Create model num_hops = args.R + 1 # include self feature hop 0 model = nn.Sequential( WeightedAggregator(num_feats, in_feats, num_hops), SIGN(in_feats, args.num_hidden, num_classes, num_hops, args.ff_layer, args.dropout, args.input_dropout) ) logging.info("# Params: {}".format(get_n_params(model))) model.to(device) if len(labels.shape) == 1: # single label multi-class loss_fcn = nn.NLLLoss() else: # multi-label multi-class loss_fcn = nn.KLDivLoss(reduction='batchmean') optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) # Start training best_epoch = 0 best_val = 0 for epoch in range(1, args.num_epochs + 1): start = time.time() train(model, feats, labels, train_nid, loss_fcn, optimizer, args.batch_size) if epoch % args.eval_every == 0: with torch.no_grad(): train_res, val_res, test_res = test( model, feats, labels, train_nid, val_nid, test_nid, evaluator, args.eval_batch_size) end = time.time() val_acc = val_res[0] log = "Epoch {}, Times(s): {:.4f}".format(epoch, end - start) if args.dataset.startswith("oag"): log += ", NDCG: Train {:.4f}, Val {:.4f}, Test {:.4f}".format(train_res[0], val_res[0], test_res[0]) log += ", MRR: Train {:.4f}, Val {:.4f}, Test {:.4f}".format(train_res[1], val_res[1], test_res[1]) else: log += ", Accuracy: Train {:.4f}, Val {:.4f}, Test {:.4f}".format(train_res[0], val_res[0], test_res[0]) logging.info(log) if val_acc > best_val: best_val = val_acc best_epoch = epoch logging.info("Best Epoch {}, Val {:.4f}".format(best_epoch, best_val))
def main(): """------------------------------ Path to save the GFV files-------------------------------------------------- """ if args.save == True: save_path = '{}'.format( args.model_encoder) time_stamp = datetime.datetime.now().strftime("%m-%d-%H:%M") save_path = os.path.join(time_stamp, save_path) save_path = os.path.join(args.dataName, save_path) save_path = os.path.join(args.save_path, save_path) print('==> Will save Everything to {}', save_path) if not os.path.exists(save_path): os.makedirs(save_path) """------------------------------------- Data Loader---------------------------------------------------------- """ [train_dataset, valid_dataset] = Datasets.__dict__[args.dataName](input_root=args.data, target_root=None, split=args.split_value, net_name=args.net_name, input_transforms=None, target_transforms=None, co_transforms=None, give_name =True) train_loader = torch.utils.data.DataLoader(train_dataset,batch_size=args.batch_size, num_workers = args.workers, shuffle = True, pin_memory=True) valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=args.batch_size, num_workers=args.workers, shuffle=False, pin_memory=True) """----------------------------------------------Model Settings--------------------------------------------------""" print('Model:', args.model_encoder) network_data = torch.load(args.pretrained) model_encoder = models.__dict__[args.model_encoder](args, num_points=2048, global_feat=True, data=network_data,calc_loss = False).cuda() model_decoder = models.__dict__[args.model_decoder](args,data=network_data).cuda() params = get_n_params(model_encoder) print('| Number of Encoder parameters [' + str(params) + ']...') params = get_n_params(model_decoder) print('| Number of Decoder parameters [' + str(params) + ']...') """-------------------------------------------------Visualer Initialization-------------------------------------""" visualizer = Visualizer(args) args.display_id = args.display_id +10 args.name = 'Validation' vis_Valid = Visualizer(args) vis_Valida = [] args.display_id = args.display_id + 10 for i in range(1,12): vis_Valida.append(Visualizer(args)) args.display_id = args.display_id +10 chamfer = ChamferLoss(args) epoch = 0 test_loss = test(train_loader,valid_loader,model_encoder,model_decoder,epoch,args,chamfer,vis_Valid,vis_Valida,save_path) print('Average Loss :{}'.format(test_loss))
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--model_name", default='GBert-predict', type=str, required=False, help="model name") parser.add_argument("--data_dir", default='../data', type=str, required=False, help="The input data dir.") parser.add_argument("--pretrain_dir", default='../saved/GBert-pretraining', type=str, required=False, help="pretraining model") parser.add_argument("--train_file", default='data-multi-visit.pkl', type=str, required=False, help="training data file.") parser.add_argument( "--output_dir", default='../saved/', type=str, required=False, help="The output directory where the model checkpoints will be written." ) # Other parameters parser.add_argument("--use_pretrain", default=False, action='store_true', help="is use pretrain") parser.add_argument("--graph", default=False, action='store_true', help="if use ontology embedding") parser.add_argument("--therhold", default=0.3, type=float, help="therhold.") parser.add_argument( "--max_seq_length", default=55, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=True, action='store_true', help="Whether to run on the dev set.") parser.add_argument("--do_test", default=True, action='store_true', help="Whether to run on the test set.") parser.add_argument("--train_batch_size", default=1, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=5e-4, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=20.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=1203, help="random seed for initialization") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") args = parser.parse_args() args.output_dir = os.path.join(args.output_dir, args.model_name) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") # if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: # raise ValueError( # "Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) print("Loading Dataset") tokenizer, (train_dataset, eval_dataset, test_dataset) = load_dataset(args) train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=1) eval_dataloader = DataLoader(eval_dataset, sampler=SequentialSampler(eval_dataset), batch_size=1) test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=1) print('Loading Model: ' + args.model_name) # config = BertConfig(vocab_size_or_config_json_file=len(tokenizer.vocab.word2idx), side_len=train_dataset.side_len) # config.graph = args.graph # model = SeperateBertTransModel(config, tokenizer.dx_voc, tokenizer.rx_voc) if args.use_pretrain: logger.info("Use Pretraining model") model = GBERT_Predict.from_pretrained(args.pretrain_dir, tokenizer=tokenizer) else: config = BertConfig( vocab_size_or_config_json_file=len(tokenizer.vocab.word2idx)) config.graph = args.graph model = GBERT_Predict(config, tokenizer) logger.info('# of model parameters: ' + str(get_n_params(model))) model.to(device) model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self rx_output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") # Prepare optimizer # num_train_optimization_steps = int( # len(train_dataset) / args.train_batch_size) * args.num_train_epochs # param_optimizer = list(model.named_parameters()) # no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] # optimizer_grouped_parameters = [ # {'params': [p for n, p in param_optimizer if not any( # nd in n for nd in no_decay)], 'weight_decay': 0.01}, # {'params': [p for n, p in param_optimizer if any( # nd in n for nd in no_decay)], 'weight_decay': 0.0} # ] # optimizer = BertAdam(optimizer_grouped_parameters, # lr=args.learning_rate, # warmup=args.warmup_proportion, # t_total=num_train_optimization_steps) optimizer = Adam(model.parameters(), lr=args.learning_rate) global_step = 0 if args.do_train: writer = SummaryWriter(args.output_dir) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", 1) dx_acc_best, rx_acc_best = 0, 0 acc_name = 'prauc' dx_history = {'prauc': []} rx_history = {'prauc': []} for _ in trange(int(args.num_train_epochs), desc="Epoch"): print('') tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 prog_iter = tqdm(train_dataloader, leave=False, desc='Training') model.train() for _, batch in enumerate(prog_iter): batch = tuple(t.to(device) for t in batch) input_ids, dx_labels, rx_labels = batch input_ids, dx_labels, rx_labels = input_ids.squeeze( dim=0), dx_labels.squeeze(dim=0), rx_labels.squeeze(dim=0) loss, rx_logits = model(input_ids, dx_labels=dx_labels, rx_labels=rx_labels, epoch=global_step) loss.backward() tr_loss += loss.item() nb_tr_examples += 1 nb_tr_steps += 1 # Display loss prog_iter.set_postfix(loss='%.4f' % (tr_loss / nb_tr_steps)) optimizer.step() optimizer.zero_grad() writer.add_scalar('train/loss', tr_loss / nb_tr_steps, global_step) global_step += 1 if args.do_eval: print('') logger.info("***** Running eval *****") model.eval() dx_y_preds = [] dx_y_trues = [] rx_y_preds = [] rx_y_trues = [] for eval_input in tqdm(eval_dataloader, desc="Evaluating"): eval_input = tuple(t.to(device) for t in eval_input) input_ids, dx_labels, rx_labels = eval_input input_ids, dx_labels, rx_labels = input_ids.squeeze( ), dx_labels.squeeze(), rx_labels.squeeze(dim=0) with torch.no_grad(): loss, rx_logits = model(input_ids, dx_labels=dx_labels, rx_labels=rx_labels) rx_y_preds.append(t2n(torch.sigmoid(rx_logits))) rx_y_trues.append(t2n(rx_labels)) # dx_y_preds.append(t2n(torch.sigmoid(dx_logits))) # dx_y_trues.append( # t2n(dx_labels.view(-1, len(tokenizer.dx_voc.word2idx)))) # rx_y_preds.append(t2n(torch.sigmoid(rx_logits))[ # :, tokenizer.rx_singe2multi]) # rx_y_trues.append( # t2n(rx_labels)[:, tokenizer.rx_singe2multi]) print('') # dx_acc_container = metric_report(np.concatenate(dx_y_preds, axis=0), np.concatenate(dx_y_trues, axis=0), # args.therhold) rx_acc_container = metric_report( np.concatenate(rx_y_preds, axis=0), np.concatenate(rx_y_trues, axis=0), args.therhold) for k, v in rx_acc_container.items(): writer.add_scalar('eval/{}'.format(k), v, global_step) if rx_acc_container[acc_name] > rx_acc_best: rx_acc_best = rx_acc_container[acc_name] # save model torch.save(model_to_save.state_dict(), rx_output_model_file) with open(os.path.join(args.output_dir, 'bert_config.json'), 'w', encoding='utf-8') as fout: fout.write(model.config.to_json_string()) if args.do_test: logger.info("***** Running test *****") logger.info(" Num examples = %d", len(test_dataset)) logger.info(" Batch size = %d", 1) def test(task=0): # Load a trained model that you have fine-tuned model_state_dict = torch.load(rx_output_model_file) model.load_state_dict(model_state_dict) model.to(device) model.eval() y_preds = [] y_trues = [] for test_input in tqdm(test_dataloader, desc="Testing"): test_input = tuple(t.to(device) for t in test_input) input_ids, dx_labels, rx_labels = test_input input_ids, dx_labels, rx_labels = input_ids.squeeze( ), dx_labels.squeeze(), rx_labels.squeeze(dim=0) with torch.no_grad(): loss, rx_logits = model(input_ids, dx_labels=dx_labels, rx_labels=rx_labels) y_preds.append(t2n(torch.sigmoid(rx_logits))) y_trues.append(t2n(rx_labels)) print('') acc_container = metric_report(np.concatenate(y_preds, axis=0), np.concatenate(y_trues, axis=0), args.therhold) # save report if args.do_train: for k, v in acc_container.items(): writer.add_scalar('test/{}'.format(k), v, 0) return acc_container test(task=0)
model_conv = Bbox(numPoints) model_conv = torch.nn.DataParallel(model_conv, device_ids=range( torch.cuda.device_count())) model_conv.load_state_dict(torch.load(resume_file)) model_conv = model_conv.cuda() else: model_conv = Bbox(numPoints) if use_gpu: model_conv = torch.nn.DataParallel(model_conv, device_ids=range( torch.cuda.device_count())) model_conv = model_conv.cuda() print("model:", model_conv) print("model params:", get_n_params(model_conv)) # criterion = nn.MSELoss() # optimizer = optim.Adam(model_conv.parameters()) # optimizer_conv = optim.Adam(model_conv.parameters(), lr=0.01) optimizer = optim.SGD(model_conv.parameters(), lr=0.001, momentum=0.9) lrScheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1) dst = ChaLocDataLoader(args["images"].split(','), imgSize) train_loader = DataLoader(dst, batch_size=batchSize, shuffle=True, num_workers=4) def train_model(model, optimizer, num_epochs=5):
def train_iters(self): logger.info("number of parameters {0}".format(utils.get_n_params(self.model))) for epoch in range(self.start_epoch, self.epochs + 1): start_time = time.time() total_training_loss = 0.0 training_batch_count = 0 counter = 0 # training for i, data_dict in enumerate(self.training_dataset): if i < 2 or i % 30 == 0 or counter < 1 or counter % 30 == 0: logger.info("epoch {0} step {1} counter {2}".format(epoch, i, counter)) if data_dict is None: continue save_song = True if i % self.report_step < 5 else False # return the predictions from train() if true # train on one batch, interating through the segments outputs, loss = self.train(data_dict, save_song_outputs=save_song, plot=True if i == 0 or i == 1 else False, register_hooks=False) training_batch_count += 1 total_training_loss += loss # save sample outputs if save_song: save_outputs(self.results_directory, self.user_prediction_directory, epoch, data_dict['perf_id'], outputs, data_dict['shifts_gt'].detach().cpu().numpy(), data_dict['original_boundaries'], training=True) del data_dict; del outputs # clear memory # validate and report losses every report_step and at end of epoch. Save the checkpoint if it is better if counter % self.report_step == 0: # iterate through the full validation set mean_validation_loss = self.validate_iters(epoch, self.validation_dataset) mean_training_loss = total_training_loss / training_batch_count # running average of training loss # report current losses and print list of losses so far logger.info("***********************************************************************************") logger.info("{0}: Training and validation loss epoch {1} step {2} : {3} {4}".format( self.extension, epoch, i, mean_training_loss, mean_validation_loss)) self.training_losses.append(mean_training_loss) self.validation_losses.append(mean_validation_loss) logger.info("***********************************************************************************") logger.info("Training and validation losses so far:\n{0}\n{1}".format( self.training_losses, self.validation_losses)) # plot the loss curves bplt.output_file(os.path.join(self.plot_directory, "rnn_losses.html")) fig_tr = bplt.figure(title="Training losses") fig_ev = bplt.figure(title="Evaluation losses") fig_cb = bplt.figure(title="Training and evaluation losses") fig_fx = bplt.figure(title="Losses with fixed y-axis range", y_range=[0, 6.0e-4]) fig_tr.circle(np.arange(len(self.training_losses)), self.training_losses, color="red") fig_ev.circle(np.arange(len(self.validation_losses)), self.validation_losses, color="red") fig_cb.circle(np.arange(len(self.training_losses)), self.training_losses, color="green") fig_cb.circle(np.arange(len(self.validation_losses)), self.validation_losses, color="orange") fig_fx.circle(np.arange(len(self.training_losses)), self.training_losses, color="green") fig_fx.circle(np.arange(len(self.validation_losses)), self.validation_losses, color="orange") bplt.save(bplt.gridplot([fig_tr, fig_ev], [fig_cb, fig_fx])) # save model and replace best if necessary logger.info("is_best before {0} mean loss {1} best prec {2}".format(self.is_best, mean_validation_loss, self.best_prec1)) self.is_best = True if mean_validation_loss < self.best_prec1 else False self.best_prec1 = min(mean_validation_loss, self.best_prec1) logger.info("is_best after {0} mean loss {1} best prec {2}".format(self.is_best, mean_validation_loss, self.best_prec1)) if self.sandbox is False: save_checkpoint({'epoch': epoch + 1, 'state_dict': self.model.state_dict(), 'best_prec1': self.best_prec1, 'optimizer': self.optimizer.state_dict(), 'training_losses': self.training_losses, 'validation_losses': self.validation_losses}, self.is_best, latest_filename=self.latest_checkpoint_file, best_filename=self.best_checkpoint_file) counter += 1 # simulated annealing # for param_group in self.optimizer.param_groups: # param_group['lr'] *= 0.998 logger.info("--- {0} time elapsed for one epoch ---".format(time.time() - start_time))
def main(args): if args.seed is not None: torch.manual_seed(args.seed) np.random.seed(args.seed) if args.gpu < 0: device = "cpu" else: device = f"cuda:{args.gpu}" # Load dataset data = load_data(device, args) g, labels, num_classes, train_nid, val_nid, test_nid = data evaluator = get_evaluator(args.dataset) rel_subsets = read_relation_subsets(args.use_relation_subsets) num_feats = len(rel_subsets) in_feats = g.nodes["paper"].data["feat"].shape[1] num_paper = g.number_of_nodes("paper") num_hops = args.R + 1 # include self feature hop 0 aggregator = PartialWeightedAggregator(num_feats, in_feats, num_hops, args.sample_size) # Preprocess neighbor-averaged features over sampled relation subgraphs with torch.no_grad(): history_sum = preprocess_agg(g, rel_subsets, args, device, aggregator) print("Done preprocessing") labels = labels.to(device) # Set up logging logging.basicConfig(format="[%(levelname)s] %(message)s", level=logging.INFO) logging.info(str(args)) # Create model model = nn.Sequential( aggregator, SIGN( in_feats, args.num_hidden, num_classes, num_hops, args.ff_layer, args.dropout, args.input_dropout, ), ) logging.info("# Params: {}".format(get_n_params(model))) model.to(device) if len(labels.shape) == 1: # single label multi-class loss_fcn = nn.NLLLoss() else: # multi-label multi-class loss_fcn = nn.KLDivLoss(reduction="batchmean") optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) with torch.no_grad(): selected = np.random.choice(num_feats, args.sample_size, replace=False) selected_subsets = [rel_subsets[i] for i in selected] feats_selected = recompute_selected_subsets(g, selected_subsets, args, num_paper, in_feats, device) # Start training best_epoch = 0 best_val = 0 for epoch in range(1, args.num_epochs + 1): start = time.time() model.train() train( model, feats_selected, labels, train_nid, loss_fcn, optimizer, args.batch_size, history=history_sum, ) if epoch % args.eval_every == 0: with torch.no_grad(): train_res, val_res, test_res = test( model, feats_selected, labels, train_nid, val_nid, test_nid, evaluator, args.eval_batch_size, history=history_sum, ) end = time.time() val_acc = val_res[0] log = "Epoch {}, Times(s): {:.4f}".format(epoch, end - start) if args.dataset.startswith("oag"): log += ", NDCG: Train {:.4f}, Val {:.4f}, Test {:.4f}".format( train_res[0], val_res[0], test_res[0]) log += ", MRR: Train {:.4f}, Val {:.4f}, Test {:.4f}".format( train_res[1], val_res[1], test_res[1]) else: log += ", Accuracy: Train {:.4f}, Val {:.4f}, Test {:.4f}".format( train_res[0], val_res[0], test_res[0]) logging.info(log) if val_acc > best_val: best_val = val_acc best_epoch = epoch # update history and aggregation weight and resample if epoch % args.resample_every == 0: with torch.no_grad(): aggregator.cpu() history_sum = aggregator((feats_selected, history_sum)) aggregator.update_selected(selected) aggregator.to(device) selected = np.random.choice(num_feats, args.sample_size, replace=False) selected_subsets = [rel_subsets[i] for i in selected] feats_selected = recompute_selected_subsets( g, selected_subsets, args, num_paper, in_feats, device) logging.info("Best Epoch {}, Val {:.4f}".format(best_epoch, best_val))
num_workers=12) infer_val_test_loader = NeighborSampler(edge_index, node_idx=paper_val_test_idx, sizes=[-1, -1], batch_size=args.test_batch_size, shuffle=True, num_workers=12) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' model = RGNN(128, args.hidden_channels, dataset.num_classes, args.num_layers, args.dropout, num_nodes_dict, list(x_dict.keys()), len(edge_index_dict.keys()), args).to(device) print('Model #Params: %d' % get_n_params(model)) # Create global label vector. y_global = node_type.new_full((node_type.size(0), 1), -1) y_global[local2global['paper']] = data.y_dict['paper'] # Move everything to the GPU. x_dict = {k: v.to(device) for k, v in x_dict.items()} edge_type = edge_type.to(device) node_type = node_type.to(device) local_node_idx = local_node_idx.to(device) y_global = y_global.to(device) def train_vanilla(epoch): model.train()
def run(args, data, device, stage=0, subset_list=None): feats, label_emb, teacher_probs, labels, labels_with_pseudos, in_feats, n_classes, \ train_nid, train_nid_with_pseudos, val_nid, test_nid, evaluator, _ = data if args.dataset == "ogbn-papers100M": # We only store test/val/test nodes' features for ogbn-papers100M labels = labels[torch.cat([train_nid, val_nid, test_nid], dim=0)] labels_with_pseudos = labels_with_pseudos[torch.cat( [train_nid, val_nid, test_nid], dim=0)] id_map = dict( zip( torch.cat([train_nid, val_nid, test_nid], dim=0).cpu().long().numpy(), np.arange(len(train_nid) + len(val_nid) + len(test_nid)))) map_func = lambda x: torch.from_numpy( np.array([id_map[a] for a in x.cpu().numpy()])).to(device) train_nid = map_func(train_nid) val_nid = map_func(val_nid) test_nid = map_func(test_nid) train_nid_with_pseudos = map_func(train_nid_with_pseudos) # Raw training set loader train_loader = torch.utils.data.DataLoader(train_nid, batch_size=args.batch_size, shuffle=True, drop_last=False) # Enhanced training set loader (but equal to raw one if stage == 0) train_loader_with_pseudos = torch.utils.data.DataLoader( train_nid_with_pseudos, batch_size=args.batch_size, shuffle=True, drop_last=False) # Validation set loader val_loader = torch.utils.data.DataLoader(val_nid, batch_size=args.eval_batch_size, shuffle=False, drop_last=False) # Test set loader test_loader = torch.utils.data.DataLoader(torch.cat( [train_nid, val_nid, test_nid], dim=0), batch_size=args.eval_batch_size, shuffle=False, drop_last=False) # All nodes loader (including nodes without labels) all_loader = torch.utils.data.DataLoader(torch.arange(len(labels)), batch_size=args.eval_batch_size, shuffle=False, drop_last=False) # Initialize model and optimizer for each run label_in_feats = label_emb.shape[1] if label_emb is not None else n_classes model = get_model(in_feats, label_in_feats, n_classes, stage, args, subset_list=subset_list) model = model.to(device) print("# Params:", get_n_params(model)) if args.dataset in ["ppi", "ppi_large", "yelp"]: # For multilabel classification loss_fcn = nn.BCEWithLogitsLoss() else: # For multiclass classification loss_fcn = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) # Start training best_epoch = 0 best_val = 0 best_val_loss = 1e9 best_test = 0 num_epochs = args.epoch_setting[stage] train_time = [] inference_time = [] val_accs = [] val_loss = [] for epoch in range(1, num_epochs + 1): start = time.time() train(model, feats, label_emb, teacher_probs, labels_with_pseudos, loss_fcn, optimizer, train_loader_with_pseudos, args) med = time.time() if epoch % args.eval_every == 0: with torch.no_grad(): acc = test(model, feats, label_emb, teacher_probs, labels, loss_fcn, val_loader, test_loader, evaluator, train_nid, val_nid, test_nid, args) end = time.time() # We can choose val_acc or val_loss to select best model (usually it does not matter) if (acc[1] > best_val and args.acc_loss == "acc") or ( acc[3] < best_val_loss and args.acc_loss == "loss"): best_epoch = epoch best_val = acc[1] best_test = acc[2] best_val_loss = acc[3] best_model = deepcopy(model) train_time.append(med - start) inference_time.append(acc[-1]) val_accs.append(acc[1]) val_loss.append(acc[-2]) log = "Epoch {}, Time(s): {:.4f} {:.4f}, ".format( epoch, med - start, acc[-1]) log += "Best Val loss: {:.4f}, Accs: Train: {:.4f}, Val: {:.4f}, Test: {:.4f}, Best Val: {:.4f}, Best Test: {:.4f}".format( best_val_loss, acc[0], acc[1], acc[2], best_val, best_test) print(log) print("Stage: {}, Best Epoch {}, Val {:.4f}, Test {:.4f}".format( stage, best_epoch, best_val, best_test)) with torch.no_grad(): best_model.eval() probs = [] if (args.model in ["sagn", "plain_sagn"] and args.weight_style == "attention") and (not args.avoid_features): attn_weights = [] else: attn_weights = None for batch in test_loader: if args.dataset == "ogbn-mag": batch_feats = { rel_subset: [x[batch].to(device) for x in feat] for rel_subset, feat in feats.items() } else: batch_feats = [x[batch].to(device) for x in feats] if isinstance( feats, list) else feats[batch].to(device) if label_emb is not None: batch_label_emb = label_emb[batch].to(device) else: batch_label_emb = None if (args.model in ["sagn", "plain_sagn" ]) and (not args.avoid_features): out, a = best_model(batch_feats, batch_label_emb) else: out = best_model(batch_feats, batch_label_emb) if args.dataset in ['yelp', 'ppi', 'ppi_large']: out = out.sigmoid() else: out = out.softmax(dim=1) # remember to transfer output probabilities to cpu probs.append(out.cpu()) if (args.model in ["sagn", "plain_sagn"] and args.weight_style == "attention") and (not args.avoid_features): attn_weights.append(a.cpu().squeeze(1).squeeze(1)) probs = torch.cat(probs, dim=0) if (args.model in ["sagn", "plain_sagn"] and args.weight_style == "attention") and (not args.avoid_features): attn_weights = torch.cat(attn_weights) del model, best_model del feats, label_emb, teacher_probs, labels, labels_with_pseudos with torch.cuda.device(device): torch.cuda.empty_cache() return best_val, best_test, probs, train_time, inference_time, val_accs, val_loss, attn_weights