def main(): parser = argparse.ArgumentParser("") parser.add_argument("--model", type=str, default='') parser.add_argument("--resume", action='store_true') parser.add_argument("--eval", action='store_true') parser.add_argument("--batch_size", type=int, default=CFG.batch_size) parser.add_argument("--nepochs", type=int, default=CFG.num_train_epochs) parser.add_argument("--wsteps", type=int, default=CFG.warmup_steps) parser.add_argument("--nlayers", type=int, default=CFG.num_hidden_layers) parser.add_argument("--nahs", type=int, default=CFG.num_attention_heads) parser.add_argument("--seed", type=int, default=7) parser.add_argument("--lr", type=float, default=CFG.learning_rate) parser.add_argument("--dropout", type=float, default=CFG.dropout) parser.add_argument("--types", nargs='+', type=str, default=['1JHC', '1JHN', '2JHC', '2JHH', '2JHN', '3JHC', '3JHH', '3JHN'], help='3JHC,2JHC,1JHC,3JHH,2JHH,3JHN,2JHN,1JHN') parser.add_argument("--train_file", default="train_mute_cp") parser.add_argument("--test_file", default="test_mute_cp") parser.add_argument("--pseudo_path", default="") parser.add_argument("--pseudo", action='store_true') parser.add_argument("--gen_pseudo", action='store_true') parser.add_argument("--use_all", action='store_true') parser.add_argument("--structure_file", default="structures_mu") parser.add_argument("--contribution_file", default="scalar_coupling_contributions") args = parser.parse_args() print(args) CFG.batch_size=args.batch_size CFG.num_train_epochs=args.nepochs CFG.warmup_steps=args.wsteps CFG.num_hidden_layers=args.nlayers CFG.num_attention_heads=args.nahs CFG.learning_rate=args.lr CFG.dropout=args.dropout CFG.seed = args.seed print(CFG.__dict__) random.seed(CFG.seed) np.random.seed(CFG.seed) torch.manual_seed(CFG.seed) #if not args.eval: if True: train_df = load_csv(args.train_file) structures_df = load_csv(args.structure_file) structures_df[['x', 'y', 'z']] -= structures_df.groupby('molecule_name')[['x', 'y', 'z']].transform('mean') contributions_df = load_csv(args.contribution_file) train_df = train_df.merge(contributions_df, how='left') train_df = normalize_cols(train_df, ['scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso']) train_df = add_extra_features(train_df, structures_df) train_df = train_df.fillna(1e08) n_mols = train_df['molecule_name'].nunique() train_df, valid_df = train_test_split(train_df, 5000 ) # only molecules with the args.types print(train_df['molecule_name'].nunique()) mol_names_with_at = train_df[train_df['type'].isin(args.types)]['molecule_name'].unique() train_df = train_df[train_df['molecule_name'].isin(mol_names_with_at)].reset_index(drop=True) print(train_df['molecule_name'].nunique()) # Print the 5 rows of valid_df to verify whether the valid_df is the same as the previous experiment. print(valid_df.head(5)) if args.pseudo: test_df = load_csv(args.test_file) logger.info(f'loading dataset - {args.pseudo_path} ...') test_pseudo_df = pd.read_csv(args.pseudo_path) #mol_names_jhn = train_df[test_df['type'].isin(['1JHN', '2JHN', '3JHN'])]['molecule_name'].unique() #test_df = test_df[test_df['molecule_name'].isin(mol_names_jhn)].reset_index(drop=True) test_df = add_extra_features(test_df, structures_df) test_df = test_df.set_index('id') test_pseudo_df = test_pseudo_df.set_index('id') test_df[['scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso']] = test_pseudo_df[['scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso']] test_df = test_df.reset_index() #test_df = normalize_target(test_df) test_df = normalize_cols(test_df, ['scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso']) #test_df = test_df.assign(fc=1e08, sd=1e08, pso=1e08, dso=1e08) train_df['weight'] = 1.0 valid_df['weight'] = 1.0 test_df['weight'] = 1.0 n_mols = test_df['molecule_name'].nunique() train_df = train_df.append(test_df).reset_index(drop=True) else: train_df['weight'] = 1.0 valid_df['weight'] = 1.0 if args.use_all: train_df = train_df.append(valid_df) print(f' n_train:{len(train_df)}, n_valid:{len(valid_df)}') config = BertConfig( 3, # not used hidden_size=CFG.hidden_size, num_hidden_layers=CFG.num_hidden_layers, num_attention_heads=CFG.num_attention_heads, intermediate_size=CFG.intermediate_size, hidden_dropout_prob=CFG.dropout, attention_probs_dropout_prob=CFG.dropout, ) model = cust_model.SelfAttn(config) if args.model != "": print("=> loading checkpoint '{}'".format(args.model)) checkpoint = torch.load(args.model) CFG.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.model, checkpoint['epoch'])) model.cuda() def count_parameters(model): return sum(p.numel() for p in model.parameters() if p.requires_grad) print('parameters: ', count_parameters(model)) n_gpu = torch.cuda.device_count() if n_gpu > 1: model = torch.nn.DataParallel(model) # to produce the submission.csv if args.eval: test_df = load_csv(args.test_file) structures_df = load_csv(args.structure_file) structures_df[['x', 'y', 'z']] -= structures_df.groupby('molecule_name')[['x', 'y', 'z']].transform('mean') test_df = add_extra_features(test_df, structures_df) test_df = test_df.assign(fc=1e08, sd=1e08, pso=1e08, dso=1e08) test_df['scalar_coupling_constant'] = 0 test_df['weight'] = 1.0 test_db = db.MolDB(test_df, CFG.max_seq_length) test_loader = DataLoader( test_db, batch_size=CFG.batch_size, shuffle=False, num_workers=CFG.num_workers) res_df = validate(test_loader, model, args.types) res_df = unnormalize_cols(res_df, cols=['fc', 'sd', 'pso', 'dso']) res_df = unnormalize_target(res_df, 'prediction1') if args.gen_pseudo: res_df['scalar_coupling_constant'] = res_df['prediction1'] res_df = res_df[res_df['id']>-1].sort_values('id') res_df[['id', 'scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso']].to_csv(f'pseudo_{CFG.seed}.csv', index=False) return res_df['prediction4']= res_df[['fc', 'sd', 'pso', 'dso']].sum(1) res_df['prediction']= res_df[['prediction1','prediction4']].mean(1) res_df['scalar_coupling_constant'] = res_df['prediction'] res_df = res_df[res_df['id']>-1].sort_values('id') os.makedirs('output', exist_ok=True) res_df[['id', 'scalar_coupling_constant']].to_csv(f'output/submission_{CFG.seed}.csv', index=False) return train_db = db.MolDB(train_df, CFG.max_seq_length) print('preloading dataset ...') train_db = db.MolDB_FromDB(train_db, 10) valid_db = db.MolDB(valid_df, CFG.max_seq_length) num_train_optimization_steps = int( len(train_db) / CFG.batch_size / CFG.gradient_accumulation_steps) * (CFG.num_train_epochs-CFG.start_epoch) print('num_train_optimization_steps', num_train_optimization_steps) train_loader = DataLoader( train_db, batch_size=CFG.batch_size, shuffle=True, num_workers=CFG.num_workers, pin_memory=True) val_loader = DataLoader( valid_db, batch_size=CFG.batch_size, shuffle=False, num_workers=CFG.num_workers) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=CFG.learning_rate, weight_decay=CFG.weight_decay, ) scheduler = WarmupLinearSchedule(optimizer, CFG.warmup_steps, t_total=num_train_optimization_steps ) def get_lr(): return scheduler.get_lr()[0] if args.model != "": if args.resume: optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) #for param_group in optimizer.param_groups: # param_group['lr'] = CFG.learning_rate mae_log_df = checkpoint['mae_log'] del checkpoint else: mae_log_df = pd.DataFrame(columns=(['EPOCH']+['LR']+args.types + ['OVERALL']) ) os.makedirs('log', exist_ok=True) res_df = validate(val_loader, model, args.types) res_df = unnormalize_cols(res_df, cols=['scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso']) res_df = unnormalize_target(res_df, 'prediction1') res_df['prediction4']= res_df[['fc', 'sd', 'pso', 'dso']].sum(1) res_df['prediction']= res_df[['prediction1','prediction4']].mean(1) res_df.to_csv(f'log/valid_df_{"_".join(args.types)}.csv', index=False) overall_mae, maes = metric(res_df, args.types) print(overall_mae, maes) curr_lr = get_lr() print(f'initial learning rate:{curr_lr}') for epoch in range(CFG.start_epoch, CFG.num_train_epochs): # train for one epoch #print(adjust_learning_rate(optimizer, epoch)) train(train_loader, model, optimizer, epoch, args.types, scheduler) if epoch % CFG.test_freq == 0: res_df = validate(val_loader, model, args.types) res_df = unnormalize_cols(res_df, cols=['scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso']) res_df = unnormalize_target(res_df, 'prediction1') res_df['prediction4']= res_df[['fc', 'sd', 'pso', 'dso']].sum(1) res_df['prediction']= res_df[['prediction1','prediction4']].mean(1) res_df.to_csv(f'log/valid_df_{"_".join(args.types)}.csv', index=False) overall_mae, maes = metric(res_df, args.types) # write log file mae_row = dict([(typ, [mae]) for typ, mae in maes.items() if typ in args.types]) mae_row.update({'EPOCH':(epoch),'OVERALL':overall_mae, 'LR':curr_lr}) mae_log_df = mae_log_df.append(pd.DataFrame(mae_row), sort=False) print(mae_log_df.tail(20)) mae_log_df.to_csv(f'log/{"_".join(args.types)}.csv', index=False) #scheduler.step(overall_mae) curr_lr = get_lr() print(f'set the learning_rate: {curr_lr}') # evaluate on validation set batch_size = CFG.batch_size pseudo_path = '' if not args.pseudo else '_' + args.pseudo_path curr_model_name = (f'b{batch_size}_l{config.num_hidden_layers}_' f'mh{config.num_attention_heads}_h{config.hidden_size}_' f'd{CFG.dropout}_' f'ep{epoch}_{"_".join(args.types)}_s{CFG.seed}{pseudo_path}.pt') model_to_save = model.module if hasattr(model, 'module') else model # Only save the cust_model it-self save_checkpoint({ 'epoch': epoch + 1, 'arch': 'transformer', 'state_dict': model_to_save.state_dict(), 'mae_log': mae_log_df, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), }, FINETUNED_MODEL_PATH, curr_model_name ) print('done')
def train(args, train_dataset, val_dataset, model, tokenizer): """ Train the model """ pretrained_model = model[0] adapter_model = model[1] args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in adapter_model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in adapter_model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") adapter_model, optimizer = amp.initialize(adapter_model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: pretrained_model = torch.nn.DataParallel(pretrained_model) adapter_model = torch.nn.DataParallel(adapter_model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: pretrained_model = torch.nn.parallel.DistributedDataParallel(pretrained_model, device_ids=[args.local_rank], output_device=args.local_rank) adapter_model = torch.nn.parallel.DistributedDataParallel(adapter_model, device_ids=[args.local_rank], output_device=args.local_rank) # Train! logger.info("***** Running training *****") logger.info(" Num train examples = %d", len(train_dataset)) #logging.info(f" Num train_examples = {len(train_examples)}") logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) logger.info("Try resume from checkpoint") if args.restore: if os.path.exists(os.path.join(args.output_dir, 'global_step.bin')): logger.info("Load last checkpoint data") global_step = torch.load(os.path.join(args.output_dir, 'global_step.bin')) output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) logger.info("Load from output_dir {}".format(output_dir)) optimizer.load_state_dict(torch.load(os.path.join(output_dir, 'optimizer.bin'))) scheduler.load_state_dict(torch.load(os.path.join(output_dir, 'scheduler.bin'))) # args = torch.load(os.path.join(output_dir, 'training_args.bin')) if hasattr(adapter_model, 'module'): adapter_model.module.load_state_dict(torch.load(os.path.join(output_dir, 'pytorch_model.bin'))) else: # Take care of distributed/parallel training adapter_model.load_state_dict(torch.load(os.path.join(output_dir, 'pytorch_model.bin'))) global_step += 1 start_epoch = int(global_step / len(train_dataloader)) start_step = global_step-start_epoch*len(train_dataloader)-1 logger.info("Start from global_step={} epoch={} step={}".format(global_step, start_epoch, start_step)) if args.local_rank in [-1, 0]: tb_writer = SummaryWriter(log_dir="runs/" + args.my_model_name, purge_step=global_step) else: global_step = 0 start_epoch = 0 start_step = 0 if args.local_rank in [-1, 0]: tb_writer = SummaryWriter(log_dir="runs/" + args.my_model_name, purge_step=global_step) logger.info("Start from scratch") else: global_step = 0 start_epoch = 0 start_step = 0 if args.local_rank in [-1, 0]: tb_writer = SummaryWriter(log_dir="runs/" + args.my_model_name, purge_step=global_step) logger.info("Start from scratch") tr_loss, logging_loss = 0.0, 0.0 pretrained_model.zero_grad() adapter_model.zero_grad() set_seed(args) # Added here for reproductibility (even between python 2 and 3) for epoch in range(start_epoch, int(args.num_train_epochs)): for step, batch in enumerate(train_dataloader): start = time.time() if args.restore and (step < start_step): continue # if args.restore and (flag_count < global_step): # flag_count+=1 # continue pretrained_model.eval() adapter_model.train() batch = tuple(t.to(args.device) for t in batch) inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM and RoBERTa don't use segment_ids 'labels': batch[3]} pretrained_model_outputs = pretrained_model(**inputs) outputs = adapter_model(pretrained_model_outputs,**inputs) loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps # epoch_iterator.set_description("loss {}".format(loss)) logger.info("Epoch {}/{} - Iter {} / {}, loss = {:.5f}, time used = {:.3f}s".format(epoch, int(args.num_train_epochs),step, len(train_dataloader), loss.item(), time.time() - start)) if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(adapter_model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: scheduler.step() # Update learning rate schedule optimizer.step() pretrained_model.zero_grad() adapter_model.zero_grad() global_step += 1 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = adapter_model.module if hasattr(adapter_model, 'module') else adapter_model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) # save to pytorch_model.bin model.state_dict() torch.save(optimizer.state_dict(), os.path.join(output_dir, 'optimizer.bin')) torch.save(scheduler.state_dict(), os.path.join(output_dir, 'scheduler.bin')) torch.save(args, os.path.join(output_dir, 'training_args.bin')) torch.save(global_step, os.path.join(args.output_dir, 'global_step.bin')) logger.info("Saving model checkpoint, optimizer, global_step to %s", output_dir) if (global_step/args.save_steps) > args.max_save_checkpoints: try: shutil.rmtree(os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step-args.max_save_checkpoints*args.save_steps))) except OSError as e: print(e) if args.local_rank == -1 and args.evaluate_during_training and global_step %args.eval_steps== 0: # Only evaluate when single GPU otherwise metrics may not average well model = (pretrained_model, adapter_model) results = evaluate(args, val_dataset, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) if args.max_steps > 0 and global_step > args.max_steps: break if args.max_steps > 0 and global_step > args.max_steps: break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step