def train_fold(): opts = get_args() seed_everything(20) #gpu selection os.environ["CUDA_VISIBLE_DEVICES"] = opts.gpu_id device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') df = pd.read_csv(opts.path) sequences = np.asarray(df.sequence) labels = np.asarray(df.label) train_indices, val_indices = iter_split(sequences, labels, opts.fold) # print(train_indices.shape) # print(val_indices.shape) # exit() dataset = PromoterDataset(sequences[train_indices], labels[train_indices]) val_dataset = PromoterDataset(sequences[val_indices], labels[val_indices]) dataloader = torch.utils.data.DataLoader(dataset, batch_size=opts.batch_size, shuffle=True) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=opts.batch_size * 2, shuffle=False) #checkpointing checkpoints_folder = 'checkpoints_fold{}'.format((opts.fold)) csv_file = 'log_fold{}.csv'.format((opts.fold)) columns = [ 'epoch', 'train_loss', 'train_acc', 'val_loss', 'val_acc', 'val_sens', 'val_spec' ] logger = CSVLogger(columns, csv_file) #build model and logger model = NucleicTransformer(opts.ntoken, opts.nclass, opts.ninp, opts.nhead, opts.nhid, opts.nlayers, opts.kmer_aggregation, kmers=opts.kmers, dropout=opts.dropout).to(device) optimizer = torch.optim.Adam(model.parameters(), weight_decay=opts.weight_decay) criterion = nn.CrossEntropyLoss(reduction='none') lr_schedule = lr_AIAYN(optimizer, opts.ninp, opts.warmup_steps, opts.lr_scale) # Initialization # opt_level = 'O1' # model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level) # model = nn.DataParallel(model) pytorch_total_params = sum(p.numel() for p in model.parameters()) print('Total number of paramters: {}'.format(pytorch_total_params)) print("Starting training for fold {}/{}".format(opts.fold, opts.nfolds)) #training loop for epoch in range(opts.epochs): model.train(True) t = time.time() total_loss = 0 #dataset.switch_mode(training=True) #dataset.update_batchsize(opts.batch_size) optimizer.zero_grad() train_preds = [] recon_preds = [] true_seqs = [] #step=0 total_steps = len(dataloader) ground_truths = [] for step, data in enumerate(dataloader): lr = lr_schedule.step() #data=dataset[step] src = data['data'].long() #directions=data['directions'] #directions=directions.reshape(len(directions),1)*np.ones(src.shape) #src=src.to(device).long() labels = data['labels'].to(device).long() mutated_sequence = mutate_dna_sequence(src).to(device).long() output = model(mutated_sequence) #print(attention_weights.shape) loss = torch.mean(criterion(output, labels)) #+\ # 0.5*torch.mean(criterion(error_sequence[:,:81].reshape(-1,2),error_mask.reshape(-1).long()))+\ # 0.5*torch.mean(criterion(recon_sequence[:,:81].reshape(-1,4),src.reshape(-1))) loss.backward() # with amp.scale_loss(loss, optimizer) as scaled_loss: # scaled_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1) optimizer.step() optimizer.zero_grad() total_loss += loss predictions = torch.argmax(output, dim=1).squeeze().cpu().numpy() train_preds.append(predictions) ground_truths.append(labels.cpu().numpy()) print( "Epoch [{}/{}], Step [{}/{}] Loss: {:.3f} Lr:{:.6f} Time: {:.1f}" .format(epoch + 1, opts.epochs, step + 1, total_steps, total_loss / (step + 1), lr, time.time() - t), end='\r', flush=True) #total_loss/(step+1) #break print('') train_preds = np.concatenate(train_preds) ground_truths = np.concatenate(ground_truths) #ground_truths=dataset.labels train_acc = Metrics.accuracy(train_preds, ground_truths) train_loss = total_loss / (step + 1) val_loss, val_acc, val_sens, val_spec = validate( model, device, val_dataloader, batch_size=opts.batch_size * 2) print("Epoch {} train acc: {}".format(epoch + 1, train_acc)) to_log = [ epoch + 1, train_loss, train_acc, val_loss, val_acc, val_sens, val_spec ] logger.log(to_log) if (epoch + 1) % opts.save_freq == 0: save_weights(model, optimizer, epoch, checkpoints_folder) get_best_weights_from_fold(opts.fold)
def train_fold(): #get arguments opts=get_args() #gpu selection os.environ["CUDA_VISIBLE_DEVICES"] = opts.gpu_id device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #instantiate datasets json_path=os.path.join(opts.path,'train.json') json=pd.read_json(json_path,lines=True) json=json[json.signal_to_noise > opts.noise_filter] ids=np.asarray(json.id.to_list()) error_weights=get_errors(json) error_weights=opts.error_alpha+np.exp(-error_weights*opts.error_beta) train_indices,val_indices=get_train_val_indices(json,opts.fold,SEED=2020,nfolds=opts.nfolds) _,labels=get_data(json) sequences=np.asarray(json.sequence) train_seqs=sequences[train_indices] val_seqs=sequences[val_indices] train_labels=labels[train_indices] val_labels=labels[val_indices] train_ids=ids[train_indices] val_ids=ids[val_indices] train_ew=error_weights[train_indices] val_ew=error_weights[val_indices] #train_inputs=np.stack([train_inputs],0) #val_inputs=np.stack([val_inputs,val_inputs2],0) dataset=RNADataset(train_seqs,train_labels,train_ids, train_ew, opts.path) val_dataset=RNADataset(val_seqs,val_labels, val_ids, val_ew, opts.path, training=False) dataloader = DataLoader(dataset, batch_size=opts.batch_size, shuffle=True, num_workers=opts.workers) val_dataloader = DataLoader(val_dataset, batch_size=opts.batch_size*2, shuffle=False, num_workers=opts.workers) # print(dataset.data.shape) # print(dataset.bpps[0].shape) # exit() #checkpointing checkpoints_folder='checkpoints_fold{}'.format((opts.fold)) csv_file='log_fold{}.csv'.format((opts.fold)) columns=['epoch','train_loss', 'val_loss'] logger=CSVLogger(columns,csv_file) #build model and logger model=NucleicTransformer(opts.ntoken, opts.nclass, opts.ninp, opts.nhead, opts.nhid, opts.nlayers, opts.kmer_aggregation, kmers=opts.kmers,stride=opts.stride, dropout=opts.dropout).to(device) optimizer=Ranger(model.parameters(), weight_decay=opts.weight_decay) criterion=weighted_MCRMSE #lr_schedule=lr_AIAYN(optimizer,opts.ninp,opts.warmup_steps,opts.lr_scale) # Mixed precision initialization opt_level = 'O1' #model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level) model = nn.DataParallel(model) pretrained_df=pd.read_csv('pretrain.csv') #print(pretrained_df.epoch[-1]) model.load_state_dict(torch.load('pretrain_weights/epoch{}.ckpt'.format(int(pretrained_df.iloc[-1].epoch)))) pytorch_total_params = sum(p.numel() for p in model.parameters()) print('Total number of paramters: {}'.format(pytorch_total_params)) #distance_mask=get_distance_mask(107) #distance_mask=torch.tensor(distance_mask).float().to(device).reshape(1,107,107) #print("Starting training for fold {}/{}".format(opts.fold,opts.nfolds)) #training loop cos_epoch=int(opts.epochs*0.75)-1 lr_schedule=torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,(opts.epochs-cos_epoch)*len(dataloader)) for epoch in range(opts.epochs): model.train(True) t=time.time() total_loss=0 optimizer.zero_grad() train_preds=[] ground_truths=[] step=0 for data in dataloader: #for step in range(1): step+=1 #lr=lr_schedule.step() lr=get_lr(optimizer) #print(lr) src=data['data'].to(device) labels=data['labels'] bpps=data['bpp'].to(device) #print(bpps.shape[1]) # bpp_selection=np.random.randint(bpps.shape[1]) # bpps=bpps[:,bpp_selection] # src=src[:,bpp_selection] # print(bpps.shape) # print(src.shape) # exit() # print(bpps.shape) # exit() #src=mutate_rna_input(src,opts.nmute) #src=src.long()[:,np.random.randint(2)] labels=labels.to(device)#.float() output=model(src,bpps) ew=data['ew'].to(device) #print(output.shape) #print(labels.shape) loss=criterion(output[:,:68],labels,ew).mean() # with amp.scale_loss(loss, optimizer) as scaled_loss: # scaled_loss.backward() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1) optimizer.step() optimizer.zero_grad() total_loss+=loss print ("Epoch [{}/{}], Step [{}/{}] Loss: {:.3f} Lr:{:.6f} Time: {:.1f}" .format(epoch+1, opts.epochs, step+1, len(dataloader), total_loss/(step+1) , lr,time.time()-t),end='\r',flush=True) #total_loss/(step+1) #break if epoch > cos_epoch: lr_schedule.step() print('') train_loss=total_loss/(step+1) #recon_acc=np.sum(recon_preds==true_seqs)/len(recon_preds) torch.cuda.empty_cache() if (epoch+1)%opts.val_freq==0 and epoch > cos_epoch: #if (epoch+1)%opts.val_freq==0: val_loss=validate(model,device,val_dataloader,batch_size=opts.batch_size) to_log=[epoch+1,train_loss,val_loss,] logger.log(to_log) if (epoch+1)%opts.save_freq==0: save_weights(model,optimizer,epoch,checkpoints_folder) # if epoch == cos_epoch: # print('yes') get_best_weights_from_fold(opts.fold)
def train_fold(): #get arguments opts = get_args() #gpu selection os.environ["CUDA_VISIBLE_DEVICES"] = opts.gpu_id device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #instantiate datasets json_path = os.path.join(opts.path, 'train.json') json = pd.read_json(json_path, lines=True) train_ids = json.id.to_list() json_path = os.path.join(opts.path, 'test.json') test = pd.read_json(json_path, lines=True) #aug_test=test #dataloader ls_indices = test.seq_length == 130 long_data = test[ls_indices] ids = np.asarray(long_data.id.to_list()) long_dataset = RNADataset(long_data.sequence.to_list(), np.zeros(len(ls_indices)), ids, np.arange(len(ls_indices)), opts.path) long_dataloader = DataLoader(long_dataset, batch_size=opts.batch_size, shuffle=True, num_workers=opts.workers) ss_indices = test.seq_length == 107 short_data = test[ss_indices] ids = short_data.id.to_list() ids = ids + train_ids short_sequences = short_data.sequence.to_list() + json.sequence.to_list() short_dataset = RNADataset(short_sequences, np.zeros(len(short_sequences)), ids, np.arange(len(short_sequences)), opts.path) short_dataloader = DataLoader(short_dataset, batch_size=opts.batch_size, shuffle=True, num_workers=opts.workers) #checkpointing checkpoints_folder = 'pretrain_weights' csv_file = 'pretrain.csv'.format((opts.fold)) columns = ['epoch', 'train_loss'] logger = CSVLogger(columns, csv_file) #build model and logger model = NucleicTransformer(opts.ntoken, opts.nclass, opts.ninp, opts.nhead, opts.nhid, opts.nlayers, opts.kmer_aggregation, kmers=opts.kmers, stride=opts.stride, dropout=opts.dropout, pretrain=True).to(device) optimizer = Ranger(model.parameters(), weight_decay=opts.weight_decay) #optimizer=torch.optim.Adam(model.parameters()) criterion = nn.CrossEntropyLoss() #lr_schedule=lr_AIAYN(optimizer,opts.ninp,opts.warmup_steps,opts.lr_scale) # Mixed precision initialization opt_level = 'O1' #model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level) model = nn.DataParallel(model) pytorch_total_params = sum(p.numel() for p in model.parameters()) print('Total number of paramters: {}'.format(pytorch_total_params)) #training loop cos_epoch = int(opts.epochs * 0.75) total_steps = len(long_dataloader) + len(short_dataloader) lr_schedule = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, (opts.epochs - cos_epoch) * (total_steps)) for epoch in range(opts.epochs): model.train(True) t = time.time() total_loss = 0 optimizer.zero_grad() train_preds = [] ground_truths = [] step = 0 for data in short_dataloader: #for step in range(1): step += 1 lr = get_lr(optimizer) src = data['data'] labels = data['labels'] bpps = data['bpp'].to(device) if np.random.uniform() > 0.5: masked = mutate_rna_input(src) else: masked = mask_rna_input(src) src = src.to(device).long() masked = masked.to(device).long() #labels=labels.to(device).float() output = model(masked, bpps) #ew=data['ew'].to(device) loss=(criterion(output[0].reshape(-1,4),src[:,:,0].reshape(-1))+\ criterion(output[1].reshape(-1,3),src[:,:,1].reshape(-1)-4)+\ criterion(output[2].reshape(-1,7),src[:,:,2].reshape(-1)-7)) # with amp.scale_loss(loss, optimizer) as scaled_loss: # scaled_loss.backward() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1) optimizer.step() optimizer.zero_grad() total_loss += loss if epoch > cos_epoch: lr_schedule.step() print( "Epoch [{}/{}], Step [{}/{}] Loss: {:.3f} Lr:{:.6f} Time: {:.1f}" .format(epoch + 1, opts.epochs, step + 1, total_steps, total_loss / (step + 1), lr, time.time() - t), end='\r', flush=True) #total_loss/(step+1) for data in long_dataloader: #for step in range(1): step += 1 lr = get_lr(optimizer) src = data['data'] labels = data['labels'] bpps = data['bpp'].to(device) if np.random.uniform() > 0.5: masked = mutate_rna_input(src) else: masked = mask_rna_input(src) src = src.to(device).long() masked = masked.to(device).long() #labels=labels.to(device).float() output = model(masked, bpps) #ew=data['ew'].to(device) loss=(criterion(output[0].reshape(-1,4),src[:,:,0].reshape(-1))+\ criterion(output[1].reshape(-1,3),src[:,:,1].reshape(-1)-4)+\ criterion(output[2].reshape(-1,7),src[:,:,2].reshape(-1)-7)) # with amp.scale_loss(loss, optimizer) as scaled_loss: # scaled_loss.backward() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1) optimizer.step() optimizer.zero_grad() total_loss += loss if epoch > cos_epoch: lr_schedule.step() print( "Epoch [{}/{}], Step [{}/{}] Loss: {:.3f} Lr:{:.6f} Time: {:.1f}" .format(epoch + 1, opts.epochs, step + 1, total_steps, total_loss / (step + 1), lr, time.time() - t), end='\r', flush=True) #total_loss/(step+1) #break # if epoch > cos_epoch: # lr_schedule.step() print('') train_loss = total_loss / (step + 1) torch.cuda.empty_cache() to_log = [ epoch + 1, train_loss, ] logger.log(to_log) if (epoch + 1) % opts.save_freq == 0: save_weights(model, optimizer, epoch, checkpoints_folder) get_best_weights_from_fold(opts.fold)
def train_fold(): opts = get_args() seed_everything(2020) #gpu selection os.environ["CUDA_VISIBLE_DEVICES"] = opts.gpu_id device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') train_df = pd.read_csv(os.path.join("..", "fullset_train.csv")) val_df = pd.read_csv(os.path.join("..", "fullset_validation.csv")) dataset = ViraminerDataset(train_df.iloc[:, 1], train_df.iloc[:, 2]) dataloader = torch.utils.data.DataLoader(dataset, batch_size=opts.batch_size, shuffle=True, num_workers=opts.num_workers) val_dataset = ViraminerDataset(val_df.iloc[:, 1], val_df.iloc[:, 2]) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=opts.batch_size * 2, shuffle=False) #exit() #lr=0 #checkpointing checkpoints_folder = 'checkpoints_fold{}'.format((opts.fold)) csv_file = 'log_fold{}.csv'.format((opts.fold)) columns = [ 'epoch', 'train_acc', 'val_loss', 'val_auc', 'val_acc', 'val_sens', 'val_spec' ] logger = CSVLogger(columns, csv_file) #build model and logger model = NucleicTransformer(opts.ntoken, opts.nclass, opts.ninp, opts.nhead, opts.nhid, opts.nlayers, opts.kmer_aggregation, kmers=opts.kmers, dropout=opts.dropout).to(device) optimizer = torch.optim.Adam(model.parameters(), weight_decay=opts.weight_decay) criterion = nn.CrossEntropyLoss(reduction='none') lr_schedule = lr_AIAYN(optimizer, opts.ninp, opts.warmup_steps, opts.lr_scale) # Initialization opt_level = 'O1' model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level) model = nn.DataParallel(model) softmax = nn.Softmax(dim=1) pytorch_total_params = sum(p.numel() for p in model.parameters()) print('Total number of paramters: {}'.format(pytorch_total_params)) print("Starting training for fold {}/{}".format(opts.fold, opts.nfolds)) #training loop for epoch in range(opts.epochs): model.train(True) t = time.time() total_loss = 0 optimizer.zero_grad() total_steps = len(dataloader) for step, data in enumerate(dataloader): #for step in range(1): lr = lr_schedule.step() src = data['data'] labels = data['labels'].to(device) mutated_sequence = mutate_dna_sequence(src, opts.nmute).to(device) output = model(mutated_sequence) loss_weight = torch.ones(len(output), device=device) loss = torch.mean(criterion(output, labels)) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1) optimizer.step() optimizer.zero_grad() total_loss += loss print( "Epoch [{}/{}], Step [{}/{}] Loss: {:.3f} Lr:{:.6f} Time: {:.1f}" .format(epoch + 1, opts.epochs, step + 1, total_steps, total_loss / (step + 1), lr, time.time() - t), end='\r', flush=True) #total_loss/(step+1) #break print('') train_loss = total_loss / (step + 1) if (epoch + 1) % opts.val_freq == 0: val_loss, auc, val_acc, val_sens, val_spec = validate( model, device, val_dataloader, batch_size=opts.batch_size * 2) print("Epoch {} train loss: {}".format(epoch + 1, train_loss)) to_log = [ epoch + 1, train_loss, val_loss, auc, val_acc, val_sens, val_spec ] logger.log(to_log) if (epoch + 1) % opts.save_freq == 0: save_weights(model, optimizer, epoch, checkpoints_folder) get_best_weights_from_fold(opts.fold)
#exit() dataset=DeepSeaDataset(train_seqs,train_labels) dataloader=torch.utils.data.DataLoader(dataset,batch_size=opts.batch_size,shuffle=True,num_workers=opts.num_workers) val_dataset=DeepSeaDataset(val_seqs.transpose(2,1,0),val_labels.transpose(1,0)) val_dataloader=torch.utils.data.DataLoader(val_dataset,batch_size=opts.batch_size*4,shuffle=False) #exit() #lr=0 #checkpointing checkpoints_folder='checkpoints_fold{}'.format((opts.fold)) csv_file='log_fold{}.csv'.format((opts.fold)) columns=['epoch','train_loss', 'val_loss','val_auc','val_acc','val_sens','val_spec'] logger=CSVLogger(columns,csv_file) #build model and logger model=NucleicTransformer(opts.ntoken, opts.nclass, opts.ninp, opts.nhead, opts.nhid, opts.nlayers, opts.kmer_aggregation, kmers=opts.kmers, dropout=opts.dropout).to(device) optimizer=torch.optim.Adam(model.parameters(), weight_decay=opts.weight_decay) criterion=nn.BCEWithLogitsLoss(reduction='none') lr_schedule=lr_AIAYN(optimizer,opts.ninp,opts.warmup_steps,opts.lr_scale) # Initialization opt_level = 'O1' model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level) model = nn.DataParallel(model) #softmax = nn.Softmax(dim=1) pytorch_total_params = sum(p.numel() for p in model.parameters())