コード例 #1
0
def train_fold():

    opts = get_args()

    seed_everything(20)
    #gpu selection
    os.environ["CUDA_VISIBLE_DEVICES"] = opts.gpu_id
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    df = pd.read_csv(opts.path)

    sequences = np.asarray(df.sequence)
    labels = np.asarray(df.label)

    train_indices, val_indices = iter_split(sequences, labels, opts.fold)
    # print(train_indices.shape)
    # print(val_indices.shape)
    # exit()
    dataset = PromoterDataset(sequences[train_indices], labels[train_indices])
    val_dataset = PromoterDataset(sequences[val_indices], labels[val_indices])
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=opts.batch_size,
                                             shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val_dataset,
                                                 batch_size=opts.batch_size *
                                                 2,
                                                 shuffle=False)

    #checkpointing
    checkpoints_folder = 'checkpoints_fold{}'.format((opts.fold))
    csv_file = 'log_fold{}.csv'.format((opts.fold))
    columns = [
        'epoch', 'train_loss', 'train_acc', 'val_loss', 'val_acc', 'val_sens',
        'val_spec'
    ]
    logger = CSVLogger(columns, csv_file)

    #build model and logger
    model = NucleicTransformer(opts.ntoken,
                               opts.nclass,
                               opts.ninp,
                               opts.nhead,
                               opts.nhid,
                               opts.nlayers,
                               opts.kmer_aggregation,
                               kmers=opts.kmers,
                               dropout=opts.dropout).to(device)
    optimizer = torch.optim.Adam(model.parameters(),
                                 weight_decay=opts.weight_decay)
    criterion = nn.CrossEntropyLoss(reduction='none')
    lr_schedule = lr_AIAYN(optimizer, opts.ninp, opts.warmup_steps,
                           opts.lr_scale)
    # Initialization
    #     opt_level = 'O1'
    #     model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)
    #     model = nn.DataParallel(model)

    pytorch_total_params = sum(p.numel() for p in model.parameters())
    print('Total number of paramters: {}'.format(pytorch_total_params))

    print("Starting training for fold {}/{}".format(opts.fold, opts.nfolds))
    #training loop
    for epoch in range(opts.epochs):
        model.train(True)
        t = time.time()
        total_loss = 0
        #dataset.switch_mode(training=True)
        #dataset.update_batchsize(opts.batch_size)
        optimizer.zero_grad()
        train_preds = []
        recon_preds = []
        true_seqs = []
        #step=0
        total_steps = len(dataloader)
        ground_truths = []
        for step, data in enumerate(dataloader):
            lr = lr_schedule.step()
            #data=dataset[step]
            src = data['data'].long()
            #directions=data['directions']
            #directions=directions.reshape(len(directions),1)*np.ones(src.shape)
            #src=src.to(device).long()
            labels = data['labels'].to(device).long()
            mutated_sequence = mutate_dna_sequence(src).to(device).long()
            output = model(mutated_sequence)
            #print(attention_weights.shape)
            loss = torch.mean(criterion(output, labels))  #+\
            # 0.5*torch.mean(criterion(error_sequence[:,:81].reshape(-1,2),error_mask.reshape(-1).long()))+\
            # 0.5*torch.mean(criterion(recon_sequence[:,:81].reshape(-1,4),src.reshape(-1)))

            loss.backward()
            #             with amp.scale_loss(loss, optimizer) as scaled_loss:
            #                scaled_loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
            optimizer.step()
            optimizer.zero_grad()
            total_loss += loss
            predictions = torch.argmax(output, dim=1).squeeze().cpu().numpy()
            train_preds.append(predictions)
            ground_truths.append(labels.cpu().numpy())
            print(
                "Epoch [{}/{}], Step [{}/{}] Loss: {:.3f} Lr:{:.6f} Time: {:.1f}"
                .format(epoch + 1, opts.epochs, step + 1, total_steps,
                        total_loss / (step + 1), lr,
                        time.time() - t),
                end='\r',
                flush=True)  #total_loss/(step+1)
            #break
        print('')
        train_preds = np.concatenate(train_preds)
        ground_truths = np.concatenate(ground_truths)
        #ground_truths=dataset.labels
        train_acc = Metrics.accuracy(train_preds, ground_truths)
        train_loss = total_loss / (step + 1)

        val_loss, val_acc, val_sens, val_spec = validate(
            model, device, val_dataloader, batch_size=opts.batch_size * 2)
        print("Epoch {} train acc: {}".format(epoch + 1, train_acc))

        to_log = [
            epoch + 1, train_loss, train_acc, val_loss, val_acc, val_sens,
            val_spec
        ]
        logger.log(to_log)

        if (epoch + 1) % opts.save_freq == 0:
            save_weights(model, optimizer, epoch, checkpoints_folder)

    get_best_weights_from_fold(opts.fold)
コード例 #2
0
def train_fold():
    #get arguments
    opts=get_args()

    #gpu selection
    os.environ["CUDA_VISIBLE_DEVICES"] = opts.gpu_id
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    #instantiate datasets
    json_path=os.path.join(opts.path,'train.json')

    json=pd.read_json(json_path,lines=True)
    json=json[json.signal_to_noise > opts.noise_filter]
    ids=np.asarray(json.id.to_list())


    error_weights=get_errors(json)
    error_weights=opts.error_alpha+np.exp(-error_weights*opts.error_beta)
    train_indices,val_indices=get_train_val_indices(json,opts.fold,SEED=2020,nfolds=opts.nfolds)

    _,labels=get_data(json)
    sequences=np.asarray(json.sequence)
    train_seqs=sequences[train_indices]
    val_seqs=sequences[val_indices]
    train_labels=labels[train_indices]
    val_labels=labels[val_indices]
    train_ids=ids[train_indices]
    val_ids=ids[val_indices]
    train_ew=error_weights[train_indices]
    val_ew=error_weights[val_indices]

    #train_inputs=np.stack([train_inputs],0)
    #val_inputs=np.stack([val_inputs,val_inputs2],0)
    dataset=RNADataset(train_seqs,train_labels,train_ids, train_ew, opts.path)
    val_dataset=RNADataset(val_seqs,val_labels, val_ids, val_ew, opts.path, training=False)
    dataloader = DataLoader(dataset, batch_size=opts.batch_size,
                            shuffle=True, num_workers=opts.workers)
    val_dataloader = DataLoader(val_dataset, batch_size=opts.batch_size*2,
                            shuffle=False, num_workers=opts.workers)

    # print(dataset.data.shape)
    # print(dataset.bpps[0].shape)
    # exit()
    #checkpointing
    checkpoints_folder='checkpoints_fold{}'.format((opts.fold))
    csv_file='log_fold{}.csv'.format((opts.fold))
    columns=['epoch','train_loss',
             'val_loss']
    logger=CSVLogger(columns,csv_file)

    #build model and logger
    model=NucleicTransformer(opts.ntoken, opts.nclass, opts.ninp, opts.nhead, opts.nhid,
                           opts.nlayers, opts.kmer_aggregation, kmers=opts.kmers,stride=opts.stride,
                           dropout=opts.dropout).to(device)
    optimizer=Ranger(model.parameters(), weight_decay=opts.weight_decay)
    criterion=weighted_MCRMSE
    #lr_schedule=lr_AIAYN(optimizer,opts.ninp,opts.warmup_steps,opts.lr_scale)

    # Mixed precision initialization
    opt_level = 'O1'
    #model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)
    model = nn.DataParallel(model)
    pretrained_df=pd.read_csv('pretrain.csv')
    #print(pretrained_df.epoch[-1])
    model.load_state_dict(torch.load('pretrain_weights/epoch{}.ckpt'.format(int(pretrained_df.iloc[-1].epoch))))

    pytorch_total_params = sum(p.numel() for p in model.parameters())
    print('Total number of paramters: {}'.format(pytorch_total_params))


    #distance_mask=get_distance_mask(107)
    #distance_mask=torch.tensor(distance_mask).float().to(device).reshape(1,107,107)
    #print("Starting training for fold {}/{}".format(opts.fold,opts.nfolds))
    #training loop
    cos_epoch=int(opts.epochs*0.75)-1
    lr_schedule=torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,(opts.epochs-cos_epoch)*len(dataloader))
    for epoch in range(opts.epochs):
        model.train(True)
        t=time.time()
        total_loss=0
        optimizer.zero_grad()
        train_preds=[]
        ground_truths=[]
        step=0
        for data in dataloader:
        #for step in range(1):
            step+=1
            #lr=lr_schedule.step()
            lr=get_lr(optimizer)
            #print(lr)
            src=data['data'].to(device)
            labels=data['labels']
            bpps=data['bpp'].to(device)
            #print(bpps.shape[1])
            # bpp_selection=np.random.randint(bpps.shape[1])
            # bpps=bpps[:,bpp_selection]
            # src=src[:,bpp_selection]

            # print(bpps.shape)
            # print(src.shape)
            # exit()

            # print(bpps.shape)
            # exit()
            #src=mutate_rna_input(src,opts.nmute)
            #src=src.long()[:,np.random.randint(2)]
            labels=labels.to(device)#.float()
            output=model(src,bpps)
            ew=data['ew'].to(device)
            #print(output.shape)
            #print(labels.shape)
            loss=criterion(output[:,:68],labels,ew).mean()

            # with amp.scale_loss(loss, optimizer) as scaled_loss:
            #    scaled_loss.backward()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
            optimizer.step()
            optimizer.zero_grad()
            total_loss+=loss
            print ("Epoch [{}/{}], Step [{}/{}] Loss: {:.3f} Lr:{:.6f} Time: {:.1f}"
                           .format(epoch+1, opts.epochs, step+1, len(dataloader), total_loss/(step+1) , lr,time.time()-t),end='\r',flush=True) #total_loss/(step+1)
            #break
            if epoch > cos_epoch:
                lr_schedule.step()
        print('')
        train_loss=total_loss/(step+1)
        #recon_acc=np.sum(recon_preds==true_seqs)/len(recon_preds)
        torch.cuda.empty_cache()
        if (epoch+1)%opts.val_freq==0 and epoch > cos_epoch:
        #if (epoch+1)%opts.val_freq==0:
            val_loss=validate(model,device,val_dataloader,batch_size=opts.batch_size)
            to_log=[epoch+1,train_loss,val_loss,]
            logger.log(to_log)


        if (epoch+1)%opts.save_freq==0:
            save_weights(model,optimizer,epoch,checkpoints_folder)

        # if epoch == cos_epoch:
        #     print('yes')


    get_best_weights_from_fold(opts.fold)
コード例 #3
0
def train_fold():
    #get arguments
    opts = get_args()

    #gpu selection
    os.environ["CUDA_VISIBLE_DEVICES"] = opts.gpu_id
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    #instantiate datasets
    json_path = os.path.join(opts.path, 'train.json')
    json = pd.read_json(json_path, lines=True)
    train_ids = json.id.to_list()

    json_path = os.path.join(opts.path, 'test.json')
    test = pd.read_json(json_path, lines=True)

    #aug_test=test
    #dataloader
    ls_indices = test.seq_length == 130
    long_data = test[ls_indices]
    ids = np.asarray(long_data.id.to_list())
    long_dataset = RNADataset(long_data.sequence.to_list(),
                              np.zeros(len(ls_indices)), ids,
                              np.arange(len(ls_indices)), opts.path)
    long_dataloader = DataLoader(long_dataset,
                                 batch_size=opts.batch_size,
                                 shuffle=True,
                                 num_workers=opts.workers)

    ss_indices = test.seq_length == 107
    short_data = test[ss_indices]
    ids = short_data.id.to_list()
    ids = ids + train_ids
    short_sequences = short_data.sequence.to_list() + json.sequence.to_list()
    short_dataset = RNADataset(short_sequences, np.zeros(len(short_sequences)),
                               ids, np.arange(len(short_sequences)), opts.path)
    short_dataloader = DataLoader(short_dataset,
                                  batch_size=opts.batch_size,
                                  shuffle=True,
                                  num_workers=opts.workers)

    #checkpointing
    checkpoints_folder = 'pretrain_weights'
    csv_file = 'pretrain.csv'.format((opts.fold))
    columns = ['epoch', 'train_loss']
    logger = CSVLogger(columns, csv_file)

    #build model and logger
    model = NucleicTransformer(opts.ntoken,
                               opts.nclass,
                               opts.ninp,
                               opts.nhead,
                               opts.nhid,
                               opts.nlayers,
                               opts.kmer_aggregation,
                               kmers=opts.kmers,
                               stride=opts.stride,
                               dropout=opts.dropout,
                               pretrain=True).to(device)
    optimizer = Ranger(model.parameters(), weight_decay=opts.weight_decay)
    #optimizer=torch.optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss()
    #lr_schedule=lr_AIAYN(optimizer,opts.ninp,opts.warmup_steps,opts.lr_scale)

    # Mixed precision initialization
    opt_level = 'O1'
    #model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)
    model = nn.DataParallel(model)

    pytorch_total_params = sum(p.numel() for p in model.parameters())
    print('Total number of paramters: {}'.format(pytorch_total_params))

    #training loop
    cos_epoch = int(opts.epochs * 0.75)
    total_steps = len(long_dataloader) + len(short_dataloader)
    lr_schedule = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, (opts.epochs - cos_epoch) * (total_steps))
    for epoch in range(opts.epochs):
        model.train(True)
        t = time.time()
        total_loss = 0
        optimizer.zero_grad()
        train_preds = []
        ground_truths = []
        step = 0
        for data in short_dataloader:
            #for step in range(1):
            step += 1
            lr = get_lr(optimizer)
            src = data['data']
            labels = data['labels']
            bpps = data['bpp'].to(device)

            if np.random.uniform() > 0.5:
                masked = mutate_rna_input(src)
            else:
                masked = mask_rna_input(src)

            src = src.to(device).long()
            masked = masked.to(device).long()

            #labels=labels.to(device).float()
            output = model(masked, bpps)
            #ew=data['ew'].to(device)


            loss=(criterion(output[0].reshape(-1,4),src[:,:,0].reshape(-1))+\
            criterion(output[1].reshape(-1,3),src[:,:,1].reshape(-1)-4)+\
            criterion(output[2].reshape(-1,7),src[:,:,2].reshape(-1)-7))

            # with amp.scale_loss(loss, optimizer) as scaled_loss:
            #    scaled_loss.backward()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
            optimizer.step()
            optimizer.zero_grad()
            total_loss += loss
            if epoch > cos_epoch:
                lr_schedule.step()
            print(
                "Epoch [{}/{}], Step [{}/{}] Loss: {:.3f} Lr:{:.6f} Time: {:.1f}"
                .format(epoch + 1, opts.epochs, step + 1, total_steps,
                        total_loss / (step + 1), lr,
                        time.time() - t),
                end='\r',
                flush=True)  #total_loss/(step+1)
        for data in long_dataloader:
            #for step in range(1):
            step += 1
            lr = get_lr(optimizer)
            src = data['data']
            labels = data['labels']
            bpps = data['bpp'].to(device)

            if np.random.uniform() > 0.5:
                masked = mutate_rna_input(src)
            else:
                masked = mask_rna_input(src)

            src = src.to(device).long()
            masked = masked.to(device).long()
            #labels=labels.to(device).float()
            output = model(masked, bpps)
            #ew=data['ew'].to(device)

            loss=(criterion(output[0].reshape(-1,4),src[:,:,0].reshape(-1))+\
            criterion(output[1].reshape(-1,3),src[:,:,1].reshape(-1)-4)+\
            criterion(output[2].reshape(-1,7),src[:,:,2].reshape(-1)-7))

            # with amp.scale_loss(loss, optimizer) as scaled_loss:
            #    scaled_loss.backward()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
            optimizer.step()
            optimizer.zero_grad()
            total_loss += loss
            if epoch > cos_epoch:
                lr_schedule.step()
            print(
                "Epoch [{}/{}], Step [{}/{}] Loss: {:.3f} Lr:{:.6f} Time: {:.1f}"
                .format(epoch + 1, opts.epochs, step + 1, total_steps,
                        total_loss / (step + 1), lr,
                        time.time() - t),
                end='\r',
                flush=True)  #total_loss/(step+1)

            #break
            # if epoch > cos_epoch:
            #     lr_schedule.step()
        print('')
        train_loss = total_loss / (step + 1)
        torch.cuda.empty_cache()
        to_log = [
            epoch + 1,
            train_loss,
        ]
        logger.log(to_log)

        if (epoch + 1) % opts.save_freq == 0:
            save_weights(model, optimizer, epoch, checkpoints_folder)

    get_best_weights_from_fold(opts.fold)
コード例 #4
0
def train_fold():

    opts = get_args()
    seed_everything(2020)
    #gpu selection
    os.environ["CUDA_VISIBLE_DEVICES"] = opts.gpu_id
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    train_df = pd.read_csv(os.path.join("..", "fullset_train.csv"))
    val_df = pd.read_csv(os.path.join("..", "fullset_validation.csv"))

    dataset = ViraminerDataset(train_df.iloc[:, 1], train_df.iloc[:, 2])
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=opts.batch_size,
                                             shuffle=True,
                                             num_workers=opts.num_workers)
    val_dataset = ViraminerDataset(val_df.iloc[:, 1], val_df.iloc[:, 2])
    val_dataloader = torch.utils.data.DataLoader(val_dataset,
                                                 batch_size=opts.batch_size *
                                                 2,
                                                 shuffle=False)

    #exit()
    #lr=0

    #checkpointing
    checkpoints_folder = 'checkpoints_fold{}'.format((opts.fold))
    csv_file = 'log_fold{}.csv'.format((opts.fold))
    columns = [
        'epoch', 'train_acc', 'val_loss', 'val_auc', 'val_acc', 'val_sens',
        'val_spec'
    ]
    logger = CSVLogger(columns, csv_file)

    #build model and logger
    model = NucleicTransformer(opts.ntoken,
                               opts.nclass,
                               opts.ninp,
                               opts.nhead,
                               opts.nhid,
                               opts.nlayers,
                               opts.kmer_aggregation,
                               kmers=opts.kmers,
                               dropout=opts.dropout).to(device)
    optimizer = torch.optim.Adam(model.parameters(),
                                 weight_decay=opts.weight_decay)
    criterion = nn.CrossEntropyLoss(reduction='none')
    lr_schedule = lr_AIAYN(optimizer, opts.ninp, opts.warmup_steps,
                           opts.lr_scale)
    # Initialization
    opt_level = 'O1'
    model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)
    model = nn.DataParallel(model)
    softmax = nn.Softmax(dim=1)

    pytorch_total_params = sum(p.numel() for p in model.parameters())
    print('Total number of paramters: {}'.format(pytorch_total_params))

    print("Starting training for fold {}/{}".format(opts.fold, opts.nfolds))
    #training loop
    for epoch in range(opts.epochs):
        model.train(True)
        t = time.time()
        total_loss = 0
        optimizer.zero_grad()
        total_steps = len(dataloader)
        for step, data in enumerate(dataloader):
            #for step in range(1):
            lr = lr_schedule.step()
            src = data['data']
            labels = data['labels'].to(device)
            mutated_sequence = mutate_dna_sequence(src, opts.nmute).to(device)
            output = model(mutated_sequence)
            loss_weight = torch.ones(len(output), device=device)
            loss = torch.mean(criterion(output, labels))

            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
            optimizer.step()
            optimizer.zero_grad()
            total_loss += loss
            print(
                "Epoch [{}/{}], Step [{}/{}] Loss: {:.3f} Lr:{:.6f} Time: {:.1f}"
                .format(epoch + 1, opts.epochs, step + 1, total_steps,
                        total_loss / (step + 1), lr,
                        time.time() - t),
                end='\r',
                flush=True)  #total_loss/(step+1)
            #break
        print('')

        train_loss = total_loss / (step + 1)

        if (epoch + 1) % opts.val_freq == 0:
            val_loss, auc, val_acc, val_sens, val_spec = validate(
                model, device, val_dataloader, batch_size=opts.batch_size * 2)
            print("Epoch {} train loss: {}".format(epoch + 1, train_loss))

            to_log = [
                epoch + 1, train_loss, val_loss, auc, val_acc, val_sens,
                val_spec
            ]
            logger.log(to_log)

        if (epoch + 1) % opts.save_freq == 0:
            save_weights(model, optimizer, epoch, checkpoints_folder)

    get_best_weights_from_fold(opts.fold)
コード例 #5
0
#exit()

dataset=DeepSeaDataset(train_seqs,train_labels)
dataloader=torch.utils.data.DataLoader(dataset,batch_size=opts.batch_size,shuffle=True,num_workers=opts.num_workers)
val_dataset=DeepSeaDataset(val_seqs.transpose(2,1,0),val_labels.transpose(1,0))
val_dataloader=torch.utils.data.DataLoader(val_dataset,batch_size=opts.batch_size*4,shuffle=False)

#exit()
#lr=0

#checkpointing
checkpoints_folder='checkpoints_fold{}'.format((opts.fold))
csv_file='log_fold{}.csv'.format((opts.fold))
columns=['epoch','train_loss',
         'val_loss','val_auc','val_acc','val_sens','val_spec']
logger=CSVLogger(columns,csv_file)

#build model and logger
model=NucleicTransformer(opts.ntoken, opts.nclass, opts.ninp, opts.nhead, opts.nhid,
                       opts.nlayers, opts.kmer_aggregation, kmers=opts.kmers,
                       dropout=opts.dropout).to(device)
optimizer=torch.optim.Adam(model.parameters(), weight_decay=opts.weight_decay)
criterion=nn.BCEWithLogitsLoss(reduction='none')
lr_schedule=lr_AIAYN(optimizer,opts.ninp,opts.warmup_steps,opts.lr_scale)
# Initialization
opt_level = 'O1'
model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)
model = nn.DataParallel(model)
#softmax = nn.Softmax(dim=1)

pytorch_total_params = sum(p.numel() for p in model.parameters())