def cross_val(args):

    torch.set_default_tensor_type('torch.DoubleTensor')

    allele_list_9 = [
        'HLA-A*02:01', 'HLA-A*03:01', 'HLA-A*11:01', 'HLA-A*02:03',
        'HLA-B*15:01', 'HLA-A*31:01', 'HLA-A*01:01', 'HLA-B*07:02',
        'HLA-A*26:01', 'HLA-A*02:06', 'HLA-A*68:02', 'HLA-B*08:01',
        'HLA-B*58:01', 'HLA-B*40:01', 'HLA-B*27:05', 'HLA-A*30:01',
        'HLA-A*69:01', 'HLA-B*57:01', 'HLA-B*35:01', 'HLA-A*02:02',
        'HLA-A*24:02', 'HLA-B*18:01', 'HLA-B*51:01', 'HLA-A*29:02',
        'HLA-A*68:01', 'HLA-A*33:01', 'HLA-A*23:01'
    ]

    allele_list_10 = [
        'HLA-A*02:01', 'HLA-A*03:01', 'HLA-A*11:01', 'HLA-A*68:01',
        'HLA-A*31:01', 'HLA-A*02:06', 'HLA-A*68:02', 'HLA-A*02:03',
        'HLA-A*33:01', 'HLA-A*02:02'
    ]

    if not os.path.exists(args.savedir):
        os.mkdir(args.savedir)

    logFileLoc = args.savedir + os.sep + args.testFile

    if os.path.isfile(logFileLoc):
        logger = open(logFileLoc, 'a')
        logger.write("%s\t%s\t\t\t%s\t\t\t%s\t\t\t%s\n" %
                     ('Length', 'Allele', 'Pearson', 'AUC', 'SRCC'))
        logger.flush()
    else:
        logger = open(logFileLoc, 'w')
        logger.write("%s\t%s\t\t\t%s\t\t\t%s\t\t\t%s\n" %
                     ('Length', 'Allele', 'Pearson', 'AUC', 'SRCC'))
        logger.flush()

    for length in [10, 9]:

        if length == 9:
            allele_list = allele_list_9
        elif length == 10:
            allele_list = allele_list_10
        else:
            print("Invalid Length")
            exit(0)

        for allele in allele_list:  #[9,10]

            model_dir = args.savedir + os.sep + 'best_model' + os.sep + allele
            if not os.path.isdir(model_dir):
                os.makedirs(model_dir)

            data_dict = pickle.load(
                open(
                    args.data_dir + os.sep + 'pickle_' + str(length) + os.sep +
                    allele.replace('*', '.').replace(':', '_') + '.p', 'rb'))

            print('test on allele: ' + data_dict['allele'])
            if not length == data_dict['sequ_length']:
                print('length error')
                exit()

            encode_channel = data_dict['channel_encode']
            meas = data_dict['label']
            bind = []
            for i in meas:
                i = (-1) * math.log10(i)
                bind.append(i)
            sequ, label = encode_channel, bind

            if (len(sequ) > 5):

                sequ_ori, label_ori = sequ, label

                train_sequ_ori, test_sequ_ori, train_label_ori, test_label_ori = train_test_split(
                    sequ_ori,
                    label_ori,
                    test_size=0.1,
                    random_state=42,
                    shuffle=True)
                sequ_ori, label_ori = test_sequ_ori, test_label_ori

                output_list = []
                label_list = []

                test_data_load = torch.utils.data.DataLoader(
                    myDataLoader.MyDataset(sequ_ori, label_ori),
                    batch_size=args.batch_size,
                    shuffle=True,
                    num_workers=args.num_workers,
                    pin_memory=True)

                model = net.ResNetC1()

                if args.onGPU == True:
                    #model = torch.nn.DataParallel(model, device_ids=[0,1,2,3]).cuda()
                    model = model.cuda()

                criteria = MSELoss()

                if args.onGPU == True:
                    criteria = criteria.cuda()

                output_sum, label = [], []

                for fold_num in range(1, 6):

                    best_model_dict = torch.load(model_dir + os.sep + allele +
                                                 '_' + str(length) + '_' +
                                                 str(fold_num) + '.pth')
                    model.load_state_dict(best_model_dict)
                    _, _, output, label = val(args, test_data_load, model,
                                              criteria)

                    if not output_sum:
                        output_sum.extend(output)
                    else:
                        output_sum = [
                            output_sum[i] + output[i]
                            for i in range(len(output_sum))
                        ]

                final_out = [output_sum[i] / 5 for i in range(len(output_sum))]
                output_list.extend(final_out)
                label_list.extend(label)

                IC_output_list = [
                    math.pow(10, (-1) * value) for value in output_list
                ]
                IC_label_list = [
                    math.pow(10, (-1) * value) for value in label_list
                ]

                bi_output_list = [
                    1 if ic < 500 else 0 for ic in IC_output_list
                ]
                bi_label_list = [1 if ic < 500 else 0 for ic in IC_label_list]

                pearson = pearsonr(IC_output_list, IC_label_list)
                auc = roc_auc_score(bi_label_list, bi_output_list)
                srcc = spearmanr(IC_output_list, IC_label_list)

                logger.write("%s\t%s\t\t%.4f\t\t\t%.4f\t\t\t%.4f\n" %
                             (length, allele, pearson[0], auc, srcc[0]))
                logger.flush()

                prediction = args.savedir + os.sep + args.predict
                if os.path.exists(prediction):
                    append_write = 'a'  # append if already exists
                else:
                    append_write = 'w'

                true_value = open(prediction, append_write)
                true_value.write("%s\n" % (allele))
                for i in range(len(output_list)):
                    true_value.write("%.4f\t%.4f\n" %
                                     (IC_label_list[i], IC_output_list[i]))
                true_value.flush()

    logger.close()
Beispiel #2
0
def trainValidateSegmentation(args):
    '''
    Main function for trainign and validation
    :param args: global arguments
    :return: None
    '''
    # check if processed data file exists or not
    if not os.path.isfile(args.cached_data_file):
        dataLoad = ld.LoadData(args.data_dir, args.classes, args.cached_data_file)
        data = dataLoad.processData()
        if data is None:
            print('Error while pickling data. Please check.')
            exit(-1)
    else:
        data = pickle.load(open(args.cached_data_file, "rb"))

    q = args.q
    p = args.p
    # load the model
    if not args.decoder:
        model = net.ESPNet_Encoder(args.classes, p=p, q=q)
        args.savedir = args.savedir + '_enc_' + str(p) + '_' + str(q) + '/'
    else:
        model = net.ESPNet(args.classes, p=p, q=q, encoderFile=args.pretrained)
        args.savedir = args.savedir + '_dec_' + str(p) + '_' + str(q) + '/'

    if args.onGPU:
        model = model.cuda()

    # create the directory if not exist
    if not os.path.exists(args.savedir):
        os.mkdir(args.savedir)

    if args.visualizeNet:
        x = Variable(torch.randn(1, 3, args.inWidth, args.inHeight))

        if args.onGPU:
            x = x.cuda()

        y = model.forward(x)
        g = viz.make_dot(y)
        g.render(args.savedir + 'model.png', view=False)

    total_paramters = netParams(model)
    print('Total network parameters: ' + str(total_paramters))

    # define optimization criteria
    weight = torch.from_numpy(data['classWeights']) # convert the numpy array to torch
    if args.onGPU:
        weight = weight.cuda()

    criteria = CrossEntropyLoss2d(weight) #weight

    if args.onGPU:
        criteria = criteria.cuda()

    print('Data statistics')
    print(data['mean'], data['std'])
    print(data['classWeights'])

    #compose the data with transforms
    trainDataset_main = myTransforms.Compose([
        myTransforms.Normalize(mean=data['mean'], std=data['std']),
        myTransforms.Scale(1024, 512),
        myTransforms.RandomCropResize(32),
        myTransforms.RandomFlip(),
        #myTransforms.RandomCrop(64).
        myTransforms.ToTensor(args.scaleIn),
        #
    ])

    trainDataset_scale1 = myTransforms.Compose([
        myTransforms.Normalize(mean=data['mean'], std=data['std']),
        myTransforms.Scale(1536, 768), # 1536, 768
        myTransforms.RandomCropResize(100),
        myTransforms.RandomFlip(),
        #myTransforms.RandomCrop(64),
        myTransforms.ToTensor(args.scaleIn),
        #
    ])

    trainDataset_scale2 = myTransforms.Compose([
        myTransforms.Normalize(mean=data['mean'], std=data['std']),
        myTransforms.Scale(1280, 720), # 1536, 768
        myTransforms.RandomCropResize(100),
        myTransforms.RandomFlip(),
        #myTransforms.RandomCrop(64),
        myTransforms.ToTensor(args.scaleIn),
        #
    ])

    trainDataset_scale3 = myTransforms.Compose([
        myTransforms.Normalize(mean=data['mean'], std=data['std']),
        myTransforms.Scale(768, 384),
        myTransforms.RandomCropResize(32),
        myTransforms.RandomFlip(),
        #myTransforms.RandomCrop(64),
        myTransforms.ToTensor(args.scaleIn),
        #
    ])

    trainDataset_scale4 = myTransforms.Compose([
        myTransforms.Normalize(mean=data['mean'], std=data['std']),
        myTransforms.Scale(512, 256),
        #myTransforms.RandomCropResize(20),
        myTransforms.RandomFlip(),
        #myTransforms.RandomCrop(64).
        myTransforms.ToTensor(args.scaleIn),
        #
    ])


    valDataset = myTransforms.Compose([
        myTransforms.Normalize(mean=data['mean'], std=data['std']),
        myTransforms.Scale(1024, 512),
        myTransforms.ToTensor(args.scaleIn),
        #
    ])

    # since we training from scratch, we create data loaders at different scales
    # so that we can generate more augmented data and prevent the network from overfitting

    trainLoader = torch.utils.data.DataLoader(
        myDataLoader.MyDataset(data['trainIm'], data['trainAnnot'], transform=trainDataset_main),
        batch_size=args.batch_size + 2, shuffle=True, num_workers=args.num_workers, pin_memory=True)

    trainLoader_scale1 = torch.utils.data.DataLoader(
        myDataLoader.MyDataset(data['trainIm'], data['trainAnnot'], transform=trainDataset_scale1),
        batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True)

    trainLoader_scale2 = torch.utils.data.DataLoader(
        myDataLoader.MyDataset(data['trainIm'], data['trainAnnot'], transform=trainDataset_scale2),
        batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True)

    trainLoader_scale3 = torch.utils.data.DataLoader(
        myDataLoader.MyDataset(data['trainIm'], data['trainAnnot'], transform=trainDataset_scale3),
        batch_size=args.batch_size + 4, shuffle=True, num_workers=args.num_workers, pin_memory=True)

    trainLoader_scale4 = torch.utils.data.DataLoader(
        myDataLoader.MyDataset(data['trainIm'], data['trainAnnot'], transform=trainDataset_scale4),
        batch_size=args.batch_size + 4, shuffle=True, num_workers=args.num_workers, pin_memory=True)

    valLoader = torch.utils.data.DataLoader(
        myDataLoader.MyDataset(data['valIm'], data['valAnnot'], transform=valDataset),
        batch_size=args.batch_size + 4, shuffle=False, num_workers=args.num_workers, pin_memory=True)

    if args.onGPU:
        cudnn.benchmark = True

    start_epoch = 0

    if args.resume:
        if os.path.isfile(args.resumeLoc):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resumeLoc)
            start_epoch = checkpoint['epoch']
            #args.lr = checkpoint['lr']
            model.load_state_dict(checkpoint['state_dict'])
            print("=> loaded checkpoint '{}' (epoch {})"
                .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    

    logFileLoc = args.savedir + args.logFile
    if os.path.isfile(logFileLoc):
        logger = open(logFileLoc, 'a')
    else:
        logger = open(logFileLoc, 'w')
        logger.write("Parameters: %s" % (str(total_paramters)))
        logger.write("\n%s\t%s\t%s\t%s\t%s\t" % ('Epoch', 'Loss(Tr)', 'Loss(val)', 'mIOU (tr)', 'mIOU (val'))
    logger.flush()

    optimizer = torch.optim.Adam(model.parameters(), args.lr, (0.9, 0.999), eps=1e-08, weight_decay=5e-4)
    # we step the loss by 2 after step size is reached
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.step_loss, gamma=0.5)


    for epoch in range(start_epoch, args.max_epochs):

        scheduler.step(epoch)
        lr = 0
        for param_group in optimizer.param_groups:
            lr = param_group['lr']
        print("Learning rate: " +  str(lr))

        # train for one epoch
        # We consider 1 epoch with all the training data (at different scales)
        train(args, trainLoader_scale1, model, criteria, optimizer, epoch)
        train(args, trainLoader_scale2, model, criteria, optimizer, epoch)
        train(args, trainLoader_scale4, model, criteria, optimizer, epoch)
        train(args, trainLoader_scale3, model, criteria, optimizer, epoch)
        lossTr, overall_acc_tr, per_class_acc_tr, per_class_iu_tr, mIOU_tr = train(args, trainLoader, model, criteria, optimizer, epoch)

        # evaluate on validation set
        lossVal, overall_acc_val, per_class_acc_val, per_class_iu_val, mIOU_val = val(args, valLoader, model, criteria)
        
            
        save_checkpoint({
            'epoch': epoch + 1,
            'arch': str(model),
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'lossTr': lossTr,
            'lossVal': lossVal,
            'iouTr': mIOU_tr,
            'iouVal': mIOU_val,
            'lr': lr
        }, args.savedir + 'checkpoint.pth.tar')

        #save the model also
        model_file_name = args.savedir + '/model_' + str(epoch + 1) + '.pth'
        torch.save(model.state_dict(), model_file_name)

        

        with open(args.savedir + 'acc_' + str(epoch) + '.txt', 'w') as log:
            log.write("\nEpoch: %d\t Overall Acc (Tr): %.4f\t Overall Acc (Val): %.4f\t mIOU (Tr): %.4f\t mIOU (Val): %.4f" % (epoch, overall_acc_tr, overall_acc_val, mIOU_tr, mIOU_val))
            log.write('\n')
            log.write('Per Class Training Acc: ' + str(per_class_acc_tr))
            log.write('\n')
            log.write('Per Class Validation Acc: ' + str(per_class_acc_val))
            log.write('\n')
            log.write('Per Class Training mIOU: ' + str(per_class_iu_tr))
            log.write('\n')
            log.write('Per Class Validation mIOU: ' + str(per_class_iu_val))

        logger.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.7f" % (epoch, lossTr, lossVal, mIOU_tr, mIOU_val, lr))
        logger.flush()
        print("Epoch : " + str(epoch) + ' Details')
        print("\nEpoch No.: %d\tTrain Loss = %.4f\tVal Loss = %.4f\t mIOU(tr) = %.4f\t mIOU(val) = %.4f" % (epoch, lossTr, lossVal, mIOU_tr, mIOU_val))
    logger.close()
def cross_val(args):

    torch.set_default_tensor_type('torch.DoubleTensor')

    csv_path = os.path.join(args.data_dir, args.file_path)
    data_ori = pd.read_csv(csv_path)
    data_ori = data_ori.loc[data_ori['species'] == 'human']
    data_ori = data_ori.loc[(data_ori['peptide_length'] == 9) |
                            (data_ori['peptide_length'] == 10)]

    allele_list_9 = [
        'HLA-A*02:01', 'HLA-A*03:01', 'HLA-A*11:01', 'HLA-A*02:03',
        'HLA-B*15:01', 'HLA-A*31:01', 'HLA-A*01:01', 'HLA-B*07:02',
        'HLA-A*26:01', 'HLA-A*02:06', 'HLA-A*68:02', 'HLA-B*08:01',
        'HLA-B*58:01', 'HLA-B*40:01', 'HLA-B*27:05', 'HLA-A*30:01',
        'HLA-A*69:01', 'HLA-B*57:01', 'HLA-B*35:01', 'HLA-A*02:02',
        'HLA-A*24:02', 'HLA-B*18:01', 'HLA-B*51:01', 'HLA-A*29:02',
        'HLA-A*68:01', 'HLA-A*33:01', 'HLA-A*23:01'
    ]

    allele_list_10 = [
        'HLA-A*02:01', 'HLA-A*03:01', 'HLA-A*11:01', 'HLA-A*68:01',
        'HLA-A*31:01', 'HLA-A*02:06', 'HLA-A*68:02', 'HLA-A*02:03',
        'HLA-A*33:01', 'HLA-A*02:02'
    ]

    if not os.path.exists(args.savedir):
        os.mkdir(args.savedir)

    logFileLoc = args.savedir + os.sep + args.testFile

    if os.path.isfile(logFileLoc):
        logger = open(logFileLoc, 'a')
        logger.write("%s\t%s\t\t\t%s\t\t\t%s\t\t\t%s\n" %
                     ('Length', 'Allele', 'Pearson', 'AUC', 'SRCC'))
        logger.flush()
    else:
        logger = open(logFileLoc, 'w')
        logger.write("%s\t%s\t\t\t%s\t\t\t%s\t\t\t%s\n" %
                     ('Length', 'Allele', 'Pearson', 'AUC', 'SRCC'))
        logger.flush()

    for length in [10, 9]:

        if length == 9:
            allele_list = allele_list_9
        elif length == 10:
            allele_list = allele_list_10
        else:
            print("Invalid Length")
            exit(0)

        for allele in allele_list:  #[9,10]

            model_dir = args.savedir + os.sep + 'best_model' + os.sep + allele
            if not os.path.isdir(model_dir):
                os.makedirs(model_dir)

            data = data_ori.loc[(data_ori['peptide_length'] == length)]
            data = data.loc[data_ori['mhc'] == allele]
            sequ = data['sequence'].values.tolist()
            matrix_list = [make_matrix(x) for x in sequ]
            meas = data['meas'].values.tolist()
            bind = []
            positive = [i for i in meas if i < 500]

            for i in meas:
                i = (-1) * math.log10(i)
                bind.append(i)
            sequ, label = matrix_list, bind

            if (len(sequ) > 0):

                sequ_ori, label_ori = sequ, label

                output_list = []
                label_list = []

                fold_num = 0

                kf = KFold(n_splits=5, shuffle=True, random_state=42)

                for train_set, test_set in kf.split(sequ_ori, label_ori):

                    fold_num += 1

                    train_sequ, test_sequ, train_label, test_label = [sequ_ori[i] for i in train_set], [sequ_ori[i] for i in test_set],\
                                                                [label_ori[i] for i in train_set], [label_ori[i] for i in test_set]

                    test_data_load = torch.utils.data.DataLoader(
                        myDataLoader.MyDataset(test_sequ, test_label),
                        batch_size=args.batch_size,
                        shuffle=True,
                        num_workers=args.num_workers,
                        pin_memory=True)

                    model = net.GRU_net()

                    if args.onGPU == True:
                        #model = torch.nn.DataParallel(model, device_ids=[0,1,2,3]).cuda()
                        model = model.cuda()

                    criteria = MSELoss()

                    if args.onGPU == True:
                        criteria = criteria.cuda()

                    best_model_dict = torch.load(model_dir + os.sep + allele +
                                                 '_' + str(length) + '_' +
                                                 str(fold_num) + '.pth')
                    model.load_state_dict(best_model_dict)
                    _, _, output, label = val(args, test_data_load, model,
                                              criteria)

                    output_list.extend(output)
                    label_list.extend(label)

                #IC_output_list = [math.pow(10, (-1) * value) for value in output_list]
                #IC_label_list = [math.pow(10, (-1) * value) for value in label_list]

                IC_output_list = output_list
                IC_label_list = label_list

                bi_output_list = [
                    1 if ic > (-1) * math.log10(500) else 0
                    for ic in IC_output_list
                ]
                bi_label_list = [
                    1 if ic > (-1) * math.log10(500) else 0
                    for ic in IC_label_list
                ]

                pearson = pearsonr(IC_output_list, IC_label_list)
                auc = roc_auc_score(bi_label_list, bi_output_list)
                srcc = spearmanr(IC_label_list, IC_output_list)

                logger.write("%s\t%s\t\t%.4f\t\t\t%.4f\t\t\t%.4f\n" %
                             (length, allele, pearson[0], auc, srcc[0]))
                logger.flush()

                prediction = args.savedir + os.sep + args.predict
                if os.path.exists(prediction):
                    append_write = 'a'  # append if already exists
                else:
                    append_write = 'w'

                true_value = open(prediction, append_write)
                true_value.write("%s\n" % (allele))
                for i in range(int(len(output_list) / 10)):
                    true_value.write("%.4f\t%.4f\n" %
                                     (IC_label_list[i], IC_output_list[i]))
                true_value.flush()

    logger.close()
Beispiel #4
0
def cross_val(args):

    torch.set_default_tensor_type('torch.DoubleTensor')

    allele_list_9 = [
        'HLA-A*02:01', 'HLA-A*03:01', 'HLA-A*11:01', 'HLA-A*02:03',
        'HLA-B*15:01', 'HLA-A*31:01', 'HLA-A*01:01', 'HLA-B*07:02',
        'HLA-A*26:01', 'HLA-A*02:06', 'HLA-A*68:02', 'HLA-B*08:01',
        'HLA-B*58:01', 'HLA-B*40:01', 'HLA-B*27:05', 'HLA-A*30:01',
        'HLA-A*69:01', 'HLA-B*57:01', 'HLA-B*35:01', 'HLA-A*02:02',
        'HLA-A*24:02', 'HLA-B*18:01', 'HLA-B*51:01', 'HLA-A*29:02',
        'HLA-A*68:01', 'HLA-A*33:01', 'HLA-A*23:01'
    ]

    allele_list_10 = [
        'HLA-A*02:01', 'HLA-A*03:01', 'HLA-A*11:01', 'HLA-A*68:01',
        'HLA-A*31:01', 'HLA-A*02:06', 'HLA-A*68:02', 'HLA-A*02:03',
        'HLA-A*33:01', 'HLA-A*02:02'
    ]

    if not os.path.exists(args.savedir):
        os.mkdir(args.savedir)

    logFileLoc = args.savedir + os.sep + args.testFile

    if os.path.isfile(logFileLoc):
        logger = open(logFileLoc, 'a')
        logger.write("%s\t%s\t\t\t\t%s\n" % ('Length', 'Allele', 'Pearson'))
        logger.flush()
    else:
        logger = open(logFileLoc, 'w')
        logger.write("%s\t%s\t\t\t\t%s\n" % ('Length', 'Allele', 'Pearson'))
        logger.flush()

    for length in [10, 9]:

        if length == 9:
            allele_list = allele_list_9
        elif length == 10:
            allele_list = allele_list_10
        else:
            print("Invalid Length")
            exit(0)

        for allele in allele_list:  #[9,10]

            model_dir = args.savedir + os.sep + 'best_model' + os.sep + allele
            if not os.path.isdir(model_dir):
                os.makedirs(model_dir)

            data_dict = pickle.load(
                open(
                    args.data_dir + os.sep + 'pickle_' + str(length) + os.sep +
                    allele.replace('*', '.').replace(':', '_') + '.p', 'rb'))

            print('val on allele: ' + data_dict['allele'])
            if not length == data_dict['sequ_length']:
                print('length error')
                exit()

            encode_channel = data_dict['channel_encode']
            meas = data_dict['label']
            bind = []
            for i in meas:
                i = (-1) * math.log10(i)
                bind.append(i)
            sequ, label = encode_channel, bind

            if (len(sequ) > 0):

                sequ_ori, label_ori = sequ, label

                output_list = []
                label_list = []

                fold_num = 0

                kf = KFold(n_splits=5, shuffle=True, random_state=42)

                pearson_list = []

                for train_set, test_set in kf.split(sequ_ori, label_ori):

                    fold_num += 1

                    train_sequ, test_sequ, train_label, test_label = [sequ_ori[i] for i in train_set], [sequ_ori[i] for i in test_set],\
                                                                [label_ori[i] for i in train_set], [label_ori[i] for i in test_set]

                    test_data_load = torch.utils.data.DataLoader(
                        myDataLoader.MyDataset(test_sequ, test_label),
                        batch_size=args.batch_size,
                        shuffle=True,
                        num_workers=args.num_workers,
                        pin_memory=True)

                    train_sequ, val_sequ, train_label, val_label = train_test_split(
                        train_sequ,
                        train_label,
                        test_size=0.1,
                        random_state=42,
                        shuffle=True)

                    val_data_load = torch.utils.data.DataLoader(
                        myDataLoader.MyDataset(val_sequ, val_label),
                        batch_size=args.batch_size,
                        shuffle=True,
                        num_workers=args.num_workers,
                        pin_memory=True)

                    model = net.ResNetC1()

                    if args.onGPU == True:
                        #model = torch.nn.DataParallel(model, device_ids=[0,1,2,3]).cuda()
                        model = model.cuda()

                    criteria = MSELoss()

                    if args.onGPU == True:
                        criteria = criteria.cuda()

                    best_model_dict = torch.load(model_dir + os.sep + allele +
                                                 '_' + str(length) + '_' +
                                                 str(fold_num) + '.pth')
                    model.load_state_dict(best_model_dict)
                    _, _, output, label = val(args, test_data_load, model,
                                              criteria)

                    output_list.extend(output)
                    label_list.extend(label)

                pearson = pearsonr(output_list, label_list)

                pearson_list.append(pearson[0])

                logger.write("%s\t%s\t\t\t%.4f\n" %
                             (length, allele, max(pearson_list)))
                logger.flush()

    logger.close()
def cross_val(args):

    torch.set_default_tensor_type('torch.DoubleTensor')

    allele_list_9 = [
        'HLA-A*02:01', 'HLA-A*03:01', 'HLA-A*11:01', 'HLA-A*02:03',
        'HLA-B*15:01', 'HLA-A*31:01', 'HLA-A*01:01', 'HLA-B*07:02',
        'HLA-A*26:01', 'HLA-A*02:06', 'HLA-A*68:02', 'HLA-B*08:01',
        'HLA-B*58:01', 'HLA-B*40:01', 'HLA-B*27:05', 'HLA-A*30:01',
        'HLA-A*69:01', 'HLA-B*57:01', 'HLA-B*35:01', 'HLA-A*02:02',
        'HLA-A*24:02', 'HLA-B*18:01', 'HLA-B*51:01', 'HLA-A*29:02',
        'HLA-A*68:01', 'HLA-A*33:01', 'HLA-A*23:01'
    ]

    allele_list_10 = [
        'HLA-A*02:01', 'HLA-A*03:01', 'HLA-A*11:01', 'HLA-A*68:01',
        'HLA-A*31:01', 'HLA-A*02:06', 'HLA-A*68:02', 'HLA-A*02:03',
        'HLA-A*33:01', 'HLA-A*02:02'
    ]

    if not os.path.exists(args.savedir):
        os.mkdir(args.savedir)

    if args.visualizeNet == True:
        x = Variable(torch.randn(1, 5, 174, 18))

        if args.onGPU == True:
            x = x.cuda()

        model = net.pre_train()

        #y = model.cuda.().forward(x)
        #g = viz.make_dot(y)
        #g.render(args.savedir + '/model.png', view=False)

        total_paramters = 0
        for parameter in model.parameters():
            i = len(parameter.size())
            p = 1
            for j in range(i):
                p *= parameter.size(j)
            total_paramters += p

        print('Parameters: ' + str(total_paramters))

    logFileLoc = args.savedir + os.sep + args.crossValFile

    if os.path.isfile(logFileLoc):
        logger = open(logFileLoc, 'a')
        logger.write("%s\t%s\t\t\t\t%s\t\t\t\t%s\n" %
                     ('Length', 'Allele', 'train_loss', 'val_loss'))
        logger.flush()
    else:
        logger = open(logFileLoc, 'w')
        logger.write("%s\t%s\t\t\t\t%s\t\t\t\t%s\n" %
                     ('Length', 'Allele', 'train_loss', 'val_loss'))
        logger.flush()

    for length in [10]:

        if length == 9:
            allele_list = allele_list_9
        elif length == 10:
            allele_list = allele_list_10
        else:
            print("Invalid Length")
            exit(0)

        for allele in allele_list:  #[9,10]

            model_dir = args.savedir + os.sep + 'best_model' + os.sep + allele
            if not os.path.isdir(model_dir):
                os.makedirs(model_dir)

            data_dict = pickle.load(
                open(
                    args.data_dir + os.sep + 'pickle_' + str(length) + os.sep +
                    allele.replace('*', '.').replace(':', '_') + '.p', 'rb'))

            print('train on allele: ' + data_dict['allele'])
            if not length == data_dict['sequ_length']:
                print('length error')
                exit()

            encode_channel = data_dict['channel_encode']
            bind = data_dict['channel_encode']
            sequ, label = encode_channel, bind

            if (len(sequ) > 0):
                sequ_ori, label_ori = sequ, label

                alleleLoc = args.savedir + os.sep + allele + '.txt'

                if os.path.isfile(alleleLoc):
                    log = open(alleleLoc, 'a')
                    log.write("\n")
                    log.write("%s\t\t\t%s\n" % ('Length: ', length))
                    log.write("%s\t\t\t\t%s\t\t\t\t%s\n" %
                              ('Epoch', 'tr_loss', 'val_loss'))
                    log.flush()
                else:
                    log = open(alleleLoc, 'w')
                    log.write("%s\t\t\t%s\n" % ('Allele', allele))
                    log.write("\n")
                    log.write("%s\t\t\t%s\n" % ('Length: ', length))
                    log.write("%s\t\t\t\t%s\t\t\t\t%s\n" %
                              ('Epoch', 'tr_loss', 'val_loss'))
                    log.flush()

                train_sequ, val_sequ, train_label, val_label = train_test_split(
                    sequ_ori,
                    label_ori,
                    test_size=0.1,
                    random_state=42,
                    shuffle=True)

                train_data_load = torch.utils.data.DataLoader(
                    myDataLoader.MyDataset(train_sequ, train_label),
                    batch_size=args.batch_size,
                    shuffle=True,
                    num_workers=args.num_workers,
                    pin_memory=True)
                val_data_load = torch.utils.data.DataLoader(
                    myDataLoader.MyDataset(val_sequ, val_label),
                    batch_size=args.batch_size,
                    shuffle=True,
                    num_workers=args.num_workers,
                    pin_memory=True)

                model = net.pre_train()

                if args.onGPU == True:
                    model = torch.nn.DataParallel(model,
                                                  device_ids=[0, 1, 2]).cuda()

                criteria = MSELoss()

                if args.onGPU == True:
                    criteria = criteria.cuda()

                #optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4
                #optimizer = torch.optim.Adam(model.parameters(), args.lr, (0.9, 0.999), eps=1e-08, weight_decay=2e-4)
                optimizer = torch.optim.Adam(model.parameters(),
                                             lr=args.lr,
                                             weight_decay=5e-4)

                if args.onGPU == True:
                    cudnn.benchmark = True

                scheduler = torch.optim.lr_scheduler.StepLR(
                    optimizer, step_size=args.step_loss, gamma=0.1)

                start_epoch = 0

                min_val_loss = 100
                loss_not_decay = 0

                train_loss_list = []
                val_loss_list = []

                for epoch in range(start_epoch, args.max_epochs):
                    tr_epoch_loss = train(args, train_data_load, model,
                                          criteria, optimizer)
                    val_epoch_loss = val(args, val_data_load, model, criteria)

                    train_loss_list.append(tr_epoch_loss)
                    val_loss_list.append(val_epoch_loss)

                    log.write("%s\t\t\t\t%.4f\t\t\t\t%.4f\n" %
                              (epoch, tr_epoch_loss, val_epoch_loss))

                    #val_epoch_loss = val_epoch_loss.cpu().data.numpy()[0]
                    if val_epoch_loss < min_val_loss:
                        if args.save_model == True:
                            model_file_name = model_dir + os.sep + allele + '_' + str(
                                length) + '.pth'
                            print('==> Saving the best model')
                            torch.save(model.state_dict(), model_file_name)
                        min_val_loss = val_epoch_loss
                        loss_not_decay = 0
                    else:
                        loss_not_decay += 1

                    if loss_not_decay >= 10:
                        break

                    scheduler.step(epoch)

                allele_train_loss = sum(train_loss_list) / len(train_loss_list)
                allele_val_loss = sum(val_loss_list) / len(val_loss_list)

                logger.write(
                    "%s\t%s\t\t\t\t%.4f\t\t\t\t%.4f\n" %
                    (length, allele, allele_train_loss, allele_val_loss))
                logger.flush()
    logger.close()
Beispiel #6
0
def main(args):
    if args.apex:
        if sys.version_info < (3, 0):
            raise RuntimeError("Apex currently only supports Python 3. Aborting.")
        if amp is None:
            raise RuntimeError("Failed to import apex. Please install apex from https://www.github.com/nvidia/apex "
                               "to enable mixed-precision training.")

    if args.output_dir:
        utils.mkdir(args.output_dir)

    utils.init_distributed_mode(args)
    print(args)

    # device = torch.device(args.device)
    device = torch.device('cuda:{}'.format(args.gpu) if torch.cuda.is_available() else 'cpu')

    torch.backends.cudnn.benchmark = True

    # Data loading code
    print("Loading data")
    if not os.path.isfile(args.cached_data_file):
        dataLoader = ld.LoadData(args.data_dir, args.classes, args.cached_data_file)
        if dataLoader is None:
            print('Error while processing the data. Please check')
            exit(-1)
        data = dataLoader.processData()
    else:
        data = pickle.load(open(args.cached_data_file, "rb"))
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    trainDataset = transforms.Compose([
        transforms.ColorJitter(),
        transforms.RandomHorizontalFlip(),
        transforms.Resize(224),
        transforms.ToTensor(),
        normalize,
    ])

    valDataset = transforms.Compose([
        transforms.Resize(224),
        transforms.ToTensor(),
        normalize,
    ])
    dataset = myDataLoader.MyDataset(data['trainIm'], data['trainClass'], transform=trainDataset)
    dataset_test = myDataLoader.MyDataset(data['valIm'], data['valClass'], transform=valDataset)

    print("Creating data loaders")
    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
        test_sampler = torch.utils.data.distributed.DistributedSampler(dataset_test)
    else:
        train_sampler = torch.utils.data.RandomSampler(dataset)
        test_sampler = torch.utils.data.SequentialSampler(dataset_test)

    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=args.batch_size,
        sampler=train_sampler, num_workers=args.workers, pin_memory=True)

    data_loader_test = torch.utils.data.DataLoader(
        dataset_test, batch_size=args.batch_size,
        sampler=test_sampler, num_workers=args.workers, pin_memory=True)

    print("Creating model")
    # import pdb
    # pdb.set_trace()
    model = torchvision.models.__dict__[args.model](pretrained=args.pretrained)
    # googlenet
    # num_ftrs = model.fc.in_features
    # model.fc = nn.Linear(num_ftrs, args.classes)   
    ##densenet161
    num_ftrs = model.classifier.in_features
    model.classifier = nn.Linear(num_ftrs, args.classes)

    ##resnet101
    # num_ftrs = model.fc.in_features
    # model.fc = nn.Linear(num_ftrs, args.classes)   
    model.to(device)
    if args.distributed and args.sync_bn:
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)

    criterion = nn.CrossEntropyLoss()

    optimizer = torch.optim.SGD(
        model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)

    if args.apex:
        model, optimizer = amp.initialize(model, optimizer,
                                          opt_level=args.apex_opt_level
                                          )

    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
        model_without_ddp = model.module

    # 修改了resume参数的规则
    if args.resume and os.path.exists(args.resume):
        print('Loading resume data from file: {}'.format(args.resume))
        checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
        args.start_epoch = checkpoint['epoch'] + 1
    else:
        print('Not load resume data.')

    if args.test_only:
        evaluate(model, criterion, data_loader_test, device=device)
        return

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, args.print_freq, args.apex)
        lr_scheduler.step()
        evaluate(model, criterion, data_loader_test, device=device)
        if args.output_dir:
            checkpoint = {
                'model': model_without_ddp.state_dict(),
                'optimizer': optimizer.state_dict(),
                'lr_scheduler': lr_scheduler.state_dict(),
                'epoch': epoch,
                'args': args}
            utils.save_on_master(
                checkpoint,
                os.path.join(args.output_dir, 'model_{}.pth'.format(epoch)))
            utils.save_on_master(
                checkpoint,
                os.path.join(args.output_dir, 'checkpoint.pth'))

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
def trainValidateSegmentation(args):
    # check if processed data file exists or not
    if not os.path.isfile(args.cached_data_file):
        dataLoader = ld.LoadData(args.data_dir, args.classes,
                                 args.cached_data_file)
        if dataLoader is None:
            print('Error while processing the data. Please check')
            exit(-1)
        data = dataLoader.processData()
    else:
        data = pickle.load(open(args.cached_data_file, "rb"))

    if args.modelType == 'C1':
        model = net.ResNetC1(args.classes)
    elif args.modelType == 'D1':
        model = net.ResNetD1(args.classes)
    else:
        print('Please select the correct model. Exiting!!')
        exit(-1)

        args.savedir = args.savedir + args.modelType + '/'

    if args.onGPU == True:
        model = model.cuda()

    # create the directory if not exist
    if not os.path.exists(args.savedir):
        os.mkdir(args.savedir)

    if args.onGPU == True:
        model = model.cuda()

    if args.visualizeNet == True:
        x = Variable(torch.randn(1, 3, args.inWidth, args.inHeight))

        if args.onGPU == True:
            x = x.cuda()

        y = model.forward(x)
        g = viz.make_dot(y)
        g.render(args.savedir + '/model.png', view=False)

    n_param = sum([np.prod(param.size()) for param in model.parameters()])
    print('Network parameters: ' + str(n_param))

    # define optimization criteria
    print('Weights to handle class-imbalance')
    weight = torch.from_numpy(
        data['classWeights'])  # convert the numpy array to torch
    print(weight)
    if args.onGPU == True:
        weight = weight.cuda()

    criteria = CrossEntropyLoss2d(weight)  # weight

    if args.onGPU == True:
        criteria = criteria.cuda()

    trainDatasetNoZoom = myTransforms.Compose([
        # myTransforms.Normalize(mean=data['mean'], std=data['std']),
        myTransforms.RandomCropResize(20),
        myTransforms.RandomHorizontalFlip(),
        myTransforms.ToTensor(args.scaleIn)
    ])

    trainDatasetWithZoom = myTransforms.Compose([
        # myTransforms.Normalize(mean=data['mean'], std=data['std']),
        myTransforms.Zoom(512, 512),
        myTransforms.RandomCropResize(20),
        myTransforms.RandomHorizontalFlip(),
        myTransforms.ToTensor(args.scaleIn)
    ])

    valDataset = myTransforms.Compose([
        # myTransforms.Normalize(mean=data['mean'], std=data['std']),
        myTransforms.ToTensor(args.scaleIn)
    ])

    trainLoaderNoZoom = torch.utils.data.DataLoader(
        myDataLoader.MyDataset(data['trainIm'],
                               data['trainAnnot'],
                               transform=trainDatasetNoZoom),
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.num_workers,
        pin_memory=True)

    trainLoaderWithZoom = torch.utils.data.DataLoader(
        myDataLoader.MyDataset(data['trainIm'],
                               data['trainAnnot'],
                               transform=trainDatasetWithZoom),
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.num_workers,
        pin_memory=True)

    valLoader = torch.utils.data.DataLoader(myDataLoader.MyDataset(
        data['valIm'], data['valAnnot'], transform=valDataset),
                                            batch_size=args.batch_size,
                                            shuffle=False,
                                            num_workers=args.num_workers,
                                            pin_memory=True)

    # define the optimizer
    # optimizer = torch.optim.Adam(model.parameters(), args.lr, (0.9, 0.999), eps=1e-08, weight_decay=2e-4)
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=args.lr,
                                momentum=0.9,
                                weight_decay=5e-4)

    if args.onGPU == True:
        cudnn.benchmark = True

    start_epoch = 0

    if args.resume:
        if os.path.isfile(args.resumeLoc):
            print("=> loading checkpoint '{}'".format(args.resumeLoc))
            checkpoint = torch.load(args.resumeLoc)
            start_epoch = checkpoint['epoch']
            model.load_state_dict(checkpoint['state_dict'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    logFileLoc = args.savedir + os.sep + args.logFile
    if os.path.isfile(logFileLoc):
        logger = open(logFileLoc, 'a')
        logger.write("Parameters: %s" % (str(total_paramters)))
        logger.write(
            "\n%s\t%s\t%s\t%s\t%s\t" %
            ('Epoch', 'Loss(Tr)', 'Loss(val)', 'mIOU (tr)', 'mIOU (val'))
        logger.flush()
    else:
        logger = open(logFileLoc, 'w')
        logger.write("Parameters: %s" % (str(total_paramters)))
        logger.write(
            "\n%s\t%s\t%s\t%s\t%s\t" %
            ('Epoch', 'Loss(Tr)', 'Loss(val)', 'mIOU (tr)', 'mIOU (val'))
        logger.flush()

    #lr scheduler
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=args.step_loss,
                                                gamma=0.1)

    for epoch in range(start_epoch, args.max_epochs):
        scheduler.step(epoch)

        lr = 0
        for param_group in optimizer.param_groups:
            lr = param_group['lr']

        # run at zoomed images first
        train(args, trainLoaderWithZoom, model, criteria, optimizer, epoch)
        lossTr, overall_acc_tr, per_class_acc_tr, per_class_iu_tr, mIOU_tr = train(
            args, trainLoaderNoZoom, model, criteria, optimizer, epoch)
        # evaluate on validation set
        lossVal, overall_acc_val, per_class_acc_val, per_class_iu_val, mIOU_val = val(
            args, valLoader, model, criteria)

        save_checkpoint(
            {
                'epoch': epoch + 1,
                'arch': str(model),
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'lossTr': lossTr,
                'lossVal': lossVal,
                'iouTr': mIOU_tr,
                'iouVal': mIOU_val,
            }, args.savedir + '/checkpoint.pth.tar')

        # save the model also
        model_file_name = args.savedir + '/model_' + str(epoch + 1) + '.pth'
        torch.save(model.state_dict(), model_file_name)

        with open(args.savedir + 'acc_' + str(epoch) + '.txt', 'w') as log:
            log.write(
                "\nEpoch: %d\t Overall Acc (Tr): %.4f\t Overall Acc (Val): %.4f\t mIOU (Tr): %.4f\t mIOU (Val): %.4f"
                % (epoch, overall_acc_tr, overall_acc_val, mIOU_tr, mIOU_val))
            log.write('\n')
            log.write('Per Class Training Acc: ' + str(per_class_acc_tr))
            log.write('\n')
            log.write('Per Class Validation Acc: ' + str(per_class_acc_val))
            log.write('\n')
            log.write('Per Class Training mIOU: ' + str(per_class_iu_tr))
            log.write('\n')
            log.write('Per Class Validation mIOU: ' + str(per_class_iu_val))

        logger.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f" %
                     (epoch, lossTr, lossVal, mIOU_tr, mIOU_val, lr))
        logger.flush()
        print("Epoch : " + str(epoch) + ' Details')
        print(
            "\nEpoch No.: %d\tTrain Loss = %.4f\tVal Loss = %.4f\t mIOU(tr) = %.4f\t mIOU(val) = %.4f"
            % (epoch, lossTr, lossVal, mIOU_tr, mIOU_val))

    logger.close()
Beispiel #8
0
def cross_val(args):

    torch.set_default_tensor_type('torch.DoubleTensor')

    allele_list_9 = [
        'HLA-A*02:01', 'HLA-A*03:01', 'HLA-A*11:01', 'HLA-A*02:03',
        'HLA-B*15:01', 'HLA-A*31:01', 'HLA-A*01:01', 'HLA-B*07:02',
        'HLA-A*26:01', 'HLA-A*02:06', 'HLA-A*68:02', 'HLA-B*08:01',
        'HLA-B*58:01', 'HLA-B*40:01', 'HLA-B*27:05', 'HLA-A*30:01',
        'HLA-A*69:01', 'HLA-B*57:01', 'HLA-B*35:01', 'HLA-A*02:02',
        'HLA-A*24:02', 'HLA-B*18:01', 'HLA-B*51:01', 'HLA-A*29:02',
        'HLA-A*68:01', 'HLA-A*33:01', 'HLA-A*23:01'
    ]

    allele_list_10 = [
        'HLA-A*02:01', 'HLA-A*03:01', 'HLA-A*11:01', 'HLA-A*68:01',
        'HLA-A*31:01', 'HLA-A*02:06', 'HLA-A*68:02', 'HLA-A*02:03',
        'HLA-A*33:01', 'HLA-A*02:02'
    ]

    if not os.path.exists(args.savedir):
        os.mkdir(args.savedir)

    if args.visualizeNet == True:
        x = Variable(torch.randn(1, 5, 174, 18))
        model = net.ResNetC1()

        if args.onGPU == True:
            x = x.cuda()
            model = model.cuda()

        y = model.forward(x)
        #g = viz.make_dot(y)
        #g.render(args.savedir + '/model.png', view=False)

        total_paramters = 0
        for parameter in model.parameters():
            i = len(parameter.size())
            p = 1
            for j in range(i):
                p *= parameter.size(j)
            total_paramters += p

        print('Parameters: ' + str(total_paramters))

    logFileLoc = args.savedir + os.sep + args.crossValFile

    if os.path.isfile(logFileLoc):
        logger = open(logFileLoc, 'a')
        logger.write("%s\t%s\t\t\t\t\t%s\n" % ('Length', 'Allele', 'Pearson'))
        logger.flush()
    else:
        logger = open(logFileLoc, 'w')
        logger.write("%s\t%s\t\t\t\t\t%s\n" % ('Length', 'Allele', 'Pearson'))
        logger.flush()

    for length in [9, 10]:

        if length == 9:
            allele_list = allele_list_9
        elif length == 10:
            allele_list = allele_list_10
        else:
            print("Invalid Length")
            exit(0)

        for allele in allele_list:  #[9,10]

            model_dir = args.savedir + os.sep + 'best_model' + os.sep + allele
            if not os.path.isdir(model_dir):
                os.makedirs(model_dir)

            data_dict = pickle.load(
                open(
                    args.data_dir + os.sep + 'pickle_' + str(length) + os.sep +
                    allele.replace('*', '.').replace(':', '_') + '.p', 'rb'))

            print('train on allele: ' + data_dict['allele'])
            if not length == data_dict['sequ_length']:
                print('length error')
                exit()

            encode_channel = data_dict['channel_encode']
            meas = data_dict['label']
            bind = []
            for i in meas:
                i = (-1) * math.log10(i)
                bind.append(i)
            sequ, label = encode_channel, bind

            if (len(sequ) > 0):
                sequ_ori, label_ori = sequ, label

                sequ_ori, test_sequ_ori, label_ori, test_label_ori = train_test_split(
                    sequ_ori,
                    label_ori,
                    test_size=0.1,
                    random_state=42,
                    shuffle=True)

                output_list = []
                label_list = []

                fold_num = 0

                kf = KFold(n_splits=5, shuffle=True, random_state=42)

                for train_set, test_set in kf.split(sequ_ori, label_ori):

                    fold_num += 1

                    #                alleleLoc = args.savedir + os.sep + allele + '.txt'

                    if os.path.isfile(alleleLoc):
                        log = open(alleleLoc, 'a')
                        log.write("\n")
                        log.write("%s\t\t\t%s\n" % ('Length: ', length))
                        log.write(
                            "%s\t\t\t\t%s\t\t\t%s\t\t\t%s\n" %
                            ('Epoch', 'tr_loss', 'val_loss', 'val_Pearson'))
                        log.flush()
                    else:
                        log = open(alleleLoc, 'w')
                        log.write("%s\t\t\t%s\n" % ('Allele', allele))
                        log.write("\n")
                        log.write("%s\t\t\t%s\n" % ('Length: ', length))
                        log.write(
                            "%s\t\t\t\t%s\t\t\t%s\t\t\t%s\n" %
                            ('Epoch', 'tr_loss', 'val_loss', 'val_Pearson'))
                        log.flush()


                    train_sequ, test_sequ, train_label, test_label = [sequ_ori[i] for i in train_set], [sequ_ori[i] for i in test_set],\
                                                                [label_ori[i] for i in train_set], [label_ori[i] for i in test_set]

                    train_sequ, val_sequ, train_label, val_label = train_test_split(
                        train_sequ,
                        train_label,
                        test_size=0.1,
                        random_state=42,
                        shuffle=True)

                    train_data_load = torch.utils.data.DataLoader(
                        myDataLoader.MyDataset(train_sequ, train_label),
                        batch_size=args.batch_size,
                        shuffle=True,
                        num_workers=args.num_workers,
                        pin_memory=True)

                    val_data_load = torch.utils.data.DataLoader(
                        myDataLoader.MyDataset(val_sequ, val_label),
                        batch_size=args.batch_size,
                        shuffle=True,
                        num_workers=args.num_workers,
                        pin_memory=True)

                    test_data_load = torch.utils.data.DataLoader(
                        myDataLoader.MyDataset(test_sequ, test_label),
                        batch_size=args.batch_size,
                        shuffle=True,
                        num_workers=args.num_workers,
                        pin_memory=True)

                    model = net.ResNetC1()

                    if args.onGPU == True:
                        #model = torch.nn.DataParallel(model, device_ids=[0,1,2,3]).cuda()
                        model = model.cuda()


#                    pretrain = torch.load('pretrain/best_model/' + allele + '/' + allele + '_' + str(length) + '.pth')
#                    model_dict = model.state_dict()
#                    pretrained_dict = {k: v for k, v in pretrain.items() if k in model_dict}
#                    model_dict.update(pretrained_dict)
#                    model.load_state_dict(model_dict)

                    criteria = MSELoss()

                    if args.onGPU == True:
                        criteria = criteria.cuda()

                # optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4)
                    optimizer = torch.optim.Adam(
                        model.parameters(),
                        args.lr,
                        weight_decay=args.weight_decay)

                    if args.onGPU == True:
                        cudnn.benchmark = True

                    scheduler = torch.optim.lr_scheduler.StepLR(
                        optimizer, step_size=args.step_loss, gamma=0.1)

                    start_epoch = 0

                    min_val_loss = 100
                    loss_not_decay = 0

                    for epoch in range(start_epoch, args.max_epochs):
                        tr_epoch_loss, tr_mean_squared_error = train(
                            args, train_data_load, model, criteria, optimizer)
                        val_epoch_loss, val_mean_squared_error, val_output, val_label = val(
                            args, val_data_load, model, criteria)

                        val_Pearson = pearsonr(val_output, val_label)

                        log = open(alleleLoc, 'a')
                        log.write("%s\t\t\t\t%.4f\t\t\t%.4f\t\t\t\t%.4f\n" %
                                  (epoch, tr_epoch_loss, val_epoch_loss,
                                   val_Pearson[0]))

                        if val_epoch_loss < min_val_loss:
                            if args.save_model == True:
                                model_file_name = model_dir + os.sep + allele + '_' + str(
                                    length) + '_' + str(fold_num) + '.pth'
                                print('==> Saving the best model')
                                torch.save(model.state_dict(), model_file_name)
                            min_val_loss = val_epoch_loss
                            loss_not_decay = 0
                        else:
                            loss_not_decay += 1

                        if loss_not_decay >= 40:
                            break

                        scheduler.step(epoch)

                    best_model_dict = torch.load(model_dir + os.sep + allele +
                                                 '_' + str(length) + '_' +
                                                 str(fold_num) + '.pth')
                    model.load_state_dict(best_model_dict)
                    _, _, output, label = val(args, test_data_load, model,
                                              criteria)

                    output_list.extend(output)
                    label_list.extend(label)

                pearson = pearsonr(output_list, label_list)
                r2 = r2_score(label_list, output_list)

                logger.write("%s\t%s\t\t\t\t%.4f\n" %
                             (length, allele, pearson[0]))
                logger.flush()

    logger.close()
Beispiel #9
0
def trainValidateSegmentation(args):

    print('Data file: ' + str(args.cached_data_file))
    print(args)

    # check if processed data file exists or not
    if not os.path.isfile(args.cached_data_file):
        dataLoader = ld.LoadData(args.data_dir, args.data_dir_val,
                                 args.classes, args.cached_data_file)
        data = dataLoader.processData()
        if data is None:
            print('Error while pickling data. Please check.')
            exit(-1)
    else:
        data = pickle.load(open(args.cached_data_file, "rb"))
    print('=> Loading the model')
    model = net.ESPNet(classes=args.classes, channels=args.channels)
    args.savedir = args.savedir + os.sep

    if args.onGPU:
        model = model.cuda()

    # create the directory if not exist
    if not os.path.exists(args.savedir):
        os.mkdir(args.savedir)

    if args.onGPU:
        model = model.cuda()

    if args.visualizeNet:
        import VisualizeGraph as viz
        x = Variable(
            torch.randn(1, args.channels, args.inDepth, args.inWidth,
                        args.inHeight))

        if args.onGPU:
            x = x.cuda()

        y = model(x, (128, 128, 128))  #, _, _
        g = viz.make_dot(y)
        g.render(args.savedir + os.sep + 'model', view=False)

    total_paramters = 0
    for parameter in model.parameters():
        i = len(parameter.size())
        p = 1
        for j in range(i):
            p *= parameter.size(j)
        total_paramters += p

    print('Parameters: ' + str(total_paramters))

    # define optimization criteria
    weight = torch.from_numpy(
        data['classWeights'])  # convert the numpy array to torch <- Sachin
    print('Class Imbalance Weights')
    print(weight)
    criteria = torch.nn.CrossEntropyLoss(weight)
    if args.onGPU:
        criteria = criteria.cuda()

    # We train at three different resolutions (144x144x144, 96x96x96 and 128x128x128)
    # and validate at one resolution (128x128x128)
    trainDatasetA = myTransforms.Compose([
        myTransforms.MinMaxNormalize(),
        myTransforms.ScaleToFixed(dimA=144, dimB=144, dimC=144),
        myTransforms.RandomFlip(),
        myTransforms.ToTensor(args.scaleIn),
    ])

    trainDatasetB = myTransforms.Compose([
        myTransforms.MinMaxNormalize(),
        myTransforms.ScaleToFixed(dimA=96, dimB=96, dimC=96),
        myTransforms.RandomFlip(),
        myTransforms.ToTensor(args.scaleIn),
    ])

    trainDatasetC = myTransforms.Compose([
        myTransforms.MinMaxNormalize(),
        myTransforms.ScaleToFixed(dimA=args.inWidth,
                                  dimB=args.inHeight,
                                  dimC=args.inDepth),
        myTransforms.RandomFlip(),
        myTransforms.ToTensor(args.scaleIn),
    ])

    valDataset = myTransforms.Compose([
        myTransforms.MinMaxNormalize(),
        myTransforms.ScaleToFixed(dimA=args.inWidth,
                                  dimB=args.inHeight,
                                  dimC=args.inDepth),
        myTransforms.ToTensor(args.scaleIn),
        #
    ])

    trainLoaderA = torch.utils.data.DataLoader(
        myDataLoader.MyDataset(data['trainIm'],
                               data['trainAnnot'],
                               transform=trainDatasetA),
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.num_workers,
        pin_memory=False)  #disabling pin memory because swap usage is high
    trainLoaderB = torch.utils.data.DataLoader(myDataLoader.MyDataset(
        data['trainIm'], data['trainAnnot'], transform=trainDatasetB),
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.num_workers,
                                               pin_memory=False)
    trainLoaderC = torch.utils.data.DataLoader(myDataLoader.MyDataset(
        data['trainIm'], data['trainAnnot'], transform=trainDatasetC),
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.num_workers,
                                               pin_memory=False)

    valLoader = torch.utils.data.DataLoader(myDataLoader.MyDataset(
        data['valIm'], data['valAnnot'], transform=valDataset),
                                            batch_size=1,
                                            shuffle=False,
                                            num_workers=args.num_workers,
                                            pin_memory=False)

    # define the optimizer
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                        model.parameters()),
                                 args.lr, (0.9, 0.999),
                                 eps=1e-08,
                                 weight_decay=2e-4)

    if args.onGPU == True:
        cudnn.benchmark = True

    start_epoch = 0
    stored_loss = 100000000.0
    if args.resume:
        if os.path.isfile(args.resumeLoc):
            print("=> loading checkpoint '{}'".format(args.resumeLoc))
            checkpoint = torch.load(args.resumeLoc)
            start_epoch = checkpoint['epoch']
            stored_loss = checkpoint['stored_loss']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    logFileLoc = args.savedir + args.logFile
    if os.path.isfile(logFileLoc):
        logger = open(logFileLoc, 'a')
        logger.write("Parameters: %s" % (str(total_paramters)))
        logger.write(
            "\n%s\t%s\t%s\t%s\t%s\t" %
            ('Epoch', 'Loss(Tr)', 'Loss(val)', 'mIOU (tr)', 'mIOU (val'))
        logger.flush()
    else:
        logger = open(logFileLoc, 'w')
        logger.write("Arguments: %s" % (str(args)))
        logger.write("\n Parameters: %s" % (str(total_paramters)))
        logger.write(
            "\n%s\t%s\t%s\t%s\t%s\t" %
            ('Epoch', 'Loss(Tr)', 'Loss(val)', 'mIOU (tr)', 'mIOU (val'))
        logger.flush()

    # reduce the learning rate by 0.5 after every 100 epochs
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=args.step_loss,
                                                gamma=0.5)  #40
    best_val_acc = 0

    loader_idxs = [
        0, 1, 2
    ]  # Three loaders at different resolutions are mapped to three indexes
    for epoch in range(start_epoch, args.max_epochs):
        # step the learning rate
        scheduler.step(epoch)
        lr = 0
        for param_group in optimizer.param_groups:
            lr = param_group['lr']
        print('Running epoch {} with learning rate {:.5f}'.format(epoch, lr))

        if epoch > 0:
            # shuffle the loaders
            np.random.shuffle(loader_idxs)

        for l_id in loader_idxs:
            if l_id == 0:
                train(args, trainLoaderA, model, criteria, optimizer, epoch)
            elif l_id == 1:
                train(args, trainLoaderB, model, criteria, optimizer, epoch)
            else:
                lossTr, overall_acc_tr, per_class_acc_tr, per_class_iu_tr, mIOU_tr = \
                    train(args, trainLoaderC, model, criteria, optimizer, epoch)

        # evaluate on validation set
        lossVal, overall_acc_val, per_class_acc_val, per_class_iu_val, mIOU_val = val(
            args, valLoader, model, criteria)

        print('saving checkpoint')  ## added
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'arch': str(model),
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'lossTr': lossTr,
                'lossVal': lossVal,
                'iouTr': mIOU_tr,
                'iouVal': mIOU_val,
                'stored_loss': stored_loss,
            }, args.savedir + '/checkpoint.pth.tar')

        # save the model also
        if mIOU_val >= best_val_acc:
            best_val_acc = mIOU_val
            torch.save(model.state_dict(), args.savedir + '/best_model.pth')

        with open(args.savedir + 'acc_' + str(epoch) + '.txt', 'w') as log:
            log.write(
                "\nEpoch: %d\t Overall Acc (Tr): %.4f\t Overall Acc (Val): %.4f\t mIOU (Tr): %.4f\t mIOU (Val): %.4f"
                % (epoch, overall_acc_tr, overall_acc_val, mIOU_tr, mIOU_val))
            log.write('\n')
            log.write('Per Class Training Acc: ' + str(per_class_acc_tr))
            log.write('\n')
            log.write('Per Class Validation Acc: ' + str(per_class_acc_val))
            log.write('\n')
            log.write('Per Class Training mIOU: ' + str(per_class_iu_tr))
            log.write('\n')
            log.write('Per Class Validation mIOU: ' + str(per_class_iu_val))

        logger.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.6f" %
                     (epoch, lossTr, lossVal, mIOU_tr, mIOU_val, lr))
        logger.flush()
        print("Epoch : " + str(epoch) + ' Details')
        print(
            "\nEpoch No.: %d\tTrain Loss = %.4f\tVal Loss = %.4f\t mIOU(tr) = %.4f\t mIOU(val) = %.4f"
            % (epoch, lossTr, lossVal, mIOU_tr, mIOU_val))

    logger.close()
Beispiel #10
0
def train_val(args):

    torch.set_default_tensor_type('torch.DoubleTensor')

    if not os.path.exists(args.savedir):
        os.mkdir(args.savedir)

    if args.visualizeNet == True:
        x = Variable(torch.randn(1, 51, 61, 23))

        if args.onGPU == True:
            x = x.cuda()

        model = net.ResNetC1()

        total_paramters = 0
        for parameter in model.parameters():
            i = len(parameter.size())
            p = 1
            for j in range(i):
                p *= parameter.size(j)
            total_paramters += p

        print('Parameters: ' + str(total_paramters))

    logFileLoc = args.savedir + os.sep + args.trainValFile

    if os.path.isfile(logFileLoc):
        logger = open(logFileLoc, 'a')
        logger.write("%s\t%s\t\t\t\t\t%s\t\t\t%s\t\t\t%s\n" %
                     ('Epoch', 'tr_loss', 'val_loss', 'tr_acc', 'val_acc'))
        logger.flush()
    else:
        logger = open(logFileLoc, 'w')
        logger.write("%s\t%s\t\t\t\t\t%s\t\t\t%s\t\t\t%s\n" %
                     ('Epoch', 'tr_loss', 'val_loss', 'tr_acc', 'val_acc'))
        logger.flush()

    image, label = loadData()

    train_image, test_image, train_label, test_label = train_test_split(
        image, label, test_size=0.1, random_state=42, shuffle=True)
    train_image, val_image, train_label, val_label = train_test_split(
        train_image, train_label, test_size=0.1, random_state=42, shuffle=True)

    train_data_load = torch.utils.data.DataLoader(myDataLoader.MyDataset(
        train_image, train_label),
                                                  batch_size=args.batch_size,
                                                  shuffle=True,
                                                  num_workers=args.num_workers,
                                                  pin_memory=True)
    val_data_load = torch.utils.data.DataLoader(myDataLoader.MyDataset(
        val_image, val_label),
                                                batch_size=args.batch_size,
                                                shuffle=True,
                                                num_workers=args.num_workers,
                                                pin_memory=True)
    test_data_load = torch.utils.data.DataLoader(myDataLoader.MyDataset(
        test_image, test_label),
                                                 batch_size=args.batch_size,
                                                 shuffle=True,
                                                 num_workers=args.num_workers,
                                                 pin_memory=True)

    model = net.ResNetC1()

    if args.onGPU == True:
        model = model.cuda()

    criteria = torch.nn.CrossEntropyLoss()

    if args.onGPU == True:
        criteria = criteria.cuda()

    # optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4
    # optimizer = torch.optim.Adam(model.parameters(), args.lr, (0.9, 0.999), eps=1e-08, weight_decay=2e-4)
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=5e-4)

    if args.onGPU == True:
        cudnn.benchmark = True

    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=args.step_loss,
                                                gamma=0.1)

    start_epoch = 0

    min_val_loss = 100

    for epoch in range(start_epoch, args.max_epochs):
        loss_train, accuracy_train, report_train = train(
            args, train_data_load, model, criteria, optimizer)
        loss_val, accuracy_val, report_val = val(args, val_data_load, model,
                                                 criteria)

        logger.write(
            "%s\t%s\t\t\t\t\t%s\t\t\t%s\t\t\t%s\n" %
            (epoch, loss_train, loss_val, accuracy_train, accuracy_val))

        alleleLoc = args.savedir + os.sep + 'acc_' + str(epoch) + '.txt'
        log = open(alleleLoc, 'a')
        log.write("train classification report")
        log.write("\n")
        log.write(report_train)
        log.write("\n")
        log.write("validation classification report")
        log.write("\n")
        log.write(report_val)
        log.flush()
        log.close()

        if loss_val < min_val_loss:
            if args.save_model == True:
                model_file_name = args.savedir + os.sep + 'best_model' + '.pth'
                print('==> Saving the best model')
                torch.save(model.state_dict(), model_file_name)
            min_val_loss = loss_val

    logger.close()
Beispiel #11
0
def trainRegression(args):

    sequ, label = load_data(args.data_dir)

    train_sequ, test_sequ, train_label, test_label = train_test_split(
        sequ, label, test_size=0.33, random_state=42)
    train_sequ, val_sequ, train_label, val_label = train_test_split(
        train_sequ, train_label, test_size=0.33, random_state=42)

    print(train_sequ[0].shape)

    train_data_load = torch.utils.data.DataLoader(myDataLoader.MyDataset(
        train_sequ, train_label),
                                                  batch_size=args.batch_size,
                                                  shuffle=True,
                                                  num_workers=args.num_workers,
                                                  pin_memory=True)

    val_data_load = torch.utils.data.DataLoader(myDataLoader.MyDataset(
        val_sequ, val_label),
                                                batch_size=args.batch_size,
                                                shuffle=False,
                                                num_workers=args.num_workers,
                                                pin_memory=True)

    test_data_load = torch.utils.data.DataLoader(myDataLoader.MyDataset(
        test_sequ, test_label),
                                                 batch_size=args.batch_size,
                                                 shuffle=False,
                                                 num_workers=args.num_workers,
                                                 pin_memory=True)

    print("DataSet prepared")

    args.savedir = args.savedir + os.sep

    # create the directory if not exist
    if not os.path.exists(args.savedir):
        os.mkdir(args.savedir)

    model = net.shallow_net()

    if args.onGPU == True:
        model = model.cuda()

    total_paramters = 0
    if args.visualizeNet == True:
        x = Variable(torch.randn(1, 20, 30, 1))

        if args.onGPU == True:
            x = x.cuda()

        y = model.forward(x)
        g = make_dot(y)
        g.render(args.savedir + '/model.png', view=False)

        total_paramters = 0
        for parameter in model.parameters():
            i = len(parameter.size())
            p = 1
            for j in range(i):
                p *= parameter.size(j)
            total_paramters += p

        print('Parameters: ' + str(total_paramters))

    criteria = CrossEntropyLoss2d()

    if args.onGPU == True:
        criteria = criteria.cuda()

    optimizer = torch.optim.SGD(model.parameters(),
                                lr=args.lr,
                                momentum=0.9,
                                weight_decay=5e-4)

    if args.onGPU == True:
        cudnn.benchmark = True

    logFileLoc = args.savedir + args.logFile
    if os.path.isfile(logFileLoc):
        logger = open(logFileLoc, 'a')
        logger.write("Parameters: %s" % (str(total_paramters)))
        logger.write(
            "\n%s\t%s\t%s\t%s\t%s\t" %
            ('Epoch', 'Loss(Tr)', 'Loss(val)', 'MSE (tr)', 'MSE (val)'))
        logger.flush()
    else:
        logger = open(logFileLoc, 'w')
        logger.write("Parameters: %s" % (str(total_paramters)))
        logger.write(
            "\n%s\t%s\t%s\t%s\t%s\t" %
            ('Epoch', 'Loss(Tr)', 'Loss(val)', 'MSE (tr)', 'MSE (val)'))
        logger.flush()

    scheduler = torch.optim.lr_scheduler.StepLR(
        optimizer, step_size=args.step_loss,
        gamma=0.1)  #.ReduceLROnPlateau(optimizer, 'min', patience=5)

    start_epoch = 0
    best_MSE = 10000

    for epoch in range(start_epoch, args.max_epochs):
        scheduler.step(epoch)

        lr = 0
        for param_group in optimizer.param_groups:
            lr = param_group['lr']

        tr_epoch_loss, tr_mean_squared_error = train(args, train_data_load,
                                                     model, criteria,
                                                     optimizer, epoch)
        val_epoch_loss, val_mean_squared_error = val(args, val_data_load,
                                                     model, criteria,
                                                     optimizer, epoch)

        save_checkpoint(
            {
                'epoch': epoch + 1,
                'arch': str(model),
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'lossTr': tr_epoch_loss,
                'lossVal': val_epoch_loss,
                'MSETr': tr_mean_squared_error,
                'MSEVal': val_mean_squared_error,
            }, args.savedir + '/checkpoint.pth.tar')

        # save the best model
        if val_mean_squared_error < best_MSE:
            model_file_name = args.savedir + '/best_peptide_model.pth'
            print('==> Saving the best model')
            torch.save(model.state_dict(), model_file_name)
            best_MSE = val_mean_squared_error

        logger.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f" %
                     (epoch, tr_epoch_loss, val_epoch_loss,
                      tr_mean_squared_error, val_mean_squared_error, lr))
        logger.flush()
        print("Epoch : " + str(epoch) + ' Details')
        print(
            "\nEpoch No.: %d\tTrain Loss = %.4f\tVal Loss = %.4f\t MSE(tr) = %.4f\t MSE(val) = %.4f"
            % (epoch, tr_epoch_loss, val_epoch_loss, tr_mean_squared_error,
               val_mean_squared_error))

        logger.close()