Python HumanDataset Examples

Programming Language: Python

Namespace/Package Name: data

Class/Type: HumanDataset

Examples at hotexamples.com: 9

Python HumanDataset - 9 examples found. These are the top rated real world Python examples of data.HumanDataset extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

HumanDataset(17)

Frequently Used Methods

HumanDataset (17)

Example #1

Show file

File: main.py Project: CH-LIU/ML2018FALL

def main():
    fold = 0
    # 4.1 mkdirs
    if not os.path.exists(config.submit):
        os.makedirs(config.submit)
    if not os.path.exists(config.weights + config.model_name + os.sep +str(fold)):
        os.makedirs(config.weights + config.model_name + os.sep +str(fold))
    if not os.path.exists(config.best_models):
        os.mkdir(config.best_models)
    if not os.path.exists("./logs/"):
        os.mkdir("./logs/")
    
    # 4.2 get model
    model = get_net()
    model.cuda()

    # criterion
    optimizer = optim.SGD(model.parameters(),lr = config.lr,momentum =0.9 ,weight_decay=1e-4)
    criterion = nn.BCEWithLogitsLoss().cuda()
    start_epoch = 0
    best_loss = 999
    best_f1 = 0
    best_results = [np.inf,0]
    val_metrics = [np.inf,0]
    resume = False
    all_files = pd.read_csv("./train.csv")
    #test_files = pd.read_csv("./sample_submission.csv")
    train_data_list,val_data_list = train_test_split(all_files,test_size = 0.13,random_state = 2050)
    # load dataset
    train_gen = HumanDataset(train_data_list,config.train_data,mode="train")
    train_loader = DataLoader(train_gen,batch_size=config.batch_size,shuffle=True,pin_memory=True,num_workers=4)

    val_gen = HumanDataset(val_data_list,config.train_data,augument=False,mode="train")
    val_loader = DataLoader(val_gen,batch_size=config.batch_size,shuffle=False,pin_memory=True,num_workers=4)

    #test_gen = HumanDataset(test_files,config.test_data,augument=False,mode="test")
    #test_loader = DataLoader(test_gen,1,shuffle=False,pin_memory=True,num_workers=4)

    scheduler = lr_scheduler.StepLR(optimizer,step_size=7,gamma=0.1)
    start = timer()
    
    #train
    for epoch in range(0,config.epochs):
        scheduler.step(epoch)
        # train
        lr = get_learning_rate(optimizer)
        train_metrics = train(train_loader,model,criterion,optimizer,epoch,val_metrics,best_results,start)
        # val
        val_metrics = evaluate(val_loader,model,criterion,epoch,train_metrics,best_results,start)
        # check results 
        is_best_loss = val_metrics[0] < best_results[0]
        best_results[0] = min(val_metrics[0],best_results[0])
        is_best_f1 = val_metrics[1] > best_results[1]
        best_results[1] = max(val_metrics[1],best_results[1])   
        # save model
        save_checkpoint({
                    "epoch":epoch + 1,
                    "model_name":config.model_name,
                    "state_dict":model.state_dict(),
                    "best_loss":best_results[0],
                    "optimizer":optimizer.state_dict(),
                    "fold":fold,
                    "best_f1":best_results[1],
        },is_best_loss,is_best_f1,fold)
        print('\r',end='',flush=True)
        log.write('%s  %5.1f %6.1f         |         %0.3f  %0.3f           |         %0.3f  %0.4f         |         %s  %s    | %s' % (\
                "best", epoch, epoch,                    
                train_metrics[0], train_metrics[1], 
                val_metrics[0], val_metrics[1],
                str(best_results[0])[:8],str(best_results[1])[:8],
                time_to_str((timer() - start),'min'))
            )
        log.write("\n")
        time.sleep(0.01)

Example #2

Show file

File: ensemble_main.py Project: irvingzhang0512/kaggle_human_protein_baseline

def training(model, fold, log, train_image_names, train_image_labels, val_image_names, val_image_labels):
    # logging issues
    log.write(
        "\n---------------------------- [START %s] %s\n\n" % (datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '-' * 20))

    log.write(
        '----------------------|--------- Train ---------|-------- Valid ---------|-------Best '
        'Results-------|----------|\n')
    log.write(
        'mode   iter   epoch   |      loss   f1_macro    |      loss   f1_macro   |       loss   f1_macro    | time   '
        '  |\n')
    log.write(
        '----------------------------------------------------------------------------------------------------------'
        '----\n')

    # training params
    optimizer = optim.SGD(model.parameters(),
                          lr=config.learning_rate_start,
                          momentum=0.9,
                          weight_decay=config.weight_decay)
    if config.loss_name == 'ce':
        criterion = nn.BCEWithLogitsLoss().cuda()
    elif config.loss_name == 'focal':
        criterion = FocalLoss().cuda()
    elif config.loss_name == 'f1':
        criterion = F1Loss().cuda()
    else:
        raise ValueError('unknown loss name {}'.format(config.loss_name))
    best_results = [np.inf, 0]
    val_metrics = [np.inf, 0]
    scheduler = lr_scheduler.StepLR(optimizer,
                                    step_size=config.learning_rate_decay_epochs,
                                    gamma=config.learning_rate_decay_rate)
    start = timer()

    train_gen = HumanDataset(train_image_names, train_image_labels, config.train_dir, mode="train")
    train_loader = DataLoader(train_gen, batch_size=config.batch_size, shuffle=True, pin_memory=True, num_workers=4)
    val_gen = HumanDataset(val_image_names, val_image_labels, config.train_dir, augument=False, mode="train")
    val_loader = DataLoader(val_gen, batch_size=config.batch_size, shuffle=False, pin_memory=True, num_workers=4)

    # train
    for epoch in range(0, config.epochs):
        # training & evaluating
        scheduler.step(epoch)
        get_learning_rate(optimizer)
        train_metrics = train(train_loader, model, criterion, optimizer, epoch, val_metrics, best_results, start)
        val_metrics = evaluate(val_loader, model, criterion, epoch, train_metrics, best_results, start)

        # check results
        is_best_loss = val_metrics[0] < best_results[0]
        best_results[0] = min(val_metrics[0], best_results[0])
        is_best_f1 = val_metrics[1] > best_results[1]
        best_results[1] = max(val_metrics[1], best_results[1])

        # save model
        save_checkpoint({
            "epoch": epoch + 1,
            "model_name": config.model_name,
            "state_dict": model.state_dict(),
            "best_loss": best_results[0],
            "optimizer": optimizer.state_dict(),
            "fold": fold,
            "best_f1": best_results[1],
        }, is_best_loss, is_best_f1, fold)

        # print logs
        print('\r', end='', flush=True)
        log.write(
            logging_pattern % (
                "best", epoch, epoch,
                train_metrics[0], train_metrics[1],
                val_metrics[0], val_metrics[1],
                str(best_results[0])[:8], str(best_results[1])[:8],
                time_to_str((timer() - start), 'min')
            )
        )
        log.write("\n")
        time.sleep(0.01)

Example #3

Show file

File: find_best_lr.py Project: witzou/cv_notes

def find_lr(init_value = 1e-8, final_value=10., beta = 0.98):
    # 1. load dataset
    all_files = pd.read_csv(config.CSV_TRAIN)
    train_data_list, _  = multilabel_stratification(all_files, test_size=0.2, random_state=42)
    train_gen = HumanDataset(train_data_list,config.train_data,mode="train")
    train_loader = DataLoader(train_gen,batch_size=config.batch_size,shuffle=True,pin_memory=True,num_workers=8)  

    # 2. get the model, and set the optimizer and criterion
    model = get_net()
    model.cuda()
    optimizer = optim.SGD(model.parameters(),lr = init_value,momentum=0.9,weight_decay=1e-4)
    criterion = nn.BCEWithLogitsLoss(opt_class_weight).cuda()

    # 3.set init value
    num = len(train_loader) - 1                             # num = samples_per_epoch / batch_size
    mult = (final_value / init_value) ** (1/num)            # init_value * (mult)**num ==> final_value

    lr = init_value
    optimizer.param_groups[0]['lr'] = lr
    avg_loss = 0.
    best_loss = 0.
    batch_num = 0
    losses = []
    log_lrs = []

    best_lr = 111

    model.train()
    model.zero_grad()
    
    for i,(images,target) in enumerate(train_loader):
        batch_num += 1

        # 0. get the loss of this batch
        images = images.cuda(non_blocking=True)
        target = torch.from_numpy(np.array(target)).float().cuda(non_blocking=True)
        output = model(images)
        loss = criterion(output,target)

        # 1. Compute the smoothed loss
        avg_loss = beta * avg_loss + (1-beta) *loss.item()
        smoothed_loss = avg_loss / (1 - beta**batch_num)

        # 2. Stop if the loss is exploding
        if batch_num > 1 and smoothed_loss > 4 * best_loss:
            return log_lrs, losses
        # 3. Record the best loss
        if smoothed_loss < best_loss or batch_num==1:
            best_loss = smoothed_loss
            best_lr = lr
        # 4. Store the values
        losses.append(smoothed_loss)
        log_lrs.append(math.log10(lr))


        # 5. Do the SGD step
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # 6. Update the lr for the next step
        lr *= mult
        optimizer.param_groups[0]['lr'] = lr

        print('%d:  factor:%.3f  smoothed_loss:%f best_loss:%f lr:%f best_lr: %f'%(i,smoothed_loss/best_loss, smoothed_loss, best_loss, lr, best_lr))
    return log_lrs, losses

Example #4

Show file

File: train_kfold.py Project: shengchen-liu/kaggle_human_protein_baseline

def main():
    # 4.1 mkdirs
    if not os.path.exists(config.submit):
        os.makedirs(config.submit)
    if not os.path.exists(config.weights + config.model_name + os.sep +
                          'fold_' + str(config.fold)):
        os.makedirs(config.weights + config.model_name + os.sep + 'fold_' +
                    str(config.fold))
    if not os.path.exists(config.best_models):
        os.mkdir(config.best_models)
    if not os.path.exists(config.logs):
        os.mkdir(config.logs)

    all_files = pd.read_csv("./input/train.csv")

    # -------------------------------------------------------
    # training
    # -------------------------------------------------------
    if config.mode == 'train':

        for fold in range(config.fold):

            # 4.2 get model
            model = get_net()
            model.cuda()

            optimizer = optim.Adam(model.parameters(), lr=config.lr)

            # ================================================================== #
            #                        Loss criterioin                             #
            # ================================================================== #
            # criterion
            # optimizer = optim.SGD(model.parameters(),lr = config.lr,momentum=0.9,weight_decay=1e-4)

            # Use the optim package to define an Optimizer that will update the weights of
            # the model for us. Here we will use Adam; the optim package contains many other
            # optimization algoriths. The first argument to the Adam constructor tells the
            # optimizer which Tensors it should update.
            assert config.loss in ['bcelog', 'f1_loss', 'focal_loss'], \
                print("Loss type {0} is unknown".format(config.loss))
            if config.loss == 'bcelog':
                criterion = nn.BCEWithLogitsLoss().cuda()
            elif config.loss == 'f1_loss':
                criterion = F1_loss().cuda()
            elif config.loss == 'focal_loss':
                criterion = FocalLoss().cuda()

            # best_loss = 999
            # best_f1 = 0
            best_results = [np.inf, 0]
            val_metrics = [np.inf, 0]

            ## k-fold--------------------------------

            # tflogger
            tflogger = TFLogger(
                os.path.join(
                    'results', 'TFlogs', config.model_name +
                    "_fold{0}_{1}".format(config.fold, fold)))

            with open(
                    os.path.join(
                        "./input/fold_{0}".format(config.fold),
                        'train_fold{0}_{1}.txt'.format(config.fold, fold)),
                    'r') as text_file:
                train_names = text_file.read().split('\n')
                # # oversample
                # s = Oversampling("./input/train.csv")
                # train_names = [idx for idx in train_names for _ in range(s.get(idx))]
                train_data_list = all_files[all_files['Id'].isin(train_names)]
                # train_data_list = all_files.copy().set_index('Id')
                # train_data_list
                # train_data_list = train_data_list.reindex(train_names)
                # 57150 -> 29016
                # reset index
                # train_data_list = train_data_list.rename_axis('Id').reset_index()
            with open(
                    os.path.join(
                        "./input/fold_{0}".format(config.fold),
                        'test_fold{0}_{1}.txt'.format(config.fold, fold)),
                    'r') as text_file:
                val_names = text_file.read().split('\n')
                val_data_list = all_files[all_files['Id'].isin(val_names)]

            # load dataset
            train_gen = HumanDataset(train_data_list,
                                     config.train_data,
                                     mode="train")
            train_loader = DataLoader(train_gen,
                                      batch_size=config.batch_size,
                                      shuffle=True,
                                      pin_memory=True,
                                      num_workers=4)

            val_gen = HumanDataset(val_data_list,
                                   config.train_data,
                                   augument=False,
                                   mode="train")
            val_loader = DataLoader(val_gen,
                                    batch_size=config.batch_size,
                                    shuffle=False,
                                    pin_memory=True,
                                    num_workers=4)

            # initialize the early_stopping object
            early_stopping = EarlyStopping(patience=7, verbose=True)

            if config.resume:
                log.write('\tinitial_checkpoint = %s\n' %
                          config.initial_checkpoint)
                checkpoint_path = os.path.join(config.weights,
                                               config.model_name, config.fold,
                                               config.initial_checkpoint,
                                               'checkpoint.pth.tar')
                loaded_model = torch.load(checkpoint_path)
                model.load_state_dict(loaded_model["state_dict"])
                start_epoch = loaded_model["epoch"]
            else:
                start_epoch = 0

            scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)
            start = timer()

            # train
            for epoch in range(start_epoch, config.epochs):
                scheduler.step(epoch)
                # train
                lr = get_learning_rate(optimizer)
                train_metrics = train(train_loader, model, criterion,
                                      optimizer, epoch, val_metrics,
                                      best_results, start, config.threshold)
                # val
                val_metrics = evaluate(val_loader, model, criterion, epoch,
                                       train_metrics, best_results, start,
                                       config.threshold)
                # check results
                is_best_loss = val_metrics[0] < best_results[0]
                best_results[0] = min(val_metrics[0], best_results[0])
                is_best_f1 = val_metrics[1] > best_results[1]
                best_results[1] = max(val_metrics[1], best_results[1])
                # save model
                save_checkpoint(
                    {
                        "epoch": epoch + 1,
                        "model_name": config.model_name,
                        "state_dict": model.state_dict(),
                        "best_loss": best_results[0],
                        "optimizer": optimizer.state_dict(),
                        "fold": config.fold,
                        "kfold": fold,
                        "best_f1": best_results[1],
                    }, is_best_loss, is_best_f1, config.fold, fold)
                # print logs
                print('\r', end='', flush=True)

                log.write(
                    '%s  %5.1f %6.1f  %.2E|  %0.3f   %0.3f    |   %0.3f    %0.4f   |  %s      %s       | %s      |%s ' % ( \
                        "best", epoch, epoch, Decimal(lr),
                        train_metrics[0], train_metrics[1],
                        val_metrics[0], val_metrics[1],
                        str(best_results[0])[:8], str(best_results[1])[:8],
                        time_to_str((timer() - start), 'min'),
                        fold),
                )
                log.write("\n")
                time.sleep(0.01)

                # ================================================================== #
                #                        Tensorboard Logging                         #
                # ================================================================== #

                # 1. Log scalar values (scalar summary)
                info = {
                    'Train_loss': train_metrics[0],
                    'Train_F1_macro': train_metrics[1],
                    'Valid_loss': val_metrics[0],
                    'Valid_F1_macro': val_metrics[1],
                    'Learnging_rate': lr
                }

                for tag, value in info.items():
                    tflogger.scalar_summary(tag, value, epoch)

                # 2. Log values and gradients of the parameters (histogram summary)
                for tag, value in model.named_parameters():
                    tag = tag.replace('.', '/')
                    tflogger.histo_summary(tag,
                                           value.data.cpu().numpy(), epoch)
                    tflogger.histo_summary(tag + '/grad',
                                           value.grad.data.cpu().numpy(),
                                           epoch)
                # -------------------------------------
                # end tflogger

                # ================================================================== #
                #                        Early stopping                         #
                # ================================================================== #
                # early_stopping needs the validation loss to check if it has decresed,
                # and if it has, it will make a checkpoint of the current model
                early_stopping(val_metrics[1], model)

                if early_stopping.early_stop:
                    print("Early stopping")
                    break
        #==========================================================#
        #End of k-fold
        # ==========================================================#

    # -------------------------------------------------------
    # testing
    # -------------------------------------------------------
    elif config.mode == 'test':
        test_files = pd.read_csv("./input/sample_submission.csv")
        test_gen = HumanDataset(test_files,
                                config.test_data,
                                augument=False,
                                mode="test")
        test_loader = DataLoader(test_gen,
                                 1,
                                 shuffle=False,
                                 pin_memory=True,
                                 num_workers=4)

        # checkpoint_path = os.path.join(config.best_models,'{0}_fold_{1}_model_best_loss.pth.tar'.format(config.model_name, fold))
        checkpoint_path = os.path.join(
            config.weights, config.model_name, 'fold_{0}'.format(fold),
            'checkpoint_{}.pth.tar'.format(config.checkpoint))
        best_model = torch.load(checkpoint_path)
        # best_model = torch.load("checkpoints/bninception_bcelog/0/checkpoint.pth.tar")
        model.load_state_dict(best_model["state_dict"])
        thresholds = [
            -0.13432257, -0.4642075, -0.50726506, -0.49715518, -0.41125674,
            0.11581507, -1.0143597, -0.18461785, -0.61600877, -0.47275479,
            -0.9142859, -0.44323673, -0.58404387, -0.22959213, -0.26110631,
            -0.43723898, -0.97624685, -0.44612319, -0.4492785, -0.56681327,
            -0.16156543, -0.12577745, -0.75476121, -0.91473052, -0.53361931,
            -0.19337344, -0.0857145, -0.45739976
        ]

        # thresholds = [-0.27631527, -0.31156957, -0.61893745, -1.01863398, -0.3141709,  -0.14000374,
        #               -0.6285302,  -0.43241383, -1.60594984, -0.14425374, -0.03979607, -0.25717957,
        #               -0.84905692, -0.37668712,  1.3710663,  -0.11193908, -0.81109447,  0.72506607,
        #               -0.05454339, -0.47056617, -0.16024197, -0.44002794, -0.65929407, -1.00900269,
        #               -0.86197429, -0.12346229, -0.4946575,  -0.52420557]
        test(test_loader, model, thresholds)
        print('Test successful!')

Example #5

Show file

def main():
    fold = 8
    # 4.1 mkdirs
    if not os.path.exists(config.submit):
        os.makedirs(config.submit)
    if not os.path.exists(config.weights + config.model_name + os.sep +
                          str(fold)):
        os.makedirs(config.weights + config.model_name + os.sep + str(fold))
    if not os.path.exists(config.best_models):
        os.mkdir(config.best_models)
    if not os.path.exists("./logs/"):
        os.mkdir("./logs/")

    # 4.2 get model
    model = get_net()
    model.cuda()

    # criterion
    optimizer = optim.SGD(model.parameters(),
                          lr=config.lr,
                          momentum=0.9,
                          weight_decay=1e-4)  #,nesterov=True)
    criterion = nn.BCEWithLogitsLoss().cuda()
    #criterion = FocalLoss().cuda()
    #criterion = F1Loss().cuda()
    start_epoch = 0
    best_loss = 999
    best_f1 = 0
    best_results = [np.inf, 0]
    val_metrics = [np.inf, 0]
    resume = False
    #all_files = pd.read_csv("./train.csv")
    train_df = pd.read_csv("./train_appended2.csv")
    train_df_orig = pd.read_csv("./total_train.csv")
    """print (type(train_df_orig))
    lows = [15,15,15,8,9,10,8,9,10,8,9,10,17,20,24,26,15,27,15,20,24,17,8,15,27,27,27]
    for i in lows:
        target = str(i)
        indicies = train_df_orig.loc[train_df_orig['Target'] == target].index
        train_df = pd.concat([train_df,train_df_orig.loc[indicies]], ignore_index=True)
        indicies = train_df_orig.loc[train_df_orig['Target'].str.startswith(target+" ")].index
        train_df = pd.concat([train_df,train_df_orig.loc[indicies]], ignore_index=True)
        indicies = train_df_orig.loc[train_df_orig['Target'].str.endswith(" "+target)].index
        train_df = pd.concat([train_df,train_df_orig.loc[indicies]], ignore_index=True)
        indicies = train_df_orig.loc[train_df_orig['Target'].str.contains(" "+target+" ")].index
        train_df = pd.concat([train_df,train_df_orig.loc[indicies]], ignore_index=True)
    #print(train_df)
    #input()"""
    test_files = pd.read_csv("./sample_submission.csv")
    train_data_list, val_data_list = train_test_split(train_df,
                                                      test_size=0.13,
                                                      random_state=2050)
    train_data_list_fake, val_data_list_fake = train_test_split(
        train_df_orig, test_size=0.01, random_state=2050)

    # load dataset
    train_gen = HumanDataset(train_data_list, config.train_data, mode="train")
    train_loader = DataLoader(train_gen,
                              batch_size=config.batch_size,
                              shuffle=True,
                              pin_memory=True,
                              num_workers=4)

    val_gen = HumanDataset(val_data_list,
                           config.train_data,
                           augument=False,
                           mode="train")
    val_loader = DataLoader(val_gen,
                            batch_size=config.batch_size,
                            shuffle=False,
                            pin_memory=True,
                            num_workers=4)

    test_gen = HumanDataset(test_files,
                            config.test_data,
                            augument=False,
                            mode="test")
    test_loader = DataLoader(test_gen,
                             1,
                             shuffle=False,
                             pin_memory=True,
                             num_workers=4)

    scheduler = lr_scheduler.StepLR(optimizer, step_size=8, gamma=0.1)
    start = timer()

    #train
    for epoch in range(0, config.epochs):
        scheduler.step(epoch)
        # train
        lr = get_learning_rate(optimizer)
        train_metrics = train(train_loader, model, criterion, optimizer, epoch,
                              val_metrics, best_results, start)
        # val
        val_metrics = evaluate(val_loader, model, criterion, epoch,
                               train_metrics, best_results, start)
        # check results
        is_best_loss = val_metrics[0] < best_results[0]
        best_results[0] = min(val_metrics[0], best_results[0])
        is_best_f1 = val_metrics[1] > best_results[1]
        best_results[1] = max(val_metrics[1], best_results[1])
        # save model
        save_checkpoint(
            {
                "epoch": epoch + 1,
                "model_name": config.model_name,
                "state_dict": model.state_dict(),
                "best_loss": best_results[0],
                "optimizer": optimizer.state_dict(),
                "fold": fold,
                "best_f1": best_results[1],
            }, is_best_loss, is_best_f1, fold)
        # print logs
        print('\r', end='', flush=True)
        log.write('%s  %5.1f %6.1f         |         %0.3f  %0.3f           |         %0.3f  %0.4f         |         %s  %s    | %s' % (\
                "best", epoch, epoch,
                train_metrics[0], train_metrics[1],
                val_metrics[0], val_metrics[1],
                str(best_results[0])[:8],str(best_results[1])[:8],
                time_to_str((timer() - start),'min'))
            )
        log.write("\n")
        time.sleep(0.01)

Example #6

Show file

File: train_model.py Project: sabbiracoustic1006/Human-Protein-Atlas-Image-Classification-Kaggle-Competition-

        model.load_state_dict(
            torch.load(
                'checkpoints/best_models/%s_fold_%d_model_best_f1.pth.tar' %
                (config.model_name, fold))['state_dict'])

        model.cuda()

        criterion = nn.BCEWithLogitsLoss().cuda()
        optimizer = optim.Adam(model.parameters(), lr=1e-3)
        scheduler = ReduceLROnPlateau(optimizer,
                                      factor=0.5,
                                      patience=2,
                                      min_lr=1e-5)

        train_gen = HumanDataset(X_train, y_train, augment=True)
        val_gen = HumanDataset(X_val, y_val, augment=False)

        train_loader = torch.utils.data.DataLoader(
            train_gen,
            batch_size=config.batch_size,
            shuffle=True,
            num_workers=6,
            pin_memory=True)
        val_loader = torch.utils.data.DataLoader(val_gen,
                                                 batch_size=config.batch_size,
                                                 num_workers=6,
                                                 pin_memory=True)
        #
        allPred = featExt(val_loader, model)
        break

Example #7

Show file

        # train_data_list
        train_data_list = train_data_list.reindex(train_names)
        # 57150 -> 29016
        # reset index
        train_data_list = train_data_list.rename_axis('Id').reset_index()
    with open(os.path.join("./input/protein-trainval-split", 'val_names.txt'), 'r') as text_file:
        val_names = text_file.read().split(',')
        val_data_list = all_files[all_files['Id'].isin(val_names)]

    # 4.2 get model
    model = get_net()
    model.cuda()
    fold = 0

    # load dataset
    train_gen = HumanDataset(train_data_list, config.train_data, mode="train")
    train_loader = DataLoader(train_gen, batch_size=config.batch_size, shuffle=True, pin_memory=True, num_workers=4)

    val_gen = HumanDataset(val_data_list, config.train_data, augument=False, mode="train")
    val_loader = DataLoader(val_gen, batch_size=config.batch_size, shuffle=False, pin_memory=True, num_workers=4)


    # checkpoint_path = os.path.join(config.best_models,'{0}_fold_{1}_model_best_loss.pth.tar'.format(config.model_name, fold))
    checkpoint_path = os.path.join(config.weights, config.model_name, 'fold_{0}'.format(fold),
                                   'checkpoint_{}.pth.tar'.format(config.checkpoint))
    best_model = torch.load(checkpoint_path)
    #best_model = torch.load("checkpoints/bninception_bcelog/0/checkpoint.pth.tar")
    model.load_state_dict(best_model["state_dict"])


    preds,y = validate(val_loader,model)

Example #8

Show file

File: main.py Project: yangsenwxy/kaggle_human_protein

def main():
    fold = config.fold
    # 4.1 mkdirs
    if not os.path.exists(config.submit):
        os.makedirs(config.submit)
    if not os.path.exists(config.weights + config.model_name + os.sep +
                          str(fold)):
        os.makedirs(config.weights + config.model_name + os.sep + str(fold))
    if not os.path.exists(config.best_models):
        os.mkdir(config.best_models)
    if not os.path.exists("./logs/"):
        os.mkdir("./logs/")

    # 4.2 get model
    model = get_net()
    model.cuda()
    if config.is_train_after_crash:
        best_model_name = config.weights + config.model_name + os.sep + str(
            fold - 10) + os.sep + "checkpoint.pth.tar"
        best_model = torch.load(best_model_name)
        print(best_model_name)
        model.load_state_dict(best_model["state_dict"])
        best_results = [np.inf, 0]
        val_metrics = [np.inf, 0]
        best_results[0] = best_model["best_loss"]
        best_results[1] = best_model["best_f1"]
    else:
        best_results = [np.inf, 0]
        val_metrics = [np.inf, 0]
    print(best_results)
    train_files = pd.read_csv(config.train_csv)
    external_files = pd.read_csv(config.external_csv)
    test_files = pd.read_csv(config.test_csv)
    all_files, test_files, weight_log = process_df(train_files, external_files,
                                                   test_files)
    # train_data_list,val_data_list = train_test_split(all_files,test_size = 0.13,random_state = 2050)
    train_data_list, val_data_list = tra_val_split(all_files)
    print(len(all_files))
    print(len(train_data_list))
    print(len(val_data_list))
    # train_data_list = train_data_list.iloc[np.arange(10000)]
    # val_data_list = val_data_list.iloc[np.arange(1000)]

    # load dataset
    train_gen = HumanDataset(train_data_list, mode="train")
    sampler = WeightedRandomSampler(
        train_data_list['freq'].values,
        num_samples=int(len(train_data_list) * config.multiply),
        replacement=True)
    train_loader = DataLoader(train_gen,
                              batch_size=config.batch_size,
                              drop_last=True,
                              sampler=sampler,
                              pin_memory=True,
                              num_workers=6)
    # train_loader = DataLoader(train_gen,batch_size=config.batch_size,shuffle=True,pin_memory=True,num_workers=6)

    val_gen = HumanDataset(val_data_list, augument=False, mode="train")
    val_loader = DataLoader(val_gen,
                            batch_size=config.batch_size,
                            drop_last=True,
                            shuffle=False,
                            pin_memory=True,
                            num_workers=6)

    test_gen = HumanDataset(test_files, augument=False, mode="test")
    test_loader = DataLoader(test_gen,
                             1,
                             shuffle=False,
                             pin_memory=True,
                             num_workers=6)

    search_gen = HumanDataset(val_data_list, augument=False, mode="train")
    search_loader = DataLoader(search_gen,
                               batch_size=config.batch_size * 4,
                               drop_last=False,
                               shuffle=False,
                               pin_memory=True,
                               num_workers=6)

    # optimizer = optim.Adam(model.parameters(), lr=config.lr, weight_decay=1e-4, amsgrad=True)
    optimizer = optim.SGD(model.parameters(),
                          lr=config.lr,
                          momentum=0.9,
                          weight_decay=1e-4)
    criterion = nn.BCEWithLogitsLoss().cuda()
    # criterion = nn.BCEWithLogitsLoss(torch.from_numpy(process_loss_weight(weight_log)).float()).cuda()
    # scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=config.epochs, eta_min=4e-8)
    # scheduler = lr_scheduler.StepLR(optimizer,step_size=6,gamma=0.1)
    # scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, patience=0, threshold=1e-3)
    scheduler = lr_scheduler.MultiStepLR(optimizer,
                                         milestones=[6, 13, 20],
                                         gamma=0.1)
    start = timer()

    # train
    if config.is_train:
        for epoch in range(0, config.epochs):
            scheduler.step(epoch)
            # train
            lr = get_learning_rate(optimizer)
            train_metrics = train(train_loader, model, criterion, optimizer,
                                  epoch, val_metrics, best_results, start, lr)
            # val
            val_metrics = evaluate(val_loader, model, criterion, epoch,
                                   train_metrics, best_results, start)
            # check results
            is_best_loss = val_metrics[0] < best_results[0]
            best_results[0] = min(val_metrics[0], best_results[0])
            is_best_f1 = val_metrics[1] > best_results[1]
            best_results[1] = max(val_metrics[1], best_results[1])
            # scheduler.step(val_metrics[0])
            # save model
            save_checkpoint(
                {
                    "epoch": epoch + 1,
                    "model_name": config.model_name,
                    "state_dict": model.state_dict(),
                    "best_loss": best_results[0],
                    "optimizer": optimizer.state_dict(),
                    "fold": fold,
                    "best_f1": best_results[1],
                }, is_best_loss, is_best_f1, fold)
            # print logs
            print('\r', end='', flush=True)
            log.write('%s  %5.1f %6.1f         |         %0.3f  %0.3f           |         %0.3f  %0.4f         |         %s  %s    | %s' % (\
                    "best", epoch + 1, epoch + 1,
                    train_metrics[0], train_metrics[1],
                    val_metrics[0], val_metrics[1],
                    str(best_results[0])[:8],str(best_results[1])[:8],
                    time_to_str((timer() - start),'min'))
                )
            log.write("\n")
            time.sleep(0.01)

    if config.is_search_thres:
        best_model_name = "%s/%s_fold_%s_model_best_%s.pth.tar" % (
            config.best_models, config.model_name, str(fold), config.best)
        # best_model_name = config.weights + config.model_name + os.sep +str(fold) + os.sep + "checkpoint.pth.tar"
        print(best_model_name)
        best_model = torch.load(best_model_name)
        model.load_state_dict(best_model["state_dict"])
        search_thresholds(search_loader, model)

    if config.is_test:
        knums = config.threshold_factor
        for knum in knums:
            for f in range(5):
                best_model_name = "%s/%s_fold_%s_model_best_%s.pth.tar" % (
                    config.best_models, config.model_name, str(fold + f),
                    config.best)
                # best_model_name = config.weights + config.model_name + os.sep +str(fold) + os.sep + "checkpoint.pth.tar"
                print(best_model_name)
                best_model = torch.load(best_model_name)
                model.load_state_dict(best_model["state_dict"])
                test(test_loader, model, (fold + f), knum)

Example #9

Show file

File: main.py Project: huutrinh68/kaggle-protein

def main():
    fold = 0
    # 4.1 mkdirs
    if not os.path.exists(config.submit):
        os.makedirs(config.submit)
    if not os.path.exists(config.weights + config.model_name + os.sep +
                          str(fold)):
        os.makedirs(config.weights + config.model_name + os.sep + str(fold))
    if not os.path.exists(config.best_models):
        os.mkdir(config.best_models)
    if not os.path.exists("./logs/"):
        os.mkdir("./logs/")

    # 4.2 get model
    model = get_net()
    model.cuda()
    # load old weight trained model
    #model.load_state_dict(torch.load("{}/{}_fold_{}_model_best_loss.pth.tar".format(config.best_models,config.model_name,str(fold)))["state_dict"])

    start_epoch = 0
    best_loss = 999
    best_f1 = 0
    best_results = [np.inf, 0]
    val_metrics = [np.inf, 0]
    resume = False
    # get train
    # train data, this data include external data
    df1 = pd.read_csv(config.train_kaggle_csv)
    df2 = pd.read_csv(config.train_external_csv)
    all_files = pd.concat([df1, df2])

    # create duplicate for low data
    # https://www.kaggle.com/c/human-protein-atlas-image-classification/discussion/74374#437548
    train_df_orig = all_files.copy()
    lows = [
        15, 15, 15, 8, 9, 10, 8, 9, 10, 8, 9, 10, 17, 20, 24, 26, 15, 27, 15,
        20, 24, 17, 8, 15, 27, 27, 27
    ]
    for i in lows:
        target = str(i)
        indicies = train_df_orig.loc[train_df_orig['Target'] == target].index
        all_files = pd.concat([all_files, train_df_orig.loc[indicies]],
                              ignore_index=True)
        indicies = train_df_orig.loc[train_df_orig['Target'].str.startswith(
            target + " ")].index
        all_files = pd.concat([all_files, train_df_orig.loc[indicies]],
                              ignore_index=True)
        indicies = train_df_orig.loc[train_df_orig['Target'].str.endswith(
            " " + target)].index
        all_files = pd.concat([all_files, train_df_orig.loc[indicies]],
                              ignore_index=True)
        indicies = train_df_orig.loc[train_df_orig['Target'].str.contains(
            " " + target + " ")].index
        all_files = pd.concat([all_files, train_df_orig.loc[indicies]],
                              ignore_index=True)

    del df1, df2, train_df_orig
    gc.collect()

    # compute class weight
    target = all_files.apply(lambda x: x['Target'].split(' '), axis=1)
    y = target.tolist()
    y = MultiLabelBinarizer().fit_transform(y)
    labels_dict = dict()
    count_classes = np.sum(y, axis=0)
    for i, count in enumerate(count_classes):
        labels_dict[i] = count

    del target, y
    gc.collect()

    dampened_cw = create_class_weight(labels_dict)[1]
    tmp = list(dampened_cw.values())
    class_weight = torch.FloatTensor(tmp).cuda()

    # criterion
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=config.lr,
                                 weight_decay=config.weight_decay)
    criterion = nn.BCEWithLogitsLoss(weight=class_weight).cuda()

    #print(all_files)
    test_files = pd.read_csv(config.sample_submission)
    train_data_list, val_data_list = train_test_split(all_files,
                                                      test_size=0.13,
                                                      random_state=2050)

    # load dataset
    train_gen = HumanDataset(train_data_list, config.train_data, mode="train")
    train_loader = DataLoader(train_gen,
                              batch_size=config.batch_size,
                              shuffle=True,
                              pin_memory=True,
                              num_workers=4)

    val_gen = HumanDataset(val_data_list,
                           config.train_data,
                           augument=False,
                           mode="train")
    val_loader = DataLoader(val_gen,
                            batch_size=config.batch_size,
                            shuffle=False,
                            pin_memory=True,
                            num_workers=4)

    test_gen = HumanDataset(test_files,
                            config.test_data,
                            augument=False,
                            mode="test")
    test_loader = DataLoader(test_gen,
                             1,
                             shuffle=False,
                             pin_memory=True,
                             num_workers=4)

    scheduler = lr_scheduler.StepLR(optimizer, step_size=8, gamma=0.1)
    start = timer()

    #train
    for epoch in range(0, config.epochs):
        scheduler.step(epoch)
        # train
        lr = get_learning_rate(optimizer)
        train_metrics = train(train_loader, model, criterion, optimizer, epoch,
                              val_metrics, best_results, start)
        # val
        val_metrics = evaluate(val_loader, model, criterion, epoch,
                               train_metrics, best_results, start)
        # check results
        is_best_loss = val_metrics[0] < best_results[0]
        best_results[0] = min(val_metrics[0], best_results[0])
        is_best_f1 = val_metrics[1] > best_results[1]
        best_results[1] = max(val_metrics[1], best_results[1])
        # save model
        save_checkpoint(
            {
                "epoch": epoch + 1,
                "model_name": config.model_name,
                "state_dict": model.state_dict(),
                "best_loss": best_results[0],
                "optimizer": optimizer.state_dict(),
                "fold": fold,
                "best_f1": best_results[1],
            }, is_best_loss, is_best_f1, fold)
        # print logs
        print('\r', end='', flush=True)
        log.write('%s  %5.1f %6.1f         |         %0.3f  %0.3f           |         %0.3f  %0.4f         |         %s  %s    | %s' % (\
                "best", epoch, epoch,
                train_metrics[0], train_metrics[1],
                val_metrics[0], val_metrics[1],
                str(best_results[0])[:8], str(best_results[1])[:8],
                time_to_str((timer() - start), 'min'))
            )
        log.write("\n")
        time.sleep(0.01)

    best_model = torch.load("{}/{}_fold_{}_model_best_loss.pth.tar".format(
        config.best_models, config.model_name, str(fold)))
    #best_model = torch.load("checkpoints/bninception_bcelog/0/checkpoint.pth.tar")
    model.load_state_dict(best_model["state_dict"])
    test(test_loader, model, fold)