Beispiel #1
0
def main():
    args = parse()

    if args.logger_name is not None:
        logger_name = args.logger_name
    else:
        logger_name = datetime.datetime.now().isoformat()

    if not args.dont_log:
        logging.getLogger(logger_name)
        logger_filename = re.subn('\\D+', '', logger_name)[0] + '.log'
        logging.basicConfig(filename=logger_filename, level=logging.DEBUG)

    save_file = None if args.dont_save else args.save_file

    root = args.liar_dataset_dir

    tokenizer, model = pretrained(model=args.pretrained_model, weights=args.pretrained_weights,
                                  freeze=not args.dont_freeze)
    if not args.dont_load_model_from_file and os.path.exists(args.save_file):
        model.load_state_dict(torch.load(args.save_file))

    device = torch.device("cuda:0" if torch.cuda.is_available() and not args.no_gpu else "cpu")
    print('using', device)
    model = model.to(device)
    if torch.cuda.device_count() > 1 and args.model_parallel:
        model = nn.DataParallel(model)

    if args.test:
        test(root, model, tokenizer, batch_size=args.batch_size, device=device)
    else:
        train(root, model, tokenizer, epochs=args.epochs, batch_size=args.batch_size, save_file=save_file, device=device)
Beispiel #2
0
def demo_test(args):
    if args.doc:
        args = config_loader(args.doc, args)
    # config
    # model_config(args, save=False)     # print model configuration of evaluation

    # set cuda
    torch.cuda.set_device(args.gpu_id)

    # model
    model = model_builder(args.model_name, args.scale, **args.model_args).cuda()

    # criteriohn
    criterion = criterion_builder(args.criterion)

    # dataset
    test_set = AxisDataSet(args.test_path, args.target_path)

    test_loader = DataLoader(test_set,
                             batch_size=args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers,
                            #  pin_memory=True,
                             pin_memory=False,
                             )

    # test
    test(model, test_loader, criterion, args)
Beispiel #3
0
def run_test(type):
    params = Params()
    #'./weight_one_hot/model3.ckpt' is best
    if type == 1:
        load_model = torch.load('./weight_0810/model46301.ckpt',
                                map_location=lambda storage, loc: storage.cuda(
                                    params.gpu_ids_test[0]))
    else:
        load_model = None
    test(0, params, load_model, None, None, None, evaluation=False)
Beispiel #4
0
def eval(*args, **kwargs):
    print("=" * 80)
    print("Eval model on [weekly_data_all_rm_duplicate.txt]")
    print("Model to eval: best_model.keras")
    print("=" * 80)
    print("\n")

    test()

    print("=" * 80)
    print("Finish Eval.")
    print("Check results in folder [weekly_result_nonredundant_sep_iedbid].")
    print("=" * 80)
    print("\n")
Beispiel #5
0
def test_ckp(ckp_name, setting):
    sess = Session()
    sess.load_checkpoints(ckp_name)

    search_mode = setting.split('_')[0]  # 'all' or 'indoor'
    search_setting = setting.split('_')[1]  # 'single' or 'multi'

    transform_test = settings.test_transforms_list

    results_ranks = np.zeros(50)
    results_map = np.zeros(1)

    for i in range(settings.test_times):
        eval_test = SYSU_eval_datasets(data_folder=settings.data_folder,
                                       data_split='test',
                                       search_mode=search_mode,
                                       search_setting=search_setting,
                                       use_random=True)

        test_queryloader = DataLoader(
            Image_dataset(eval_test.query, transform=transform_test),
            batch_size=settings.val_batch_size,
            shuffle=False,
            num_workers=0,
            drop_last=False,
        )

        test_galleryloader = DataLoader(
            Image_dataset(eval_test.gallery, transform=transform_test),
            batch_size=settings.val_batch_size,
            shuffle=False,
            num_workers=0,
            drop_last=False,
        )

        test_ranks, test_mAP = test([
            nn.Sequential(sess.feature_generator, sess.feature_embedder_rgb),
            nn.Sequential(sess.feature_generator, sess.feature_embedder_ir)
        ], test_queryloader, test_galleryloader)
        results_ranks += test_ranks
        results_map += test_mAP

        logger.info(
            'Test no.{} for model {} in setting {}, Test mAP: {}, R1: {}, R5: {}, R10: {}, R20: {}'
            .format(i, ckp_name, setting, test_mAP * 100.0,
                    test_ranks[0] * 100.0, test_ranks[4] * 100.0,
                    test_ranks[9] * 100.0, test_ranks[19] * 100.0))

    test_mAP = results_map / settings.test_times
    test_ranks = results_ranks / settings.test_times
    logger.info(
        'For model {} in setting {}, AVG test mAP: {}, R1: {}, R5: {}, R10: {}, R20: {}'
        .format(ckp_name, setting, test_mAP * 100.0, test_ranks[0] * 100.0,
                test_ranks[4] * 100.0, test_ranks[9] * 100.0,
                test_ranks[19] * 100.0))

    return [
        ckp_name, test_mAP * 100.0, test_ranks[0] * 100.0,
        test_ranks[4] * 100.0, test_ranks[9] * 100.0, test_ranks[19] * 100.0
    ]
Beispiel #6
0
def main():
    global opt
    train_dataset = mnist_Dataset(num_of_cross=0,cross=1)
    if opt.manualSeed is None:
        opt.manualSeed = random.randint(1, 10000)
    if torch.cuda.is_available() and not opt.cuda:
        print("WARNING: You have a CUDA device, so you should probably run with \"cuda: True\"")
        torch.manual_seed(opt.manualSeed)
    else:
        if int(opt.ngpu) == 1:
            print('so we use 1 gpu to training')
            print('setting gpu on gpuid {0}'.format(opt.gpu_id))

            if opt.cuda:
                os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_id
                torch.cuda.manual_seed(opt.manualSeed)
                cudnn.benchmark = True
    #loss_rec = np.load('acc_train.npy')
    #acc_rec = np.load('acc.npy')
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=opt.batchSize,
                                                   shuffle=True, num_workers=int(opt.workers))

    # create model
    model = mnist_model.cat_and_dog_resnet()

    if opt.init_model != '':
        print('loading pretrained model from {0}'.format(opt.init_model))
        model.load_state_dict(torch.load(opt.init_model))
    if opt.cuda:
        print('shift model and criterion to GPU .. ')
        model = model.cuda()
        # criterion = criterion.cuda()
    acc = test(model,opt,0,Training =False,cross=1)
Beispiel #7
0
def main():
    global opt
    loss_rec = np.zeros((opt.folds, 100))
    acc_rec = np.zeros((opt.folds, 100))
    #loss_rec = np.load('acc_train.npy')
    #acc_rec = np.load('acc.npy')
    for iteration in range(opt.folds):
        train_dataset = mnist_Dataset(num_of_cross=iteration)

        print('number of train samples is: {0}'.format(len(train_dataset)))
        print('finished loading data')

        if opt.manualSeed is None:
            opt.manualSeed = random.randint(1, 10000)

        if torch.cuda.is_available() and not opt.cuda:
            print(
                "WARNING: You have a CUDA device, so you should probably run with \"cuda: True\""
            )
            torch.manual_seed(opt.manualSeed)
        else:
            if int(opt.ngpu) == 1:
                print('so we use 1 gpu to training')
                print('setting gpu on gpuid {0}'.format(opt.gpu_id))

                if opt.cuda:
                    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_id
                    torch.cuda.manual_seed(opt.manualSeed)
                    cudnn.benchmark = True
        print('Random Seed: {0}'.format(opt.manualSeed))
        # train data loader
        train_loader = torch.utils.data.DataLoader(train_dataset,
                                                   batch_size=opt.batchSize,
                                                   shuffle=True,
                                                   num_workers=int(
                                                       opt.workers))

        # create model
        model = mnist_model.cat_and_dog_resnet()

        if opt.init_model != '':
            print('loading pretrained model from {0}'.format(opt.init_model))
            model.load_state_dict(torch.load(opt.init_model))

        # Contrastive Loss
        #criterion = mnist_model.StableBCELoss()
        criterion = nn.CrossEntropyLoss()

        if opt.cuda:
            print('shift model and criterion to GPU .. ')
            model = model.cuda()
            criterion = criterion.cuda()

        # optimizer
        # optimizer = optim.SGD(model.parameters(), lr=opt.lr,
        #                      momentum=opt.momentum,
        #                      weight_decay=opt.weight_decay)

        optimizer = optim.Adam(model.parameters(), lr=opt.lr)
        # optimizer = optim.SGD(model.parameters(), lr=opt.lr, weight_decay=opt.weight_decay, momentum=opt.momentum)
        # optimizer = optim.Adadelta(params=model.parameters(), lr=opt.lr)
        # adjust learning rate every lr_decay_epoch
        lambda_lr = lambda epoch: opt.lr_decay**(
            (epoch + 1) // opt.lr_decay_epoch)  # poly policy
        scheduler = LR_Policy(optimizer, lambda_lr)

        resume_epoch = 0
        acc = test(model, opt, iteration)
        acc_rec[iteration][0] = acc
        acc = test(model, opt, iteration, Training=True)
        loss_rec[iteration][0] = acc
        for epoch in range(resume_epoch, opt.max_epochs):
            #################################
            # train for one epoch
            #################################
            #accuracy = test(model, opt, epoch)
            train(train_loader, model, criterion, optimizer, iteration, opt,
                  epoch)
            scheduler.step()

            ##################################
            # save checkpoints
            ##################################

            # save model every 10 epochs
            accuracy = test(model, opt, iteration)
            acc_rec[iteration][epoch + 1] = accuracy
            np.save('acc.npy', acc_rec)
            accuracy = test(model, opt, iteration, Training=True)
            loss_rec[iteration][epoch + 1] = accuracy
            np.save('acc_train.npy', loss_rec)

            if ((epoch + 1) % opt.epoch_save) == 0:
                path_checkpoint = '{0}/{1}_{3}_epoch{2}.pth'.format(
                    opt.checkpoint_folder, opt.prefix, epoch + 1, iteration)
                utils.save_checkpoint(model.state_dict(), path_checkpoint)
Beispiel #8
0
    parser.add_argument('--eval', type=bool,  default=False,
                        help='evaluate the model')   
    parser.add_argument('--mc_level', type=int, default=-1, help='label level to use; -1 means all')      
    args = parser.parse_args()

    abs_cfg_dir = os.path.abspath(os.path.join(__file__, "../configs"))
    config.merge_cfg_from_dir(abs_cfg_dir)
    cfg = config.CONFIG
    
    HM = read_h_matrix_file_list(cfg.DATASET.DATA.H_MATRIX_LIST_FILE)
    _init_()
    name_dict = {True:"eval", False:""}
    io = IOStream('checkpoints/' + args.exp_name + '/{}run.log'.format(name_dict[args.eval]))

    args.cuda = torch.cuda.is_available()
    torch.manual_seed(cfg.DEVICES.SEED)

    if args.cuda:
        if len(cfg.DEVICES.GPU_ID) == 1:
            torch.cuda.set_device(cfg.DEVICES.GPU_ID[0])
        io.cprint(
            'Using GPU : ' + str(torch.cuda.current_device()) + ' from ' + str(torch.cuda.device_count()) + ' devices')
        torch.cuda.manual_seed(cfg.DEVICES.SEED)
    else:
        io.cprint('Using CPU')

    if not args.eval:
        train(args, io, cfg, HM)
    else:
        test(args, io, cfg, HM)
def main(args):
    if args.decoder_type == "attn":
        args.use_bi = True

    if (args.test_only == True) and (args.decode_method == "beam"):
        args.batch_size = 1

    if args.self_attn == True:
        args.encoder_hidden_size = 300
        args.decoder_hidden_size = 300

    source_words_to_load = 1000000
    target_words_to_load = 1000000
    input_lang, output_lang, train_pairs, train_max_length = prepareData(
        "train",
        args.language,
        "en",
        args.data_path,
        max_len_ratio=args.max_len_ratio,
        char=args.char_chinese)
    input_lang_dev, output_lang_dev, dev_pairs, _ = prepareData(
        'dev',
        args.language,
        'en',
        path=args.data_path,
        max_len_ratio=1,
        char=args.char_chinese)
    # _, _, test_pairs, _ = prepareData('test', args.language, 'en', path=args.data_path)

    if args.use_pretrain_emb:
        if args.language == "zh":
            if args.char_chinese:
                source_embedding, source_notPretrained = load_char_embd(
                    args.emb_path + "sgns.literature.char",
                    input_lang,
                    reload=args.reload_emb)
            else:
                file_check(args.emb_path + 'chinese_ft_300.txt')
                source_embedding, source_notPretrained = load_fasttext_embd(
                    args.emb_path + 'chinese_ft_300.txt',
                    input_lang,
                    input_lang,
                    source_words_to_load,
                    reload=args.reload_emb)
        else:
            file_check(args.emb_path + 'vietnamese_ft_300.txt')
            source_embedding, source_notPretrained = load_fasttext_embd(
                args.emb_path + 'vietnamese_ft_300.txt',
                input_lang,
                input_lang,
                source_words_to_load,
                reload=args.reload_emb)

        file_check(args.emb_path + 'english_ft_300.txt')
        target_embedding, target_notPretrained = load_fasttext_embd(
            args.emb_path + 'english_ft_300.txt',
            output_lang,
            input_lang,
            target_words_to_load,
            reload=args.reload_emb)
        if args.tune_pretrain_emb:
            source_notPretrained[:] = 1
            target_notPretrained[:] = 1
    else:
        source_embedding = source_notPretrained = target_embedding = target_notPretrained = None

    # 0000000000


#     target_embedding = target_notPretrained = None

    params = {
        'batch_size': args.batch_size,
        'shuffle': True,
        'collate_fn': vocab_collate_func,
        'num_workers': 20
    }
    params2 = {
        'batch_size': args.batch_size,
        'shuffle': False,
        'collate_fn': vocab_collate_func,
        'num_workers': 20
    }

    train_set, dev_set = Dataset(train_pairs, input_lang,
                                 output_lang), Dataset(dev_pairs, input_lang,
                                                       output_lang_dev)
    train_loader = torch.utils.data.DataLoader(train_set, **params)
    dev_loader = torch.utils.data.DataLoader(dev_set, **params2)

    print(len(train_loader), len(dev_loader))

    if args.self_attn:
        encoder = Encoder_SelfAttn(input_lang.n_words, EMB_DIM, args.dim_ff,
                                   args.selfattn_en_num, args.decoder_layers,
                                   args.decoder_hidden_size, source_embedding,
                                   source_notPretrained, args.device,
                                   args.attn_head).to(args.device)
    else:
        encoder = EncoderRNN(input_lang.n_words, EMB_DIM,
                             args.encoder_hidden_size, args.encoder_layers,
                             args.decoder_layers, args.decoder_hidden_size,
                             source_embedding, source_notPretrained,
                             args.rnn_type, args.use_bi, args.device, False,
                             args.attn_head).to(args.device)

    if args.transformer:
        decoder = Decoder_SelfAttn(output_lang.n_words, EMB_DIM, args.dim_ff,
                                   args.selfattn_de_num, target_embedding,
                                   target_notPretrained, args.device,
                                   args.attn_head).to(args.device)
    elif args.decoder_type == "basic":
        decoder = DecoderRNN(output_lang.n_words,
                             EMB_DIM,
                             args.decoder_hidden_size,
                             args.decoder_layers,
                             target_embedding,
                             target_notPretrained,
                             args.rnn_type,
                             dropout_p=args.decoder_emb_dropout,
                             device=args.device).to(args.device)
    elif args.decoder_type == "attn":
        decoder = DecoderRNN_Attention(output_lang.n_words,
                                       EMB_DIM,
                                       args.decoder_hidden_size,
                                       args.decoder_layers,
                                       target_embedding,
                                       target_notPretrained,
                                       args.rnn_type,
                                       dropout_p=args.decoder_emb_dropout,
                                       device=args.device,
                                       method=args.attn_method).to(args.device)
    else:
        raise ValueError

    print(encoder, decoder)
    if not args.test_only:
        trainIters(encoder, decoder, train_loader, dev_loader, \
                   input_lang, output_lang, input_lang_dev, output_lang_dev,
                   train_max_length, args.epoch,
                   plot_every=args.plot_every, print_every=args.print_every,
                   weight_decay=args.weight_decay, learning_rate=args.learning_rate,
                   device=args.device, teacher_forcing_ratio=args.teacher_forcing_ratio,
                   label=args.save_model_name,
                   use_lr_scheduler = True, gamma_en = 0.99, gamma_de = 0.99,
                   beam_width=args.beam_width, min_len=args.min_len, n_best=args.n_best,
                   decode_method=args.decode_method,
                   save_result_path = args.save_result_path, save_model=args.save_model)
    else:
        encoder.load_state_dict(
            torch.load('encoder' + "-" + args.save_model_name + '.ckpt',
                       map_location=lambda storage, location: storage))
        decoder.load_state_dict(
            torch.load('decoder' + "-" + args.save_model_name + '.ckpt',
                       map_location=lambda storage, location: storage))

        bleu_score, decoded_list, target_list, attn_weight = test(
            encoder, decoder, dev_loader, input_lang, output_lang, input_lang,
            output_lang_dev, args.beam_width, args.min_len, args.n_best,
            train_max_length, args.decode_method, args.device)
        print("dev bleu: ", bleu_score)
        i = 0
        with open("results/dev_examples_{}.txt".format(args.save_result_label),
                  "w+") as f:
            f.write("bleu: {}\n".format(bleu_score))
            for (source, target, source_len, target_len) in (dev_loader):
                source_list = [[
                    input_lang.index2word[k.item()] for k in source[i]
                ][:source_len[i] - 1] for i in range(len(source))]
                for s in source_list:
                    f.write("S: {}\n".format(" ".join(s)))
                    f.write("T: {}\n".format(decoded_list[i]))
                    f.write("H: {}\n".format(target_list[i]))
                    f.write("\n")
                    i += 1

        # ===================================================== #
        bleu_score, decoded_list, target_list, attn_weight = test(
            encoder, decoder, train_loader, input_lang, output_lang,
            input_lang, output_lang, args.beam_width, args.min_len,
            args.n_best, train_max_length, args.decode_method, args.device)
        print("train bleu: ", bleu_score)
        i = 0
        with open(
                "results/train_examples_{}.txt".format(args.save_result_label),
                "w+") as f:
            f.write("bleu: {}\n".format(bleu_score))
            for (source, target, source_len, target_len) in (train_loader):
                source_list = [[
                    input_lang.index2word[k.item()] for k in source[i]
                ][:source_len[i] - 1] for i in range(len(source))]
                for s in source_list:
                    f.write("S: {}\n".format(" ".join(s)))
                    f.write("T: {}\n".format(decoded_list[i]))
                    f.write("H: {}\n".format(target_list[i]))
                    f.write("\n")
                    i += 1

    return 0
Beispiel #10
0
def train():
    criterion = nn.CrossEntropyLoss().cuda()
    print('train start!')
    data_iter_s = iter(source_loader)
    data_iter_t = iter(target_loader)
    data_iter_t_l = iter(target_labeled_loader)
    len_train_source = len(source_loader)
    len_train_target = len(target_loader)
    len_train_target_l = len(target_labeled_loader)
    for step in range(conf.train.min_step + 1):
        G.train()
        C1.train()
        C2.train()
        if step % len_train_target == 0:
            data_iter_t = iter(target_loader)
        if step % len_train_target_l == 0:
            data_iter_t_l = iter(target_labeled_loader)
        if step % len_train_source == 0:
            data_iter_s = iter(source_loader)
        data_t = next(data_iter_t)
        data_t_l = next(data_iter_t_l)
        data_s = next(data_iter_s)
        inv_lr_scheduler(param_lr_g, opt_g, step,
                         init_lr=conf.train.lr,
                         max_iter=conf.train.min_step)
        inv_lr_scheduler(param_lr_f, opt_c1, step,
                         init_lr=conf.train.lr,
                         max_iter=conf.train.min_step)
        img_s = data_s[0]
        label_s = data_s[1]
        img_t = data_t[0]
        index_t = data_t[2]
        img_s, label_s = Variable(img_s.cuda()), \
                         Variable(label_s.cuda())
        img_t = Variable(img_t.cuda())
        index_t = Variable(index_t.cuda())
        img_t_l = data_t_l[0].cuda()
        label_t_l = data_t_l[1].cuda()

        if len(img_t) < batch_size:
            break
        if len(img_s) < batch_size:
            break
        opt_g.zero_grad()
        opt_c1.zero_grad()
        ## Weight normalizztion
        C1.module.weight_norm()
        ## Source loss calculation
        feat = G(img_s)
        out_s = C1(feat)
        loss_s = criterion(out_s, label_s)
        #loss_s += criterion(C2(feat.detach()), label_s)

        feat_t = G(img_t)
        out_t = C1(feat_t)
        feat_t = F.normalize(feat_t)
        ## Train a linear classifier on top of feature extractor.
        ## We should not update feature extractor.
        G.eval()
        feat_t_l = G(img_t_l)
        G.train()
        out_t_l = C2(feat_t_l.detach())
        loss_t_l = criterion(out_t_l, label_t_l)

        ### Calculate mini-batch x memory similarity
        feat_mat = lemniscate(feat_t, index_t)
        ### We do not use memory features present in mini-batch
        feat_mat[:, index_t] = -1 / conf.model.temp
        ### Calculate mini-batch x mini-batch similarity
        feat_mat2 = torch.matmul(feat_t,
                                 feat_t.t()) / conf.model.temp
        mask = torch.eye(feat_mat2.size(0),
                         feat_mat2.size(0)).bool().cuda()
        feat_mat2.masked_fill_(mask, -1 / conf.model.temp)
        loss_nc = conf.train.eta * entropy(torch.cat([feat_mat,
                                                      feat_mat2], 1))
        loss_ent = conf.train.eta * entropy_margin(out_t, conf.train.thr,
                                                   conf.train.margin)
        all = loss_nc + loss_s + loss_t_l
        with amp.scale_loss(all, [opt_g, opt_c1]) as scaled_loss:
            scaled_loss.backward()
        opt_g.step()
        opt_c1.step()
        opt_g.zero_grad()
        opt_c1.zero_grad()
        lemniscate.update_weight(feat_t, index_t)
        if step % conf.train.log_interval == 0:
            print('Train [{}/{} ({:.2f}%)]\tLoss Source: {:.6f} '
                  'Loss NC: {:.6f} Loss LT: {:.6f}\t'.format(
                step, conf.train.min_step,
                100 * float(step / conf.train.min_step),
                loss_s.item(), loss_nc.item(), loss_t_l.item()))
        if step > 0 and step % conf.test.test_interval == 0:
            test(step, dataset_test, filename, n_share, num_class, G, C1,
                 conf.train.thr)
            test_class_inc(step, dataset_test, filename, n_target, G, C2,
                           n_share)
            G.train()
            C1.train()
            C2.train()
Beispiel #11
0
#    lr = 0.0001, betas=(0.5, 0.999))

############################# Hyper-parameters ################################

alpha = 1.0
beta = 1.0
gamma = 0.05
K = 5
nu = 1

################################  Train  ######################################

# Loss plot
logger = Logger(2000, len(train_dataloader))

test_ranks, test_mAP = test(feature_generator, queryloader, galleryloader)
train_ranks, train_mAP = test(feature_generator, queryloader_train,
                              galleryloader_train)

for epoch in range(0, 2000):

    print("Epoch ---------------", epoch + 1)
    for i, batch in enumerate(train_dataloader):

        #print("Batch number ",i)

        anchor_rgb, positive_rgb, negative_rgb, anchor_ir, positive_ir, \
        negative_ir, anchor_label, modality_rgb, modality_ir = batch

        if torch.cuda.is_available():
            anchor_rgb = anchor_rgb.cuda()
Beispiel #12
0
def train(config):
    gpu_manage(config)

    ### DATASET LOAD ###
    print('===> Loading datasets')

    dataset = Dataset(config)
    train_size = int(0.9 * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(
        dataset, [train_size, test_size])
    training_data_loader = DataLoader(dataset=train_dataset,
                                      num_workers=config.threads,
                                      batch_size=config.batchsize,
                                      shuffle=True)
    test_data_loader = DataLoader(dataset=test_dataset,
                                  num_workers=config.threads,
                                  batch_size=config.test_batchsize,
                                  shuffle=False)

    ### MODELS LOAD ###
    print('===> Loading models')

    if config.gen_model == 'unet':
        gen = UNet(in_ch=config.in_ch,
                   out_ch=config.out_ch,
                   gpu_ids=config.gpu_ids)
    else:
        print('The generator model does not exist')

    if config.gen_init is not None:
        param = torch.load(config.gen_init)
        gen.load_state_dict(param)
        print('load {} as pretrained model'.format(config.gen_init))
    dis = Discriminator(in_ch=config.in_ch,
                        out_ch=config.out_ch,
                        gpu_ids=config.gpu_ids)
    if config.dis_init is not None:
        param = torch.load(config.dis_init)
        dis.load_state_dict(param)
        print('load {} as pretrained model'.format(config.dis_init))

    # setup optimizer
    opt_gen = optim.Adam(gen.parameters(),
                         lr=config.lr,
                         betas=(config.beta1, 0.999),
                         weight_decay=0.00001)
    opt_dis = optim.Adam(dis.parameters(),
                         lr=config.lr,
                         betas=(config.beta1, 0.999),
                         weight_decay=0.00001)

    real_a = torch.FloatTensor(config.batchsize, config.in_ch, 256, 256)
    real_b = torch.FloatTensor(config.batchsize, config.out_ch, 256, 256)

    criterionL1 = nn.L1Loss()
    criterionMSE = nn.MSELoss()
    criterionSoftplus = nn.Softplus()

    if config.cuda:
        gen = gen.cuda(0)
        dis = dis.cuda(0)
        criterionL1 = criterionL1.cuda(0)
        criterionMSE = criterionMSE.cuda(0)
        criterionSoftplus = criterionSoftplus.cuda(0)
        real_a = real_a.cuda(0)
        real_b = real_b.cuda(0)

    real_a = Variable(real_a)
    real_b = Variable(real_b)

    logreport = LogReport(log_dir=config.out_dir)
    testreport = TestReport(log_dir=config.out_dir)

    # main
    for epoch in range(1, config.epoch + 1):
        for iteration, batch in enumerate(training_data_loader, 1):
            real_a_cpu, real_b_cpu = batch[0], batch[1]
            real_a.data.resize_(real_a_cpu.size()).copy_(real_a_cpu)
            real_b.data.resize_(real_b_cpu.size()).copy_(real_b_cpu)
            fake_b = gen.forward(real_a)

            ################
            ### Update D ###
            ################

            opt_dis.zero_grad()

            # train with fake
            fake_ab = torch.cat((real_a, fake_b), 1)
            pred_fake = dis.forward(fake_ab.detach())
            batchsize, _, w, h = pred_fake.size()

            loss_d_fake = torch.sum(
                criterionSoftplus(pred_fake)) / batchsize / w / h

            # train with real
            real_ab = torch.cat((real_a, real_b), 1)
            pred_real = dis.forward(real_ab)
            loss_d_real = torch.sum(
                criterionSoftplus(-pred_real)) / batchsize / w / h

            # Combined loss
            loss_d = loss_d_fake + loss_d_real

            loss_d.backward()

            if epoch % config.minimax == 0:
                opt_dis.step()

            ################
            ### Update G ###
            ################

            opt_gen.zero_grad()

            # First, G(A) should fake the discriminator
            fake_ab = torch.cat((real_a, fake_b), 1)
            pred_fake = dis.forward(fake_ab)
            loss_g_gan = torch.sum(
                criterionSoftplus(-pred_fake)) / batchsize / w / h

            # Second, G(A) = B
            loss_g_l1 = criterionL1(fake_b, real_b) * config.lamb

            loss_g = loss_g_gan + loss_g_l1

            loss_g.backward()

            opt_gen.step()

            # log
            if iteration % 100 == 0:
                print(
                    "===> Epoch[{}]({}/{}): loss_d_fake: {:.4f} loss_d_real: {:.4f} loss_g_gan: {:.4f} loss_g_l1: {:.4f}"
                    .format(epoch, iteration, len(training_data_loader),
                            loss_d_fake.item(), loss_d_real.item(),
                            loss_g_gan.item(), loss_g_l1.item()))

                log = {}
                log['epoch'] = epoch
                log['iteration'] = len(training_data_loader) * (epoch -
                                                                1) + iteration
                log['gen/loss'] = loss_g.item()
                log['dis/loss'] = loss_d.item()

                logreport(log)

        with torch.no_grad():
            log_test = test(config, test_data_loader, gen, criterionMSE, epoch)
            testreport(log_test)

        if epoch % config.snapshot_interval == 0:
            checkpoint(config, epoch, gen, dis)

        logreport.save_lossgraph()
        testreport.save_lossgraph()
Beispiel #13
0
def cluster(approach, datapath):
    """
    Run a clustering approach on unlabeled data.
    """
    report_path = test(datapath, approach, params[approach])
    c.echo('Report compiled at {0}.'.format(report_path))
Beispiel #14
0
def train(g, d, train_loader, neg_loader, epoches, g_optim, d_optim, neg_lens):
    g = g.to(device)
    d = d.to(device)
    time.sleep(0.1)
    print("start training on {}".format(device))
    time.sleep(0.1)
    bce_loss = torch.nn.BCELoss()
    # 训练判别器D
    for e in tqdm(range(epoches)):
        start_time = time.time()
        idx = 0
        d_loss = 0.0
        neg_iter = neg_loader.__iter__()
        # 训练判别器d
        for _, _, real_attr, real_user_emb in train_loader:
            if idx > neg_lens:
                break
            _, _, neg_attr, neg_user_emb = neg_iter.next()
            # 正例的属性和用户嵌入
            real_attr = real_attr.to(device)
            real_user_emb = real_user_emb.to(device)
            # 负例的属性和用户嵌入
            neg_attr = neg_attr.to(device)
            neg_user_emb = neg_user_emb.to(device)
            # 生成器生成虚拟用户嵌入
            fake_user_emb = g(real_attr)
            fake_user_emb = fake_user_emb.to(device)
            # 判别器判别
            d_real, d_logit_real = d(real_attr, real_user_emb)
            d_fake, d_logit_fake = d(real_attr, fake_user_emb)
            d_neg, d_logit_neg = d(neg_attr, neg_user_emb)
            # 计算d_loss
            d_optim.zero_grad()
            d_loss_real = bce_loss(d_real, torch.ones_like(d_real))
            d_loss_fake = bce_loss(d_fake, torch.zeros_like(d_fake))
            d_loss_neg = bce_loss(d_neg, torch.zeros_like(d_neg))
            d_loss = torch.mean(d_loss_real + d_loss_fake + d_loss_neg)
            d_loss.backward()
            d_optim.step()
            idx += batch_size
        # 训练生成器g
        g_loss = 0.0
        for uid, mid, attr, user_emb in train_loader:
            g_optim.zero_grad()
            attr = attr.to(device)
            # 生成虚拟用户嵌入
            fake_user_emb = g(attr)
            fake_user_emb = fake_user_emb.to(device)
            # 算loss
            d_fake, d_logit_fake = d(attr, fake_user_emb)
            g_loss = bce_loss(d_fake, torch.ones_like(d_fake))
            g_loss.backward()
            g_optim.step()
        end_time = time.time()
        print("\nepoch:{}: time:{:.2f}, d_loss:{:.3f}, g_loss:{:.3f}".format(
            e + 1, end_time - start_time, d_loss, g_loss))
        # test
        test_item, test_attribute = data_loader.load_test_data()
        test_item = torch.tensor(test_item).to(device)
        test_attribute = torch.tensor(test_attribute,
                                      dtype=torch.long).to(device)
        fake_user = g(test_attribute)
        eval.test(fake_user.cpu().detach().numpy())
        time.sleep(0.1)
Beispiel #15
0
target = parser.parse_args().target

if __name__ == '__main__':
    config = Config('config.yaml')
    if not os.path.exists(config.checkpoint_dir):
        os.makedirs(config.checkpoint_dir)

    word2idx, train_data, valid_data, test_data = load_data(config)
    idx2word = dict(zip(word2idx.values(), word2idx.keys()))
    config.nwords = len(word2idx)
    print("vacab size is %d" % config.nwords)

    while True:
        random.seed(time.time())
        config.srand = random.randint(0, 100000)

        np.random.seed(config.srand)
        random.seed(config.srand)
        paddle.seed(config.srand)

        model = MemN2N(config)
        train(model, train_data, valid_data, config)

        test_ppl = test(model, test_data, config)
        if test_ppl < target:
            model_path = os.path.join(
                config.checkpoint_dir,
                config.model_name + "_" + str(config.srand) + "_good")
            paddle.save(model.state_dict(), model_path)
            break
Beispiel #16
0
def run_train_val(ckp_name='ckp_latest'):
    sess = Session()
    sess.load_checkpoints(ckp_name)

    sess.tensorboard('train_stats')
    sess.tensorboard('val_stats')

    ######################## Get Datasets & Dataloaders ###########################

    train_dataset = SYSU_triplet_dataset(
        data_folder=settings.data_folder,
        transforms_list=settings.transforms_list)

    def get_train_dataloader():
        return iter(
            DataLoader(SYSU_triplet_dataset(
                data_folder=settings.data_folder,
                transforms_list=settings.transforms_list),
                       batch_size=settings.train_batch_size,
                       shuffle=True,
                       num_workers=settings.num_workers,
                       drop_last=True))

    train_dataloader = get_train_dataloader()

    eval_val = SYSU_eval_datasets(data_folder=settings.data_folder,
                                  data_split='val')

    transform_test = settings.test_transforms_list

    val_queryloader = DataLoader(
        Image_dataset(eval_val.query, transform=transform_test),
        batch_size=settings.val_batch_size,
        shuffle=False,
        num_workers=0,
        drop_last=False,
    )

    val_galleryloader = DataLoader(
        Image_dataset(eval_val.gallery, transform=transform_test),
        batch_size=settings.val_batch_size,
        shuffle=False,
        num_workers=0,
        drop_last=False,
    )

    while sess.step < settings.iter_sche[-1]:
        sess.sche_G.step()
        sess.feature_generator.train()
        sess.feature_embedder_rgb.train()
        sess.feature_embedder_ir.train()

        sess.id_classifier.train()

        try:
            batch_t = next(train_dataloader)
        except StopIteration:
            train_dataloader = get_train_dataloader()
            batch_t = next(train_dataloader)
            sess.epoch_count += 1

        sess.inf_batch(batch_t)

        if sess.step % int(settings.latest_steps) == 0:
            sess.save_checkpoints('ckp_latest')
            sess.save_checkpoints('ckp_latest_backup')

        if sess.step % settings.val_step == 0:
            sess.feature_generator.eval()
            sess.feature_embedder_rgb.eval()
            sess.feature_embedder_ir.eval()
            sess.id_classifier.eval()
            test_ranks, test_mAP = test([
                nn.Sequential(sess.feature_generator,
                              sess.feature_embedder_rgb),
                nn.Sequential(sess.feature_generator, sess.feature_embedder_ir)
            ], val_queryloader, val_galleryloader)

            sess.write('val_stats', {'test_mAP_percentage': test_mAP*100.0, \
                                     'test_rank-1_accuracy_percentage':test_ranks[0]*100.0,\
                                     'test_rank-5_accuracy_percentage':test_ranks[4]*100.0,\
                                     'test_rank-10_accuracy_percentage':test_ranks[9]*100.0,\
                                     'test_rank-20_accuracy_percentage':test_ranks[19]*100.0
            })

        if sess.step % sess.save_steps == 0:
            sess.save_checkpoints('ckp_step_%d' % sess.step)
            logger.info('save model as ckp_step_%d' % sess.step)
        sess.step += 1
Beispiel #17
0
def train(hyp, opt, device, tb_writer=None):
    logger.info(f'Hyperparameters {hyp}')
    log_dir = Path(tb_writer.log_dir) if tb_writer else Path(
        opt.logdir) / 'evolve'  # logging directory
    wdir = log_dir / 'weights'  # weights directory
    os.makedirs(wdir, exist_ok=True)
    last = wdir / 'last.pt'
    best = wdir / 'best.pt'
    results_file = str(log_dir / 'results.txt')
    epochs, batch_size, total_batch_size, weights, rank = \
        opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank

    # Save run settings
    with open(log_dir / 'hyp.yaml', 'w') as f:
        yaml.dump(hyp, f, sort_keys=False)
    with open(log_dir / 'opt.yaml', 'w') as f:
        yaml.dump(vars(opt), f, sort_keys=False)

    # Configure
    cuda = device.type != 'cpu'
    init_seeds(2 + rank)
    with open(opt.data) as f:
        data_dict = yaml.load(f, Loader=yaml.FullLoader)  # data dict
    with torch_distributed_zero_first(rank):
        check_dataset(data_dict)  # check
    train_path = data_dict['train']
    test_path = data_dict['val']
    nc, names = (1, ['item']) if opt.single_cls else (int(
        data_dict['nc']), data_dict['names'])  # number classes, names
    assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (
        len(names), nc, opt.data)  # check

    # Model
    pretrained = weights.endswith('.pt')
    if pretrained:
        with torch_distributed_zero_first(rank):
            attempt_download(weights)  # download if not found locally
        ckpt = torch.load(weights, map_location=device)  # load checkpoint
        if hyp.get('anchors'):
            ckpt['model'].yaml['anchors'] = round(
                hyp['anchors'])  # force autoanchor
        model = Model(opt.cfg or ckpt['model'].yaml, ch=3,
                      nc=nc).to(device)  # create
        exclude = ['anchor'] if opt.cfg or hyp.get('anchors') else [
        ]  # exclude keys
        state_dict = ckpt['model'].float().state_dict()  # to FP32
        state_dict = intersect_dicts(state_dict,
                                     model.state_dict(),
                                     exclude=exclude)  # intersect
        model.load_state_dict(state_dict, strict=False)  # load
        logger.info(
            'Transferred %g/%g items from %s' %
            (len(state_dict), len(model.state_dict()), weights))  # report
    else:
        model = Model(opt.cfg, ch=3, nc=nc).to(device)  # create

    # Freeze
    freeze = ['model.%s.' % x
              for x in range(5)]  # parameter names to freeze (full or partial)
    if any(freeze):
        for k, v in model.named_parameters():
            if any(x in k for x in freeze):
                print('freezing %s' % k)
                v.requires_grad = False

    # Optimizer
    nbs = 64  # nominal batch size
    accumulate = max(round(nbs / total_batch_size),
                     1)  # accumulate loss before optimizing
    hyp['weight_decay'] *= total_batch_size * accumulate / nbs  # scale weight_decay

    pg0, pg1, pg2 = [], [], []  # optimizer parameter groups
    for k, v in model.named_parameters():
        v.requires_grad = True
        if '.bias' in k:
            pg2.append(v)  # biases
        elif '.weight' in k and '.bn' not in k:
            pg1.append(v)  # apply weight decay
        else:
            pg0.append(v)  # all else

    if opt.adam:
        optimizer = optim.Adam(pg0,
                               lr=hyp['lr0'],
                               betas=(hyp['momentum'],
                                      0.999))  # adjust beta1 to momentum
    else:
        optimizer = optim.SGD(pg0,
                              lr=hyp['lr0'],
                              momentum=hyp['momentum'],
                              nesterov=True)

    optimizer.add_param_group({
        'params': pg1,
        'weight_decay': hyp['weight_decay']
    })  # add pg1 with weight_decay
    optimizer.add_param_group({'params': pg2})  # add pg2 (biases)
    logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' %
                (len(pg2), len(pg1), len(pg0)))
    del pg0, pg1, pg2

    # Scheduler https://arxiv.org/pdf/1812.01187.pdf
    # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR
    lf = lambda x: ((1 + math.cos(x * math.pi / epochs)) / 2) * (1 - hyp[
        'lrf']) + hyp['lrf']  # cosine
    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
    # plot_lr_scheduler(optimizer, scheduler, epochs)

    # Resume
    start_epoch, best_fitness = 0, 0.0
    if pretrained:
        # Optimizer
        if ckpt['optimizer'] is not None:
            optimizer.load_state_dict(ckpt['optimizer'])
            best_fitness = ckpt['best_fitness']

        # Results
        if ckpt.get('training_results') is not None:
            with open(results_file, 'w') as file:
                file.write(ckpt['training_results'])  # write results.txt

        # Epochs
        start_epoch = ckpt['epoch'] + 1
        if opt.resume:
            assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % (
                weights, epochs)
            shutil.copytree(wdir, wdir.parent /
                            f'weights_backup_epoch{start_epoch - 1}'
                            )  # save previous weights
        if epochs < start_epoch:
            logger.info(
                '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.'
                % (weights, ckpt['epoch'], epochs))
            epochs += ckpt['epoch']  # finetune additional epochs

        del ckpt, state_dict

    # Image sizes
    gs = int(max(model.stride))  # grid size (max stride)
    imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size
                         ]  # verify imgsz are gs-multiples

    # DP mode
    if cuda and rank == -1 and torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    # SyncBatchNorm
    if opt.sync_bn and cuda and rank != -1:
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
        logger.info('Using SyncBatchNorm()')

    # Exponential moving average
    ema = ModelEMA(model) if rank in [-1, 0] else None

    # DDP mode
    if cuda and rank != -1:
        model = DDP(model,
                    device_ids=[opt.local_rank],
                    output_device=opt.local_rank)

    # Trainloader
    dataloader, dataset = create_dataloader(train_path,
                                            imgsz,
                                            batch_size,
                                            gs,
                                            opt,
                                            hyp=hyp,
                                            augment=True,
                                            cache=opt.cache_images,
                                            rect=opt.rect,
                                            rank=rank,
                                            world_size=opt.world_size,
                                            workers=opt.workers)
    mlc = np.concatenate(dataset.labels, 0)[:, 0].max()  # max label class
    nb = len(dataloader)  # number of batches
    assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (
        mlc, nc, opt.data, nc - 1)

    # Process 0
    if rank in [-1, 0]:
        ema.updates = start_epoch * nb // accumulate  # set EMA updates
        testloader = create_dataloader(test_path,
                                       imgsz_test,
                                       total_batch_size,
                                       gs,
                                       opt,
                                       hyp=hyp,
                                       augment=False,
                                       cache=opt.cache_images,
                                       rect=True,
                                       rank=-1,
                                       world_size=opt.world_size,
                                       workers=opt.workers)[0]  # testloader

        if not opt.resume:
            labels = np.concatenate(dataset.labels, 0)
            c = torch.tensor(labels[:, 0])  # classes
            # cf = torch.bincount(c.long(), minlength=nc) + 1.  # frequency
            # model._initialize_biases(cf.to(device))
            plot_labels(labels, save_dir=log_dir)
            if tb_writer:
                # tb_writer.add_hparams(hyp, {})  # causes duplicate https://github.com/ultralytics/yolov5/pull/384
                tb_writer.add_histogram('classes', c, 0)

            # Anchors
            if not opt.noautoanchor:
                check_anchors(dataset,
                              model=model,
                              thr=hyp['anchor_t'],
                              imgsz=imgsz)

    # Model parameters
    hyp['cls'] *= nc / 80.  # scale coco-tuned hyp['cls'] to current dataset
    model.nc = nc  # attach number of classes to model
    model.hyp = hyp  # attach hyperparameters to model
    model.gr = 1.0  # giou loss ratio (obj_loss = 1.0 or giou)
    model.class_weights = labels_to_class_weights(dataset.labels, nc).to(
        device)  # attach class weights
    model.names = names

    # Start training
    t0 = time.time()
    nw = max(3 * nb,
             1e3)  # number of warmup iterations, max(3 epochs, 1k iterations)
    # nw = min(nw, (epochs - start_epoch) / 2 * nb)  # limit warmup to < 1/2 of training
    maps = np.zeros(nc)  # mAP per class
    results = (
        0, 0, 0, 0, 0, 0, 0
    )  # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification'
    scheduler.last_epoch = start_epoch - 1  # do not move
    scaler = amp.GradScaler(enabled=cuda)
    logger.info(
        'Image sizes %g train, %g test\nUsing %g dataloader workers\nLogging results to %s\n'
        'Starting training for %g epochs...' %
        (imgsz, imgsz_test, dataloader.num_workers, log_dir, epochs))
    for epoch in range(
            start_epoch, epochs
    ):  # epoch ------------------------------------------------------------------
        model.train()

        # Update image weights (optional)
        if opt.image_weights:
            # Generate indices
            if rank in [-1, 0]:
                cw = model.class_weights.cpu().numpy() * (
                    1 - maps)**2  # class weights
                iw = labels_to_image_weights(dataset.labels,
                                             nc=nc,
                                             class_weights=cw)  # image weights
                dataset.indices = random.choices(
                    range(dataset.n), weights=iw,
                    k=dataset.n)  # rand weighted idx
            # Broadcast if DDP
            if rank != -1:
                indices = (torch.tensor(dataset.indices)
                           if rank == 0 else torch.zeros(dataset.n)).int()
                dist.broadcast(indices, 0)
                if rank != 0:
                    dataset.indices = indices.cpu().numpy()

        # Update mosaic border
        # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs)
        # dataset.mosaic_border = [b - imgsz, -b]  # height, width borders

        mloss = torch.zeros(4, device=device)  # mean losses
        if rank != -1:
            dataloader.sampler.set_epoch(epoch)
        pbar = enumerate(dataloader)
        logger.info(
            ('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls',
                                   'total', 'targets', 'img_size'))
        if rank in [-1, 0]:
            pbar = tqdm(pbar, total=nb)  # progress bar
        optimizer.zero_grad()
        for i, (
                imgs, targets, paths, _
        ) in pbar:  # batch -------------------------------------------------------------
            ni = i + nb * epoch  # number integrated batches (since train start)
            imgs = imgs.to(device, non_blocking=True).float(
            ) / 255.0  # uint8 to float32, 0-255 to 0.0-1.0

            # Warmup
            if ni <= nw:
                xi = [0, nw]  # x interp
                # model.gr = np.interp(ni, xi, [0.0, 1.0])  # giou loss ratio (obj_loss = 1.0 or giou)
                accumulate = max(
                    1,
                    np.interp(ni, xi, [1, nbs / total_batch_size]).round())
                for j, x in enumerate(optimizer.param_groups):
                    # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
                    x['lr'] = np.interp(
                        ni, xi,
                        [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)])
                    if 'momentum' in x:
                        x['momentum'] = np.interp(ni, xi,
                                                  [0.9, hyp['momentum']])

            # Multi-scale
            if opt.multi_scale:
                sz = random.randrange(imgsz * 0.8,
                                      imgsz * 1.2 + gs) // gs * gs  # size
                sf = sz / max(imgs.shape[2:])  # scale factor
                if sf != 1:
                    ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]
                          ]  # new shape (stretched to gs-multiple)
                    imgs = F.interpolate(imgs,
                                         size=ns,
                                         mode='bilinear',
                                         align_corners=False)

            # Forward
            with amp.autocast(enabled=cuda):
                pred = model(imgs)  # forward
                loss, loss_items = compute_loss(
                    pred, targets.to(device),
                    model)  # loss scaled by batch_size
                if rank != -1:
                    loss *= opt.world_size  # gradient averaged between devices in DDP mode

            # Backward
            scaler.scale(loss).backward()

            # Optimize
            if ni % accumulate == 0:
                scaler.step(optimizer)  # optimizer.step
                scaler.update()
                optimizer.zero_grad()
                if ema:
                    ema.update(model)

            # Print
            if rank in [-1, 0]:
                mloss = (mloss * i + loss_items) / (i + 1
                                                    )  # update mean losses
                mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9
                                 if torch.cuda.is_available() else 0)  # (GB)
                s = ('%10s' * 2 +
                     '%10.4g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem,
                                      *mloss, targets.shape[0], imgs.shape[-1])
                pbar.set_description(s)

                # Plot
                if ni < 3:
                    f = str(log_dir / ('train_batch%g.jpg' % ni))  # filename
                    result = plot_images(images=imgs,
                                         targets=targets,
                                         paths=paths,
                                         fname=f)
                    if tb_writer and result is not None:
                        tb_writer.add_image(f,
                                            result,
                                            dataformats='HWC',
                                            global_step=epoch)
                        # tb_writer.add_graph(model, imgs)  # add model to tensorboard

            # end batch ------------------------------------------------------------------------------------------------

        # Scheduler
        lr = [x['lr'] for x in optimizer.param_groups]  # for tensorboard
        scheduler.step()

        # DDP process 0 or single-GPU
        if rank in [-1, 0]:
            # mAP
            if ema:
                ema.update_attr(
                    model,
                    include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride'])
            final_epoch = epoch + 1 == epochs
            if not opt.notest or final_epoch:  # Calculate mAP
                if final_epoch:  # replot predictions
                    [
                        os.remove(x) for x in glob.glob(
                            str(log_dir / 'test_batch*_pred.jpg'))
                        if os.path.exists(x)
                    ]
                results, maps, times = eval.test(opt.data,
                                                 batch_size=total_batch_size,
                                                 imgsz=imgsz_test,
                                                 model=ema.ema,
                                                 single_cls=opt.single_cls,
                                                 dataloader=testloader,
                                                 save_dir=log_dir)

            # Write
            with open(results_file, 'a') as f:
                f.write(s + '%10.4g' * 7 % results +
                        '\n')  # P, R, mAP, F1, test_losses=(GIoU, obj, cls)
            if len(opt.name) and opt.bucket:
                os.system('gsutil cp %s gs://%s/results/results%s.txt' %
                          (results_file, opt.bucket, opt.name))

            # Tensorboard
            if tb_writer:
                tags = [
                    'train/giou_loss',
                    'train/obj_loss',
                    'train/cls_loss',  # train loss
                    'metrics/precision',
                    'metrics/recall',
                    'metrics/mAP_0.5',
                    'metrics/mAP_0.5:0.95',
                    'val/giou_loss',
                    'val/obj_loss',
                    'val/cls_loss',  # val loss
                    'x/lr0',
                    'x/lr1',
                    'x/lr2'
                ]  # params
                for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags):
                    tb_writer.add_scalar(tag, x, epoch)

            # Update best mAP
            fi = fitness(np.array(results).reshape(
                1, -1))  # fitness_i = weighted combination of [P, R, mAP, F1]
            if fi > best_fitness:
                best_fitness = fi

            # Save model
            save = (not opt.nosave) or (final_epoch and not opt.evolve)
            if save:
                with open(results_file, 'r') as f:  # create checkpoint
                    ckpt = {
                        'epoch':
                        epoch,
                        'best_fitness':
                        best_fitness,
                        'training_results':
                        f.read(),
                        'model':
                        ema.ema,
                        'optimizer':
                        None if final_epoch else optimizer.state_dict()
                    }

                # Save last, best and delete
                torch.save(ckpt, last)
                if best_fitness == fi:
                    torch.save(ckpt, best)
                del ckpt
        # end epoch ----------------------------------------------------------------------------------------------------
    # end training

    if rank in [-1, 0]:
        # Strip optimizers
        n = opt.name if opt.name.isnumeric() else ''
        fresults, flast, fbest = log_dir / f'results{n}.txt', wdir / f'last{n}.pt', wdir / f'best{n}.pt'
        for f1, f2 in zip([wdir / 'last.pt', wdir / 'best.pt', results_file],
                          [flast, fbest, fresults]):
            if os.path.exists(f1):
                os.rename(f1, f2)  # rename
                if str(f2).endswith('.pt'):  # is *.pt
                    strip_optimizer(f2)  # strip optimizer
                    os.system(
                        'gsutil cp %s gs://%s/weights' %
                        (f2, opt.bucket)) if opt.bucket else None  # upload
        # Finish
        if not opt.evolve:
            plot_results(save_dir=log_dir)  # save as results.png
        logger.info('%g epochs completed in %.3f hours.\n' %
                    (epoch - start_epoch + 1, (time.time() - t0) / 3600))

    dist.destroy_process_group() if rank not in [-1, 0] else None
    torch.cuda.empty_cache()
    return results
def main():
    # Training settings
    parser = argparse.ArgumentParser(description="PyTorch MNIST Example")
    parser.add_argument(
        "--batch-size",
        type=int,
        default=1,
        metavar="N",
        help="input batch size for training (default: 1)",
    )
    parser.add_argument(
        "--test-batch-size",
        type=int,
        default=1,
        metavar="N",
        help="input batch size for testing (default: 1)",
    )
    parser.add_argument(
        "--epochs",
        type=int,
        default=3,
        metavar="N",
        help="number of epochs to train (default: 14)",
    )
    parser.add_argument(
        "--lr",
        type=float,
        default=1.0,
        metavar="LR",
        help="learning rate (default: 1.0)",
    )
    parser.add_argument(
        "--gamma",
        type=float,
        default=0.7,
        metavar="M",
        help="Learning rate step gamma (default: 0.7)",
    )
    parser.add_argument("--no-cuda",
                        action="store_true",
                        default=False,
                        help="disables CUDA training")
    parser.add_argument(
        "--dry-run",
        action="store_true",
        default=False,
        help="quickly check a single pass",
    )
    parser.add_argument("--seed",
                        type=int,
                        default=1,
                        metavar="S",
                        help="random seed (default: 1)")
    parser.add_argument(
        "--log-interval",
        type=int,
        default=100,
        metavar="N",
        help="how many batches to wait before logging training status",
    )
    parser.add_argument(
        "--save-model",
        action="store_true",
        default=False,
        help="For Saving the current Model",
    )
    args = parser.parse_args()
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)

    device = torch.device("cuda" if use_cuda else "cpu")

    kwargs = {"batch_size": args.batch_size}
    if use_cuda:
        kwargs.update({
            "num_workers": 1,
            "pin_memory": True,
            "shuffle": True
        }, )

    transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.1307, ), (0.3081, ))])
    dataset1 = datasets.MNIST("../data",
                              train=True,
                              download=True,
                              transform=transform)
    dataset2 = datasets.MNIST("../data", train=False, transform=transform)
    train_loader = torch.utils.data.DataLoader(dataset1, **kwargs)
    test_loader = torch.utils.data.DataLoader(dataset2, **kwargs)

    model = Net().to(device)

    for epoch in range(1, args.epochs + 1):
        train(args, model, device, train_loader, epoch)
        test(model, device, test_loader)

    if args.save_model:
        torch.save(model.state_dict(), "mnist_cnn.pt")
Beispiel #19
0
def cross_validate(model, crit, opt):
    dataset = VideoDataset(opt, 'val')
    print(len(dataset))
    _, _, seq_probs, seq_preds, labels, masks = test(model, crit, dataset,
                                                     dataset.get_vocab(), opt)
Beispiel #20
0
def main(seed, k_fold, batch_size, num_epoch, continue_my_model,
         continue_my_model_train_path, learning_rate, num_instance, delta_v,
         delta_d, p_var, p_dist, p_reg, p_seg, p_disc, p_cla, is_pseudo_mask,
         is_pre, is_pre_path):
    '''1准备数据集'''
    transform = ImgMaskTransform(img_size=(128, 256))
    train_dataset = Metric_Learning_ImageFolder(root=image_folder +
                                                '/data_train',
                                                transform=transform)
    val_dataset = Metric_Learning_ImageFolder(root=image_folder +
                                              '/data_train',
                                              transform=transform)

    # {'narrow1':0, 'narrow2':1, 'narrow3':2, 'narrow4':3, 'wide':4}
    narrow_list = train_dataset.class_to_idx
    cla_dict = dict((val, key) for key, val in narrow_list.items())
    cla = []
    for key, val in narrow_list.items():
        cla.append(key)
    # write dict into json file
    json_str = json.dumps(cla_dict, indent=4)
    with open('class_indices.json', 'w') as json_file:
        json_file.write(json_str)

    indices = list(range(len(train_dataset)))
    print(len(train_dataset))
    # 打乱数据
    np.random.seed(seed)
    np.random.shuffle(indices)
    # print(indices)

    for i in range(k_fold):
        print('\n', '*' + '-' * 10, 'F{}'.format(i + 1), '-' * 10 + '*')
        '''2设置实验结果保存路径'''
        train_result = pd.DataFrame(columns=('loss', 'accurate'))
        val_result = pd.DataFrame(columns=('loss', 'accurate', 'recall',
                                           'precision', 'AUC', 'F1'))
        test_result = pd.DataFrame(columns=('loss', 'accurate', 'recall',
                                            'precision', 'AUC', 'F1'))

        train_len, train_loader, validation_loader = k_fold_loader(
            i, int(len(train_dataset) * 1 / k_fold), indices, train_dataset,
            val_dataset, batch_size)
        test_data = StandarImageFolder(
            root=os.path.join(image_folder, 'data_test'),
            transform=augumentation.liner_classifier_test_transform)
        test_loader = DataLoader(test_data,
                                 batch_size=batch_size,
                                 shuffle=False,
                                 num_workers=16)
        '''3初始化模型'''

        net = MyNetworks3(n_instance=5,
                          n_classes=5,
                          embed_dim=2,
                          branch_size=1024,
                          deep_features_size=2048,
                          backend='resnet50',
                          pretrained=is_pre,
                          model_path=is_pre_path).cuda()
        for param in net.extractors.conv1.parameters():
            param.requires_grad = False
        for param in net.extractors.bn1.parameters():
            param.requires_grad = False
        for param in net.extractors.layer1.parameters():
            param.requires_grad = False
        for p in net.extractors.layer2.parameters():
            p.requires_grad = False
        if continue_my_model:
            print('continue my model.')
            missing_keys, unexpected_keys = net.load_state_dict(
                torch.load(continue_my_model_train_path), strict=False)

        # ##########查看可以更新的参数#####################
        # 通过这个查看训练好的模型权重是否真的加载进来了
        # parm = {}
        # for name, parameters in net.named_parameters():
        #     if name == 'extractors.conv1.weight':
        #         print(name, ':', parameters.size())
        #         print(name, ':', parameters)
        #         parm[name] = parameters.cpu().detach().numpy()

        for name, param in net.named_parameters():
            if param.requires_grad:
                print(name)
        # ###############查看可以更新的参数################
        # pspnet: 53.86M FLOPs: 997.02M / lanenet34.71M FLOPs: 561.82M
        # flops, params = profile(net, inputs=(torch.randn(1, 3, 32, 32).cuda(),))
        # flops, params = clever_format([flops, params])
        # print('# Model Params: {} FLOPs: {}'.format(params, flops))
        '''4设置优化器'''
        optimizer = optim.SGD(
            net.parameters(),
            lr=learning_rate,
            momentum=0.9,
            dampening=0,  # 动量的抑制因子,默认为0
            weight_decay=0.0005,  # 默认为0,有值说明用作正则化
            nesterov=True,
        )  # 使用Nesterov动量,默认为False
        '''5初始化损失函数'''
        disc_criterion = DiscriminativeLoss_wizaron(num_instance=num_instance,
                                                    delta_v=delta_v,
                                                    delta_d=delta_d,
                                                    norm=2,
                                                    scale_var=p_var,
                                                    scale_dist=p_dist,
                                                    scale_reg=p_reg,
                                                    usegpu=True).cuda()
        cla_criterion = nn.CrossEntropyLoss().cuda()
        # seg_criterion = nn.CrossEntropyLoss().cuda()
        seg_criterion = SoftDiceLoss().cuda()
        # seg_criterion = MyLovaszLoss().cuda()

        results = {
            'train_loss': [],
            'train_acc@1': [],
            'train_acc@2': [],
            'train_seg_loss': [],
            'train_var_loss': [],
            'train_dis_loss': [],
            'train_reg_loss': [],
            'val_loss': [],
            'val_acc@1': [],
            'val_acc@2': [],
            # 'val_mask_loss': [], 'val_var_loss': [], 'val_dis_loss': [], 'val_reg_loss': [],
            'test_loss': [],
            'test_acc@1': [],
            'test_acc@2': []
        }
        '''6开始训练'''
        best_acc, best_recall, best_precision, best_auc, best_f1 = 0.0, 0.0, 0.0, 0.0, 0.0
        lr_epoch = []
        for epoch in range(1, args.num_epoch + 1):
            print('\nF{} | Epoch [{}/{}]'.format(i + 1, epoch, args.num_epoch))

            # 1 train   loss_list里面放的是lovasz,var,dist,reg loss
            lr, train_loss, train_acc_1, train_acc_2, train_part_loss_list = train(
                num_epoch=num_epoch,
                per_epoch=epoch - 1,
                is_pseudo_mask=is_pseudo_mask,
                net=net,
                train_dataset=train_dataset,
                data_loader=train_loader,
                train_optimizer=optimizer,
                lr=learning_rate,
                disc_loss=disc_criterion,
                seg_loss=seg_criterion,
                cla_loss=cla_criterion,
                p_seg=p_seg,
                p_discriminative=p_disc,
                p_cla=p_cla,
                save_pre=save_pre_img)
            lr_epoch += lr
            # print('lr:', lr)

            results['train_loss'].append(train_loss)
            results['train_acc@1'].append(train_acc_1)
            results['train_acc@2'].append(train_acc_2)
            results['train_seg_loss'].append(train_part_loss_list[0])
            results['train_var_loss'].append(train_part_loss_list[1])
            results['train_dis_loss'].append(train_part_loss_list[2])
            results['train_reg_loss'].append(train_part_loss_list[3])

            train_result = train_result.append(pd.DataFrame({
                'loss': [train_loss],
                'accurate': [train_acc_1]
            }),
                                               ignore_index=True)

            # 2 val
            val_loss, val_acc_1, val_acc_2, val_pred_probs, val_pred_labels, val_gt_labels = val_test(
                per_epoch=epoch,
                is_pseudo_mask=args.is_pseudo_mask,
                val_dataset=val_dataset,
                net=net,
                data_loader=validation_loader,
                disc_loss=disc_criterion,
                seg_loss=seg_criterion,
                cla_loss=cla_criterion,
                p_seg=p_seg,
                p_discriminative=p_disc,
                p_cla=p_cla,
                is_val=True)

            results['val_loss'].append(val_loss)
            results['val_acc@1'].append(val_acc_1)
            results['val_acc@2'].append(val_acc_2)

            val_acc, val_recall, val_precision, val_auc, val_f1 = metrics_score(
                val_gt_labels, val_pred_labels)
            val_result = val_result.append(pd.DataFrame({
                'loss': [val_loss],
                'accurate': [val_acc],
                'recall': [val_recall],
                'precision': [val_precision],
                'AUC': [val_auc],
                'F1': [val_f1]
            }),
                                           ignore_index=True)
            # 3 test
            test_loss, test_acc_1, test_acc_2, test_pred_probs, test_pred_labels, test_gt_labels = test(
                net=net, data_loader=test_loader, criterion=cla_criterion)

            results['test_loss'].append(test_loss)
            results['test_acc@1'].append(test_acc_1)
            results['test_acc@2'].append(test_acc_2)
            test_acc, test_recall, test_precision, test_auc, test_f1 = metrics_score(
                test_gt_labels, test_pred_labels)
            test_result = test_result.append(pd.DataFrame({
                'loss': [test_loss],
                'accurate': [test_acc],
                'recall': [test_recall],
                'Precision': [test_precision],
                'AUC': [test_auc],
                'F1': [test_f1]
            }),
                                             ignore_index=True)
            '''save statistics'''
            data_frame = pd.DataFrame(data=results, index=range(1, epoch + 1))
            data_frame.to_csv(os.path.join(
                save_dir,
                'final_linear_statistics_' + 'K' + str(i + 1) + '.csv'),
                              index_label='epoch')
            total_curve = plot_loss_acc(data_frame)
            plt.savefig(
                os.path.join(
                    save_dir,
                    'final_linear_statistics_' + 'K' + str(i + 1) + '.png'))

            # print('[Per_epoch]Val  acc:{} | auc:{} | f1:{}'.format(val_acc, val_auc, val_f1))
            # print('[Per_epoch]Test acc:{} | auc:{} | f1:{}'.format(test_acc, test_auc, test_f1))

            if val_acc_1 > best_acc:
                best_acc = val_acc_1
                # 当验证集准确率最高时,保存测试集的结果
                save_best(i, net, val_gt_labels, val_pred_labels,
                          val_pred_probs, test_gt_labels, test_pred_labels,
                          cla, test_pred_probs, save_best_dir, save_dir)
                torch.save(
                    net.state_dict(),
                    os.path.join(
                        save_best_dir,
                        'model/K' + str(i + 1) + 'EP' + str(epoch) + '.pth'))
                print(
                    '[Best]\nVal:  acc:{} | recalll:{} | precision:{} | auc:{} | f1:{}'
                    .format(val_acc, val_recall, val_precision, val_auc,
                            val_f1))
                print(
                    'Test: acc:{} | recalll:{} | precision:{} | auc:{} | f1:{}'
                    .format(test_acc, test_recall, test_precision, test_auc,
                            test_f1))

            if epoch % 100 == 0:
                # 保存中间结果
                save_intermediate(net, save_intermediate_dir, epoch, i,
                                  val_gt_labels, val_pred_labels,
                                  val_pred_probs, cla, test_pred_probs,
                                  test_gt_labels, test_pred_labels,
                                  train_result, val_result, test_result)
                print('save epoch {}!'.format(epoch))
            '''save final epoch results'''
            train_result.to_csv(
                os.path.join(save_csv_dir,
                             'Train' + 'K' + str(i + 1) + '.csv'))
            val_result.to_csv(
                os.path.join(save_csv_dir, 'Val' + 'K' + str(i + 1) + '.csv'))
            test_result.to_csv(
                os.path.join(save_csv_dir, 'Test' + 'K' + str(i + 1) + '.csv'))
        save_k_final_results(net, save_dir, save_display_dir, i, lr_epoch,
                             val_gt_labels, val_pred_labels, val_pred_probs,
                             cla, test_pred_probs, test_gt_labels,
                             test_pred_labels)

        plot_part_loss_AucF1(data_frame, val_result, test_result)
        plt.savefig(
            os.path.join(save_dir,
                         'PartLoss_AUCF1' + 'K' + str(i + 1) + '.png'))
        break
def eval_reward(args, shared_model, writer_dir=None):
    """
	For evaluation

	Arguments:
	- writer: the tensorboard summary writer directory (note: can't get it working directly with the SummaryWriter object)
	"""
    writer = SummaryWriter(log_dir=os.path.join(
        writer_dir, 'eval')) if writer_dir is not None else None

    # current episode stats
    episode_reward = episode_value_mse = episode_td_error = episode_pg_loss = episode_length = 0

    # global stats
    i_episode = 0
    total_episode = total_steps = 0
    num_goals_achieved = 0

    # intilialize the env and models
    torch.manual_seed(args.seed)
    env = create_env(args.env_name, framework=args.framework, args=args)
    set_seed(args.seed, env, args.framework)

    shared_enc, shared_dec, shared_d_module, shared_r_module = shared_model

    enc = Encoder(env.observation_space.shape[0],
                  args.dim,
                  use_conv=args.use_conv)
    dec = Decoder(env.observation_space.shape[0],
                  args.dim,
                  use_conv=args.use_conv)
    d_module = D_Module(env.action_space.shape[0], args.dim, args.discrete)
    r_module = R_Module(env.action_space.shape[0],
                        args.dim,
                        discrete=args.discrete,
                        baseline=False,
                        state_space=env.observation_space.shape[0])

    all_params = chain(enc.parameters(), dec.parameters(),
                       d_module.parameters(), r_module.parameters())

    if args.from_checkpoint is not None:
        model_state, _ = torch.load(args.from_checkpoint)
        model.load_state_dict(model_state)

    # set the model to evaluation mode
    enc.eval()
    dec.eval()
    d_module.eval()
    r_module.eval()

    # reset the state
    state = env.reset()
    state = Variable(torch.from_numpy(state).float())

    start = time.time()

    while total_episode < args.num_episodes:

        # Sync with the shared model
        r_module.load_state_dict(shared_r_module.state_dict())
        d_module.load_state_dict(shared_d_module.state_dict())
        enc.load_state_dict(shared_enc.state_dict())
        dec.load_state_dict(shared_dec.state_dict())

        # reset stuff
        cd_p = Variable(torch.zeros(1, args.lstm_dim))
        hd_p = Variable(torch.zeros(1, args.lstm_dim))

        # for the reward
        cr_p = Variable(torch.zeros(1, args.lstm_dim))
        hr_p = Variable(torch.zeros(1, args.lstm_dim))

        i_episode += 1
        episode_length = 0
        episode_reward = 0
        args.local = True
        args.d = 0
        succ, _, episode_reward, episode_length = test(1, args, args, args,
                                                       d_module, r_module, enc)
        log("Eval: succ {:.2f}, reward {:.2f}, length {:.2f}".format(
            succ, episode_reward, episode_length))
        # Episode has ended, write the summaries here
        if writer_dir is not None:
            # current episode stats
            writer.add_scalar('eval/episode_reward', episode_reward, i_episode)
            writer.add_scalar('eval/episode_length', episode_length, i_episode)
            writer.add_scalar('eval/success', succ, i_episode)

        time.sleep(args.eval_every)
        print("sleep")
Beispiel #22
0
 def test_ap(self, net, epoch):
     for dataset in self.test_datasets:
         ap, _ = test(net, dataset, batch_size=self.batch_size)
         self.writer.log_ap(epoch, ap, dataset.name())
Beispiel #23
0
def train(hyp, tb_writer, opt, device):
    print(f'Hyperparameters {hyp}')
    log_dir = tb_writer.log_dir if tb_writer else 'runs/evolution'  # run directory
    wdir = str(Path(log_dir) / 'weights') + os.sep  # weights directory
    os.makedirs(wdir, exist_ok=True)
    last = wdir + 'last.pt'
    best = wdir + 'best.pt'
    results_file = log_dir + os.sep + 'results.txt'
    epochs, batch_size, total_batch_size, weights, rank = \
        opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.local_rank
    # TODO: Init DDP logging. Only the first process is allowed to log.
    # Since I see lots of print here, the logging configuration is skipped here. We may see repeated outputs.

    # Save run settings
    with open(Path(log_dir) / 'hyp.yaml', 'w') as f:
        yaml.dump(hyp, f, sort_keys=False)
    with open(Path(log_dir) / 'opt.yaml', 'w') as f:
        yaml.dump(vars(opt), f, sort_keys=False)

    # Configure
    init_seeds(2 + rank)
    with open(opt.data) as f:
        data_dict = yaml.load(f, Loader=yaml.FullLoader)  # model dict
    train_path = data_dict['train']
    test_path = data_dict['val']
    nc, names = (1, ['item']) if opt.single_cls else (int(
        data_dict['nc']), data_dict['names'])  # number classes, names
    assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (
        len(names), nc, opt.data)  # check

    # Remove previous results
    if rank in [-1, 0]:
        for f in glob.glob('*_batch*.jpg') + glob.glob(results_file):
            os.remove(f)

    # Create model
    model = Model(opt.cfg, nc=nc).to(device)

    # Image sizes
    gs = int(max(model.stride))  # grid size (max stride)
    imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size
                         ]  # verify imgsz are gs-multiples

    # Optimizer
    nbs = 64  # nominal batch size
    # default DDP implementation is slow for accumulation according to: https://pytorch.org/docs/stable/notes/ddp.html
    # all-reduce operation is carried out during loss.backward().
    # Thus, there would be redundant all-reduce communications in a accumulation procedure,
    # which means, the result is still right but the training speed gets slower.
    # TODO: If acceleration is needed, there is an implementation of allreduce_post_accumulation
    # in https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT/run_pretraining.py
    accumulate = max(round(nbs / total_batch_size),
                     1)  # accumulate loss before optimizing
    hyp['weight_decay'] *= total_batch_size * accumulate / nbs  # scale weight_decay

    pg0, pg1, pg2 = [], [], []  # optimizer parameter groups
    for k, v in model.named_parameters():
        if v.requires_grad:
            if '.bias' in k:
                pg2.append(v)  # biases
            elif '.weight' in k and '.bn' not in k:
                pg1.append(v)  # apply weight decay
            else:
                pg0.append(v)  # all else

    if hyp['optimizer'] == 'adam':  # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR
        optimizer = optim.Adam(pg0,
                               lr=hyp['lr0'],
                               betas=(hyp['momentum'],
                                      0.999))  # adjust beta1 to momentum
    else:
        optimizer = optim.SGD(pg0,
                              lr=hyp['lr0'],
                              momentum=hyp['momentum'],
                              nesterov=True)

    optimizer.add_param_group({
        'params': pg1,
        'weight_decay': hyp['weight_decay']
    })  # add pg1 with weight_decay
    optimizer.add_param_group({'params': pg2})  # add pg2 (biases)
    print('Optimizer groups: %g .bias, %g conv.weight, %g other' %
          (len(pg2), len(pg1), len(pg0)))
    del pg0, pg1, pg2

    # Load Model
    with torch_distributed_zero_first(rank):
        google_utils.attempt_download(weights)
    start_epoch, best_fitness = 0, 0.0
    if weights.endswith('.pt'):  # pytorch format
        ckpt = torch.load(weights, map_location=device)  # load checkpoint

        # load model
        try:
            exclude = ['anchor']  # exclude keys
            ckpt['model'] = {
                k: v
                for k, v in ckpt['model'].float().state_dict().items()
                if k in model.state_dict() and not any(x in k for x in exclude)
                and model.state_dict()[k].shape == v.shape
            }
            model.load_state_dict(ckpt['model'], strict=False)
            print('Transferred %g/%g items from %s' %
                  (len(ckpt['model']), len(model.state_dict()), weights))
        except KeyError as e:
            s = "%s is not compatible with %s. This may be due to model differences or %s may be out of date. " \
                "Please delete or update %s and try again, or use --weights '' to train from scratch." \
                % (weights, opt.cfg, weights, weights)
            raise KeyError(s) from e

        # load optimizer
        if ckpt['optimizer'] is not None:
            optimizer.load_state_dict(ckpt['optimizer'])
            best_fitness = ckpt['best_fitness']

        # load results
        if ckpt.get('training_results') is not None:
            with open(results_file, 'w') as file:
                file.write(ckpt['training_results'])  # write results.txt

        # epochs
        start_epoch = ckpt['epoch'] + 1
        if epochs < start_epoch:
            print(
                '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.'
                % (weights, ckpt['epoch'], epochs))
            epochs += ckpt['epoch']  # finetune additional epochs

        del ckpt

    # Mixed precision training https://github.com/NVIDIA/apex
    if mixed_precision:
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level='O1',
                                          verbosity=0)

    # Scheduler https://arxiv.org/pdf/1812.01187.pdf
    lf = lambda x: ((
        (1 + math.cos(x * math.pi / epochs)) / 2)**1.0) * 0.8 + 0.2  # cosine
    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
    # https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822
    # plot_lr_scheduler(optimizer, scheduler, epochs)

    # DP mode
    if device.type != 'cpu' and rank == -1 and torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    # SyncBatchNorm
    if opt.sync_bn and device.type != 'cpu' and rank != -1:
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
        print('Using SyncBatchNorm()')

    # Exponential moving average
    ema = torch_utils.ModelEMA(model) if rank in [-1, 0] else None

    # DDP mode
    if device.type != 'cpu' and rank != -1:
        model = DDP(model, device_ids=[rank], output_device=rank)

    # Trainloader
    dataloader, dataset = create_dataloader(train_path,
                                            imgsz,
                                            batch_size,
                                            gs,
                                            opt,
                                            hyp=hyp,
                                            augment=True,
                                            cache=opt.cache_images,
                                            rect=opt.rect,
                                            local_rank=rank,
                                            world_size=opt.world_size)
    mlc = np.concatenate(dataset.labels, 0)[:, 0].max()  # max label class
    nb = len(dataloader)  # number of batches
    assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (
        mlc, nc, opt.data, nc - 1)

    # Testloader
    if rank in [-1, 0]:
        # local_rank is set to -1. Because only the first process is expected to do evaluation.
        testloader = create_dataloader(test_path,
                                       imgsz_test,
                                       total_batch_size,
                                       gs,
                                       opt,
                                       hyp=hyp,
                                       augment=False,
                                       cache=opt.cache_images,
                                       rect=True,
                                       local_rank=-1,
                                       world_size=opt.world_size)[0]

    # Model parameters
    hyp['cls'] *= nc / 80.  # scale coco-tuned hyp['cls'] to current dataset
    model.nc = nc  # attach number of classes to model
    model.hyp = hyp  # attach hyperparameters to model
    model.gr = 1.0  # giou loss ratio (obj_loss = 1.0 or giou)
    model.class_weights = labels_to_class_weights(dataset.labels, nc).to(
        device)  # attach class weights
    model.names = names

    # Class frequency
    if rank in [-1, 0]:
        labels = np.concatenate(dataset.labels, 0)
        c = torch.tensor(labels[:, 0])  # classes
        # cf = torch.bincount(c.long(), minlength=nc) + 1.
        # model._initialize_biases(cf.to(device))
        plot_labels(labels, save_dir=log_dir)
        # if tb_writer:
        # tb_writer.add_hparams(hyp, {})  # causes duplicate https://github.com/ultralytics/yolov5/pull/384
        # tb_writer.add_histogram('classes', c, 0)

        # Check anchors
        if not opt.noautoanchor:
            check_anchors(dataset,
                          model=model,
                          thr=hyp['anchor_t'],
                          imgsz=imgsz)

    # Start training
    t0 = time.time()
    nw = max(3 * nb,
             1e3)  # number of warmup iterations, max(3 epochs, 1k iterations)
    maps = np.zeros(nc)  # mAP per class
    results = (
        0, 0, 0, 0, 0, 0, 0
    )  # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification'
    scheduler.last_epoch = start_epoch - 1  # do not move
    if rank in [0, -1]:
        print('Image sizes %g train, %g test' % (imgsz, imgsz_test))
        print('Using %g dataloader workers' % dataloader.num_workers)
        print('Starting training for %g epochs...' % epochs)
    # torch.autograd.set_detect_anomaly(True)
    for epoch in range(
            start_epoch, epochs
    ):  # epoch ------------------------------------------------------------------
        model.train()

        # Update image weights (optional)
        # When in DDP mode, the generated indices will be broadcasted to synchronize dataset.
        if dataset.image_weights:
            # Generate indices.
            if rank in [-1, 0]:
                w = model.class_weights.cpu().numpy() * (
                    1 - maps)**2  # class weights
                image_weights = labels_to_image_weights(dataset.labels,
                                                        nc=nc,
                                                        class_weights=w)
                dataset.indices = random.choices(
                    range(dataset.n), weights=image_weights,
                    k=dataset.n)  # rand weighted idx
            # Broadcast.
            if rank != -1:
                indices = torch.zeros([dataset.n], dtype=torch.int)
                if rank == 0:
                    indices[:] = torch.from_tensor(dataset.indices,
                                                   dtype=torch.int)
                dist.broadcast(indices, 0)
                if rank != 0:
                    dataset.indices = indices.cpu().numpy()

        # Update mosaic border
        # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs)
        # dataset.mosaic_border = [b - imgsz, -b]  # height, width borders

        mloss = torch.zeros(4, device=device)  # mean losses
        if rank != -1:
            dataloader.sampler.set_epoch(epoch)
        pbar = enumerate(dataloader)
        if rank in [-1, 0]:
            print(
                ('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj',
                                       'cls', 'total', 'targets', 'img_size'))
            pbar = tqdm(pbar, total=nb)  # progress bar
        optimizer.zero_grad()
        for i, (
                imgs, targets, paths, _
        ) in pbar:  # batch -------------------------------------------------------------
            ni = i + nb * epoch  # number integrated batches (since train start)
            imgs = imgs.to(device, non_blocking=True).float(
            ) / 255.0  # uint8 to float32, 0 - 255 to 0.0 - 1.0

            # Warmup
            if ni <= nw:
                xi = [0, nw]  # x interp
                # model.gr = np.interp(ni, xi, [0.0, 1.0])  # giou loss ratio (obj_loss = 1.0 or giou)
                accumulate = max(
                    1,
                    np.interp(ni, xi, [1, nbs / total_batch_size]).round())
                for j, x in enumerate(optimizer.param_groups):
                    # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
                    x['lr'] = np.interp(
                        ni, xi,
                        [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)])
                    if 'momentum' in x:
                        x['momentum'] = np.interp(ni, xi,
                                                  [0.9, hyp['momentum']])

            # Multi-scale
            if opt.multi_scale:
                sz = random.randrange(imgsz * 0.5,
                                      imgsz * 1.5 + gs) // gs * gs  # size
                sf = sz / max(imgs.shape[2:])  # scale factor
                if sf != 1:
                    ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]
                          ]  # new shape (stretched to gs-multiple)
                    imgs = F.interpolate(imgs,
                                         size=ns,
                                         mode='bilinear',
                                         align_corners=False)

            # Forward
            pred = model(imgs)

            # Loss
            loss, loss_items = compute_loss(pred, targets.to(device),
                                            model)  # scaled by batch_size
            if rank != -1:
                loss *= opt.world_size  # gradient averaged between devices in DDP mode
            if not torch.isfinite(loss):
                print('WARNING: non-finite loss, ending training ', loss_items)
                return results

            # Backward
            if mixed_precision:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            # Optimize
            if ni % accumulate == 0:
                optimizer.step()
                optimizer.zero_grad()
                if ema is not None:
                    ema.update(model)

            # Print
            if rank in [-1, 0]:
                mloss = (mloss * i + loss_items) / (i + 1
                                                    )  # update mean losses
                mem = '%.3gG' % (torch.cuda.memory_cached() / 1E9
                                 if torch.cuda.is_available() else 0)  # (GB)
                s = ('%10s' * 2 +
                     '%10.4g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem,
                                      *mloss, targets.shape[0], imgs.shape[-1])
                pbar.set_description(s)

                # Plot
                if ni < 3:
                    f = str(Path(log_dir) /
                            ('train_batch%g.jpg' % ni))  # filename
                    result = plot_images(images=imgs,
                                         targets=targets,
                                         paths=paths,
                                         fname=f)
                    if tb_writer and result is not None:
                        tb_writer.add_image(f,
                                            result,
                                            dataformats='HWC',
                                            global_step=epoch)
                        # tb_writer.add_graph(model, imgs)  # add model to tensorboard

            # end batch ------------------------------------------------------------------------------------------------

        # Scheduler
        scheduler.step()

        # Only the first process in DDP mode is allowed to log or save checkpoints.
        if rank in [-1, 0]:
            # mAP
            if ema is not None:
                ema.update_attr(
                    model,
                    include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride'])
            final_epoch = epoch + 1 == epochs
            if not opt.notest or final_epoch:  # Calculate mAP
                results, maps, times = eval.test(
                    opt.data,
                    batch_size=total_batch_size,
                    imgsz=imgsz_test,
                    save_json=final_epoch
                    and opt.data.endswith(os.sep + 'coco.yaml'),
                    model=ema.ema.module
                    if hasattr(ema.ema, 'module') else ema.ema,
                    single_cls=opt.single_cls,
                    dataloader=testloader,
                    save_dir=log_dir)

                # Write
                with open(results_file, 'a') as f:
                    f.write(
                        s + '%10.4g' * 7 % results +
                        '\n')  # P, R, mAP, F1, test_losses=(GIoU, obj, cls)
                if len(opt.name) and opt.bucket:
                    os.system('gsutil cp %s gs://%s/results/results%s.txt' %
                              (results_file, opt.bucket, opt.name))

                # Tensorboard
                if tb_writer:
                    tags = [
                        'train/giou_loss', 'train/obj_loss', 'train/cls_loss',
                        'metrics/precision', 'metrics/recall',
                        'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95',
                        'val/giou_loss', 'val/obj_loss', 'val/cls_loss'
                    ]
                    for x, tag in zip(list(mloss[:-1]) + list(results), tags):
                        tb_writer.add_scalar(tag, x, epoch)

                # Update best mAP
                fi = fitness(np.array(results).reshape(
                    1,
                    -1))  # fitness_i = weighted combination of [P, R, mAP, F1]
                if fi > best_fitness:
                    best_fitness = fi

            # Save model
            save = (not opt.nosave) or (final_epoch and not opt.evolve)
            if save:
                with open(results_file, 'r') as f:  # create checkpoint
                    ckpt = {
                        'epoch':
                        epoch,
                        'best_fitness':
                        best_fitness,
                        'training_results':
                        f.read(),
                        'model':
                        ema.ema.module if hasattr(ema, 'module') else ema.ema,
                        'optimizer':
                        None if final_epoch else optimizer.state_dict()
                    }

                # Save last, best and delete
                torch.save(ckpt, last)
                if (best_fitness == fi) and not final_epoch:
                    torch.save(ckpt, best)
                del ckpt
        # end epoch ----------------------------------------------------------------------------------------------------
    # end training

    if rank in [-1, 0]:
        # Strip optimizers
        n = ('_'
             if len(opt.name) and not opt.name.isnumeric() else '') + opt.name
        fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n
        for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'],
                          [flast, fbest, fresults]):
            if os.path.exists(f1):
                os.rename(f1, f2)  # rename
                ispt = f2.endswith('.pt')  # is *.pt
                strip_optimizer(f2) if ispt else None  # strip optimizer
                os.system('gsutil cp %s gs://%s/weights' % (
                    f2, opt.bucket)) if opt.bucket and ispt else None  # upload
        # Finish
        if not opt.evolve:
            plot_results(save_dir=log_dir)  # save as results.png
        print('%g epochs completed in %.3f hours.\n' %
              (epoch - start_epoch + 1, (time.time() - t0) / 3600))

    dist.destroy_process_group() if rank not in [-1, 0] else None
    torch.cuda.empty_cache()
    return results
Beispiel #24
0
def main():

    print(pprint.pformat(vars(args)))

    ###### Bookkeeping
    if os.path.exists(test_results_fname(args)):
        resp = None
        while resp not in {"yes", "no", "y", "n"}:
            resp = input(
                f"{args.save} already exists. Overwrite contents? [y/n]: ")
            if resp == "yes" or resp == "y":
                break
            elif resp == "no" or resp == "n":
                print("Exiting")
                exit()
    else:
        os.makedirs(args.save, exist_ok=True)

    # Save command to file
    with open(command_fname(args), 'w') as f:
        f.write(pprint.pformat(vars(args)))

    ###### Dataloading

    dset_train = DeepChromeDataset(dataroot=args.globstr_train,
                                   num_procs=args.dset_workers)
    print(f"Training set has {len(dset_train)} samples.")

    dset_val = DeepChromeDataset(dataroot=args.globstr_val,
                                 num_procs=args.dset_workers)
    print(f"Validation set has {len(dset_val)} samples.")

    dset_test = DeepChromeDataset(dataroot=args.globstr_test,
                                  num_procs=args.dset_workers)
    print(f"Test set has {len(dset_test)} samples.")

    train_loader = torch.utils.data.DataLoader(
        dset_train,
        batch_size=args.batch_size,
        num_workers=args.dloader_workers,
        shuffle=True,
        pin_memory=True,
    )

    val_loader = torch.utils.data.DataLoader(
        dset_val,
        batch_size=args.batch_size,
        num_workers=args.dloader_workers,
        shuffle=True,
        pin_memory=True,
    )

    test_loader = torch.utils.data.DataLoader(
        dset_test,
        batch_size=args.batch_size,
        num_workers=args.dloader_workers,
        shuffle=True,
        pin_memory=True,
    )

    ###### Setup Model
    if args.arch == 'DeepChrome':
        model = DeepChromeModel()
    elif args.arch == 'DeepChromeFC':
        model = DeepChromeFCModel()
    else:
        raise NotImplementedError()

    if not args.no_gpu:
        model = model.cuda()

    ###### Optimization
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=args.lr,
                                weight_decay=args.wd,
                                momentum=args.momentum)

    def cosine_annealing(step, total_steps, lr_max, lr_min):
        return lr_min + (lr_max - lr_min) * 0.5 * (
            1 + np.cos(step / total_steps * np.pi))

    scheduler = torch.optim.lr_scheduler.LambdaLR(
        optimizer,
        lr_lambda=lambda step: cosine_annealing(
            step,
            args.epochs * len(train_loader),
            1,  # since lr_lambda computes multiplicative factor
            1e-6 / args.lr))

    ###### Logging
    with open(train_log_fname(args), 'w') as f:
        f.write("epoch,train_loss,val_loss,val_acc,val_auroc\n")

    ###### Train!
    print("Beginning training...")
    best_epoch_auroc = 0
    best_epoch = None
    num_without_changing_best_val_auroc = 0
    for epoch in range(args.epochs):

        train_loss = train_one_epoch(epoch, model, train_loader, optimizer,
                                     scheduler)

        val_auroc, val_acc, val_loss = test(model, val_loader, args.no_gpu)

        ###### Logging

        print(
            'Epoch {0:3d} | Train Loss {1:.6f} | Val Loss {2:.6f} | Val AUROC {3:.6f} | Val Accuracy {4:.6f}'
            .format(
                (epoch + 1),
                train_loss,
                val_loss,
                val_auroc,
                val_acc,
            ))

        with open(train_log_fname(args), 'a') as f:
            f.write(f"{epoch},{train_loss},{val_loss},{val_acc},{val_auroc}\n")

        if val_auroc > best_epoch_auroc:
            best_epoch = epoch
            best_epoch_auroc = val_auroc
            num_without_changing_best_val_auroc = 0
            # Save the model iff this is the best epoch so far
            _dict = {
                "model.state_dict": model.state_dict(),
                "optimizer.state_dict": optimizer.state_dict(),
                "epoch": epoch,
            }
            torch.save(_dict, checkpoint_fname(args, epoch))
        else:
            num_without_changing_best_val_auroc += 1

        if num_without_changing_best_val_auroc > args.patience:
            print("Early stopping")
            break

    print(f"Doing final testing")

    print("Loading {0}".format(checkpoint_fname(args, best_epoch)))
    model.load_state_dict(
        torch.load(checkpoint_fname(args, best_epoch))['model.state_dict'])

    # Do final testing
    test_auroc, test_acc, test_loss = test(model, test_loader, args.no_gpu)
    with open(test_results_fname(args), 'w', encoding='utf-8') as f:
        data = {
            "test_auroc": test_auroc,
            "test_acc": test_acc,
            "test_loss": test_loss
        }
        print(pprint.pformat(data))
        json.dump(data, f, ensure_ascii=False, indent=4)

    print(f"Finished successfully. See {args.save}")
def train(config):
    gpu_manage(config)

    ### DATASET LOAD ###
    print('===> Loading datasets')

    dataset = TrainDataset(config)
    print('dataset:', len(dataset))
    train_size = int((1 - config.validation_size) * len(dataset))
    validation_size = len(dataset) - train_size
    train_dataset, validation_dataset = torch.utils.data.random_split(
        dataset, [train_size, validation_size])
    print('train dataset:', len(train_dataset))
    print('validation dataset:', len(validation_dataset))
    training_data_loader = DataLoader(dataset=train_dataset,
                                      num_workers=config.threads,
                                      batch_size=config.batchsize,
                                      shuffle=True)
    validation_data_loader = DataLoader(dataset=validation_dataset,
                                        num_workers=config.threads,
                                        batch_size=config.validation_batchsize,
                                        shuffle=False)

    ### MODELS LOAD ###
    print('===> Loading models')

    gen = Generator(gpu_ids=config.gpu_ids)

    if config.gen_init is not None:
        param = torch.load(config.gen_init)
        gen.load_state_dict(param)
        print('load {} as pretrained model'.format(config.gen_init))

    dis = Discriminator(in_ch=config.in_ch,
                        out_ch=config.out_ch,
                        gpu_ids=config.gpu_ids)

    if config.dis_init is not None:
        param = torch.load(config.dis_init)
        dis.load_state_dict(param)
        print('load {} as pretrained model'.format(config.dis_init))

    # setup optimizer
    opt_gen = optim.Adam(gen.parameters(),
                         lr=config.lr,
                         betas=(config.beta1, 0.999),
                         weight_decay=0.00001)
    opt_dis = optim.Adam(dis.parameters(),
                         lr=config.lr,
                         betas=(config.beta1, 0.999),
                         weight_decay=0.00001)

    real_a = torch.FloatTensor(config.batchsize, config.in_ch, config.width,
                               config.height)
    real_b = torch.FloatTensor(config.batchsize, config.out_ch, config.width,
                               config.height)
    M = torch.FloatTensor(config.batchsize, config.width, config.height)

    criterionL1 = nn.L1Loss()
    criterionMSE = nn.MSELoss()
    criterionSoftplus = nn.Softplus()

    if config.cuda:
        gen = gen.cuda()
        dis = dis.cuda()
        criterionL1 = criterionL1.cuda()
        criterionMSE = criterionMSE.cuda()
        criterionSoftplus = criterionSoftplus.cuda()
        real_a = real_a.cuda()
        real_b = real_b.cuda()
        M = M.cuda()

    real_a = Variable(real_a)
    real_b = Variable(real_b)

    logreport = LogReport(log_dir=config.out_dir)
    validationreport = TestReport(log_dir=config.out_dir)

    print('===> begin')
    start_time = time.time()
    # main
    for epoch in range(1, config.epoch + 1):
        epoch_start_time = time.time()
        for iteration, batch in enumerate(training_data_loader, 1):
            real_a_cpu, real_b_cpu, M_cpu = batch[0], batch[1], batch[2]
            real_a.data.resize_(real_a_cpu.size()).copy_(real_a_cpu)
            real_b.data.resize_(real_b_cpu.size()).copy_(real_b_cpu)
            M.data.resize_(M_cpu.size()).copy_(M_cpu)
            att, fake_b = gen.forward(real_a)

            ################
            ### Update D ###
            ################

            opt_dis.zero_grad()

            # train with fake
            fake_ab = torch.cat((real_a, fake_b), 1)
            pred_fake = dis.forward(fake_ab.detach())
            batchsize, _, w, h = pred_fake.size()

            loss_d_fake = torch.sum(
                criterionSoftplus(pred_fake)) / batchsize / w / h

            # train with real
            real_ab = torch.cat((real_a, real_b), 1)
            pred_real = dis.forward(real_ab)
            loss_d_real = torch.sum(
                criterionSoftplus(-pred_real)) / batchsize / w / h

            # Combined loss
            loss_d = loss_d_fake + loss_d_real

            loss_d.backward()

            if epoch % config.minimax == 0:
                opt_dis.step()

            ################
            ### Update G ###
            ################

            opt_gen.zero_grad()

            # First, G(A) should fake the discriminator
            fake_ab = torch.cat((real_a, fake_b), 1)
            pred_fake = dis.forward(fake_ab)
            loss_g_gan = torch.sum(
                criterionSoftplus(-pred_fake)) / batchsize / w / h

            # Second, G(A) = B
            loss_g_l1 = criterionL1(fake_b, real_b) * config.lamb
            loss_g_att = criterionMSE(att[:, 0, :, :], M)
            loss_g = loss_g_gan + loss_g_l1 + loss_g_att

            loss_g.backward()

            opt_gen.step()

            # log
            if iteration % 10 == 0:
                print(
                    "===> Epoch[{}]({}/{}): loss_d_fake: {:.4f} loss_d_real: {:.4f} loss_g_gan: {:.4f} loss_g_l1: {:.4f}"
                    .format(epoch, iteration, len(training_data_loader),
                            loss_d_fake.item(), loss_d_real.item(),
                            loss_g_gan.item(), loss_g_l1.item()))

                log = {}
                log['epoch'] = epoch
                log['iteration'] = len(training_data_loader) * (epoch -
                                                                1) + iteration
                log['gen/loss'] = loss_g.item()
                log['dis/loss'] = loss_d.item()

                logreport(log)

        print('epoch', epoch, 'finished, use time',
              time.time() - epoch_start_time)
        with torch.no_grad():
            log_validation = test(config, validation_data_loader, gen,
                                  criterionMSE, epoch)
            validationreport(log_validation)
        print('validation finished')
        if epoch % config.snapshot_interval == 0:
            checkpoint(config, epoch, gen, dis)

        logreport.save_lossgraph()
        validationreport.save_lossgraph()
    print('training time:', time.time() - start_time)
Beispiel #26
0
def main():

    print(pprint.pformat(vars(args)))

    ###### Bookkeeping
    if os.path.exists(test_results_fname(args)):
        resp = None
        while resp not in {"yes", "no", "y", "n"}:
            resp = input(
                f"{args.save} already exists. Overwrite contents? [y/n]: ")
            if resp == "yes" or resp == "y":
                break
            elif resp == "no" or resp == "n":
                print("Exiting")
                exit()
    else:
        os.makedirs(args.save, exist_ok=True)

    # Save command to file
    with open(command_fname(args), 'w') as f:
        f.write(pprint.pformat(vars(args)))

    ###### Dataloading

    dset_train = DeepChromeDataset(dataroot=args.globstr_train,
                                   num_procs=args.dset_workers)
    print(f"Training set has {len(dset_train)} samples.")

    train_loader = torch.utils.data.DataLoader(
        dset_train,
        batch_size=args.batch_size,
        num_workers=args.dloader_workers,
        shuffle=True,
        pin_memory=True,
    )

    ###### Setup Model
    if args.arch == 'DeepChrome':
        model = DeepChromeModel()
    elif args.arch == 'DeepChromeFC':
        model = DeepChromeFCModel()
    else:
        raise NotImplementedError()

    if not args.no_gpu:
        model = model.cuda()

    ###### Optimization
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=args.lr,
                                weight_decay=args.wd,
                                momentum=args.momentum)

    def cosine_annealing(step, total_steps, lr_max, lr_min):
        return lr_min + (lr_max - lr_min) * 0.5 * (
            1 + np.cos(step / total_steps * np.pi))

    scheduler = torch.optim.lr_scheduler.LambdaLR(
        optimizer,
        lr_lambda=lambda step: cosine_annealing(
            step,
            args.epochs * len(train_loader),
            1,  # since lr_lambda computes multiplicative factor
            1e-6 / args.lr))

    ###### Logging
    with open(train_log_fname(args), 'w') as f:
        f.write("epoch,train_loss,val_loss,val_acc,val_auroc\n")

    ###### Train!
    print("Beginning training...")
    best_epoch_auroc = 0
    best_epoch = None
    num_without_changing_best_val_auroc = 0

    TestCells = []
    for cell in args.globstr_val_cell_ids:
        TestCells.append(
            CellDataSet(cell, checkpoint_fname(args, cell), args.batch_size,
                        args.dset_workers, args.dloader_workers))

    for epoch in range(args.epochs):
        if all(map(lambda cell: cell.is_done, TestCells)):
            break

        # Train 1 epoch
        train_loss = train_one_epoch(epoch, model, train_loader, optimizer,
                                     scheduler)

        # Validate
        # val_auroc, val_acc, val_loss = test(model, val_loader, args.no_gpu)
        total_val_auroc = 0
        total_val_acc = 0
        total_val_loss = 0
        num_cells = 0
        for cell in TestCells:
            if cell.is_done:
                continue

            num_cells += 1
            val_auroc, val_acc, val_loss = test(model, cell.val_loader,
                                                args.no_gpu)

            total_val_auroc += val_auroc
            total_val_acc += val_acc
            total_val_loss += val_loss

            cell.add_valid_auroc(val_auroc, epoch, model.state_dict(),
                                 optimizer.state_dict(), args.patience)

        ###### Logging
        print(
            'Epoch {0:3d} | Train Loss {1:.6f} | Val Loss {2:.6f} | Val AUROC {3:.6f} | Val Accuracy {4:.6f}'
            .format(
                epoch,
                train_loss,
                total_val_auroc / num_cells,
                total_val_acc / num_cells,
                total_val_loss / num_cells,
            ))

        with open(train_log_fname(args), 'a') as f:
            f.write(
                f"{epoch},{train_loss},{total_val_loss / num_cells},{total_val_acc / num_cells},{total_val_auroc / num_cells}\n"
            )

    # Save the stragglers
    for cell in TestCells:
        if not cell.is_done:
            print(f"{cell.cell_id} was a straggler :(")
            cell._save_model_to_disk()

    # Test on all cells
    all_save_data = dict()
    for cell in TestCells:
        model.load_state_dict(cell.best_model)

        print(f"Doing final testing on cell {cell.cell_id}")

        # Do final testing
        test_auroc, test_acc, test_loss = test(model, cell.test_loader,
                                               args.no_gpu)
        data = {
            "test_auroc": test_auroc,
            "test_acc": test_acc,
            "test_loss": test_loss
        }
        all_save_data[cell.cell_id] = data

    with open(test_results_fname(args), 'w', encoding='utf-8') as f:
        print(pprint.pformat(all_save_data))
        json.dump(all_save_data, f, ensure_ascii=False, indent=4)

    print(f"Finished successfully. See {args.save}")
def trainIters(encoder, decoder, train_loader, dev_loader, \
               input_lang, output_lang, 
               input_lang_dev, output_lang_dev,
               max_word_len, n_iters, 
               plot_every=100, print_every=1, weight_decay=0,
               learning_rate=0.01, device=DEVICE, 
               teacher_forcing_ratio=0.5, label="", 
               use_lr_scheduler = True, gamma_en = 0.9, gamma_de=0.9, 
               beam_width=3, min_len=1, n_best=1, decode_method="beam", 
               save_result_path = '', save_model=False):
    start = time.time()
    num_steps = len(train_loader)
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every
    cur_best = 0

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate, weight_decay=weight_decay)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate, weight_decay=weight_decay)
    scheduler_encoder = ExponentialLR(encoder_optimizer, gamma_en, last_epoch=-1) 
    scheduler_decoder = ExponentialLR(decoder_optimizer, gamma_de, last_epoch=-1) 
    criterion = nn.NLLLoss()
 
    loss_file = open(save_result_path +'/%s-loss.txt'%label, 'w+')
    bleu_file = open(save_result_path +'/%s-bleu.txt'%label, 'w+')
    for epoch in range(1, n_iters + 1):
        if use_lr_scheduler:
            scheduler_encoder.step()
            scheduler_decoder.step()
        for i, (data1, data2, len1, len2) in enumerate(train_loader):
            encoder.train()
            decoder.train()
            source, target, source_len, target_len = data1.to(device), data2.to(device),len1.to(device),len2.to(device)

            loss = train(source, target, source_len, target_len, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion, 
                         device=device, teacher_forcing_ratio=teacher_forcing_ratio)
            print_loss_total += loss
            plot_loss_total += loss

            if i != 0 and (i % plot_every == 0):
                plot_loss_avg = plot_loss_total / plot_every
                plot_losses.append(plot_loss_avg)
                plot_loss_total = 0
        if epoch != 0 and (epoch % print_every == 0):        
            print_loss_avg = print_loss_total / len(train_loader)
            print_loss_total = 0
            print("testing..")
            bleu_score, _ , _, _ = test(encoder, decoder, dev_loader, 
                                    input_lang, output_lang,
                                    input_lang_dev, output_lang_dev,
                                    beam_width, min_len, n_best, 
                                    max_word_len, decode_method, device)
            print('%s epoch:(%d %d%%) step[%d %d] Average_Loss %.4f, Bleu Score %.3f' % (timeSince(start, epoch / n_iters),
                                        epoch, epoch / n_iters * 100, i, num_steps, print_loss_avg, bleu_score))
            loss_file.write("%s\n" % print_loss_avg)    
            bleu_file.write("%s\n" % bleu_score)
            if (bleu_score > cur_best):
                print("found best! save model...")
                fail_cnt = 0
                if save_model:
                    torch.save(encoder.state_dict(), 'encoder' + "-" + label + '.ckpt')
                    torch.save(decoder.state_dict(), 'decoder' + "-" + label + '.ckpt')
                    print("model saved")
                cur_best = bleu_score
            else:
                fail_cnt += 1
            if fail_cnt == 15:
                print("No improvement for 15 epochs. Halt!")
                return 0
        
        torch.cuda.empty_cache()
    loss_file.close()
    bleu_file.close()
Beispiel #28
0
def cluster(approach, datapath):
    """
    Run a clustering approach on unlabeled data.
    """
    report_path = test(datapath, approach, params[approach])
    c.echo('Report compiled at {0}.'.format(report_path))
Beispiel #29
0
def train(loader,
          model,
          crit,
          optimizer,
          lr_scheduler,
          opt,
          rl_crit=None,
          opt_test=None,
          test_dataset=None):
    model.train()
    loss_avg = averager()
    #model = nn.DataParallel(model)
    writer = SummaryWriter()
    for epoch in range(opt["epochs"]):
        lr_scheduler.step()

        iteration = 0
        # If start self crit training
        if opt["self_crit_after"] != -1 and epoch >= opt["self_crit_after"]:
            sc_flag = True
            init_cider_scorer(opt["cached_tokens"])
        else:
            sc_flag = False

        for data in loader:
            torch.cuda.synchronize()
            fc_feats = data['fc_feats'].cuda()
            labels = data['labels'].cuda()
            masks = data['masks'].cuda()

            # clip_nums = data['clip_num']
            # sorted_clip_nums, indices = torch.sort(clip_nums, descending=True)
            # _, desorted_indices = torch.sort(indices, descending=False)
            # fc_feats = fc_feats[indices]
            # pack = rnn.pack_padded_sequence(fc_feats, sorted_clip_nums, batch_first=True)
            optimizer.zero_grad()
            if not sc_flag:
                seq_probs, _ = model(fc_feats, labels, 'train')
                loss = crit(seq_probs, labels[:, 1:], masks[:, 1:])
            else:
                seq_probs, seq_preds = model(fc_feats,
                                             mode='inference',
                                             opt=opt)
                reward = get_self_critical_reward(model, fc_feats, data,
                                                  seq_preds)
                print(reward.shape)
                loss = rl_crit(seq_probs, seq_preds,
                               torch.from_numpy(reward).float().cuda())

            loss_avg.add(loss)
            loss.backward()
            clip_grad_value_(model.parameters(), opt['grad_clip'])
            optimizer.step()
            # train_loss = loss.item()
            torch.cuda.synchronize()
            iteration += 1

            # if not sc_flag:
            #     print("iter %d (epoch %d), train_loss = %.6f" %
            #           (iteration, epoch, train_loss))
            # else:
            #     print("iter %d (epoch %d), avg_reward = %.6f" %
            #           (iteration, epoch, np.mean(reward[:, 0])))
        print("[epoch %d]->train_loss = %.6f" % (epoch, loss_avg.val()))
        writer.add_scalar('scalar/train_loss_epcho', loss_avg.val())
        if epoch % opt["save_checkpoint_every"] == 0:
            test(model, crit, test_dataset, test_dataset.get_vocab(), opt_test,
                 writer)
            model.train()
            model_path = os.path.join(opt["checkpoint_path"],
                                      'model_%d.pth' % (epoch))
            model_info_path = os.path.join(opt["checkpoint_path"],
                                           'model_score.txt')
            torch.save(model.state_dict(), model_path)
            print("model saved to %s" % (model_path))
            with open(model_info_path, 'a') as f:
                f.write("model_%d, loss: %.6f\n" % (epoch, loss_avg.val()))
        loss_avg.reset()