def train_model(args):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    device_ids = [0, 1, 2, 3]
    batch_size = args.batch_size
    input_channels = 1
    out_channels = [args.out_channels1, args.out_channels2]
    kernel_size_cnn = [[args.kernel_size_cnn1, args.kernel_size_cnn2],
                       [args.kernel_size_cnn2, args.kernel_size_cnn1]]
    stride_size_cnn = [[args.stride_size_cnn1, args.stride_size_cnn2],
                       [args.stride_size_cnn2, args.stride_size_cnn1]]
    kernel_size_pool = [[args.kernel_size_pool1, args.kernel_size_pool2],
                        [args.kernel_size_pool2, args.kernel_size_pool1]]
    stride_size_pool = [[args.stride_size_pool1, args.stride_size_pool2],
                        [args.stride_size_pool2, args.stride_size_pool1]]
    hidden_dim = 200
    num_layers = 2
    dropout = 0
    num_labels = 4
    hidden_dim_lstm = 200
    epoch_num = 50
    num_layers_lstm = 2
    nfft = [512, 1024]
    weight = args.weight
    #model = MultiSpectrogramModel(input_channels,out_channels, kernel_size_cnn, stride_size_cnn, kernel_size_pool,
    #stride_size_pool, hidden_dim,num_layers,dropout,num_labels, batch_size,
    #hidden_dim_lstm,num_layers_lstm,device, nfft, weight, False)
    model = resnet18()
    print(
        "============================ Number of parameters ===================================="
    )
    print(str(sum(p.numel() for p in model.parameters() if p.requires_grad)))

    path = "batch_size:{};out_channels:{};kernel_size_cnn:{};stride_size_cnn:{};kernel_size_pool:{};stride_size_pool:{}; weight:{}".format(
        args.batch_size, out_channels, kernel_size_cnn, stride_size_cnn,
        kernel_size_pool, stride_size_pool, weight)
    with open("/scratch/speech/models/classification/resnet_stats.txt",
              "a+") as f:
        f.write("\n" + "============ model starts ===========")
        f.write(
            "\n" + "model_parameters: " +
            str(sum(p.numel()
                    for p in model.parameters() if p.requires_grad)) + "\n" +
            path + "\n")
    model.cuda()
    model = DataParallel(model, device_ids=device_ids)
    model.train()

    # Use Adam as the optimizer with learning rate 0.01 to make it fast for testing purposes
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    optimizer2 = optim.SGD(model.parameters(), lr=0.1)
    scheduler = ReduceLROnPlateau(optimizer=optimizer,
                                  factor=0.5,
                                  patience=2,
                                  threshold=1e-3)
    #scheduler2=ReduceLROnPlateau(optimizer=optimizer2, factor=0.5, patience=2, threshold=1e-3)
    #scheduler2 =CosineAnnealingLR(optimizer2, T_max=300, eta_min=0.0001)
    scheduler3 = MultiStepLR(optimizer, [5, 10, 15], gamma=0.1)

    # Load the training data
    training_data = IEMOCAP(name='mel', nfft=nfft, train=True)
    train_loader = DataLoader(dataset=training_data,
                              batch_size=batch_size,
                              shuffle=True,
                              collate_fn=my_collate,
                              num_workers=0,
                              drop_last=True)
    testing_data = IEMOCAP(name='mel', nfft=nfft, train=False)
    test_loader = DataLoader(dataset=testing_data,
                             batch_size=batch_size,
                             shuffle=True,
                             collate_fn=my_collate,
                             num_workers=0,
                             drop_last=True)

    #print("=================")
    #print(len(training_data))
    #print("===================")

    test_acc = []
    train_acc = []
    test_loss = []
    train_loss = []
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    for epoch in range(
            epoch_num
    ):  # again, normally you would NOT do 300 epochs, it is toy data
        print("===================================" + str(epoch + 1) +
              "==============================================")
        losses = 0
        correct = 0
        model.train()
        for j, (input_lstm, input1, input2, target,
                seq_length) in enumerate(train_loader):
            if (j + 1) % 20 == 0:
                print("=================================Train Batch" +
                      str(j + 1) +
                      "===================================================")
            model.zero_grad()
            x = model(input1)
            target = target.to(device)
            target_index = torch.argmax(target, dim=1).to(device)
            correct_batch = torch.sum(target_index == torch.argmax(x, dim=1))
            losses_batch = F.cross_entropy(x, torch.max(target, 1)[1])
            correct_batch = torch.unsqueeze(correct_batch, dim=0)
            losses_batch = torch.unsqueeze(losses_batch, dim=0)
            loss = torch.mean(losses_batch, dim=0)
            #print(loss)
            correct_batch = torch.sum(correct_batch, dim=0)
            losses += loss.item() * batch_size
            loss.backward()
            #weight=model.module.state_dict()["weight"]
            #weight=torch.exp(10*weight)/(1+torch.exp(10*weight)).item()
            optimizer.step()
            correct += correct_batch.item()
        accuracy = correct * 1.0 / ((j + 1) * batch_size)
        losses = losses / ((j + 1) * batch_size)
        #scheduler3.step()
        losses_test = 0
        correct_test = 0
        #torch.save(model.module.state_dict(), "/scratch/speech/models/classification/spec_full_joint_checkpoint_epoch_{}.pt".format(epoch+1))
        model.eval()
        with torch.no_grad():
            for j, (input_lstm, input1, input2, target,
                    seq_length) in enumerate(test_loader):
                if (j + 1) % 10 == 0:
                    print(
                        "=================================Test Batch" +
                        str(j + 1) +
                        "===================================================")
                x = model(input1)
                target = target.to(device)
                target_index = torch.argmax(target, dim=1).to(device)
                correct_batch = torch.sum(
                    target_index == torch.argmax(x, dim=1))
                losses_batch = F.cross_entropy(x, torch.max(target, 1)[1])
                correct_batch = torch.unsqueeze(correct_batch, dim=0)
                losses_batch = torch.unsqueeze(losses_batch, dim=0)
                loss = torch.mean(losses_batch, dim=0)
                correct_batch = torch.sum(correct_batch, dim=0)
                losses_test += loss.item() * batch_size
                correct_test += correct_batch.item()

        #print("how many correct:", correct_test)
        accuracy_test = correct_test * 1.0 / ((j + 1) * batch_size)
        losses_test = losses_test / ((j + 1) * batch_size)

        # data gathering
        test_acc.append(accuracy_test)
        train_acc.append(accuracy)
        test_loss.append(losses_test)
        train_loss.append(losses)
        print(
            "Epoch: {}-----------Training Loss: {} -------- Testing Loss: {} -------- Training Acc: {} -------- Testing Acc: {}"
            .format(epoch + 1, losses, losses_test, accuracy, accuracy_test) +
            "\n")
        with open("/scratch/speech/models/classification/resnet_stats.txt",
                  "a+") as f:
            #f.write("Epoch: {}-----------Training Loss: {} -------- Testing Loss: {} -------- Training Acc: {} -------- Testing Acc: {}".format(epoch+1,losses,losses_test, accuracy, accuracy_test)+"\n")
            if epoch == epoch_num - 1:
                f.write("Best Accuracy:{:06.5f}".format(max(test_acc)) + "\n")
                f.write("Average Top 10 Accuracy:{:06.5f}".format(
                    np.mean(np.sort(np.array(test_acc))[-10:])) + "\n")
                f.write("=============== model ends ===================" +
                        "\n")
    print("success:{}, Best Accuracy:{}".format(path, max(test_acc)))
Exemple #2
0
def main():
    args = setup_train_args()
    # 日志同时输出到文件和console
    global logger
    logger = create_logger(args)
    # 当用户使用GPU,并且GPU可用时
    args.cuda = torch.cuda.is_available() and not args.no_cuda
    device = 'cuda' if args.cuda else 'cpu'
    logger.info('using device:{}'.format(device))
    # 为CPU设置种子用于生成随机数,以使得结果是确定的
    # 为当前GPU设置随机种子;如果使用多个GPU,应该使用torch.cuda.manual_seed_all()为所有的GPU设置种子。
    # 当得到比较好的结果时我们通常希望这个结果是可以复现
    if args.seed:
        set_random_seed(args)

    # 设置使用哪些显卡进行训练

    # 初始化tokenizer
    tokenizer = BertTokenizer(vocab_file=args.vocab_path)
    # tokenizer的字典大小
    vocab_size = len(tokenizer)

    global pad_id
    pad_id = tokenizer.convert_tokens_to_ids(PAD)

    # 创建对话模型的输出目录
    if not os.path.exists(args.dialogue_model_output_path):
        os.mkdir(args.dialogue_model_output_path)
    # 创建MMI模型的输出目录
    if not os.path.exists(args.mmi_model_output_path):
        os.mkdir(args.mmi_model_output_path)
    # 加载GPT2模型
    model, n_ctx = create_model(args, vocab_size)
    model.to(device)
    # 对原始数据进行预处理,将原始语料转换成对应的token_id

    if args.raw and args.train_mmi:  # 如果当前是要训练MMI模型
        preprocess_mmi_raw_data(args, tokenizer, n_ctx)
    elif args.raw and not args.train_mmi:  # 如果当前是要训练对话生成模型
        print("_______________________________________")
        preprocess_raw_data(args, tokenizer, n_ctx)
    # 是否使用多块GPU进行并行运算
    multi_gpu = False
    if args.cuda and torch.cuda.device_count() > 1:
        logger.info("Let's use GPUs to train")
        model = DataParallel(
            model, device_ids=[int(i) for i in args.device.split(',')])
        multi_gpu = True
    # 记录模型参数数量
    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    logger.info('number of model parameters: {}'.format(num_parameters))

    # 加载数据
    logger.info("loading traing data")
    if args.train_mmi:  # 如果是训练MMI模型
        with open(args.train_mmi_tokenized_path, "r", encoding="utf8") as f:
            data = f.read()
    else:  # 如果是训练对话生成模型
        with open(args.train_tokenized_path, "r", encoding="utf8") as f:
            data = f.read()
    data_list = data.split("\n")
    train_list, test_list = train_test_split(data_list,
                                             test_size=0.2,
                                             random_state=1)
    # 开始训练
    train(model, device, train_list, multi_gpu, args)
    # 测试模型
    evaluate(model, device, test_list, multi_gpu, args)
Exemple #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0,1,2,3',
                        type=str,
                        required=False,
                        help='设置使用哪些显卡')
    parser.add_argument('--model_config',
                        default='config/model_config_small.json',
                        type=str,
                        required=False,
                        help='选择模型参数')
    parser.add_argument('--tokenizer_path',
                        default='cache/vocab_small.txt',
                        type=str,
                        required=False,
                        help='选择词库')
    parser.add_argument('--raw_data_path',
                        default='data/train.json',
                        type=str,
                        required=False,
                        help='原始训练语料')
    parser.add_argument('--tokenized_data_path',
                        default='data/tokenized/',
                        type=str,
                        required=False,
                        help='tokenized语料存放位置')
    parser.add_argument('--raw', action='store_true', help='是否先做tokenize')
    parser.add_argument('--epochs',
                        default=5,
                        type=int,
                        required=False,
                        help='训练循环')
    parser.add_argument('--batch_size',
                        default=8,
                        type=int,
                        required=False,
                        help='训练batch size')
    parser.add_argument('--lr',
                        default=1.5e-4,
                        type=float,
                        required=False,
                        help='学习率')
    parser.add_argument('--warmup_steps',
                        default=2000,
                        type=int,
                        required=False,
                        help='warm up步数')
    parser.add_argument('--log_step',
                        default=1,
                        type=int,
                        required=False,
                        help='多少步汇报一次loss,设置为gradient accumulation的整数倍')
    parser.add_argument('--stride',
                        default=768,
                        type=int,
                        required=False,
                        help='训练时取训练数据的窗口步长')
    parser.add_argument('--gradient_accumulation',
                        default=1,
                        type=int,
                        required=False,
                        help='梯度积累')
    parser.add_argument('--fp16', action='store_true', help='混合精度')
    parser.add_argument('--fp16_opt_level',
                        default='O1',
                        type=str,
                        required=False)
    parser.add_argument('--max_grad_norm',
                        default=1.0,
                        type=float,
                        required=False)
    parser.add_argument('--num_pieces',
                        default=100,
                        type=int,
                        required=False,
                        help='将训练语料分成多少份')
    parser.add_argument('--min_length',
                        default=128,
                        type=int,
                        required=False,
                        help='最短收录文章长度')
    parser.add_argument('--output_dir',
                        default='model/',
                        type=str,
                        required=False,
                        help='模型输出路径')
    parser.add_argument('--pretrained_model',
                        default='',
                        type=str,
                        required=False,
                        help='模型训练起点路径')
    parser.add_argument('--writer_dir',
                        default='tensorboard_summary/',
                        type=str,
                        required=False,
                        help='Tensorboard路径')
    parser.add_argument('--segment', action='store_true', help='中文以词为单位')
    parser.add_argument('--bpe_token', action='store_true', help='subword')
    parser.add_argument('--encoder_json',
                        default="tokenizations/encoder.json",
                        type=str,
                        help="encoder.json")
    parser.add_argument('--vocab_bpe',
                        default="tokenizations/vocab.bpe",
                        type=str,
                        help="vocab.bpe")

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    if args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡

    model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(
        args.model_config)
    print('config:\n' + model_config.to_json_string())

    n_ctx = model_config.n_ctx
    if args.bpe_token:
        full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe)
    else:
        full_tokenizer = tokenization_bert.BertTokenizer(
            vocab_file=args.tokenizer_path)
    full_tokenizer.max_len = 999999
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('using device:', device)

    raw_data_path = args.raw_data_path
    tokenized_data_path = args.tokenized_data_path
    raw = args.raw  # 选择是否从零开始构建数据集
    epochs = args.epochs
    batch_size = args.batch_size
    lr = args.lr
    warmup_steps = args.warmup_steps
    log_step = args.log_step
    stride = args.stride
    gradient_accumulation = args.gradient_accumulation
    fp16 = args.fp16  # 不支持半精度的显卡请勿打开
    fp16_opt_level = args.fp16_opt_level
    max_grad_norm = args.max_grad_norm
    num_pieces = args.num_pieces
    min_length = args.min_length
    output_dir = args.output_dir
    # tb_writer = SummaryWriter(log_dir=args.writer_dir)
    assert log_step % gradient_accumulation == 0

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    if raw:
        build_files(data_path=raw_data_path,
                    tokenized_data_path=tokenized_data_path,
                    num_pieces=num_pieces,
                    full_tokenizer=full_tokenizer,
                    min_length=min_length)
        print('files built')

    if not args.pretrained_model:
        model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config)
    else:
        model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(
            args.pretrained_model)
    model.train()
    model.to(device)

    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    print('number of parameters: {}'.format(num_parameters))

    multi_gpu = False
    full_len = 0
    print('calculating total steps')
    for i in tqdm(range(num_pieces)):
        with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                  'r') as f:
            full_len += len([int(item) for item in f.read().strip().split()])
    total_steps = int(full_len / stride * epochs / batch_size /
                      gradient_accumulation)
    print('total steps = {}'.format(total_steps))

    optimizer = transformers.AdamW(model.parameters(),
                                   lr=lr,
                                   correct_bias=True)
    scheduler = transformers.WarmupLinearSchedule(optimizer,
                                                  warmup_steps=warmup_steps,
                                                  t_total=total_steps)
    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=fp16_opt_level)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = DataParallel(
            model, device_ids=[int(i) for i in args.device.split(',')])
        multi_gpu = True
    print('starting training')
    overall_step = 0
    running_loss = 0
    for epoch in range(epochs):
        print('epoch {}'.format(epoch + 1))
        now = datetime.now()
        print('time: {}'.format(now))
        x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32)
        random.shuffle(x)
        piece_num = 0
        for i in x:
            with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                      'r') as f:
                line = f.read().strip()
            tokens = line.split()
            tokens = [int(token) for token in tokens]
            start_point = 0
            samples = []
            while start_point < len(tokens) - n_ctx:
                samples.append(tokens[start_point:start_point + n_ctx])
                start_point += stride
            if start_point < len(tokens):
                samples.append(tokens[len(tokens) - n_ctx:])
            random.shuffle(samples)
            for step in range(len(samples) // batch_size):  # drop last

                #  prepare data
                batch = samples[step * batch_size:(step + 1) * batch_size]
                batch_inputs = []
                for ids in batch:
                    int_ids = [int(x) for x in ids]
                    batch_inputs.append(int_ids)
                batch_inputs = torch.tensor(batch_inputs).long().to(device)

                #  forward pass
                outputs = model.forward(input_ids=batch_inputs,
                                        labels=batch_inputs)
                loss, logits = outputs[:2]

                #  get loss
                if multi_gpu:
                    loss = loss.mean()
                if gradient_accumulation > 1:
                    loss = loss / gradient_accumulation

                #  loss backward
                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   max_grad_norm)

                #  optimizer step
                if (overall_step + 1) % gradient_accumulation == 0:
                    running_loss += loss.item()
                    optimizer.step()
                    optimizer.zero_grad()
                    scheduler.step()
                if (overall_step + 1) % log_step == 0:
                    # tb_writer.add_scalar('loss', loss.item() * gradient_accumulation, overall_step)
                    print(
                        'now time: {}:{}. Step {} of piece {} of epoch {}, loss {}'
                        .format(
                            datetime.now().hour,
                            datetime.now().minute, step + 1, piece_num,
                            epoch + 1, running_loss * gradient_accumulation /
                            (log_step / gradient_accumulation)))
                    running_loss = 0
                overall_step += 1
            piece_num += 1

        print('saving model for epoch {}'.format(epoch + 1))
        if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)):
            os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1))
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(output_dir +
                                      'model_epoch{}'.format(epoch + 1))
        # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1))
        # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1))
        print('epoch {} finished'.format(epoch + 1))

        then = datetime.now()
        print('time: {}'.format(then))
        print('time for one epoch: {}'.format(then - now))

    print('training finished')
    if not os.path.exists(output_dir + 'final_model'):
        os.mkdir(output_dir + 'final_model')
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(output_dir + 'final_model')
def train(opt):
    # set device to cpu/gpu
    if opt.use_gpu:
        device = torch.device("cuda", opt.gpu_id)
    else:
        device = torch.device("cpu")

    # Data transformations for data augmentation
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.RandomErasing(),
    ])
    transform_val = transforms.Compose([
        transforms.ToTensor(),
    ])

    # get CIFAR10/CIFAR100 train/val set
    if opt.dataset == "CIFAR10":
        alp_lambda = 0.5
        lambda_loss = [0.005, 0.001]
        train_set = CIFAR10(root="./data", train=True,
                            download=True, transform=transform_train)
        val_set = CIFAR10(root="./data", train=True,
                          download=True, transform=transform_val)
    else:
        alp_lambda = 0.5
        lambda_loss = [0.005, 0.001]
        train_set = CIFAR100(root="./data", train=True,
                             download=True, transform=transform_train)
        val_set = CIFAR100(root="./data", train=True,
                           download=True, transform=transform_val)
    num_classes = np.unique(train_set.targets).shape[0]

    # set stratified train/val split
    idx = list(range(len(train_set.targets)))
    train_idx, val_idx, _, _ = train_test_split(
        idx, train_set.targets, test_size=opt.val_split, random_state=42)

    # get train/val samplers
    train_sampler = SubsetRandomSampler(train_idx)
    val_sampler = SubsetRandomSampler(val_idx)

    # get train/val dataloaders
    train_loader = DataLoader(train_set,
                              sampler=train_sampler,
                              batch_size=opt.batch_size,
                              num_workers=opt.num_workers)
    val_loader = DataLoader(val_set,
                            sampler=val_sampler,
                            batch_size=opt.batch_size,
                            num_workers=opt.num_workers)

    data_loaders = {"train": train_loader, "val": val_loader}

    print("Dataset -- {}, Metric -- {}, Train Mode -- {}, Backbone -- {}".format(opt.dataset,
                                                                                 opt.metric, opt.train_mode, opt.backbone))
    print("Train iteration batch size: {}".format(opt.batch_size))
    print("Train iterations per epoch: {}".format(len(train_loader)))

    # get backbone model
    if opt.backbone == "resnet18":
        model = resnet18(pretrained=False)
    else:
        model = resnet34(pretrained=False)

    # set metric loss function
    in_features = model.fc.in_features
    model.fc = Softmax(in_features, num_classes)

    model.to(device)
    if opt.use_gpu:
        model = DataParallel(model).to(device)

    criterion = CrossEntropyLoss()
    mse_criterion = MSELoss()
    cent_criterion = CenterLoss(num_classes, in_features, device)

    # set optimizer and LR scheduler
    if opt.optimizer == "sgd":
        optimizer = SGD([{"params": model.parameters()}],
                        lr=opt.lr, weight_decay=opt.weight_decay, momentum=0.9)
        cent_optimizer = SGD([{"params": cent_criterion.parameters()}],
                             lr=opt.lr, weight_decay=opt.weight_decay, momentum=0.9)
    else:
        optimizer = Adam([{"params": model.parameters()}],
                         lr=opt.lr, weight_decay=opt.weight_decay)
        cent_optimizer = Adam([{"params": cent_criterion.parameters()}],
                              lr=opt.lr, weight_decay=opt.weight_decay)
    if opt.scheduler == "decay":
        scheduler = lr_scheduler.StepLR(
            optimizer, step_size=opt.lr_step, gamma=opt.lr_decay)
    else:
        scheduler = lr_scheduler.ReduceLROnPlateau(
            optimizer, factor=0.1, patience=10)

    # train/val loop
    for epoch in range(opt.epoch):
        for phase in ["train", "val"]:
            total_examples, total_correct, total_loss = 0, 0, 0

            if phase == "train":
                model.train()
            else:
                model.eval()

            start_time = time.time()
            for ii, data in enumerate(data_loaders[phase]):
                # load data batch to device
                images, labels = data
                images = images.to(device)
                labels = labels.to(device).long()

                # perform adversarial attack update to images
                if opt.train_mode == "at" or opt.train_mode == "alp":
                    adv_images = pgd(
                        model, images, labels, 8. / 255, 2. / 255, 7)
                else:
                    pass

                 # at train mode
                if opt.train_mode == "at":
                    # get feature embedding and logits from resnet
                    features, predictions = model(images, labels)
                    adv_features, adv_predictions = model(adv_images, labels)

                    # get center loss
                    cent_loss = cent_criterion(features, labels)
                    cent_loss = cent_loss + \
                        cent_criterion(adv_features, labels)

                    # get feature norm loss
                    norm = features.mm(features.t()).diag()
                    adv_norm = adv_features.mm(adv_features.t()).diag()
                    norm_loss = (torch.sum(norm) + torch.sum(adv_norm)) / \
                        (features.size(0) + adv_features.size(0))

                    # get cross-entropy loss
                    ce_loss = criterion(predictions, labels)
                    ce_loss = ce_loss + criterion(adv_predictions, labels)

                    # combine cross-entropy loss, center loss and feature norm loss using lambda weights
                    loss = ce_loss + lambda_loss[0] * \
                        cent_loss + lambda_loss[1] * norm_loss
                    optimizer.zero_grad()
                    cent_optimizer.zero_grad()

                    # for result accumulation
                    predictions = adv_predictions

                # alp train mode
                elif opt.train_mode == "alp":
                    # get feature embedding and logits from resnet
                    features, predictions = model(images, labels)
                    adv_features, adv_predictions = model(adv_images, labels)

                    # get center loss
                    cent_loss = cent_criterion(features, labels)
                    cent_loss = cent_loss + \
                        cent_criterion(adv_features, labels)

                    # get feature norm loss
                    norm = features.mm(features.t()).diag()
                    adv_norm = adv_features.mm(adv_features.t()).diag()
                    norm_loss = (torch.sum(norm) + torch.sum(adv_norm)) / \
                        (features.size(0) + adv_features.size(0))

                    # get cross-entropy loss
                    ce_loss = criterion(predictions, labels)
                    ce_loss = ce_loss + criterion(adv_predictions, labels)

                    # get alp loss
                    alp_loss = mse_criterion(adv_predictions, predictions)

                    # combine cross-entropy loss, center loss and feature norm loss using lambda weights
                    loss = ce_loss + lambda_loss[0] * \
                        cent_loss + lambda_loss[1] * norm_loss
                    # combine loss with alp loss
                    loss = loss + alp_lambda * alp_loss
                    optimizer.zero_grad()
                    cent_optimizer.zero_grad()

                    # for result accumulation
                    predictions = adv_predictions

                # clean train mode
                else:
                    # get feature embedding and logits from resnet
                    features, predictions = model(images, labels)

                    # get center loss
                    cent_loss = cent_criterion(features, labels)

                    # get feature norm loss
                    norm = features.mm(features.t()).diag()
                    norm_loss = torch.sum(norm) / features.size(0)

                    # get cross-entropy loss
                    ce_loss = criterion(predictions, labels)

                    # combine cross-entropy loss, center loss and feature norm loss using lambda weights
                    loss = ce_loss + lambda_loss[0] * \
                        cent_loss + lambda_loss[1] * norm_loss
                    optimizer.zero_grad()
                    cent_optimizer.zero_grad()

                # only take step if in train phase
                if phase == "train":
                    loss.backward()
                    optimizer.step()
                    cent_optimizer.step()

                # accumulate train or val results
                predictions = torch.argmax(predictions, 1)
                total_examples += predictions.size(0)
                total_correct += predictions.eq(labels).sum().item()
                total_loss += loss.item()

                # print accumulated train/val results at end of epoch
                if ii == len(data_loaders[phase]) - 1:
                    end_time = time.time()
                    acc = total_correct / total_examples
                    loss = total_loss / len(data_loaders[phase])
                    print("{}: Epoch -- {} Loss -- {:.6f} Acc -- {:.6f} Time -- {:.6f}sec".format(
                        phase, epoch, loss, acc, end_time - start_time))

                    if phase == "train":
                        loss = total_loss / len(data_loaders[phase])
                        scheduler.step(loss)
                    else:
                        print("")

    # save model after training for opt.epoch
    save_model(model, opt.dataset, opt.metric, opt.train_mode, opt.backbone)
Exemple #5
0
class ProGAN:
    """ Wrapper around the Generator and the Discriminator """
    def __init__(self,
                 depth=7,
                 latent_size=512,
                 learning_rate=0.001,
                 beta_1=0,
                 beta_2=0.99,
                 eps=1e-8,
                 drift=0.001,
                 n_critic=1,
                 use_eql=True,
                 loss="wgan-gp",
                 use_ema=True,
                 ema_decay=0.999,
                 device=th.device("cpu")):
        """
        constructor for the class
        :param depth: depth of the GAN (will be used for each generator and discriminator)
        :param latent_size: latent size of the manifold used by the GAN
        :param learning_rate: learning rate for Adam
        :param beta_1: beta_1 for Adam
        :param beta_2: beta_2 for Adam
        :param eps: epsilon for Adam
        :param n_critic: number of times to update discriminator
                         (Used only if loss is wgan or wgan-gp)
        :param drift: drift penalty for the
                      (Used only if loss is wgan or wgan-gp)
        :param use_eql: whether to use equalized learning rate
        :param loss: the loss function to be used
                     Can either be a string =>
                          ["wgan-gp", "wgan", "lsgan", "lsgan-with-sigmoid"]
                     Or an instance of GANLoss
        :param use_ema: boolean for whether to use exponential moving averages
        :param ema_decay: value of mu for ema
        :param device: device to run the GAN on (GPU / CPU)
        """

        from torch.optim import Adam
        from torch.nn import DataParallel

        # Create the Generator and the Discriminator
        self.gen = Generator(depth, latent_size, use_eql=use_eql).to(device)
        self.dis = Discriminator(depth, latent_size,
                                 use_eql=use_eql).to(device)

        # if code is to be run on GPU, we can use DataParallel:
        if device == th.device("cuda"):
            self.gen = DataParallel(self.gen)
            self.dis = DataParallel(self.dis)

        # state of the object
        self.latent_size = latent_size
        self.depth = depth
        self.use_ema = use_ema
        self.ema_decay = ema_decay
        self.n_critic = n_critic
        self.use_eql = use_eql
        self.device = device
        self.drift = drift

        # define the optimizers for the discriminator and generator
        self.gen_optim = Adam(self.gen.parameters(),
                              lr=learning_rate,
                              betas=(beta_1, beta_2),
                              eps=eps)

        self.dis_optim = Adam(self.dis.parameters(),
                              lr=learning_rate,
                              betas=(beta_1, beta_2),
                              eps=eps)

        # define the loss function used for training the GAN
        self.loss = self.__setup_loss(loss)

        if self.use_ema:
            from .CustomLayers import update_average

            # create a shadow copy of the generator
            self.gen_shadow = copy.deepcopy(self.gen)

            # updater function:
            self.ema_updater = update_average

            # initialize the gen_shadow weights equal to the
            # weights of gen
            self.ema_updater(self.gen_shadow, self.gen, beta=0)

    def __setup_loss(self, loss):
        from . import Losses as losses

        if isinstance(loss, str):
            loss = loss.lower()  # lowercase the string
            if loss == "wgan":
                loss = losses.WGAN_GP(self.device,
                                      self.dis,
                                      self.drift,
                                      use_gp=False)
                # note if you use just wgan, you will have to use weight clipping
                # in order to prevent gradient exploding

            elif loss == "wgan-gp":
                loss = losses.WGAN_GP(self.device,
                                      self.dis,
                                      self.drift,
                                      use_gp=True)

            elif loss == "lsgan":
                loss = losses.LSGAN(self.device, self.dis)

            elif loss == "lsgan-with-sigmoid":
                loss = losses.LSGAN_SIGMOID(self.device, self.dis)

            else:
                raise ValueError("Unknown loss function requested")

        elif not isinstance(loss, losses.GANLoss):
            raise ValueError(
                "loss is neither an instance of GANLoss nor a string")

        return loss

    def optimize_discriminator(self, noise, real_batch, depth, alpha):
        """
        performs one step of weight update on discriminator using the batch of data
        :param noise: input noise of sample generation
        :param real_batch: real samples batch
        :param depth: current depth of optimization
        :param alpha: current alpha for fade-in
        :return: current loss (Wasserstein loss)
        """
        from torch.nn import AvgPool2d
        from torch.nn.functional import upsample

        # downsample the real_batch for the given depth
        down_sample_factor = int(np.power(2, self.depth - depth - 1))
        prior_downsample_factor = max(int(np.power(2, self.depth - depth)), 0)

        ds_real_samples = AvgPool2d(down_sample_factor)(real_batch)

        if depth > 0:
            prior_ds_real_samples = upsample(
                AvgPool2d(prior_downsample_factor)(real_batch), scale_factor=2)
        else:
            prior_ds_real_samples = ds_real_samples

        # real samples are a combination of ds_real_samples and prior_ds_real_samples
        real_samples = (alpha * ds_real_samples) + (
            (1 - alpha) * prior_ds_real_samples)

        loss_val = 0
        for _ in range(self.n_critic):
            # generate a batch of samples
            fake_samples = self.gen(noise, depth, alpha).detach()

            loss = self.loss.dis_loss(real_samples, fake_samples, depth, alpha)

            # optimize discriminator
            self.dis_optim.zero_grad()
            loss.backward()
            self.dis_optim.step()

            loss_val += loss.item()

        return loss_val / self.n_critic

    def optimize_generator(self, noise, depth, alpha):
        """
        performs one step of weight update on generator for the given batch_size
        :param noise: input random noise required for generating samples
        :param depth: depth of the network at which optimization is done
        :param alpha: value of alpha for fade-in effect
        :return: current loss (Wasserstein estimate)
        """

        # generate fake samples:
        fake_samples = self.gen(noise, depth, alpha)

        # TODO: Change this implementation for making it compatible for relativisticGAN
        loss = self.loss.gen_loss(None, fake_samples, depth, alpha)

        # optimize the generator
        self.gen_optim.zero_grad()
        loss.backward()
        self.gen_optim.step()

        # if use_ema is true, apply ema to the generator parameters
        if self.use_ema:
            self.ema_updater(self.gen_shadow, self.gen, self.ema_decay)

        # return the loss value
        return loss.item()
Exemple #6
0
def training(M):
    batch_size = len(gpu_ids) * 1
    data_len = 100000
    hover_loader = DataLoader(dataset=HoverDataset(data_len),
                              batch_size=batch_size,
                              shuffle=True,
                              drop_last=True,
                              num_workers=8)

    res = 256

    label_dir = 'crn0/Label256Full'
    l = os.listdir(label_dir)

    for epoch in range(200):
        running_loss = 0
        c_t = 0

        print("New Epoch")

        for data in hover_loader:

            a = time.time()
            label_images, input_images = data
            label_images = label_images.cuda()
            input_images = input_images.cuda(gpu_ids[-1])

            b = time.time()

            #print(label_images.shape)
            if label_images.shape[0] != batch_size or label_images.shape[
                    1] != DIMENSION:
                print("skip")
                continue

            c_t += label_images.shape[0]
            # for I in enumerate(l):
            #J = str.replace(I[1], 'gtFine_color.png', 'leftImg8bit.png')

            #     label_images1 = Variable(torch.unsqueeze(torch.from_numpy(helper.get_semantic_map(
            #         'crn0/Label256Full/'+I[1])).float().permute(2, 0, 1), dim=0))  # .cuda()#training label
            #     input_images = Variable(torch.unsqueeze(torch.from_numpy(
            #         io.imread("crn0/RGB256Full/"+J)).float(), dim=0).permute(0, 3, 1, 2))

            if M == 0:
                model = cascaded_model(label_images, res)
                model = model.cuda()
                model = DataParallel(model, gpu_ids)
                #                 model.load_state_dict(torch.load('mynet_updated.pth')) # if u want to resume training from a pretrained model then add the .pth file here
                optimizer = optim.Adam(model.parameters(),
                                       lr=0.0001 * len(gpu_ids),
                                       betas=(0.9, 0.999),
                                       eps=1e-08,
                                       weight_decay=0)

            optimizer.zero_grad()
            Generator = model(label_images)
            Loss = Net(input_images, Generator, label_images)
            c = time.time()

            print(Loss.data)
            if len(gpu_ids) > 1:
                Loss = Loss.mean()
            Loss.backward()
            optimizer.step()
            M = 1
            running_loss += Loss.data.item()
            d = time.time()
            print(epoch, c_t, Loss.data.item(), b - a, c - b, d - c)

            if c_t % 1000 == 0:
                Generator = Generator.permute(0, 2, 3, 1)
                Generator = Generator.cpu()
                Generator = Generator.data.numpy()
                output = np.minimum(np.maximum(Generator, 0.0), 255.0)
                scipy.misc.toimage(output[0, :, :, :], cmin=0, cmax=255).save(
                    "crn0/vis/{}_{}_output_real.jpg".format(epoch, c_t))

        shuffle(l)
        # can replace the 2975 with c_t for generalization
        epoch_loss = running_loss / data_len
        print(epoch, epoch_loss)
        torch.save(model.state_dict(),
                   'crn0/mynet_epoch{}_CRN.pth'.format(epoch))
        #epoch_acc = running_corrects / 2975.0


#     return Loss
    best_model_wts = model.state_dict()
    model.load_state_dict(best_model_wts)

    return model
    def train(self):
        if not self.pretrained_model:
            model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel(
                config=self.model_config)
        else:
            model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(
                self.pretrained_model)
        model.train()
        model.to(self.device)
        # 计算模型参数量
        num_parameters = 0
        parameters = model.parameters()
        for parameter in parameters:
            num_parameters += parameter.numel()
        self.print_and_log('模型参数量 = {}'.format(num_parameters))

        if self.do_tokenize:
            self.print_and_log("开始加载训练集")
            self.tokenize_and_save()
            self.print_and_log("训练集加载完毕")

        full_len = 0
        for i in range(self.split_num):
            with open(
                    self.tokenized_data_path +
                    'tokenized_train_{}.txt'.format(i), 'r') as f:
                full_len += len(
                    [int(item) for item in f.read().strip().split()])
        sample_num = int(full_len / self.stride)
        epoch_steps = int(full_len / self.stride / self.batch_size /
                          self.gradient_accumulation)
        total_steps = int(full_len / self.stride * self.epochs /
                          self.batch_size / self.gradient_accumulation)
        self.print_and_log('样本数 = {}'.format(sample_num))
        self.print_and_log('epoch 步数 = {}'.format(epoch_steps))
        self.print_and_log('总步数 = {}'.format(total_steps))

        optimizer = pytorch_transformers.AdamW(model.parameters(),
                                               lr=self.lr,
                                               correct_bias=True)
        scheduler = pytorch_transformers.WarmupLinearSchedule(
            optimizer, warmup_steps=self.warmup_steps, t_total=total_steps)

        if self.fp16:
            try:
                from apex import amp
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
                )
            model, optimizer = amp.initialize(model,
                                              optimizer,
                                              opt_level=self.fp16_opt_level)

        if torch.cuda.device_count() > 1:
            model = DataParallel(model)
            multi_gpu = True
        else:
            multi_gpu = False

        overall_step = 0
        running_loss = 0
        for epoch in range(self.epochs):
            self.print_and_log('epoch {}'.format(epoch + 1))
            now = datetime.now()
            self.print_and_log('time: {}'.format(now))
            optimizer.zero_grad()
            split_indices = np.linspace(0,
                                        self.split_num - 1,
                                        self.split_num,
                                        dtype=np.int32)
            random.shuffle(split_indices)
            for split_index in split_indices:
                with open(
                        self.tokenized_data_path +
                        'tokenized_train_{}.txt'.format(split_index),
                        'r') as f:
                    line = f.read().strip()
                all_ids = line.split()
                all_ids = [int(x) for x in all_ids]
                start_point = 0
                samples = []
                while start_point < len(all_ids) - self.n_ctx:
                    samples.append(all_ids[start_point:start_point +
                                           self.n_ctx])
                    start_point += self.stride
                random.shuffle(samples)
                for i in range(len(samples) // self.batch_size):  # drop last
                    batch = samples[i * self.batch_size:(i + 1) *
                                    self.batch_size]
                    batch_labels = torch.tensor(batch, dtype=torch.long).to(
                        self.device)
                    batch_inputs = torch.tensor(batch, dtype=torch.long).to(
                        self.device)
                    outputs = model.forward(input_ids=batch_inputs,
                                            labels=batch_labels)
                    loss, logits = outputs[:2]

                    if multi_gpu:
                        loss = loss.mean()

                    if self.gradient_accumulation > 1:
                        loss = loss / self.gradient_accumulation

                    #  loss backward
                    if self.fp16:
                        with amp.scale_loss(loss, optimizer) as scaled_loss:
                            scaled_loss.backward()
                            torch.nn.utils.clip_grad_norm_(
                                amp.master_params(optimizer),
                                self.max_grad_norm)
                    else:
                        loss.backward()
                        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       self.max_grad_norm)

                    if (i + 1) % self.gradient_accumulation == 0:
                        running_loss += loss.item()
                        scheduler.step()
                        optimizer.step()
                        optimizer.zero_grad()
                        overall_step += 1

                    if (overall_step +
                            1) % self.log_step == 0 and running_loss != 0:
                        self.print_and_log(
                            'now time: {}:{}. Step {} of epoch {}, loss {}'.
                            format(
                                datetime.now().hour,
                                datetime.now().minute, overall_step + 1,
                                epoch + 1, running_loss *
                                self.gradient_accumulation / self.log_step))
                        running_loss = 0

            if not os.path.exists(self.output_dir +
                                  'model_epoch{}'.format(epoch + 1)):
                os.makedirs(self.output_dir +
                            'model_epoch{}'.format(epoch + 1))
            gpt2_model = model.transformer
            model_to_save = gpt2_model.module if hasattr(
                gpt2_model, 'module') else gpt2_model
            model_to_save.save_pretrained(self.output_dir +
                                          'model_epoch{}'.format(epoch + 1))
            # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1))
            # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1))

            then = datetime.now()
            self.print_and_log('time: {}'.format(then))
            self.print_and_log('time for one epoch: {}'.format(then - now))

        self.print_and_log('training finished')
        self.f_log.close()
        if not os.path.exists(self.output_dir + 'final_model'):
            os.makedirs(self.output_dir + 'final_model')
        gpt2_model = model.transformer
        model_to_save = gpt2_model.module if hasattr(gpt2_model,
                                                     'module') else gpt2_model
        model_to_save.save_pretrained(self.output_dir + 'final_model')
Exemple #8
0
def main():
    global args
    args = parser.parse_args()

    torch.manual_seed(0)
    torch.cuda.manual_seed_all(0)

    model = import_module(args.model)
    config, net, loss, get_pbb = model.get_model()
    start_epoch = args.start_epoch
    save_dir = args.save_dir

    if args.resume:
        checkpoint = torch.load(args.resume)
        if start_epoch == 0:
            start_epoch = checkpoint['epoch'] + 1
        if not save_dir:
            save_dir = checkpoint['save_dir']
        else:
            save_dir = os.path.join('results', save_dir)
        net.load_state_dict(checkpoint['state_dict'])
    else:
        if start_epoch == 0:
            start_epoch = 1
        if not save_dir:
            exp_id = time.strftime('%Y%m%d-%H%M%S', time.localtime())
            save_dir = os.path.join('results', args.model + '-' + exp_id)
        else:
            save_dir = os.path.join('results', save_dir)

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    logfile = os.path.join(save_dir, 'log')
    if args.test != 1:
        sys.stdout = Logger(logfile)
        pyfiles = [f for f in os.listdir('./') if f.endswith('.py')]
        for f in pyfiles:
            shutil.copy(f, os.path.join(save_dir, f))
    n_gpu = setgpu(args.gpu)
    args.n_gpu = n_gpu
    net = net.cuda()
    loss = loss.cuda()
    cudnn.benchmark = True
    net = DataParallel(net)
    datadir = config_detector['preprocess_result_path']
    print 'datadir = ', datadir

    net = DataParallel(net, device_ids=[0])

    def get_lr(epoch):
        if epoch <= args.epochs * 0.5:
            lr = args.lr
        elif epoch <= args.epochs * 0.8:
            lr = 0.1 * args.lr
        else:
            lr = 0.01 * args.lr
        return lr

    def weights_init(m):
        classname = m.__class__.__name__
        if classname.find('Conv') != -1:
            m.weight.data.normal_(0.0, 0.02)
        elif classname.find('BatchNorm') != -1:
            m.weight.data.normal_(1.0, 0.02)
            m.bias.data.fill_(0)
        elif classname.find('Linear') != -1:
            m.bias.data.fill_(0)

    # Cross-Validation of 3D-semi, train
    k_fold = args.fold
    print "Authorizing fold: {:d}".format(k_fold)

    # Loading training set
    dataset = data.DataBowl3Detector(
        datadir,
        'detector/luna_file_id/subset_fold{:d}'.format(k_fold) +
        '/file_id_rpn_train.npy',
        config,
        phase='train')
    rpn_train_loader = DataLoader(dataset,
                                  batch_size=args.batch_size,
                                  shuffle=True,
                                  num_workers=args.workers,
                                  pin_memory=True)

    optimizer = torch.optim.SGD(net.parameters(),
                                args.lr,
                                momentum=0.9,
                                weight_decay=args.weight_decay)

    # Training process
    train_loss_l, train_tpr_l = [], []

    # weights initialize
    net.apply(weights_init)

    for epoch in range(start_epoch, args.epochs + 1):
        if not os.path.exists(os.path.join(save_dir,
                                           'fold{:d}'.format(k_fold))):
            os.makedirs(os.path.join(save_dir, 'fold{:d}'.format(k_fold)))
        train_loss, train_tpr = train(
            rpn_train_loader, net, loss, epoch, optimizer, get_lr,
            args.save_freq, os.path.join(save_dir, 'fold{:d}'.format(k_fold)))

        # Append loss results
        train_loss_l.append(train_loss)
        train_tpr_l.append(train_tpr)

    # Save Train-Validation results
    if not os.path.exists('./train-vali-results/fold{:d}'.format(k_fold)):
        os.makedirs('./train-vali-results/fold{:d}'.format(k_fold))
    np.save(
        './train-vali-results/fold{:d}'.format(k_fold) + '/rpn-train-loss.npy',
        np.asarray(train_loss_l).astype(np.float64))
    np.save(
        './train-vali-results/fold{:d}'.format(k_fold) + '/rpn-train-tpr.npy',
        np.asarray(train_tpr_l).astype(np.float64))

    # Testing process
    if args.test == 1:
        margin = 32
        sidelen = 144

        split_comber = SplitComb(sidelen, config['max_stride'],
                                 config['stride'], margin, config['pad_value'])
        dataset = data.DataBowl3Detector(
            datadir,
            'detector/luna_file_id/subset_fold{:d}'.format(k_fold) +
            '/file_id_test.npy',
            config,
            phase='test',
            split_comber=split_comber)
        test_loader = DataLoader(
            dataset,
            batch_size=1,  # 在测试阶段,batch size 固定为1
            shuffle=False,
            num_workers=args.workers,
            collate_fn=data.collate,
            pin_memory=False)

        split_comber = SplitComb(sidelen, config['max_stride'],
                                 config['stride'], margin, config['pad_value'])
        dataset = data.DataBowl3Detector(
            datadir,
            'detector/luna_file_id/subset_fold{:d}'.format(k_fold) +
            '/file_id_total_train.npy',
            config,
            phase='test',
            split_comber=split_comber)

        train_total_loader = DataLoader(
            dataset,
            batch_size=1,  # 在测试阶段,batch size 固定为1
            shuffle=False,
            num_workers=args.workers,
            collate_fn=data.collate,
            pin_memory=False)

        split_comber = SplitComb(sidelen, config['max_stride'],
                                 config['stride'], margin, config['pad_value'])
        dataset = data.DataBowl3Detector(
            datadir,
            'detector/luna_file_id/file_id_unlabel.npy',
            config,
            phase='test',
            split_comber=split_comber)

        unlabel_loader = DataLoader(
            dataset,
            batch_size=1,  # 在测试阶段,batch size 固定为1
            shuffle=False,
            num_workers=args.workers,
            collate_fn=data.collate,
            pin_memory=False)

        test_dir = os.path.join(save_dir, 'voi_fold{:d}'.format(k_fold),
                                'test')
        if not os.path.exists(test_dir):
            os.makedirs(test_dir)
        find_voi(test_loader, net, get_pbb, test_dir, config)

        total_train_dir = os.path.join(save_dir, 'voi_fold{:d}'.format(k_fold),
                                       'total_train')
        if not os.path.exists(total_train_dir):
            os.makedirs(total_train_dir)
        find_voi(train_total_loader, net, get_pbb, total_train_dir, config)

        unlabel_dir = os.path.join(save_dir, 'voi_fold{:d}'.format(k_fold),
                                   'unlabel')
        if not os.path.exists(unlabel_dir):
            os.makedirs(unlabel_dir)
        find_voi(unlabel_loader, net, get_pbb, unlabel_dir, config)
Exemple #9
0
if args.dataset in ['PaviaU', 'Pavia']:
    num_cla = 9
elif args.dataset in ['Indian', 'Salinas']:
    num_cla = 16
elif args.dataset == 'KSC':
    num_cla = 13
else:
    print('undefined dataset')

make_if_not_exist(trained_model_dir)
model = DataParallel(dict[args.model_name](num_classes=num_cla, dropout_keep_prob=0))
if args.use_cuda:
    model=model.cuda()

optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=1e-5)

start_epoch = 0
if args.restore and len(os.listdir(trained_model_dir)):
    model, start_epoch = model_restore(model, trained_model_dir)

train_info_record = trained_model_dir + 'train_info_' + args.model_name + '.txt'
for epoch in range(start_epoch+1, args.epochs+1):
    start = time.time()
    train(epoch, model, train_loader, optimizer, args)
    end = time.time()
    print('epoch: {} , cost {} seconds'.format(epoch,  end-start))

    if epoch % args.model_save_interval == 0 and epoch > args.epochs*0.9:
        model_name = trained_model_dir + '/trained_model{}.pkl'.format(epoch)
        torch.save(model.cpu().state_dict(), model_name)
Exemple #10
0
def main():
    global args
    args = parser.parse_args()

    torch.manual_seed(0)

    ##################################
    nodmodel = import_module(args.model1)
    config1, nod_net, loss, get_pbb = nodmodel.get_model()
    args.lr_stage = config1['lr_stage']
    args.lr_preset = config1['lr']

    save_dir = args.save_dir
    ##################################

    casemodel = import_module(args.model2)

    config2 = casemodel.config
    args.lr_stage2 = config2['lr_stage']
    args.lr_preset2 = config2['lr']
    topk = config2['topk']
    case_net = casemodel.CaseNet(topk=topk, nodulenet=nod_net)

    args.miss_ratio = config2['miss_ratio']
    args.miss_thresh = config2['miss_thresh']
    if args.debug:
        args.save_dir = 'debug'

    ###################################

    ################################
    start_epoch = args.start_epoch
    if args.resume:
        checkpoint = torch.load(args.resume)
        if start_epoch == 0:
            start_epoch = checkpoint['epoch'] + 1
        if not save_dir:
            save_dir = checkpoint['save_dir']
        else:
            save_dir = os.path.join('results', save_dir)
        case_net.load_state_dict(checkpoint['state_dict'])
    else:
        if start_epoch == 0:
            start_epoch = 1
        if not save_dir:
            exp_id = time.strftime('%Y%m%d-%H%M%S', time.localtime())
            save_dir = os.path.join('results', args.model1 + '-' + exp_id)
        else:
            save_dir = os.path.join('results', save_dir)
    if args.epochs == None:
        end_epoch = args.lr_stage2[-1]
    else:
        end_epoch = args.epochs

    ################################
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    logfile = os.path.join(save_dir, 'log')
    if args.test1 != 1 and args.test2 != 1:
        sys.stdout = Logger(logfile)
        pyfiles = [f for f in os.listdir('./') if f.endswith('.py')]
        for f in pyfiles:
            shutil.copy(f, os.path.join(save_dir, f))
    ################################

    torch.cuda.set_device(0)
    # nod_net = nod_net.cuda()
    case_net = case_net.cuda()
    loss = loss.cuda()
    cudnn.benchmark = True
    if not args.debug:
        case_net = DataParallel(case_net)
        nod_net = DataParallel(nod_net)
    ################################

    if args.test1 == 1:
        testsplit = np.load('full.npy')
        dataset = DataBowl3Classifier(testsplit, config2, phase='test')
        predlist = test_casenet(case_net, dataset).T
        anstable = np.concatenate([[testsplit], predlist], 0).T
        df = pandas.DataFrame(anstable)
        df.columns = {'id', 'cancer'}
        df.to_csv('allstage1.csv', index=False)
        return

    if args.test2 == 1:
        testsplit = np.load('test.npy')
        dataset = DataBowl3Classifier(testsplit, config2, phase='test')
        predlist = test_casenet(case_net, dataset).T
        anstable = np.concatenate([[testsplit], predlist], 0).T
        df = pandas.DataFrame(anstable)
        df.columns = {'id', 'cancer'}
        df.to_csv('quick', index=False)
        return
    if args.test3 == 1:
        testsplit3 = np.load('stage2.npy')
        dataset = DataBowl3Classifier(testsplit3, config2, phase='test')
        predlist = test_casenet(case_net, dataset).T
        anstable = np.concatenate([[testsplit3], predlist], 0).T
        df = pandas.DataFrame(anstable)
        df.columns = {'id', 'cancer'}
        df.to_csv('stage2_ans.csv', index=False)
        return
    print("save_dir", save_dir)
    print("save_freq", args.save_freq)
    # trainsplit = np.load('kaggleluna_full.npy')
    train_list = [f.split('_')[0] for f in os.listdir(config1['datadir'])]
    trainsplit = sorted(set(train_list), key=train_list.index)
    # valsplit = np.load('valsplit.npy')
    # testsplit = np.load('test.npy')

    dataset = DataBowl3Detector(trainsplit, config1, phase='train')
    train_loader_nod = DataLoader(dataset,
                                  batch_size=args.batch_size,
                                  shuffle=True,
                                  num_workers=args.workers,
                                  pin_memory=True)

    # dataset = DataBowl3Detector(valsplit,config1,phase = 'val')
    # val_loader_nod = DataLoader(dataset,batch_size = args.batch_size,
    #     shuffle = False,num_workers = args.workers,pin_memory=True)

    optimizer = torch.optim.SGD(nod_net.parameters(),
                                args.lr,
                                momentum=0.9,
                                weight_decay=args.weight_decay)

    # trainsplit = np.load('full.npy')
    dataset = DataBowl3Classifier(trainsplit, config2, phase='train')
    train_loader_case = DataLoader(dataset,
                                   batch_size=args.batch_size2,
                                   shuffle=True,
                                   num_workers=args.workers,
                                   pin_memory=True)

    # dataset = DataBowl3Classifier(valsplit,config2,phase = 'val')
    # val_loader_case = DataLoader(dataset,batch_size = max([args.batch_size2,1]),
    #     shuffle = False,num_workers = args.workers,pin_memory=True)

    # dataset = DataBowl3Classifier(trainsplit,config2,phase = 'val')
    # all_loader_case = DataLoader(dataset,batch_size = max([args.batch_size2,1]),
    #     shuffle = False,num_workers = args.workers,pin_memory=True)

    optimizer2 = torch.optim.SGD(case_net.parameters(),
                                 args.lr,
                                 momentum=0.9,
                                 weight_decay=args.weight_decay)
    '''
    1. case_net 分类模型权重
        加载分类模型权重,设置初始化参数
    2. 配置log路径文件
    3. case_net在gpu上部署,多gpu部署
    4. 测试:
        1. 使用方法test_casenet 对dtaset进行分类
        终止程序运行
    5. 训练功能:
        1. 检测器训练集加载
        2. 优化器
        3. 分类器训练集加载
        4. 优化器2 
        5. for(start_epoch, end_epoch):
            每隔30个epoch进行一次目标检测器训练
            每个epoch进行 分类器的训练

    '''

    for epoch in range(start_epoch, end_epoch + 1):
        if epoch == start_epoch:
            lr = args.lr
            debug = args.debug
            args.lr = 0.0
            args.debug = True
            train_casenet(epoch, case_net, train_loader_case, optimizer2, args)
            args.lr = lr
            args.debug = debug
        if epoch < args.lr_stage[-1]:
            train_nodulenet(train_loader_nod, nod_net, loss, epoch, optimizer,
                            args)
            # validate_nodulenet(val_loader_nod, nod_net, loss)
        if epoch > config2['startepoch']:
            train_casenet(epoch, case_net, train_loader_case, optimizer2, args)
            # val_casenet(epoch,case_net,val_loader_case,args)
            # val_casenet(epoch,case_net,all_loader_case,args)

        if epoch % args.save_freq == 0:
            state_dict = case_net.module.state_dict()
            for key in state_dict.keys():
                state_dict[key] = state_dict[key].cpu()

            torch.save(
                {
                    'epoch': epoch,
                    'save_dir': save_dir,
                    'state_dict': state_dict,
                    'args': args
                }, os.path.join(save_dir, '%03d.ckpt' % epoch))
Exemple #11
0
def main():
    # drive.mount('/content/drive/')
    # path = '/content/drive/My Drive/Colab Notebooks'
    # os.chdir(path)
    '''
    import moxing as mox
    mox.file.make_dirs('/cache')
    mox.file.copy_parallel('obs://ghost-story/ghost/nlpdata/config/model_config_small.json',
                           '/cache/config/model_config_small.json')
    mox.file.copy_parallel('obs://ghost-story/ghost/nlpdata/cache/vocab_small.txt', '/cache/cache/vocab_small.txt')
    model_config = transformers.modeling_gpt2.GPT2Config.from_json_file('/cache/config/model_config_small.json')

    mox.file.copy_parallel('obs://ghost-story/ghost/nlpdata/ghost.json', '/cache/ghost.json')
    mox.file.copy_parallel('obs://ghost-story/ghost/nlpdata/data/tokenization/', '/cache/data/tokenization/')
    mox.file.copy_parallel('obs://ghost-story/ghost/nlpdata/model/', '/cache/data/model/')
    mox.file.copy_parallel('obs://ghost-story/ghost/', '/cache/data/model/')
    args = parser.parse_args()
    args, unparsed = parser.parse_known_args()
  '''
    parser = argparse.ArgumentParser()
    parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡')
    parser.add_argument('--model_config', default='/content/gpt-2-chinese-finetune/nlpdata/config/model_config_small.json', type=str,
                        required=False,
                        help='选择模型参数')
    parser.add_argument('--tokenizer_path', default='/content/gpt-2-chinese-finetune/nlpdata/cache/vocab_small.txt', type=str,
                        required=False,
                        help='选择词库')
    parser.add_argument('--raw_data_path', default='/content/gpt-2-chinese-finetune/nlpdata/ghost.json', type=str,
                        required=False, help='原始训练语料')
    parser.add_argument('--tokenized_data_path', default='/content/gpt-2-chinese-finetune/nlpdata/',
                        help='tokenized语料存放位置')
    parser.add_argument('--raw', action='store_true', help='是否先做tokenize')
    parser.add_argument('--epochs', default=50, type=int, required=False, help='训练循环')
    parser.add_argument('--batch_size', default=1, type=int, required=False, help='训练batch size')
    parser.add_argument('--lr', default=1.5e-3, type=float, required=False, help='学习率')
    parser.add_argument('--warmup_steps', default=2000, type=int, required=False, help='warm up步数')
    parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次loss')
    parser.add_argument('--stride', default=768, type=int, required=False, help='训练时取训练数据的窗口步长')
    parser.add_argument('--gradient_accumulation', default=1, type=int, required=False, help='梯度积累')
    parser.add_argument('--fp16', action='store_true', help='混合精度')
    parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False)
    parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False)
    parser.add_argument('--num_pieces', default=1, type=int, required=False, help='将训练语料分成多少份')
    #     parser.add_argument('--output_dir', default='obs://ghost-story/ghost/nlpdata/model/', type=str, required=False, help='模型输出路径')
    parser.add_argument('--pretrained_model', default='args = /content/gpt-2-chinese-finetune/nlpdata/cache', type=str, required=False,
                        help='模型训练起点路径')
    parser.add_argument('--segment', action='store_true', help='中文以词为单位')

    args = parser.parse_args()
    #args, unparsed = parser.parse_known_args()
    print('args:\n' + args.__repr__())

    # if args.segment:
    #     from data import tokenization_bert_word_level as tokenization_bert
    # else:
    #     import Tokenization

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡
    #     model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(args.model_config)
    #     model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(mox.file.read('/cache/config/model_config_small.json'))
    model_config = transformers.modeling_gpt2.GPT2Config.from_json_file('/content/gpt-2-chinese-finetune/nlpdata/config/model_config_small.json')
    
    print('config:\n' + model_config.to_json_string())

    n_ctx = model_config.n_ctx
    #     full_tokenizer = Tokenization.BertTokenizer(vocab_file=args.tokenizer_path)
    #     full_tokenizer = BertTokenizer(vocab_file=args.tokenizer_path)
    full_tokenizer = BertTokenizer(vocab_file='/content/gpt-2-chinese-finetune/nlpdata/cache/vocab_small.txt')
    full_tokenizer.max_model_input_sizes = 999999
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('using device:', device)

    raw_data_path = '/content/gpt-2-chinese-finetune/nlpdata/ghost.json'
    tokenized_data_path = '/content/gpt-2-chinese-finetune/nlpdata/cache/'
    # raw = args.raw  # 选择是否从零开始构建数据集
    raw = True
    epochs = args.epochs
    batch_size = args.batch_size
    lr = args.lr
    warmup_steps = args.warmup_steps
    log_step = args.log_step
    stride = args.stride
    gradient_accumulation = args.gradient_accumulation
    fp16 = args.fp16  # 不支持半精度的显卡请勿打开
    fp16_opt_level = args.fp16_opt_level
    max_grad_norm = args.max_grad_norm
    num_pieces = args.num_pieces
    output_dir = '/content/gpt-2-chinese-finetune/nlpdata/cache/'

    if raw:
        print('building files')
        build_files(raw_data_path=raw_data_path, tokenized_data_path=tokenized_data_path, full_tokenizer=full_tokenizer,
                    num_pieces=num_pieces)
        print('files built')

    #     if not args.pretrained_model:
    model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config)
    #     else:
    #         model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(args.pretrained_model)
    model.train()
    model.to(device)
    multi_gpu = False
    full_len = 0
    print('calculating total steps')
    for i in tqdm(range(num_pieces)):
        with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f:
            full_len += len([int(item) for item in f.read().strip().split()])
    total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation)
    print('total steps = {}'.format(total_steps))

    optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True)
    scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps,
                                                             num_training_steps=total_steps)
    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = DataParallel(model)
        multi_gpu = True
    print('starting training')
    running_loss = 0
    for epoch in range(epochs):
        print('epoch {}'.format(epoch + 1))
        now = datetime.now()
        print('time: {}'.format(now))
        x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32)
        random.shuffle(x)
        piece_num = 0
        for i in x:
            with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f:
                line = f.read().strip()
            tokens = line.split()
            tokens = [int(token) for token in tokens]
            start_point = 0
            samples = []
            while start_point < len(tokens) - n_ctx:
                samples.append(tokens[start_point: start_point + n_ctx])
                start_point += stride
            if start_point < len(tokens):
                samples.append(tokens[len(tokens) - n_ctx:])
            random.shuffle(samples)
            for step in range(len(samples) // batch_size):

                #  prepare data
                batch = samples[step * batch_size: (step + 1) * batch_size]
                batch_labels = []
                batch_inputs = []
                for ids in batch:
                    int_ids_for_labels = [int(x) for x in ids]
                    int_ids_for_inputs = [int(x) for x in ids]
                    batch_labels.append(int_ids_for_labels)
                    batch_inputs.append(int_ids_for_inputs)
                batch_labels = torch.tensor(batch_labels).long().to(device)
                batch_inputs = torch.tensor(batch_inputs).long().to(device)

                #  forward pass
                outputs = model.forward(input_ids=batch_inputs, labels=batch_labels)
                loss, logits = outputs[:2]

                #  get loss
                if multi_gpu:
                    loss = loss.mean()
                if gradient_accumulation > 1:
                    loss = loss / gradient_accumulation

                #  loss backward
                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

                #  optimizer step
                if (step + 1) % gradient_accumulation == 0:
                    running_loss += loss.item()
                    optimizer.step()
                    optimizer.zero_grad()
                    scheduler.step()
                if (step + 1) % log_step == 0:
                    print('now time: {}:{}. Step {} of piece {} of epoch {}, loss {}'.format(
                        datetime.now().hour,
                        datetime.now().minute,
                        (step + 1) // gradient_accumulation,
                        piece_num,
                        epoch + 1,
                        running_loss / log_step))
                    running_loss = 0
            piece_num += 1

        print('saving model for epoch {}'.format(epoch + 1))
        if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)):
            os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1))
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(output_dir + 'model_epoch{}'.format(epoch + 1))
        #mox.file.copy_parallel(output_dir + 'model_epoch{}'.format(epoch + 1), '/content/gpt-2-chinese-finetune/nlpdata/model/')
        # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1))
        # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1))
        print('epoch {} finished'.format(epoch + 1))

        then = datetime.now()
        print('time: {}'.format(then))
        print('time for one epoch: {}'.format(then - now))

    print('training finished')
    if not os.path.exists(output_dir + 'final_model'):
        os.mkdir(output_dir + 'final_model')
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(output_dir + 'final_model')
def train_model(train_dataset, train_num_each, val_dataset, val_num_each):
    num_train = len(train_dataset)
    num_val = len(val_dataset)

    train_useful_start_idx = get_useful_start_idx(sequence_length, train_num_each)
    val_useful_start_idx = get_useful_start_idx(sequence_length, val_num_each)

    num_train_we_use = len(train_useful_start_idx) // num_gpu * num_gpu
    num_val_we_use = len(val_useful_start_idx) // num_gpu * num_gpu
    # num_train_we_use = 8000
    # num_val_we_use = 800

    train_we_use_start_idx = train_useful_start_idx[0:num_train_we_use]  # 训练数据开始位置
    val_we_use_start_idx = val_useful_start_idx[0:num_val_we_use]

    np.random.seed(0)
    np.random.shuffle(train_we_use_start_idx)
    train_idx = []
    for i in range(num_train_we_use):
        for j in range(sequence_length):
            train_idx.append(train_we_use_start_idx[i] + j * srate)  # 训练数据位置,每一张图是一个数据

    val_idx = []
    for i in range(num_val_we_use):
        for j in range(sequence_length):
            val_idx.append(val_we_use_start_idx[i] + j * srate)

    num_train_all = float(len(train_idx))
    num_val_all = float(len(val_idx))
    print('num of train dataset: {:6d}'.format(num_train))
    print('num train start idx : {:6d}'.format(len(train_useful_start_idx)))
    print('last idx train start: {:6d}'.format(train_useful_start_idx[-1]))
    print('num of train we use : {:6d}'.format(num_train_we_use))
    print('num of all train use: {:6d}'.format(int(num_train_all)))
    print('num of valid dataset: {:6d}'.format(num_val))
    print('num valid start idx : {:6d}'.format(len(val_useful_start_idx)))
    print('last idx valid start: {:6d}'.format(val_useful_start_idx[-1]))
    print('num of valid we use : {:6d}'.format(num_val_we_use))
    print('num of all valid use: {:6d}'.format(int(num_val_all)))

    val_loader = DataLoader(
        val_dataset,
        batch_size=val_batch_size,
        # sampler=val_idx,
        sampler=SeqSampler(val_dataset, val_idx),
        num_workers=workers,
        pin_memory=False
    )
    model = resnet_lstm()
    if use_gpu:
        model = model.cuda()

    model = DataParallel(model)
    criterion = nn.CrossEntropyLoss()
    '''
    if multi_optim == 0:
        if optimizer_choice == 0:
            optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, dampening=dampening,
                                  weight_decay=weight_decay, nesterov=use_nesterov)
            if sgd_adjust_lr == 0:
                exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=sgd_adjust_lr, gamma=sgd_gamma)
            elif sgd_adjust_lr == 1:
                exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min')
        elif optimizer_choice == 1:
            optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    elif multi_optim == 1:
        if optimizer_choice == 0:
            optimizer = optim.SGD([
                {'params': model.module.share.parameters()},
                {'params': model.module.lstm.parameters(), 'lr': learning_rate},
                {'params': model.module.fc.parameters(), 'lr': learning_rate},
            ], lr=learning_rate / 10, momentum=momentum, dampening=dampening,
                weight_decay=weight_decay, nesterov=use_nesterov)
            if sgd_adjust_lr == 0:
                exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=sgd_adjust_lr, gamma=sgd_gamma)
            elif sgd_adjust_lr == 1:
                exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min')
        elif optimizer_choice == 1:
            optimizer = optim.Adam([
                {'params': model.module.share.parameters()},
                {'params': model.module.lstm.parameters(), 'lr': learning_rate},
                {'params': model.module.fc.parameters(), 'lr': learning_rate},
            ], lr=learning_rate / 10)
    '''
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    best_model_wts = copy.deepcopy(model.state_dict())
    best_val_accuracy = 0.0
    correspond_train_acc = 0.0

    record_np = np.zeros([epochs, 4])

    for epoch in range(epochs):
        np.random.seed(epoch)
        np.random.shuffle(train_we_use_start_idx)
        train_idx = []
        for i in range(num_train_we_use):
            for j in range(sequence_length):
                train_idx.append(train_we_use_start_idx[i] + j * srate)

        train_loader = DataLoader(
            train_dataset,
            batch_size=train_batch_size,
            sampler=SeqSampler(train_dataset, train_idx),
            num_workers=workers,
            pin_memory=False
        )

        model.train()
        train_loss = 0.0
        train_corrects = 0
        train_start_time = time.time()
        num = 0
        train_num = 0
        for data in train_loader:
            num = num + 1
            #inputs, labels_phase, kdata = data
            inputs, labels_phase = data
            if use_gpu:
                inputs = Variable(inputs.cuda())
                labels = Variable(labels_phase.cuda())
                #kdatas = Variable(kdata.cuda())
            else:
                inputs = Variable(inputs)
                labels = Variable(labels_phase)
                #kdatas = Variable(kdata)
            optimizer.zero_grad()
            #outputs = model.forward(inputs, kdatas)
            outputs = model.forward(inputs)
            outputs = F.softmax(outputs, dim=1)
            _, preds = torch.max(outputs.data, 1)
            print(num)
            print(preds)
            print(labels)

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.data
            train_corrects += torch.sum(preds == labels.data)
            train_num += labels.shape[0]
            print(train_corrects.cpu().numpy() / train_num)
            if train_corrects.cpu().numpy() / train_num > 0.75:
                torch.save(copy.deepcopy(model.state_dict()), 'test.pth')

        train_elapsed_time = time.time() - train_start_time
        train_accuracy = train_corrects.cpu().numpy() / train_num
        train_average_loss = train_loss / train_num

        # begin eval
        model.eval()
        val_loss = 0.0
        val_corrects = 0
        val_num = 0
        val_start_time = time.time()
        for data in val_loader:
            #inputs, labels_phase, kdata = data
            inputs, labels_phase = data
            #labels_phase = labels_phase[(sequence_length - 1)::sequence_length]
            #kdata = kdata[(sequence_length - 1)::sequence_length]
            if use_gpu:
                inputs = Variable(inputs.cuda())
                labels = Variable(labels_phase.cuda())
                #kdatas = Variable(kdata.cuda())
            else:
                inputs = Variable(inputs)
                labels = Variable(labels_phase)
                #kdatas = Variable(kdata)

            if crop_type == 0 or crop_type == 1:
                #outputs = model.forward(inputs, kdatas)
                outputs = model.forward(inputs)
            elif crop_type == 5:
                inputs = inputs.permute(1, 0, 2, 3, 4).contiguous()
                inputs = inputs.view(-1, 3, 224, 224)
                #outputs = model.forward(inputs, kdatas)
                outputs = model.forward(inputs)
                outputs = outputs.view(5, -1, 3)
                outputs = torch.mean(outputs, 0)
            elif crop_type == 10:
                inputs = inputs.permute(1, 0, 2, 3, 4).contiguous()
                inputs = inputs.view(-1, 3, 224, 224)
                #outputs = model.forward(inputs, kdatas)
                outputs = model.forward(inputs)
                outputs = outputs.view(10, -1, 3)
                outputs = torch.mean(outputs, 0)

            #outputs = outputs[sequence_length - 1::sequence_length]

            _, preds = torch.max(outputs.data, 1)
            print(num)
            print(preds)
            print(labels)
            loss = criterion(outputs, labels)
            val_loss += loss.data
            val_corrects += torch.sum(preds == labels.data)
            val_num += labels.shape[0]
        val_elapsed_time = time.time() - val_start_time
        val_accuracy = val_corrects.cpu().numpy() / val_num
        val_average_loss = val_loss / val_num
        print('epoch: {:4d}'
              ' train in: {:2.0f}m{:2.0f}s'
              ' train loss: {:4.4f}'
              ' train accu: {:.4f}'
              ' valid in: {:2.0f}m{:2.0f}s'
              ' valid loss: {:4.4f}'
              ' valid accu: {:.4f}'
              .format(epoch,
                      train_elapsed_time // 60,
                      train_elapsed_time % 60,
                      train_average_loss,
                      train_accuracy,
                      val_elapsed_time // 60,
                      val_elapsed_time % 60,
                      val_average_loss,
                      val_accuracy))

        if optimizer_choice == 0:
            if sgd_adjust_lr == 0:
                exp_lr_scheduler.step()
            elif sgd_adjust_lr == 1:
                exp_lr_scheduler.step(val_average_loss)

        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            correspond_train_acc = train_accuracy
            best_model_wts = copy.deepcopy(model.state_dict())
        if val_accuracy == best_val_accuracy:
            if train_accuracy > correspond_train_acc:
                correspond_train_acc = train_accuracy
                best_model_wts = copy.deepcopy(model.state_dict())

        record_np[epoch, 0] = train_accuracy
        record_np[epoch, 1] = train_average_loss
        record_np[epoch, 2] = val_accuracy
        record_np[epoch, 3] = val_average_loss
        np.save(str(epoch) + '.npy', record_np)

    print('best accuracy: {:.4f} cor train accu: {:.4f}'.format(best_val_accuracy, correspond_train_acc))

    save_val = int("{:4.0f}".format(best_val_accuracy * 10000))
    save_train = int("{:4.0f}".format(correspond_train_acc * 10000))
    model_name = "lstm" \
                 + "_epoch_" + str(epochs) \
                 + "_length_" + str(sequence_length) \
                 + "_opt_" + str(optimizer_choice) \
                 + "_mulopt_" + str(multi_optim) \
                 + "_flip_" + str(use_flip) \
                 + "_crop_" + str(crop_type) \
                 + "_batch_" + str(train_batch_size) \
                 + "_train_" + str(save_train) \
                 + "_val_" + str(save_val) \
                 + ".pth"

    torch.save(best_model_wts, model_name)

    record_name = "lstm" \
                  + "_epoch_" + str(epochs) \
                  + "_length_" + str(sequence_length) \
                  + "_opt_" + str(optimizer_choice) \
                  + "_mulopt_" + str(multi_optim) \
                  + "_flip_" + str(use_flip) \
                  + "_crop_" + str(crop_type) \
                  + "_batch_" + str(train_batch_size) \
                  + "_train_" + str(save_train) \
                  + "_val_" + str(save_val) \
                  + ".npy"
    np.save(record_name, record_np)
Exemple #13
0
def main():
    global args
    args = parser.parse_args()

    seed = 0
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.set_device(0)

    model = import_module(args.model)
    config, net, loss, get_pbb = model.get_model()
    start_epoch = args.start_epoch
    save_dir = args.save_dir

    if args.resume:
        checkpoint = torch.load(args.resume)
        if start_epoch == 0:
            start_epoch = checkpoint['epoch'] + 1
        if not save_dir:
            save_dir = checkpoint['save_dir']
        else:
            save_dir = os.path.join('results', save_dir)
        net.load_state_dict(checkpoint['state_dict'])
    else:
        if start_epoch == 0:
            start_epoch = 1
        if not save_dir:
            exp_id = time.strftime('%Y%m%d-%H%M%S', time.localtime())
            save_dir = os.path.join('results', args.model + '-' + exp_id)
        else:
            save_dir = os.path.join('results', save_dir)

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    logfile = os.path.join(save_dir, 'log')

    # if training, save files to know how training was done
    if args.test != 1:
        sys.stdout = Logger(logfile)
        # sys.stdout = logging.getLogger(logfile)
        print sys.argv
        pyfiles = [f for f in os.listdir('./') if f.endswith('.py')]
        for f in pyfiles:
            shutil.copy(f, os.path.join(save_dir, f))
        shutil.copy('config_training.py', os.path.join(save_dir))

    n_gpu = setgpu(args.gpu)
    args.n_gpu = n_gpu
    net = net.cuda()
    loss = loss.cuda()
    cudnn.benchmark = True
    net = DataParallel(net)
    datadir = config_training[
        'preprocess_result_path'] if args.data is None else args.data

    if args.test == 1:
        margin = 32
        sidelen = 144

        split_comber = SplitComb(sidelen, config['max_stride'],
                                 config['stride'], margin, config['pad_value'])

        test_set_file = args.test_filename

        dataset = data.DataBowl3Detector(datadir,
                                         test_set_file,
                                         config,
                                         phase='test',
                                         split_comber=split_comber)
        test_loader = DataLoader(dataset,
                                 batch_size=1,
                                 shuffle=False,
                                 num_workers=args.workers,
                                 collate_fn=data.collate,
                                 pin_memory=False)

        test(test_loader, net, get_pbb, save_dir, config, args.test_set)
        return

    #net = DataParallel(net)

    dataset = data.DataBowl3Detector(datadir,
                                     args.train_filename,
                                     config,
                                     phase='train')
    train_loader = DataLoader(dataset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.workers,
                              pin_memory=True)

    dataset = data.DataBowl3Detector(datadir,
                                     args.val_filename,
                                     config,
                                     phase='val')
    val_loader = DataLoader(dataset,
                            batch_size=args.batch_size,
                            shuffle=False,
                            num_workers=args.workers,
                            pin_memory=True)

    if args.optim == 'adam':
        optimizer = torch.optim.Adam(net.parameters())
    elif args.optim == 'sgd':
        optimizer = torch.optim.SGD(net.parameters(),
                                    args.lr,
                                    momentum=0.9,
                                    weight_decay=args.weight_decay)

    def get_lr(epoch):
        if epoch <= args.epochs * 0.5:
            lr = args.lr
        elif epoch <= args.epochs * 0.8:
            lr = 0.1 * args.lr
        else:
            lr = 0.01 * args.lr
        return lr

    for epoch in range(start_epoch, args.epochs + 1):
        train(train_loader, net, loss, epoch, optimizer, get_lr,
              args.save_freq, save_dir)
        validate(val_loader, net, loss)
Exemple #14
0
              ', training network from scratch. Press enter to continue.')

# Create the dataloader
classIdx = classNameToIdx[className]
trainSet = IcaDataset(classIdx, nLeadingZerosFormat, paths['rgbDir'],
                      rgbFormat, paths['segDir'], segFormat,
                      paths['posesPath'], keypoints, K)  # Torch dataset
trainSampler = RandomSampler(trainSet)
trainBatchSampler = BatchSampler(trainSampler, batchSize,
                                 drop_last=True)  # Torch sampler
trainLoader = DataLoader(trainSet,
                         batch_sampler=trainBatchSampler,
                         num_workers=8)

# Initialize the optimizer
optimizer = Adam(network.parameters(), lr=learningRate)

# Train the model
nIterations = len(trainSet) // batchSize
for iEpoch in range(nEpochs):
    print('Starting epoch #' + str(iEpoch + 1) + ' out of ' + str(nEpochs))
    tEpochStart = time.time()
    for idx, data in enumerate(trainLoader):

        # Start training loop iteration timer
        tTrainingLoopStart = time.time()

        # Extract data
        tExtractDataStart = time.time()
        image, maskGT, vertexGT, vertexWeightsGT = [d.cuda() for d in data]
        tExtractDataElapsed = time.time() - tExtractDataStart
Exemple #15
0
class model(base_process):
    def train_stage_2(self):

        batch = 240
        lr1 = 0.15

        data_set = loader(os.path.join(os.getcwd(), 'data_2'), {"mode": "training"})
        data_set_test = loader(os.path.join(os.getcwd(), 'data_2'),{"mode": "test"}, data_set.index)
        data_set_eval = loader(os.path.join(os.getcwd(), 'data_2'),{"mode": "eval"}, data_set.index)

        data_loader = DataLoader(data_set, batch, True, collate_fn=call_back.detection_collate_RPN)
        data_loader_test = DataLoader(data_set_test, batch, False, collate_fn=call_back.detection_collate_RPN)
        data_loader_eval = DataLoader(data_set_eval, batch, False, collate_fn=call_back.detection_collate_RPN)

        # optim = Adadelta(self.ROI.parameters(), lr=lr1, weight_decay=1e-5)
        start_time = time.time()
        optim_a = Adadelta([{'params': self.pre.parameters()},
                            {'params': self.ROI.parameters()}], lr=0.15, weight_decay=1e-5)
        cfg.test = False
        count = 0
        for epoch in range(200):
            runing_losss = 0.0
            cls_loss = 0
            coor_loss = 0
            cls_loss2 = 0
            coor_loss2 = 0
            count += 1
            # base_time = RPN_time = ROI_time = nms_time = pre_gt = loss_time = linear_time = 0
            for data in data_loader:
                y = data[1]
                x = data[0].cuda()
                peak = data[2]
                num = data[3]
                optim_a.zero_grad()

                with torch.no_grad():
                    if self.flag >= 2:
                        result = self.base_process(x, y, peak)
                        feat1 = result['feat_8']
                        feat2 = result['feat_16']
                        feat3 = result['feat_32']
                        feat4 = result['feat_64']
                        label = result['label']
                        loss_box = result['loss_box']
                        cross_entropy = result['cross_entropy']

                cls_score = self.pre(feat1, feat2, feat3, feat4)
                cls_score = self.ROI(cls_score)

                cross_entropy2 = self.tool2.cal_loss2(cls_score, label)

                loss_total = cross_entropy2
                loss_total.backward()
                optim_a.step()
                runing_losss += loss_total.item()
                cls_loss2 += cross_entropy2.item()
                cls_loss += cross_entropy.item()
                coor_loss += loss_box.item()
            end_time = time.time()
            torch.cuda.empty_cache()
            print(
                "epoch:{a} time:{ff}: loss:{b:.4f} cls:{d:.4f} cor{e:.4f} cls2:{f:.4f} cor2:{g:.4f} date:{fff}".format(
                    a=epoch,
                    b=runing_losss,
                    d=cls_loss,
                    e=coor_loss,
                    f=cls_loss2,
                    g=coor_loss2, ff=int(end_time - start_time),
                    fff=time.asctime()))
            # if epoch % 10 == 0:
            #     adjust_learning_rate(optim, 0.9, epoch, 50, lr1)
            p = None

            # if epoch % 2 == 0:
            #     print("test result")
            # save(self.RPN.module.state_dict(),
            #      os.path.join(os.getcwd(), str(epoch) + 'rpn_a2.p'))
            # save(self.RPN.module.state_dict(),
            #      os.path.join(os.getcwd(), str(epoch) + 'base_a2.p'))
            start_time = end_time
        all_data = []
        all_label = []
        for data in data_loader:
            y = data[1]
            x = data[0].cuda()
            num = data[3]
            peak = data[2]
            with torch.no_grad():
                if self.flag >= 2:
                    result = self.base_process_2(x, y, peak)
                    data_ = result['x']
                    label = result['label']
                    loss_box = result['loss_box']
                    cross_entropy = result['cross_entropy']
                    all_data.extend(data_.cpu())
                    all_label.extend(label.cpu())
        for data in data_loader_eval:
            y = data[1]
            x = data[0].cuda()
            num = data[3]
            peak = data[2]
            with torch.no_grad():
                if self.flag >= 2:
                    result = self.base_process_2(x, y, peak)
                    data_ = result['x']
                    label = result['label']
                    loss_box = result['loss_box']
                    cross_entropy = result['cross_entropy']
                    all_data.extend(data_.cpu())
                    all_label.extend(label.cpu())
        for data in data_loader_test:
            y = data[1]
            x = data[0].cuda()
            num = data[3]
            peak = data[2]
            with torch.no_grad():
                if self.flag >= 2:
                    result = self.base_process_2(x, y, peak)
                    data_ = result['x']
                    label = result['label']
                    loss_box = result['loss_box']
                    cross_entropy = result['cross_entropy']
                    all_data.extend(data_.cpu())
                    all_label.extend(label.cpu())

        all_data = torch.stack(all_data, 0).numpy()
        all_label = torch.LongTensor(all_label).numpy()
        from imblearn.over_sampling import SMOTE
        fun = SMOTE()
        all_data, all_label = fun.fit_resample(all_data, all_label)
        total = len(all_label)
        training_label = all_label[:int(0.7 * total)]
        training_data = all_data[:int(0.7 * total)]

        test_label = all_label[-int(0.2 * total):]
        test_data = all_data[-int(0.2 * total):]
        count = 0
        self.ROI = roi().cuda()
        self.ROI = DataParallel(self.ROI, device_ids=[0])
        self.ROI.apply(weights_init)

        optim_b = Adadelta(self.ROI.parameters(), lr=0.15, weight_decay=1e-5)
        for epoch in range(1200):
            runing_losss = 0.0
            cls_loss = 0
            coor_loss = 0
            cls_loss2 = 0
            coor_loss2 = 0
            count += 1
            optim_b.zero_grad()
            optim_a.zero_grad()

            # base_time = RPN_time = ROI_time = nms_time = pre_gt = loss_time = linear_time = 0
            for j in range(int(len(training_label) / 240)):
                data_ = torch.Tensor(training_data[j * 240:j * 240 + 240]).view(240, 1024, 15).cuda()
                label_ = torch.LongTensor(training_label[j * 240:j * 240 + 240]).cuda()
                optim_b.zero_grad()

                cls_score = self.ROI(data_)
                cross_entropy2 = self.tool2.cal_loss2(cls_score, label_)

                loss_total = cross_entropy2
                loss_total.backward()
                optim_b.step()
                runing_losss += loss_total.item()
                cls_loss2 += cross_entropy2.item()
                cls_loss += cross_entropy.item()
                coor_loss += loss_box.item()
            end_time = time.time()
            torch.cuda.empty_cache()
            print(
                "epoch:{a} time:{ff}: loss:{b:.4f} cls:{d:.4f} cor{e:.4f} cls2:{f:.4f} cor2:{g:.4f} date:{fff}".format(
                    a=epoch,
                    b=runing_losss,
                    d=cls_loss,
                    e=coor_loss,
                    f=cls_loss2,
                    g=coor_loss2, ff=int(end_time - start_time),
                    fff=time.asctime()))
            if epoch % 10 == 0 and epoch > 0:
                adjust_learning_rate(optim_b, 0.9, epoch, 50, 0.3)

            p = None
            self.eval_(test_data, test_label)
            # self.ROI_eval(data_loader_eval, {"epoch": epoch})

            start_time = end_time
        print('finish')

    def eval_(self, data, label):
        self.ROI = self.ROI.eval()
        gt = []
        pre = []
        total = int(len(label) / 240)
        with torch.no_grad():
            for i in range(total):
                a = i * 240
                b = a + 240
                sin_x = torch.Tensor(data[a:b]).cuda()
                sin_x = sin_x.view(240, 1024, 15)
                sin_y = label[a:b]
                predict = self.ROI(sin_x)
                predict, index = torch.max(predict, 1)
                pre.extend(index.cpu().tolist())
                gt.extend(sin_y)
        print("ppv:{}".format(metrics.precision_score(gt, pre, average='micro')))
        print("spe:{}".format(specificity_score(gt, pre, average='micro')))
        print("sen:{}".format(metrics.recall_score(gt, pre, average='micro')))


    def base_process_2(self, x, y, peak):
        cross_entropy, loss_box = torch.ones(1), torch.ones(1)
        with torch.no_grad():
            x1, x2, x3, x4 = self.features(x)
            if self.flag == 3:
                predict_confidence, box_predict = self.RPN(x1, x2, x3, x4)
                proposal, batch_offset, batch_conf = self.tool.get_proposal(predict_confidence, box_predict,
                                                                            y, test=True)
                # save_proposal = [i.cpu().numpy() for i in proposal]
                # save_data = x.cpu().numpy()
                # save_y = [i.numpy() for i in y]
                # self.save_dict['data'].append(save_data)
                # self.save_dict['label'].append(save_y)
                # self.save_dict['predict'].append(save_proposal)

            proposal, label = self.tool2.pre_gt_match_uniform(proposal, y, training=True, params={'peak': peak})

            if 1:
                for i in range(len(proposal)):
                    tmp = torch.zeros(proposal[i].size()[0], 1).fill_(
                        i).cuda()
                    proposal[i] = torch.cat([tmp, proposal[i]], 1)
                proposal = torch.cat(proposal, 0)

            feat4, label, class_num = self.tool2.roi_pooling_cuda(x4, proposal, label=label, stride=64,
                                                                  pool=self.pool4,
                                                                  batch=True)
            feat3 = \
                self.tool2.roi_pooling_cuda(x3, proposal, stride=64, pool=self.pool3,
                                            batch=True, label=None)[
                    0]
            feat2 = \
                self.tool2.roi_pooling_cuda(x2, proposal, stride=32,
                                            pool=self.pool2,
                                            batch=True, label=None)[0]
            feat1 = \
                self.tool2.roi_pooling_cuda(x1, proposal, stride=16,
                                            pool=self.pool1,
                                            batch=True, label=None, )[0]

            x = self.pre(feat1, feat2, feat3, feat4)
            x = x.view(-1, 1024 * 15)
            if self.flag == 2:
                result = {}
                result['x'] = x
                result['label'] = label
                result['predict_offset'] = 0
                result['class_num'] = class_num
                result['batch_cor_weight'] = 0
                result['cross_entropy'] = cross_entropy
                result['loss_box'] = loss_box
                return result
            elif self.flag == 3:
                result = {}
                result['x'] = x
                result['label'] = label
                result['class_num'] = class_num
                result['cross_entropy'] = cross_entropy
                result['loss_box'] = loss_box

                return result
Exemple #16
0
    def run_once(self,
                 opt,
                 run_engine_opt,
                 log_dir,
                 prev_log_dir=None,
                 fold_idx=0):
        """Simply run the defined run_step of the related method once."""
        check_manual_seed(self.seed)

        log_info = {}
        if self.logging:
            # check_log_dir(log_dir)
            rm_n_mkdir(log_dir)

            tfwriter = SummaryWriter(log_dir=log_dir)
            json_log_file = log_dir + "/stats.json"
            with open(json_log_file, "w") as json_file:
                json.dump({}, json_file)  # create empty file
            log_info = {
                "json_file": json_log_file,
                "tfwriter": tfwriter,
            }

        ####
        loader_dict = {}
        for runner_name, runner_opt in run_engine_opt.items():
            loader_dict[runner_name] = self._get_datagen(
                opt["batch_size"][runner_name],
                runner_name,
                opt["target_info"]["gen"],
                nr_procs=runner_opt["nr_procs"],
                fold_idx=fold_idx,
            )
        ####
        def get_last_chkpt_path(prev_phase_dir, net_name):
            stat_file_path = prev_phase_dir + "/stats.json"
            with open(stat_file_path) as stat_file:
                info = json.load(stat_file)
            epoch_list = [int(v) for v in info.keys()]
            last_chkpts_path = "%s/%s_epoch=%d.tar" % (
                prev_phase_dir,
                net_name,
                max(epoch_list),
            )
            return last_chkpts_path

        # TODO: adding way to load pretrained weight or resume the training
        # parsing the network and optimizer information
        net_run_info = {}
        net_info_opt = opt["run_info"]
        for net_name, net_info in net_info_opt.items():
            assert inspect.isclass(net_info["desc"]) or inspect.isfunction(
                net_info["desc"]
            ), "`desc` must be a Class or Function which instantiate NEW objects !!!"
            net_desc = net_info["desc"]()

            # TODO: customize print-out for each run ?
            # summary_string(net_desc, (3, 270, 270), device='cpu')

            pretrained_path = net_info["pretrained"]
            if pretrained_path is not None:
                if pretrained_path == -1:
                    # * depend on logging format so may be broken if logging format has been changed
                    pretrained_path = get_last_chkpt_path(
                        prev_log_dir, net_name)
                    net_state_dict = torch.load(pretrained_path)["desc"]
                else:
                    chkpt_ext = os.path.basename(pretrained_path).split(
                        ".")[-1]
                    if chkpt_ext == "npz":
                        net_state_dict = dict(np.load(pretrained_path))
                        net_state_dict = {
                            k: torch.from_numpy(v)
                            for k, v in net_state_dict.items()
                        }
                    elif chkpt_ext == "tar":  # ! assume same saving format we desire
                        net_state_dict = torch.load(pretrained_path)["desc"]

                colored_word = colored(net_name, color="red", attrs=["bold"])
                print("Model `%s` pretrained path: %s" %
                      (colored_word, pretrained_path))

                # load_state_dict returns (missing keys, unexpected keys)
                net_state_dict = convert_pytorch_checkpoint(net_state_dict)
                load_feedback = net_desc.load_state_dict(net_state_dict,
                                                         strict=False)
                # * uncomment for your convenience
                print("Missing Variables: \n", load_feedback[0])
                print("Detected Unknown Variables: \n", load_feedback[1])

            # * extremely slow to pass this on DGX with 1 GPU, why (?)
            net_desc = DataParallel(net_desc)
            net_desc = net_desc.to("cuda")
            # print(net_desc) # * dump network definition or not?
            optimizer, optimizer_args = net_info["optimizer"]
            optimizer = optimizer(net_desc.parameters(), **optimizer_args)
            # TODO: expand for external aug for scheduler
            nr_iter = opt["nr_epochs"] * len(loader_dict["train"])
            scheduler = net_info["lr_scheduler"](optimizer)
            net_run_info[net_name] = {
                "desc": net_desc,
                "optimizer": optimizer,
                "lr_scheduler": scheduler,
                # TODO: standardize API for external hooks
                "extra_info": net_info["extra_info"],
            }

        # parsing the running engine configuration
        assert ("train" in run_engine_opt
                ), "No engine for training detected in description file"

        # initialize runner and attach callback afterward
        # * all engine shared the same network info declaration
        runner_dict = {}
        for runner_name, runner_opt in run_engine_opt.items():
            runner_dict[runner_name] = RunEngine(
                dataloader=loader_dict[runner_name],
                engine_name=runner_name,
                run_step=runner_opt["run_step"],
                run_info=net_run_info,
                log_info=log_info,
            )

        for runner_name, runner in runner_dict.items():
            callback_info = run_engine_opt[runner_name]["callbacks"]
            for event, callback_list, in callback_info.items():
                for callback in callback_list:
                    if callback.engine_trigger:
                        triggered_runner_name = callback.triggered_engine_name
                        callback.triggered_engine = runner_dict[
                            triggered_runner_name]
                    runner.add_event_handler(event, callback)

        # retrieve main runner
        main_runner = runner_dict["train"]
        main_runner.state.logging = self.logging
        main_runner.state.log_dir = log_dir
        # start the run loop
        main_runner.run(opt["nr_epochs"])

        print("\n")
        print("########################################################")
        print("########################################################")
        print("\n")
        return
Exemple #17
0
def main():
    global args
    args = parser.parse_args()
    
    
    torch.manual_seed(0)
    torch.cuda.set_device(0)

    model = import_module(args.model)
    config, net, loss, get_pbb = model.get_model()
    start_epoch = args.start_epoch
    save_dir = args.save_dir
    
    if args.resume:
        checkpoint = torch.load(args.resume)
        if start_epoch == 0:
            start_epoch = checkpoint['epoch'] + 1
        if not save_dir:
            save_dir = checkpoint['save_dir']
        else:
            save_dir = os.path.join('results',save_dir)
        net.load_state_dict(checkpoint['state_dict'])
    else:
        if start_epoch == 0:
            start_epoch = 1
        if not save_dir:
            exp_id = time.strftime('%Y%m%d-%H%M%S', time.localtime())
            save_dir = os.path.join('results', args.model + '-' + exp_id)
        else:
            save_dir = os.path.join('results',save_dir)
    
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    logfile = os.path.join(save_dir,'log')
    if args.test!=1:
        sys.stdout = Logger(logfile)
        pyfiles = [f for f in os.listdir('./') if f.endswith('.py')]
        for f in pyfiles:
            shutil.copy(f,os.path.join(save_dir,f))
    n_gpu = setgpu(args.gpu)
    args.n_gpu = n_gpu
    net = net.cuda()
    loss = loss.cuda()
    cudnn.benchmark = True
    net = DataParallel(net)
    datadir = config_training['preprocess_result_path']
    
    if args.test == 1:
        margin = 32
        sidelen = 144

        split_comber = SplitComb(sidelen,config['max_stride'],config['stride'],margin,config['pad_value'])
        dataset = data.DataBowl3Detector(
            datadir,
            'full.npy',
            config,
            phase='test',
            split_comber=split_comber)
        test_loader = DataLoader(
            dataset,
            batch_size = 1,
            shuffle = False,
            num_workers = args.workers,
            collate_fn = data.collate,
            pin_memory=False)
        
        test(test_loader, net, get_pbb, save_dir,config)
        return

    #net = DataParallel(net)
    
    dataset = data.DataBowl3Detector(
        datadir,
        'kaggleluna_full.npy',
        config,
        phase = 'train')
    train_loader = DataLoader(
        dataset,
        batch_size = args.batch_size,
        shuffle = True,
        num_workers = args.workers,
        pin_memory=True)

    dataset = data.DataBowl3Detector(
        datadir,
        'valsplit.npy',
        config,
        phase = 'val')
    val_loader = DataLoader(
        dataset,
        batch_size = args.batch_size,
        shuffle = False,
        num_workers = args.workers,
        pin_memory=True)

    optimizer = torch.optim.SGD(
        net.parameters(),
        args.lr,
        momentum = 0.9,
        weight_decay = args.weight_decay)
    
    def get_lr(epoch):
        if epoch <= args.epochs * 0.5:
            lr = args.lr
        elif epoch <= args.epochs * 0.8:
            lr = 0.1 * args.lr
        else:
            lr = 0.01 * args.lr
        return lr
    

    for epoch in range(start_epoch, args.epochs + 1):
        train(train_loader, net, loss, epoch, optimizer, get_lr, args.save_freq, save_dir)
        validate(val_loader, net, loss)
Exemple #18
0
def main():
    torch.manual_seed(0)
    # torch.cuda.set_device(1)
    setgpu("all")

    epochs = 1000

    def getlr(epoch, epochs):
        lr = 0.01
        if epoch <= epochs * 0.5:
            lr = lr
        elif epoch <= epochs * 0.8:
            lr = 0.1 * lr
        else:
            lr = 0.01 * lr
        return lr

    datadir = "/home/user/disk2/video/2017/"
    savedir = "/home/user/disk2/video/saveV2/"
    logfile = os.path.join(savedir, 'log.txt')
    logfileVal = os.path.join(savedir, 'logVal.txt')

    if not os.path.exists(savedir):
        os.makedirs(savedir)
    dataset = dataLoader.DataSet(datadir)
    datasetVal = dataLoader.DataSetVal(datadir)

    net = nets.EmbeddingNet()
    # checkpoint = torch.load(savedir+"428.ckpt")
    # net.load_state_dict(checkpoint)

    net = DataParallel(net)
    net = net.cuda()
    loss = nets.Loss()
    loss = loss.cuda()
    trainLoader = DataLoader(dataset,
                             batch_size=48,
                             shuffle=True,
                             num_workers=12,
                             pin_memory=True)
    valLoader = DataLoader(datasetVal,
                           batch_size=6,
                           shuffle=True,
                           num_workers=18,
                           pin_memory=True)

    cudnn.benchmark = True

    lr = 0.01
    optimizer = torch.optim.SGD(net.parameters(),
                                lr,
                                momentum=0.9,
                                weight_decay=1e-4)
    for epoch in range(epochs):
        train(trainLoader, net, loss, epoch, optimizer, getlr, savedir,
              logfile, epochs)
        if epoch % 10 == 0:
            val(valLoader, net, loss, epoch, getlr, savedir, logfileVal,
                epochs)
            state_dict = net.module.state_dict()
            for key in state_dict.keys():
                state_dict[key] = state_dict[key].cpu()
            torch.save(state_dict, os.path.join(savedir, '%03d.ckpt' % epoch))
            print "save " + str(epoch)
        file = open(logfile, "a")
        file.write("save " + str(epoch))
        file.close()
    transforms = Compose([
        Resize(config.IMAGE_SIZE),
        CenterCrop(config.IMAGE_SIZE),
        ToTensor(),
        Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])
    dataset = ImageFolder("../ganData/face/", transform=transforms)
    dataLoader = DataLoader(dataset=dataset,
                            batch_size=config.BATCH_SIZE,
                            shuffle=True,
                            num_workers=config.NUM_WORKERS_LOAD_IMAGE,
                            drop_last=True)
    netG, netD = DataParallel(GeneratorNet()), DataParallel(DiscriminatorNet())
    map_location = lambda storage, loc: storage

    optimizer_generator = Adam(netG.parameters(),
                               config.LR_GENERATOR,
                               betas=(config.BETA1, 0.999))
    optimizer_discriminator = Adam(netD.parameters(),
                                   config.LR_DISCRIMINATOR,
                                   betas=(config.BETA1, 0.999))

    criterion = BCELoss()

    true_labels = Variable(t.ones(config.BATCH_SIZE))
    fake_labels = Variable(t.zeros(config.BATCH_SIZE))
    fix_noises = Variable(t.randn(config.BATCH_SIZE, config.NOISE_Z, 1, 1))
    noises = Variable(t.randn(config.BATCH_SIZE, config.NOISE_Z, 1, 1))

    # errord_meter = AverageValueMeter()
    # errorg_meter = AverageValueMeter()
Exemple #20
0
def main():
    global args
    args = parser.parse_args()
    config_training = import_module(args.config)
    config_training = config_training.config
    # from config_training import config as config_training
    torch.manual_seed(0)
    torch.cuda.set_device(0)

    model = import_module(args.model)
    config, net, loss, get_pbb = model.get_model()
    start_epoch = args.start_epoch
    save_dir = args.save_dir

    if args.resume:
        checkpoint = torch.load(args.resume)
        # if start_epoch == 0:
        #     start_epoch = checkpoint['epoch'] + 1
        # if not save_dir:
        #     save_dir = checkpoint['save_dir']
        # else:
        #     save_dir = os.path.join('results',save_dir)
        net.load_state_dict(checkpoint['state_dict'])
    # else:
    if start_epoch == 0:
        start_epoch = 1
    if not save_dir:
        exp_id = time.strftime('%Y%m%d-%H%M%S', time.localtime())
        save_dir = os.path.join('results', args.model + '-' + exp_id)
    else:
        save_dir = os.path.join('results', save_dir)

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    logfile = os.path.join(save_dir, 'log')
    if args.test != 1:
        sys.stdout = Logger(logfile)
        pyfiles = [f for f in os.listdir('./') if f.endswith('.py')]
        for f in pyfiles:
            shutil.copy(f, os.path.join(save_dir, f))
    n_gpu = setgpu(args.gpu)
    args.n_gpu = n_gpu
    net = net.cuda()
    loss = loss.cuda()
    cudnn.benchmark = False  # True
    net = DataParallel(net)
    traindatadir = config_training['train_preprocess_result_path']
    valdatadir = config_training['val_preprocess_result_path']
    testdatadir = config_training['test_preprocess_result_path']
    trainfilelist = []
    print config_training['train_data_path']
    for folder in config_training['train_data_path']:
        print folder
        for f in os.listdir(folder):
            if f.endswith('.mhd') and f[:-4] not in config_training['black_list']:
                trainfilelist.append(folder.split('/')[-2]+'/'+f[:-4])
    valfilelist = []
    for folder in config_training['val_data_path']:
        for f in os.listdir(folder):
            if f.endswith('.mhd') and f[:-4] not in config_training['black_list']:
                valfilelist.append(folder.split('/')[-2]+'/'+f[:-4])
    testfilelist = []
    for folder in config_training['test_data_path']:
        for f in os.listdir(folder):
            if f.endswith('.mhd') and f[:-4] not in config_training['black_list']:
                testfilelist.append(folder.split('/')[-2]+'/'+f[:-4])

    if args.test == 1:
        margin = 32
        sidelen = 144
        import data
        split_comber = SplitComb(
            sidelen, config['max_stride'], config['stride'], margin, config['pad_value'])
        dataset = data.DataBowl3Detector(
            testdatadir,
            testfilelist,
            config,
            phase='test',
            split_comber=split_comber)
        test_loader = DataLoader(
            dataset,
            batch_size=1,
            shuffle=False,
            num_workers=args.workers,
            collate_fn=data.collate,
            pin_memory=False)

        for i, (data, target, coord, nzhw) in enumerate(test_loader):  # check data consistency
            if i >= len(testfilelist)/args.batch_size:
                break

        test(test_loader, net, get_pbb, save_dir, config)
        return
    #net = DataParallel(net)
    import data
    print len(trainfilelist)
    dataset = data.DataBowl3Detector(
        traindatadir,
        trainfilelist,
        config,
        phase='train')
    train_loader = DataLoader(
        dataset,
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.workers,
        pin_memory=True)

    dataset = data.DataBowl3Detector(
        valdatadir,
        valfilelist,
        config,
        phase='val')
    val_loader = DataLoader(
        dataset,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.workers,
        pin_memory=True)

    for i, (data, target, coord) in enumerate(train_loader):  # check data consistency
        if i >= len(trainfilelist)/args.batch_size:
            break

    for i, (data, target, coord) in enumerate(val_loader):  # check data consistency
        if i >= len(valfilelist)/args.batch_size:
            break

    optimizer = torch.optim.SGD(
        net.parameters(),
        args.lr,
        momentum=0.9,
        weight_decay=args.weight_decay)

    def get_lr(epoch):
        if epoch <= args.epochs * 1/3:  # 0.5:
            lr = args.lr
        elif epoch <= args.epochs * 2/3:  # 0.8:
            lr = 0.1 * args.lr
        elif epoch <= args.epochs * 0.8:
            lr = 0.05 * args.lr
        else:
            lr = 0.01 * args.lr
        return lr

    for epoch in range(start_epoch, start_epoch + args.epochs):
        train(train_loader, net, loss, epoch, optimizer,
              get_lr, args.save_freq, save_dir)
        validate(val_loader, net, loss)
Exemple #21
0
def main():
    if raw:
        print('building files')
        build_files(data_path=raw_data_path)
        print('files built')

    model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel(
        config=model_config)
    model.to(device)
    multi_gpu = False
    full_line = ''
    print('calculating total steps')
    for i in tqdm(range(num_pieces)):
        with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                  'r') as f:
            full_line += f.read()
    full_line = full_line.strip()
    full_line = [int(item) for item in full_line.split()]
    len_full_line = len(full_line)
    samples = []
    start_point = 0
    while start_point + n_ctx < len_full_line:
        samples.append(full_line[start_point:start_point + n_ctx])
        start_point += stride
    total_steps = int(
        len(samples) * epochs / batch_size / gradient_accumulation)
    print('total steps = {}'.format(total_steps))
    optimizer = pytorch_transformers.AdamW(model.parameters(),
                                           lr=lr,
                                           correct_bias=True)
    scheduler = pytorch_transformers.WarmupLinearSchedule(
        optimizer, warmup_steps=warmup_steps, t_total=total_steps)
    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=fp16_opt_level)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = DataParallel(model)
        multi_gpu = True
    print('starting training')
    for epoch in range(epochs):
        print('epoch {}'.format(epoch + 1))
        now = datetime.now()
        print('time: {}'.format(now))
        running_loss = 0
        random.shuffle(samples)
        for step in range(len(samples) // batch_size):

            #  prepare data
            batch = samples[step * batch_size:(step + 1) * batch_size]
            batch_labels = []
            batch_inputs = []
            for ids in batch:
                int_ids_for_labels = [int(x) for x in ids]
                int_ids_for_inputs = [int(x) for x in ids]
                batch_labels.append(int_ids_for_labels)
                batch_inputs.append(int_ids_for_inputs)
            batch_labels = torch.tensor(batch_labels).long().to(device)
            batch_inputs = torch.tensor(batch_inputs).long().to(device)

            #  forward pass
            outputs = model.forward(input_ids=batch_inputs,
                                    labels=batch_labels)
            loss, logits = outputs[:2]

            #  get loss
            if multi_gpu:
                loss = loss.mean()
            if gradient_accumulation > 1:
                loss = loss / gradient_accumulation

            #  loss backward
            if fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), max_grad_norm)
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               max_grad_norm)

            #  optimizer step
            if (step + 1) % gradient_accumulation == 0:
                running_loss += loss.item()
                scheduler.step()
                optimizer.step()
                optimizer.zero_grad()
            if (step + 1) % log_step == 0:
                print('step {} of epoch {}, loss {}'.format(
                    (step + 1) // gradient_accumulation, epoch + 1,
                    running_loss * gradient_accumulation**2 / log_step))
                running_loss = 0

        print('saving model for epoch {}'.format(epoch + 1))
        if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)):
            os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1))
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(output_dir +
                                      'model_epoch{}'.format(epoch + 1))
        # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1))
        # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1))
        print('epoch {} finished'.format(epoch + 1))

        then = datetime.now()
        print('time: {}'.format(then))
        print('time for one epoch: {}'.format(then - now))

    print('training finished')
    if not os.path.exists(output_dir + 'final_model'):
        os.mkdir(output_dir + 'final_model')
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(output_dir + 'final_model')
Exemple #22
0
def run():
    logger.info("using device: {}".format(config.DEVICE))
    train_data = process_raw_data()
    train_list, test_list = train_test_split(train_data,
                                             test_size=0.2,
                                             random_state=34)

    # 加载GPT2模型
    model, n_ctx = create_model(False)
    model.to(config.DEVICE)
    # 是否使用多块GPU进行并行运算: 可以选择要使用哪几块显卡来进行训练
    multi_gpu = False
    if torch.cuda.is_available() and torch.cuda.device_count() > 1:
        logger.info("Using more than one GPUs to train...")
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = config.DEVICE_NUM
        model = DataParallel(
            model, device_ids=[int(i) for i in config.DEVICE_NUM.split(",")])
        multi_gpu = True

    # 记录模型参数数量
    num_parameters = sum(
        [parameter.numel() for parameter in model.parameters()])
    logger.info("number of model parameters: {}".format(num_parameters))

    # 加载数据
    logger.info("loading training data")
    train_dataset = DialogueDataset(train_list, n_ctx)
    batch_num = len(train_dataset) // config.BATCH_SIZE
    test_dataset = DialogueDataset(test_list, n_ctx)
    test_batch_num = len(test_dataset) // config.BATCH_SIZE

    train_data_loader = DataLoader(train_dataset,
                                   batch_size=config.BATCH_SIZE,
                                   shuffle=True,
                                   num_workers=4,
                                   collate_fn=collate_fn)

    test_data_loader = DataLoader(test_dataset,
                                  batch_size=config.BATCH_SIZE,
                                  shuffle=True,
                                  num_workers=1,
                                  collate_fn=collate_fn)

    # 计算所有epoch进行参数优化的总步数total_steps
    total_steps = int(
        len(train_data_loader) * config.EPOCHS / config.BATCH_SIZE /
        config.GRADIENT_ACCUMULATION)
    logger.info('total training steps = {}'.format(total_steps))

    # 设置优化器,并且在初始训练时,使用warmup策略
    optimizer = AdamW(model.parameters(),
                      lr=config.LEARNING_RATE,
                      correct_bias=True)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=config.WARM_STEPS,
        num_training_steps=total_steps)

    logger.info("start training...")
    best_loss = 100
    best_accuracy = 0
    for epoch in range(config.EPOCHS):
        train_fn(model, train_data_loader, optimizer, scheduler, epoch,
                 batch_num, multi_gpu)
        loss, accuracy = eval_fn(model, test_data_loader, test_batch_num,
                                 multi_gpu)
        if loss < best_loss or accuracy > best_accuracy:
            logger.info('saving model for epoch {}, best loss: {}'.format(
                epoch + 1, loss))
            model_to_save = model.module if hasattr(model, 'module') else model
            model_to_save.save_pretrained(config.MODEL_PATH)
            best_loss = loss
            best_accuracy = accuracy
Exemple #23
0
def main(args):

    manualSeed = random.randint(1, 100000)
    print("Random Seed: ", manualSeed)
    random.seed(manualSeed)
    torch.manual_seed(manualSeed)
    torch.cuda.manual_seed_all(manualSeed)
    cudnn.benchmark = True
    #cudnn.deterministic = False
    cudnn.enabled = True

    root = ''

    train_source, num_classes = preprocess(root + 'market/bounding_box_train',
                                           relabel=True)
    gallery, _ = preprocess(root + 'market/bounding_box_test', relabel=False)
    query, _ = preprocess(root + 'market/query', relabel=False)

    marketTrain = Market('train', train_source,
                         root + 'market/bounding_box_train/', 'train',
                         args.height, args.width, 'data/pose_train.json')
    galleryds = Market('val', gallery, root + 'market/bounding_box_test/',
                       'gallery', args.height, args.width,
                       'data/pose_gallery.json')
    querds = Market('val', query, root + 'market/query/', 'query', args.height,
                    args.width, 'data/pose_query.json')

    num_epochs = args.epochs
    train_batch_size = 32  #args.batch_size
    test_batch_size = 64
    train_loader = DataLoader(marketTrain,
                              batch_size=train_batch_size,
                              shuffle=True,
                              num_workers=8,
                              pin_memory=False)
    query_loader = DataLoader(querds,
                              batch_size=train_batch_size,
                              shuffle=False,
                              num_workers=8,
                              pin_memory=False)
    gallery_loader = DataLoader(galleryds,
                                batch_size=train_batch_size,
                                shuffle=False,
                                num_workers=8,
                                pin_memory=False)

    reidNet = resnet50(pretrained=True, num_classes=num_classes)
    model = DataParallel(reidNet).cuda()

    # Optimizer
    if hasattr(model.module, 'base'):
        base_param_ids = set(map(id, model.module.base.parameters()))
        new_params = [
            p for p in model.parameters() if id(p) not in base_param_ids
        ]
        param_groups = [{
            'params': model.module.base.parameters(),
            'lr_mult': 0.1
        }, {
            'params': new_params,
            'lr_mult': 1.0
        }]
        print('Learning rate is set.')
    else:
        param_groups = model.parameters()
    optimiser = torch.optim.SGD(param_groups,
                                lr=args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay,
                                nesterov=True)

    # Schedule learning rate
    step_size = args.step_size

    def adjust_lr(epoch):
        _lr = args.lr * (args.lr_factor**(epoch // step_size))
        print(_lr)
        for g in optimiser.param_groups:
            g['lr'] = _lr * g.get('lr_mult', 1)

    #checkpoint = torch.load('models_epoch/reidNet_10.pth')
    #model.load_state_dict(checkpoint['state_dict'])
    #optimiser.load_state_dict(checkpoint['optimizer'])

    criterion = torch.nn.CrossEntropyLoss(reduction='elementwise_mean').cuda()

    start_epoch = 0  #checkpoint['epoch'] + 1

    for epoch in range(start_epoch, num_epochs):
        adjust_lr(epoch)

        print("Starting Epoch [%d]" % (epoch))

        tloss = train(train_loader, model, optimiser, criterion)

        state = {
            'epoch': epoch,
            'state_dict': model.state_dict(),
            'optimizer': optimiser.state_dict(),
        }

        evaluator = Evaluator(model)
        all = evaluator.evaluate(query_loader, gallery_loader, query, gallery,
                                 args.output_feature, args.rerank)

        with open('losses/rank1.txt', 'a') as the_file:
            the_file.write(str(all[0] * 100) + '\n')
        the_file.close()

        model_name = 'models_epoch/reidNet_' \
                     + str(epoch) + '_' + str(all[0] * 100)[:5] +'.pth'
        torch.save(state, model_name)
Exemple #24
0
class ProGAN(BaseModel):
    """ Wrapper around the Generator and the Discriminator """
    def __init__(self,
                 depth=7,
                 latent_size=256,
                 num_channels=3,
                 learning_rate=1e-3,
                 beta_1=0,
                 beta_2=0.99,
                 eps=1e-8,
                 drift=0.001,
                 use_eql=True,
                 use_ema=True,
                 ema_decay=0.999,
                 checkpoint=None,
                 **kwargs):
        """
        constructor for the class ProGAN, extends BaseModel
        :param depth: depth of the GAN, 2^depth is the final size of generated images
        :param latent_size: latent size of the manifold used by the GAN
        :param num_channels: *NOT YET IMPLEMENTED* will control number of channels of in/outputs
        :param drift: drift penalty for the discriminator
                      (Used only if loss is wgan or wgan-gp)
        :param use_eql: whether to use equalized learning rate
        :param use_ema: boolean for whether to use exponential moving averages
        :param ema_decay: value of mu for ema
        :param checkpoint: generator checkpoint to load for inference
        :param learning_rate: base learning rate for Adam
        :param beta_1: beta_1 parameter for Adam
        :param beta_2: beta_2 parameter for Adam
        :param eps: epsilon parameter for Adam
        """
        super(ProGAN, self).__init__(**kwargs)

        # state of the object
        self.latent_size = latent_size
        self.num_channels = num_channels
        self.depth = depth - 1  # ensures generated images are size 2^depth
        self.use_ema = use_ema
        self.ema_decay = ema_decay
        self.use_eql = use_eql
        self.drift = drift
        self.dataloader = None

        # Create the Generator and the Discriminator
        self.G = Generator(self.depth, self.latent_size,
                           use_eql=self.use_eql).to(self.device)
        self.D = Discriminator(self.depth,
                               self.latent_size,
                               use_eql=self.use_eql).to(self.device)

        # if code is to be run on GPU, we can use DataParallel:
        if self.device == th.device("cuda"):
            self.G = DataParallel(self.G)
            self.D = DataParallel(self.D)

        # define the optimizers for the discriminator and generator
        self.default_rate = learning_rate
        self.G_optim = Adam(self.G.parameters(),
                            lr=learning_rate,
                            betas=(beta_1, beta_2),
                            eps=eps)
        self.D_optim = Adam(self.D.parameters(),
                            lr=learning_rate,
                            betas=(beta_1, beta_2),
                            eps=eps)

        # setup the ema for the generator
        if self.use_ema:
            # create a shadow copy of the generator
            self.G_shadow = copy.deepcopy(self.G)

            # initialize the G_shadow weights equal to the weights of G
            self.update_average(self.G_shadow, self.G, beta=0)

        if checkpoint is not None:
            self.model_names = ['G']
            self.load_networks(checkpoint)
            self.set_requires_grad(self.G, requires_grad=False)

    def setup_loss(self, loss):
        if isinstance(loss, str):
            loss = loss.lower()  # lowercase the string
            if loss == "wgan":
                loss = WGAN_GP(self.device, self.D, self.drift, use_gp=False)
                # note if you use just wgan, you will have to use weight clipping
                # in order to prevent gradient exploding
            elif loss == "wgan-gp":
                loss = WGAN_GP(self.device, self.D, self.drift, use_gp=True)
            elif loss == "lsgan":
                loss = LSGAN(self.D)
            elif loss == "lsgan-sig":
                loss = LSGAN_SIGMOID(self.D)
            elif loss == "hinge":
                loss = HingeLoss(self.D)
            elif loss == "rel-avg":
                loss = RelativisticAverageHinge(self.D)
            elif loss == "r1-reg":
                loss = R1Regularized(self.device, self.D)
            else:
                raise ValueError("Unknown loss function requested")
        elif not isinstance(loss, GANLoss):
            raise ValueError(
                "loss is neither an instance of GANLoss nor a string")
        return loss

    # This function updates the exponential average weights based on the current training
    def update_average(self, model_tgt, model_src, beta):
        """
        update the target model using exponential moving averages
        :param model_tgt: target model
        :param model_src: source model
        :param beta: value of decay beta
        :return: None (updates the target model)
        """
        # turn off gradient calculation
        self.set_requires_grad(model_tgt, False)
        self.set_requires_grad(model_src, False)

        param_dict_src = dict(model_src.named_parameters())

        for p_name, p_tgt in model_tgt.named_parameters():
            p_src = param_dict_src[p_name]
            assert (p_src is not p_tgt)
            p_tgt.copy_(beta * p_tgt + (1. - beta) * p_src)

        # turn back on the gradient calculation
        self.set_requires_grad(model_tgt, True)
        self.set_requires_grad(model_src, True)

    def forward(self, real_A):
        return self.G(real_A, self.depth - 1, alpha=1)

    def optimize_D(self, noise, real_batch, depth, alpha):
        self.set_requires_grad(self.G, False)
        self.set_requires_grad(self.D, True)

        # downsample the real_batch for the given depth
        down_sample_factor = int(
            np.power(2, self.depth - depth -
                     1)) if not self.dataloader.prescaled_data else 1
        prior_downsample_factor = max(int(
            np.power(2, self.depth -
                     depth)), 0) if not self.dataloader.prescaled_data else 2

        ds_real_samples = AvgPool2d(down_sample_factor)(real_batch)

        if depth > 0:
            prior_ds_real_samples = interpolate(
                AvgPool2d(prior_downsample_factor)(real_batch), scale_factor=2)
        else:
            prior_ds_real_samples = ds_real_samples

        # real samples are a combination of ds_real_samples and prior_ds_real_samples
        real_samples = (alpha * ds_real_samples) + (
            (1 - alpha) * prior_ds_real_samples)

        loss_val = 0
        for _ in range(self.n_critic):
            # optimize discriminator
            self.D_optim.zero_grad()

            # generate a batch of samples
            fake_samples = self.G(noise, depth, alpha).detach()

            loss = self.loss.loss_D(real_samples.requires_grad_(),
                                    fake_samples.requires_grad_(),
                                    depth=depth,
                                    alpha=alpha)

            if not isinstance(self.loss, R1Regularized):
                loss.backward()

            self.D_optim.step()

            loss_val += loss.item()

        return loss_val / self.n_critic

    def optimize_G(self, noise, real_batch, depth, alpha):
        self.set_requires_grad(self.G, True)
        self.set_requires_grad(self.D, False)

        # optimize the generator
        self.G_optim.zero_grad()

        fake_samples = self.G(noise, depth, alpha)

        loss = self.loss.loss_G(real_batch,
                                fake_samples,
                                depth=depth,
                                alpha=alpha)
        loss.backward()

        self.G_optim.step()

        # if use_ema is true, apply ema to the generator parameters
        if self.use_ema:
            self.update_average(self.G_shadow, self.G, self.ema_decay)

        # return the loss value
        return loss.item()

    def train(self,
              continue_train=False,
              data_path='maua/datasets/default_progan',
              dataloader=None,
              start_epoch=1,
              start_depth=1,
              until_depth=None,
              fade_in=0.5,
              save_freq=25,
              log_freq=5,
              num_epochs=50,
              learning_rates_dict={
                  256: 5e-4,
                  512: 2.5e-4,
                  1024: 1e-4
              },
              n_critic=1,
              loss="wgan-gp"):
        """
        Training function for ProGAN object
        :param continue_train: whether to continue training or not
        :param data_path: path to folder containing images to train on
        :param dataloader: custom dataloader to use, otherwise images will only be resized to max resolution
        :param start_epoch: epoch to continue training from (defaults to most recent, if continuing training)
        :param start_depth: depth to continue training from (defaults to most recent, if continuing training)
        :param until_depth: depth to continue training until (defaults to self.depth)
        :param fade_in: fraction of epochs per depth to fade into the new resolution
        :param save_freq: frequency to save checkpoints in number of epochs
        :param log_freq: frequency to log images in number of or fraction of epochs
        :param learning_rates_dict: dictionary of learning rates per resolution (defaults to self.learning_rate)
        :param n_critic: number of times to update discriminator (Used only if loss is wgan or wgan-gp)
        :param loss: the loss function to be used. Can either be a string =>
                        ["wgan-gp", "wgan", "lsgan", "lsgan-sig", "hinge", "rel-avg", "r1-reg"]
                     or an instance of GANLoss
        """
        self.model_names = ["G", "D"]
        self.n_critic = n_critic
        self.loss = self.setup_loss(loss)

        os.makedirs(os.path.join(self.save_dir, "images"), exist_ok=True)

        start_epoch = epoch = 1
        total_epochs = num_epochs * self.depth
        if continue_train:
            epoch = self.get_latest_network(start_epoch,
                                            max_epoch=total_epochs)
            start_depth = start_depth if start_depth != 1 else math.ceil(
                epoch / num_epochs)
            start_epoch = epoch - math.floor(epoch / num_epochs) * num_epochs

        # create dataloader
        if dataloader is None and self.dataloader is None:
            transforms = tv.transforms.Compose(
                [tn.Resize(2**(self.depth + 1)),
                 tn.ToTensor()])
            dataloader = ProGANDataLoader(data_path=data_path,
                                          transforms=transforms)
        dataloader.generate_prescaled_dataset(
            sizes=list(map(lambda x: 2**(x + 3), range(self.depth - 1))))
        self.dataloader = dataloader
        batches_dict = self.dataloader.get_batch_sizes(self)
        dataset_size = len(dataloader)
        print('# training images = %d' % dataset_size)

        # create fixed_input for logging
        fixed_input = th.randn(12, self.latent_size).to(self.device)

        print("Starting training on " + str(self.device))
        global_time = time.time()
        for depth in range(start_depth,
                           self.depth if until_depth is None else until_depth):
            current_res = 2**(depth + 2)
            print("Current resolution: %d x %d" % (current_res, current_res))

            # update batch size and learning rate for scale
            dataloader.set_batch_size(current_res, batches_dict[current_res])
            total_batches = dataloader.batches()
            learning_rate = learning_rates_dict.get(current_res,
                                                    self.default_rate)
            self.D_optim.lr = self.G_optim.lr = learning_rate

            for e in range(start_epoch if depth == start_depth else 1,
                           num_epochs + 1):
                start = time.time()

                # calculate the value of alpha for fade-in effect
                alpha = min(e / (num_epochs * fade_in), 1)
                if log_freq < 1:
                    print("Start of epoch: %s / %s \t Fade in: %s" %
                          (epoch, total_epochs, alpha))

                loss_D, loss_G = 0, 0
                for i, batch in enumerate(dataloader, 1):
                    images = batch.to(self.device)
                    noise = th.randn(images.shape[0],
                                     self.latent_size).to(self.device)

                    loss_D += self.optimize_D(noise, images, depth, alpha)
                    loss_G += self.optimize_G(noise, images, depth, alpha)

                    if i % math.ceil(total_batches * log_freq) == 0 and not (
                            i == 0 or i == total_batches):
                        elapsed = str(
                            datetime.timedelta(seconds=time.time() -
                                               global_time))
                        print(
                            "Elapsed: [%s] Batch: %d / %d d_loss: %f  g_loss: %f"
                            % (elapsed, i, total_batches,
                               loss_D / math.ceil(total_batches * log_freq),
                               loss_G / math.ceil(total_batches * log_freq)))
                        loss_D, loss_G = 0, 0

                        # create a grid of samples and save it
                        gen_img_file = os.path.join(
                            self.save_dir, "images", "sample_res%d_e%d_b%d" %
                            (current_res, epoch, i) + ".png")
                        with th.no_grad():
                            self.create_grid(
                                samples=self.G(fixed_input, depth, alpha),
                                scale_factor=int(
                                    np.power(2, self.depth - depth - 2)),
                                img_file=gen_img_file,
                            )

                if log_freq < 1:
                    print("End of epoch:", epoch, "Took: ",
                          time.time() - start, "sec")

                if log_freq >= 1 and epoch % log_freq == 0 or epoch == total_epochs:
                    elapsed = str(
                        datetime.timedelta(seconds=time.time() - global_time))
                    print(
                        "Elapsed: [%s] Epoch: %d / %d Fade in: %.02f d_loss: %f  g_loss: %f"
                        % (elapsed, epoch, num_epochs *
                           (self.depth - 1), alpha, loss_D, loss_G))
                    # create a grid of samples and save it
                    gen_img_file = os.path.join(
                        self.save_dir, "images",
                        "sample_res%d_e%d" % (current_res, epoch) + ".png")
                    with th.no_grad():
                        self.create_grid(
                            samples=self.G(fixed_input, depth, alpha),
                            scale_factor=int(
                                np.power(2, self.depth - depth) / 4),
                            img_file=gen_img_file,
                        )

                if epoch % save_freq == 0 or epoch == total_epochs:
                    self.save_networks(epoch)

                epoch += 1

        print("Training finished, took: ",
              datetime.timedelta(seconds=time.time() - global_time))
        self.save_networks("final")

    # used to create grid of training images for logging
    def create_grid(self, samples, scale_factor, img_file, real_imgs=False):
        samples = th.clamp(samples, min=0, max=1)
        if scale_factor > 1 and not real_imgs:
            samples = interpolate(samples, scale_factor=scale_factor)
        save_image(samples, img_file, nrow=int(np.sqrt(len(samples)) + 1))
Exemple #25
0
def get_model(dev, z_dim, nc):
    vae = DataParallel(VAE(dev=dev, z_dim=z_dim, nc=nc))
    vae = vae.to(dev).double()
    opt = torch.optim.Adam(vae.parameters(), lr=1e-3)
    return vae, opt
Exemple #26
0
def train_Ours(args, train_loader, val_loader, knownclass, Encoder, Decoder,
               NorClsfier, SSDClsfier, summary_writer, saver):
    seed = init_random_seed(args.manual_seed)

    criterionCls = nn.CrossEntropyLoss()
    criterionRec = nn.MSELoss()

    if args.parallel_train:
        Encoder = DataParallel(Encoder)
        Decoder = DataParallel(Decoder)
        NorClsfier = DataParallel(NorClsfier)
        SSDClsfier = DataParallel(SSDClsfier)

    optimizer = optim.Adam(
        list(Encoder.parameters()) + list(NorClsfier.parameters()) +
        list(SSDClsfier.parameters()) + list(Decoder.parameters()),
        lr=args.lr)

    if args.adv is 'PGDattack':
        print("**********Defense PGD Attack**********")
    elif args.adv is 'FGSMattack':
        print("**********Defense FGSM Attack**********")

    if args.adv is 'PGDattack':
        from advertorch.attacks import PGDAttack
        nor_adversary = PGDAttack(predict1=Encoder,
                                  predict2=NorClsfier,
                                  nb_iter=args.adv_iter)
        rot_adversary = PGDAttack(predict1=Encoder,
                                  predict2=SSDClsfier,
                                  nb_iter=args.adv_iter)

    elif args.adv is 'FGSMattack':
        from advertorch.attacks import GradientSignAttack
        nor_adversary = GradientSignAttack(predict1=Encoder,
                                           predict2=NorClsfier)
        rot_adversary = GradientSignAttack(predict1=Encoder,
                                           predict2=SSDClsfier)

    global_step = 0
    # ----------
    #  Training
    # ----------
    for epoch in range(args.n_epoch):

        Encoder.train()
        Decoder.train()
        NorClsfier.train()
        SSDClsfier.train()

        for steps, (orig, label, rot_orig,
                    rot_label) in enumerate(train_loader):

            label = lab_conv(knownclass, label)
            orig, label = orig.cuda(), label.long().cuda()

            rot_orig, rot_label = rot_orig.cuda(), rot_label.long().cuda()

            with ctx_noparamgrad_and_eval(Encoder):
                with ctx_noparamgrad_and_eval(NorClsfier):
                    with ctx_noparamgrad_and_eval(SSDClsfier):
                        adv = nor_adversary.perturb(orig, label)
                        rot_adv = rot_adversary.perturb(rot_orig, rot_label)

            latent_feat = Encoder(adv)
            norpred = NorClsfier(latent_feat)
            norlossCls = criterionCls(norpred, label)

            recon = Decoder(latent_feat)
            lossRec = criterionRec(recon, orig)

            ssdpred = SSDClsfier(Encoder(rot_adv))
            rotlossCls = criterionCls(ssdpred, rot_label)

            loss = args.norClsWgt * norlossCls + args.rotClsWgt * rotlossCls + args.RecWgt * lossRec

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            #============ tensorboard the log info ============#
            lossinfo = {
                'loss': loss.item(),
                'norlossCls': norlossCls.item(),
                'lossRec': lossRec.item(),
                'rotlossCls': rotlossCls.item(),
            }

            global_step += 1

            #============ print the log info ============#
            if (steps + 1) % args.log_step == 0:
                errors = OrderedDict([
                    ('loss', loss.item()),
                    ('norlossCls', norlossCls.item()),
                    ('lossRec', lossRec.item()),
                    ('rotlossCls', rotlossCls.item()),
                ])

                saver.print_current_errors((epoch + 1), (steps + 1), errors)

        # evaluate performance on validation set periodically
        if ((epoch + 1) % args.val_epoch == 0):

            # switch model to evaluation mode
            Encoder.eval()
            NorClsfier.eval()

            running_corrects = 0.0
            epoch_size = 0.0
            val_loss_list = []

            # calculate accuracy on validation set
            for steps, (images, label) in enumerate(val_loader):

                label = lab_conv(knownclass, label)
                images, label = images.cuda(), label.long().cuda()

                adv = nor_adversary.perturb(images, label)

                with torch.no_grad():
                    logits = NorClsfier(Encoder(adv))
                    _, preds = torch.max(logits, 1)
                    running_corrects += torch.sum(preds == label.data)
                    epoch_size += images.size(0)

                    val_loss = criterionCls(logits, label)

                    val_loss_list.append(val_loss.item())

            val_loss_mean = sum(val_loss_list) / len(val_loss_list)

            val_acc = running_corrects.double() / epoch_size
            print('Val Acc: {:.4f}, Val Loss: {:.4f}'.format(
                val_acc, val_loss_mean))

            valinfo = {
                'Val Acc': val_acc.item(),
                'Val Loss': val_loss.item(),
            }
            for tag, value in valinfo.items():
                summary_writer.add_scalar(tag, value, (epoch + 1))

            orig_show = vutils.make_grid(orig, normalize=True, scale_each=True)
            recon_show = vutils.make_grid(recon,
                                          normalize=True,
                                          scale_each=True)

            summary_writer.add_image('Ori_Image', orig_show, (epoch + 1))
            summary_writer.add_image('Rec_Image', recon_show, (epoch + 1))

        if ((epoch + 1) % args.model_save_epoch == 0):
            model_save_path = os.path.join(args.results_path,
                                           args.training_type, 'snapshots',
                                           args.datasetname + '-' + args.split,
                                           args.denoisemean,
                                           args.adv + str(args.adv_iter))
            mkdir(model_save_path)
            torch.save(
                Encoder.state_dict(),
                os.path.join(model_save_path,
                             "Encoder-{}.pt".format(epoch + 1)))
            torch.save(
                NorClsfier.state_dict(),
                os.path.join(model_save_path,
                             "NorClsfier-{}.pt".format(epoch + 1)))
            torch.save(
                Decoder.state_dict(),
                os.path.join(model_save_path,
                             "Decoder-{}.pt".format(epoch + 1)))

    torch.save(Encoder.state_dict(),
               os.path.join(model_save_path, "Encoder-final.pt"))
    torch.save(NorClsfier.state_dict(),
               os.path.join(model_save_path, "NorClsfier-final.pt"))
    torch.save(Decoder.state_dict(),
               os.path.join(model_save_path, "Decoder-final.pt"))
def train(args):

    # Setup TrainDataLoader
    trainloader = CCFLoader(args.traindir, split=args.split,is_transform=True, img_size=(args.img_rows, args.img_cols))
    n_classes = trainloader.n_classes
    TrainDataLoader = data.DataLoader(trainloader, batch_size=args.batch_size, num_workers=8, shuffle=True)

    #Setup for validate
    valloader = CCFLoader(args.traindir, split='val', is_transform=True, img_size=(args.img_rows, args.img_cols))
    VALDataLoader = data.DataLoader(valloader,batch_size=4, num_workers=4, shuffle=False)

    # Setup visdom for visualization
    vis = visdom.Visdom()
    assert vis.check_connection()

    loss_window = vis.line(X=np.zeros((1,)),
                           Y=np.zeros((1)),
                            opts=dict(xlabel='minibatches',
                                      ylabel='Loss',
                                      title=args.arch+' Training Loss',
                                      legend=['Loss']))
    valacc_window = vis.line(X=np.zeros((1,)),
                           Y=np.zeros((1)),
                            opts=dict(xlabel='minibatches',
                                      ylabel='ACC',
                                      title='Val ACC',
                                      legend=['ACC']))
    # Setup Model
    start_epoch = 0
    if(args.snapshot==None):
        model = get_model(args.arch, n_classes)
        model = DataParallel(model.cuda(args.gpu[0]),device_ids=args.gpu)
    else:
        model = get_model(args.arch, n_classes)
        state_dict = torch.load(args.snapshot).state_dict()
        from collections import OrderedDict
        new_state_dict = OrderedDict()
        for k,v in state_dict.items():
            name =k[7:] #remove moudle
            new_state_dict[name] = v
        model.load_state_dict(new_state_dict)
        model = DataParallel(model.cuda(),device_ids=[i for i in range(len(args.gpu))])
        start_epoch = int(os.path.basename(args.snapshot).split('.')[0])

    optimizer = torch.optim.SGD(model.parameters(), lr=args.l_rate, momentum=0.99, weight_decay=5e-4)

    for epoch in range(args.n_epoch):
        adjust_learning_rate(optimizer,args.l_rate,epoch,args.step)
        if(epoch < start_epoch):
            continue
        for i, (images, labels) in enumerate(TrainDataLoader):
            if torch.cuda.is_available():
                images = Variable(images.cuda(args.gpu[0]))
                labels = Variable(labels.cuda(args.gpu[0]))
            else:
                images = Variable(images)
                labels = Variable(labels)

            iter = len(TrainDataLoader)*epoch + i
            #poly_lr_scheduler(optimizer, args.l_rate, iter)

            model.train()
            optimizer.zero_grad()
            outputs = model(images)
            if(isinstance(outputs,tuple)):
                loss = cross_entropy2d(outputs[0], labels,weights_per_class) + args.clsloss_weight * bin_clsloss(outputs[1], labels)
            else:
                #loss = cross_entropy2d(outputs, labels)
                loss = cross_entropy2d(outputs, labels,weights_per_class)
                #loss = focal_loss2d(outputs, labels)

            loss.backward()
            optimizer.step()

            vis.line(
                 X=torch.ones((1, 1)).cpu()*iter,
                 Y=torch.Tensor([loss.data[0]]).unsqueeze(0).cpu(),
                 win=loss_window,
                 update='append')

        print("Epoch [%d/%d] iteration: %d with Loss: %.4f" % (epoch+1, args.n_epoch, iter+1, loss.data[0]))

        #validation
        loss,acc = validate(model,VALDataLoader,n_classes)
        vis.line(X=torch.ones((1, 1)).cpu()*(epoch+1),Y=torch.ones((1, 1)).cpu()*acc,win=valacc_window,update='append')

        if(not os.path.exists("snapshot/{}".format(args.arch))):
            os.mkdir("snapshot/{}".format(args.arch))
        torch.save(model, "snapshot/{}/{}.pkl".format(args.arch, epoch+1))
Exemple #28
0
    def train(self):
        if not self.pretrained_model:
            model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel(
                config=self.model_config)
        else:
            self.print_and_log('加载预训练模型')
            model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(
                self.pretrained_model)
        model.train()
        model.to(self.device)
        # 计算模型参数量
        num_parameters = 0
        parameters = model.parameters()
        for parameter in parameters:
            num_parameters += parameter.numel()
        self.print_and_log('模型参数量: {}'.format(num_parameters))

        self.print_and_log("开始加载训练集")
        train_loader = self.create_dataloader()
        self.print_and_log("训练集加载完毕")

        epoch_steps = int(train_loader.sampler.num_samples / self.batch_size /
                          self.accumulation_steps)
        total_steps = epoch_steps * self.epochs
        self.print_and_log('总样本数 = {}'.format(
            train_loader.sampler.num_samples))
        self.print_and_log('epoch 步数 = {}'.format(epoch_steps))
        self.print_and_log('总步数 = {}'.format(total_steps))

        optimizer = pytorch_transformers.AdamW(model.parameters(),
                                               lr=self.lr,
                                               correct_bias=True)
        scheduler = pytorch_transformers.WarmupLinearSchedule(
            optimizer, warmup_steps=self.warmup_steps, t_total=total_steps)

        if self.fp16:
            try:
                from apex import amp
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
                )
            model, optimizer = amp.initialize(model,
                                              optimizer,
                                              opt_level=self.fp16_opt_level)

        if torch.cuda.device_count() > 1:
            model = DataParallel(model)
            multi_gpu = True
        else:
            multi_gpu = False

        overall_step = 0
        running_loss = 0
        model.train()
        for epoch in range(self.epochs):
            self.print_and_log('epoch {}'.format(epoch + 1))
            now = datetime.now()
            self.print_and_log('time: {}'.format(now))
            optimizer.zero_grad()
            for i, batch_data in enumerate(train_loader):
                if torch.cuda.is_available():
                    # keyword_ids = batch_data[0].to(self.device, non_blocking=True)
                    passage_ids = batch_data[1].to(self.device,
                                                   non_blocking=True)
                    label_ids = passage_ids.clone().to(self.device,
                                                       non_blocking=True)
                else:
                    # keyword_ids = batch_data[0]
                    passage_ids = batch_data[1]
                    label_ids = passage_ids.clone()
                outputs = model(input_ids=passage_ids, labels=label_ids)
                loss, logits = outputs[:2]
                # 多 GPU 训练
                if multi_gpu:
                    loss = loss.mean()
                # 梯度累加
                if self.gradient_accumulation > 1:
                    loss = loss / self.gradient_accumulation
                #  混合精度训练或正常训练
                if self.fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), self.max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   self.max_grad_norm)
                # 更新权重
                if (i + 1) % self.gradient_accumulation == 0:
                    running_loss += loss.item()
                    scheduler.step()
                    optimizer.step()
                    optimizer.zero_grad()
                    overall_step += 1
                # 报告 train loss
                if (overall_step +
                        1) % self.log_step == 0 and running_loss != 0:
                    self.print_and_log(
                        'now time: {}:{}. Step {} of epoch {}, loss {}'.format(
                            datetime.now().hour,
                            datetime.now().minute, overall_step + 1, epoch + 1,
                            running_loss * self.gradient_accumulation /
                            self.log_step))
                    running_loss = 0

            # 保存模型
            if (epoch + 1) % 1 == 0:
                if not os.path.exists(self.output_dir +
                                      'model_epoch{}'.format(epoch + 1)):
                    os.makedirs(self.output_dir +
                                'model_epoch{}'.format(epoch + 1))
                model_to_save = model.module if hasattr(model,
                                                        'module') else model
                model_to_save.save_pretrained(self.output_dir +
                                              'model_epoch{}'.format(epoch +
                                                                     1))
                # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1))
                # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1))

            then = datetime.now()
            self.print_and_log('time: {}'.format(then))
            self.print_and_log('time for one epoch: {}'.format(then - now))
            model.train()

        self.print_and_log('training finished')
        self.f_log.close()
        if not os.path.exists(self.output_dir + 'final_model'):
            os.makedirs(self.output_dir + 'final_model')
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(self.output_dir + 'final_model')
Exemple #29
0
def main():
    global args
    args = parser.parse_args()
    
    
    torch.manual_seed(0)
    
    
    ##################################

    nodmodel = import_module(args.model1)
    config1, nod_net, loss, get_pbb = nodmodel.get_model()
    args.lr_stage = config1['lr_stage']
    args.lr_preset = config1['lr']

    
    save_dir = args.save_dir

    
    ##################################
    
    casemodel = import_module(args.model2)
    
    config2 = casemodel.config
    args.lr_stage2 = config2['lr_stage']
    args.lr_preset2 = config2['lr']
    topk = config2['topk']
    case_net = casemodel.CaseNet(topk = topk,nodulenet=nod_net)

    args.miss_ratio = config2['miss_ratio']
    args.miss_thresh = config2['miss_thresh']
    if args.debug:
        args.save_dir = 'debug'
    
    ###################################
    
    
    
    
    
    
    ################################
    start_epoch = args.start_epoch
    if args.resume:
        checkpoint = torch.load(args.resume)
        if start_epoch == 0:
            start_epoch = checkpoint['epoch'] + 1
        if not save_dir:
            save_dir = checkpoint['save_dir']
        else:
            save_dir = os.path.join('results',save_dir)
        case_net.load_state_dict(checkpoint['state_dict'])
    else:
        if start_epoch == 0:
            start_epoch = 1
        if not save_dir:
            exp_id = time.strftime('%Y%m%d-%H%M%S', time.localtime())
            save_dir = os.path.join('results', args.model1 + '-' + exp_id)
        else:
            save_dir = os.path.join('results',save_dir)
    if args.epochs == None:
        end_epoch = args.lr_stage2[-1]
    else:
        end_epoch = args.epochs
    ################################
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    logfile = os.path.join(save_dir,'log')
    if args.test1!=1 and args.test2!=1 :
        sys.stdout = Logger(logfile)
        pyfiles = [f for f in os.listdir('./') if f.endswith('.py')]
        for f in pyfiles:
            shutil.copy(f,os.path.join(save_dir,f))
    ################################
    torch.cuda.set_device(0)
    #nod_net = nod_net.cuda()
    case_net = case_net.cuda()
    loss = loss.cuda()
    cudnn.benchmark = True
    if not args.debug:
        case_net = DataParallel(case_net)
        nod_net = DataParallel(nod_net)
    ################################


    if args.test1 == 1:
        testsplit = np.load('full.npy')
        dataset = DataBowl3Classifier(testsplit, config2, phase = 'test')
        predlist = test_casenet(case_net,dataset).T
        anstable = np.concatenate([[testsplit],predlist],0).T
        df = pandas.DataFrame(anstable)
        df.columns={'id','cancer'}
        df.to_csv('allstage1.csv',index=False)
        return

    if args.test2 ==1:

        testsplit = np.load('test.npy')
        dataset = DataBowl3Classifier(testsplit, config2, phase = 'test')
        predlist = test_casenet(case_net,dataset).T
        anstable = np.concatenate([[testsplit],predlist],0).T
        df = pandas.DataFrame(anstable)
        df.columns={'id','cancer'}
        df.to_csv('quick',index=False)
        return
    if args.test3 == 1:
        testsplit3 = np.load('stage2.npy')
        dataset = DataBowl3Classifier(testsplit3,config2,phase = 'test')
        predlist = test_casenet(case_net,dataset).T
        anstable = np.concatenate([[testsplit3],predlist],0).T
        df = pandas.DataFrame(anstable)
        df.columns={'id','cancer'}
        df.to_csv('stage2_ans.csv',index=False)
        return
    print(save_dir)
    print(args.save_freq)
    trainsplit = np.load('kaggleluna_full.npy')
    valsplit = np.load('valsplit.npy')
    testsplit = np.load('test.npy')

    dataset = DataBowl3Detector(trainsplit,config1,phase = 'train')
    train_loader_nod = DataLoader(dataset,batch_size = args.batch_size,
        shuffle = True,num_workers = args.workers,pin_memory=True)

    dataset = DataBowl3Detector(valsplit,config1,phase = 'val')
    val_loader_nod = DataLoader(dataset,batch_size = args.batch_size,
        shuffle = False,num_workers = args.workers,pin_memory=True)

    optimizer = torch.optim.SGD(nod_net.parameters(),
        args.lr,momentum = 0.9,weight_decay = args.weight_decay)
    
    trainsplit = np.load('full.npy')
    dataset = DataBowl3Classifier(trainsplit,config2,phase = 'train')
    train_loader_case = DataLoader(dataset,batch_size = args.batch_size2,
        shuffle = True,num_workers = args.workers,pin_memory=True)
    
    dataset = DataBowl3Classifier(valsplit,config2,phase = 'val')
    val_loader_case = DataLoader(dataset,batch_size = max([args.batch_size2,1]),
        shuffle = False,num_workers = args.workers,pin_memory=True)

    dataset = DataBowl3Classifier(trainsplit,config2,phase = 'val')
    all_loader_case = DataLoader(dataset,batch_size = max([args.batch_size2,1]),
        shuffle = False,num_workers = args.workers,pin_memory=True)

    optimizer2 = torch.optim.SGD(case_net.parameters(),
        args.lr,momentum = 0.9,weight_decay = args.weight_decay)
    
    for epoch in range(start_epoch, end_epoch + 1):
        if epoch ==start_epoch:
            lr = args.lr
            debug = args.debug
            args.lr = 0.0
            args.debug = True
            train_casenet(epoch,case_net,train_loader_case,optimizer2,args)
            args.lr = lr
            args.debug = debug
        if epoch<args.lr_stage[-1]:
            train_nodulenet(train_loader_nod, nod_net, loss, epoch, optimizer, args)
            validate_nodulenet(val_loader_nod, nod_net, loss)
        if epoch>config2['startepoch']:
            train_casenet(epoch,case_net,train_loader_case,optimizer2,args)
            val_casenet(epoch,case_net,val_loader_case,args)
            val_casenet(epoch,case_net,all_loader_case,args)

        if epoch % args.save_freq == 0:            
            state_dict = case_net.module.state_dict()
            for key in state_dict.keys():
                state_dict[key] = state_dict[key].cpu()

            torch.save({
                'epoch': epoch,
                'save_dir': save_dir,
                'state_dict': state_dict,
                'args': args},
                os.path.join(save_dir, '%03d.ckpt' % epoch))
Exemple #30
0
def main(verbose=1,
         print_freq=100,
         restore=True,
         ckpt_path=None,
         val_freq=1,
         run_id="model",
         dset_mode="grayscale_mask",
         model_type="siamese",
         dataset_name="deepfashion",
         ckpt_type="siamese",
         freeze_encoder_until_it=1000):

    print("TRAINING MODEL {} ON DATASET {}".format(model_type, dataset_name))

    if restore and ckpt_path:
        raise RuntimeError("Specify restore 0R ckpt_path")

    ckpt_savepath = os.path.join(cfg.CKPT_DIR, "{}.pth".format(run_id))
    print("Saving ckpts to {}".format(ckpt_savepath))
    logs_savepath = os.path.join(cfg.LOGDIR, run_id)
    print("Saving logs to {}".format(logs_savepath))

    if restore or ckpt_path:
        print("Restoring weights from {}".format(
            ckpt_savepath if restore else ckpt_path))

    if cfg.USE_GPU:
        if not torch.cuda.is_available():
            raise RuntimeError("cuda not available")
        device = torch.device('cuda')
    else:
        device = torch.device("cpu")

    print('DEVICE', device)

    # model
    model = get_model(model_type)
    model = DataParallel(model)

    # must call this before constructing the optimizer:
    # https://pytorch.org/docs/stable/optim.html
    model.to(device)

    # set up training
    # TODO better one?
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=0.01,
                                momentum=0.9,
                                weight_decay=0.0001)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   step_size=3,
                                                   gamma=0.1)
    criterion = ContrastiveLoss()

    initial_epoch = 0
    iteration = 0
    unfrozen = False

    if ckpt_path:
        ckpt = torch.load(ckpt_path)
        state_dict = ckpt['model_state_dict']

        if ckpt_type == model_type:
            model.load_state_dict(state_dict)
        elif model_type == 'dual' and ckpt_type == 'siamese':
            model = load_siamese_ckpt_into_dual(model, state_dict)
        else:
            raise NotImplementedError()

    elif restore:
        if os.path.exists(ckpt_savepath):
            print("LOADING MODEL")
            ckpt = torch.load(ckpt_savepath)
            model.load_state_dict(ckpt['model_state_dict'])
            optimizer.load_state_dict(ckpt['optimizer_state_dict'])
            initial_epoch = ckpt['epoch']
            iteration = ckpt['it']
            dset_mode = ckpt.get('dset_mode', dset_mode)

    else:
        raise RuntimeError("Should not get here! Check for bugs")

    print("Using dset_mode {}".format(dset_mode))

    # dataset
    train_ds, test_ds = get_dataset(dataset_name, dset_mode)
    # train_ds = Subset(train_ds, range(500))
    # test_ds = Subset(test_ds, range(100))
    train_dl = DataLoader(train_ds,
                          batch_size=cfg.BATCH_SIZE,
                          shuffle=True,
                          num_workers=cfg.NUM_WORKERS)
    test_dl = DataLoader(test_ds,
                         batch_size=cfg.BATCH_SIZE,
                         shuffle=False,
                         num_workers=cfg.NUM_WORKERS)

    # training loop
    start = time.time()

    try:
        for epoch in range(initial_epoch, cfg.NUM_EPOCHS):
            logger = SummaryWriter(logs_savepath)

            # effectively puts the model in train mode.
            # Opposite of model.eval()
            model.train()

            print("Epoch {}".format(epoch))

            for i, (im1, im2, y) in tqdm(enumerate(train_dl),
                                         total=len(train_ds) / cfg.BATCH_SIZE):
                iteration += 1

                if not unfrozen and iteration > freeze_encoder_until_it:
                    print("Unfreezing encoder")
                    unfrozen = True

                    for param in model.parameters():
                        param.requires_grad = True

                logger.add_scalar('DataTime', time.time() - start, iteration)

                im1 = im1.to(device)
                im2 = im2.to(device)
                y = y.to(device)

                enc1, enc2 = model(im1, im2)
                loss = criterion(enc1, enc2, y)

                # I think this zeros out previous gradients (in case people
                # want to accumulate gradients?)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                # logging
                logger.add_scalar('TrainLoss', loss.item(), iteration)
                logger.add_scalar('ItTime', time.time() - start, iteration)
                start = time.time()

                # display metrics

            # do some validation

            if (epoch + 1) % val_freq == 0:
                print("Validating...")
                model.eval()  # puts model in validation mode

                with torch.no_grad():

                    for i, (im1, im2,
                            y) in tqdm(enumerate(test_dl),
                                       total=len(test_ds) / cfg.BATCH_SIZE):
                        im1 = im1.to(device)
                        im2 = im2.to(device)
                        y = y.to(device)

                        enc1, enc2 = model(im1, im2)
                        loss = criterion(enc1, enc2, y)

                        logger.add_scalar('ValLoss', loss, iteration)

            # end of epoch
            lr_scheduler.step()

            save_ckpt(ckpt_savepath, model, epoch, iteration, optimizer,
                      dset_mode, dataset_name, model_type)

    except KeyboardInterrupt:
        print('Got keyboard interrupt, saving model...')
        save_ckpt(ckpt_savepath, model, epoch, iteration, optimizer, dset_mode,
                  dataset_name, model_type)
Exemple #31
0
class Train():
    def __init__(self, config):
        self.config = config

        ATTR_HEAD = {'race': RaceHead, 'gender': GenderHead,
                     'age': AgeHead, 'recognition': self.config.recognition_head}

        self.writer = SummaryWriter(config.log_path)

        if path.isfile(self.config.train_source):
            self.train_loader = LMDBDataLoader(self.config, self.config.train_source)
        else:
            self.train_loader = CustomDataLoader(self.config, self.config.train_source,
                                                 self.config.train_list)

        class_num = self.train_loader.class_num()
        print(len(self.train_loader.dataset))
        print(f'Classes: {class_num}')

        self.model = ResNet(self.config.depth, self.config.drop_ratio, self.config.net_mode)
        if self.config.attribute == 'recognition':
            self.head = ATTR_HEAD[self.config.attribute](classnum=class_num, m=self.config.margin)
        else:
            self.head = ATTR_HEAD[self.config.attribute](classnum=class_num)

        paras_only_bn, paras_wo_bn = separate_bn_param(self.model)

        dummy_input = torch.zeros(1, 3, 112, 112)
        self.writer.add_graph(self.model, dummy_input)

        if torch.cuda.device_count() > 1:
            print(f"Model will use {torch.cuda.device_count()} GPUs!")
            self.model = DataParallel(self.model)
            self.head = DataParallel(self.head)

        self.model = self.model.to(self.config.device)
        self.head = self.head.to(self.config.device)

        self.weights = None
        if self.config.attribute in ['race', 'gender']:
            _, self.weights = np.unique(self.train_loader.dataset.get_targets(), return_counts=True)
            self.weights = np.max(self.weights) / self.weights
            self.weights = torch.tensor(self.weights, dtype=torch.float, device=self.config.device)
            self.config.weights = self.weights
            print(self.weights)

        if self.config.val_source is not None:
            if self.config.attribute != 'recognition':
                if path.isfile(self.config.val_source):
                    self.val_loader = LMDBDataLoader(self.config, self.config.val_source, False)
                else:
                    self.val_loader = CustomDataLoader(self.config, self.config.val_source,
                                                       self.config.val_list, False)

            else:
                self.validation_list = []
                for val_name in config.val_list:
                    dataset, issame = get_val_pair(self.config.val_source, val_name)
                    self.validation_list.append([dataset, issame, val_name])

        self.optimizer = optim.SGD([{'params': paras_wo_bn,
                                     'weight_decay': self.config.weight_decay},
                                    {'params': self.head.parameters(),
                                     'weight_decay': self.config.weight_decay},
                                    {'params': paras_only_bn}],
                                   lr=self.config.lr, momentum=self.config.momentum)

        if self.config.resume:
            print(f'Resuming training from {self.config.resume}')
            load_state(self.model, self.head, self.optimizer, self.config.resume, False)

        if self.config.pretrained:
            print(f'Loading pretrained weights from {self.config.pretrained}')
            load_state(self.model, self.head, None, self.config.pretrained, True)

        print(self.config)
        self.save_file(self.config, 'config.txt')

        print(self.optimizer)
        self.save_file(self.optimizer, 'optimizer.txt')

        self.tensorboard_loss_every = max(len(self.train_loader) // 100, 1)
        self.evaluate_every = max(len(self.train_loader) // 5, 1)

        if self.config.lr_plateau:
            self.scheduler = ReduceLROnPlateau(self.optimizer, mode=self.config.max_or_min, factor=0.1,
                                               patience=3, verbose=True, threshold=0.001, cooldown=1)
        if self.config.early_stop:
            self.early_stop = EarlyStop(mode=self.config.max_or_min)

    def run(self):
        self.model.train()
        self.head.train()
        running_loss = 0.
        step = 0
        val_acc = 0.
        val_loss = 0.

        best_step = 0
        best_acc = float('Inf')
        if self.config.max_or_min == 'max':
            best_acc *= -1

        for epoch in range(self.config.epochs):
            train_logger = TrainLogger(self.config.batch_size, self.config.frequency_log)

            if epoch + 1 in self.config.reduce_lr and not self.config.lr_plateau:
                self.reduce_lr()

            for idx, data in enumerate(self.train_loader):
                imgs, labels = data
                imgs = imgs.to(self.config.device)
                labels = labels.to(self.config.device)

                self.optimizer.zero_grad()

                embeddings = self.model(imgs)

                if self.config.attribute == 'recognition':
                    outputs = self.head(embeddings, labels)
                else:
                    outputs = self.head(embeddings)

                if self.weights is not None:
                    loss = self.config.loss(outputs, labels, weight=self.weights)
                else:
                    loss = self.config.loss(outputs, labels)

                loss.backward()
                running_loss += loss.item()

                self.optimizer.step()

                if step % self.tensorboard_loss_every == 0:
                    loss_board = running_loss / self.tensorboard_loss_every
                    self.writer.add_scalar('train_loss', loss_board, step)
                    running_loss = 0.

                if step % self.evaluate_every == 0 and step != 0:
                    if self.config.val_source is not None:
                        val_acc, val_loss = self.evaluate(step)
                        self.model.train()
                        self.head.train()
                        best_acc, best_step = self.save_model(val_acc, best_acc, step, best_step)
                        print(f'Best accuracy: {best_acc:.5f} at step {best_step}')
                    else:
                        save_state(self.model, self.head, self.optimizer, self.config, 0, step)

                train_logger(epoch, self.config.epochs, idx, len(self.train_loader), loss.item())
                step += 1

            if self.config.lr_plateau:
                self.scheduler.step(val_acc)

            if self.config.early_stop:
                self.early_stop(val_acc)
                if self.early_stop.stop:
                    print("Early stopping model...")
                    break

        val_acc, val_loss = self.evaluate(step)
        best_acc = self.save_model(val_acc, best_acc, step, best_step)
        print(f'Best accuracy: {best_acc} at step {best_step}')

    def save_model(self, val_acc, best_acc, step, best_step):
        if (self.config.max_or_min == 'max' and val_acc > best_acc) or \
           (self.config.max_or_min == 'min' and val_acc < best_acc):
            best_acc = val_acc
            best_step = step
            save_state(self.model, self.head, self.optimizer, self.config, val_acc, step)

        return best_acc, best_step

    def reduce_lr(self):
        for params in self.optimizer.param_groups:
            params['lr'] /= 10

        print(self.optimizer)

    def tensorboard_val(self, accuracy, step, loss=0, dataset=''):
        self.writer.add_scalar('{}val_acc'.format(dataset), accuracy, step)

        if self.config.attribute != 'recognition':
            self.writer.add_scalar('val_loss', loss, step)

    def evaluate(self, step):
        if self.config.attribute != 'recognition':
            val_acc, val_loss = self.evaluate_attribute()
            self.tensorboard_val(val_acc, step, val_loss)

        elif self.config.attribute == 'recognition':
            val_loss = 0
            val_acc = 0
            print('Validating...')
            for idx, validation in enumerate(self.validation_list):
                dataset, issame, val_name = validation
                acc, std = self.evaluate_recognition(dataset, issame)
                self.tensorboard_val(acc, step, dataset=f'{val_name}_')
                print(f'{val_name}: {acc:.5f}+-{std:.5f}')
                val_acc += acc

            val_acc /= (idx + 1)
            self.tensorboard_val(val_acc, step)
            print(f'Mean accuracy: {val_acc:.5f}')

        return val_acc, val_loss

    def evaluate_attribute(self):
        self.model.eval()
        self.head.eval()

        y_true = torch.tensor([], dtype=self.config.output_type, device=self.config.device)
        all_outputs = torch.tensor([], device=self.config.device)

        with torch.no_grad():
            for imgs, labels in iter(self.val_loader):
                imgs = imgs.to(self.config.device)
                labels = labels.to(self.config.device)

                embeddings = self.model(imgs)
                outputs = self.head(embeddings)

                y_true = torch.cat((y_true, labels), 0)
                all_outputs = torch.cat((all_outputs, outputs), 0)

            if self.weights is not None:
                loss = round(self.config.loss(all_outputs, y_true, weight=self.weights).item(), 4)
            else:
                loss = round(self.config.loss(all_outputs, y_true).item(), 4)

        y_true = y_true.cpu().numpy()

        if self.config.attribute == 'age':
            y_pred = all_outputs.cpu().numpy()
            y_pred = np.round(y_pred, 0)
            y_pred = np.sum(y_pred, axis=1)
            y_true = np.sum(y_true, axis=1)
            accuracy = round(mean_absolute_error(y_true, y_pred), 4)
        else:
            _, y_pred = torch.max(all_outputs, 1)
            y_pred = y_pred.cpu().numpy()

            accuracy = round(np.sum(y_true == y_pred) / len(y_pred), 4)

        return accuracy, loss

    def evaluate_recognition(self, samples, issame, nrof_folds=10, tta=False):
        self.model.eval()
        idx = 0
        embeddings = np.zeros([len(samples), self.config.embedding_size])

        with torch.no_grad():
            for idx in range(0, len(samples), self.config.batch_size):
                batch = torch.tensor(samples[idx:idx + self.config.batch_size])
                embeddings[idx:idx + self.config.batch_size] = self.model(batch.to(self.config.device)).cpu()
                idx += self.config.batch_size

        tpr, fpr, accuracy, best_thresholds = verification.evaluate(embeddings, issame, nrof_folds)

        return round(accuracy.mean(), 5), round(accuracy.std(), 5)

    def save_file(self, string, file_name):
        file = open(path.join(self.config.work_path, file_name), "w")
        file.write(str(string))
        file.close()
Exemple #32
0
def main():
    global args
    args = parser.parse_args()

    torch.manual_seed(0)
    torch.cuda.set_device(0)

    model = import_module(args.model)
    config, net, loss = model.get_model()
    start_epoch = args.start_epoch
    save_dir = args.save_dir

    if args.resume:
        checkpoint = torch.load(args.resume)
        #if start_epoch == 0:
        #    start_epoch = checkpoint['epoch'] + 1
        #if not save_dir:
        #    save_dir = checkpoint['save_dir']
        #else:
        save_dir = os.path.join('results', save_dir)
        net.load_state_dict(checkpoint['state_dict'])
    else:
        if start_epoch == 0:
            start_epoch = 1
        if not save_dir:
            exp_id = time.strftime('%Y%m%d-%H%M%S', time.localtime())
            save_dir = os.path.join('results', args.model + '-' + exp_id)
        else:
            save_dir = os.path.join('results', save_dir)

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    logfile = os.path.join(save_dir, 'log')
    if args.test != 1:
        sys.stdout = Logger(logfile)
        pyfiles = [f for f in os.listdir('./') if f.endswith('.py')]
        for f in pyfiles:
            shutil.copy(f, os.path.join(save_dir, f))
    n_gpu = setgpu(args.gpu)
    args.n_gpu = n_gpu
    print("arg", args.gpu)
    print("num_gpu", n_gpu)
    net = net.cuda()
    loss = loss.cuda()
    cudnn.benchmark = True
    net = DataParallel(net)
    datadir = config_training['preprocess_result_path']

    print("datadir", datadir)
    print("pad_val", config['pad_value'])
    print("aug type", config['augtype'])

    dataset = data.DataBowl3Detector(datadir,
                                     'train_luna_9.npy',
                                     config,
                                     phase='train')
    print("len train_dataset", dataset.__len__())
    train_loader = DataLoader(dataset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.workers,
                              pin_memory=True)

    dataset = data.DataBowl3Detector(datadir, 'val9.npy', config, phase='val')
    print("len val_dataset", dataset.__len__())

    val_loader = DataLoader(dataset,
                            batch_size=1,
                            shuffle=False,
                            num_workers=args.workers,
                            pin_memory=True)

    optimizer = torch.optim.SGD(net.parameters(),
                                args.lr,
                                momentum=0.9,
                                weight_decay=args.weight_decay)

    def get_lr(epoch):
        if epoch <= args.epochs * 0.5:
            lr = args.lr
        elif epoch <= args.epochs * 0.8:
            lr = 0.1 * args.lr
        else:
            lr = 0.01 * args.lr
        return lr

    best_val_loss = 100
    best_mal_loss = 100
    for epoch in range(start_epoch, args.epochs + 1):
        print("epoch", epoch)
        train(train_loader, net, loss, epoch, optimizer, get_lr,
              args.save_freq, save_dir)
        best_val_loss, best_mal_loss = validate(val_loader, net, loss,
                                                best_val_loss, best_mal_loss,
                                                epoch, save_dir)