def train_model(args): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') device_ids = [0, 1, 2, 3] batch_size = args.batch_size input_channels = 1 out_channels = [args.out_channels1, args.out_channels2] kernel_size_cnn = [[args.kernel_size_cnn1, args.kernel_size_cnn2], [args.kernel_size_cnn2, args.kernel_size_cnn1]] stride_size_cnn = [[args.stride_size_cnn1, args.stride_size_cnn2], [args.stride_size_cnn2, args.stride_size_cnn1]] kernel_size_pool = [[args.kernel_size_pool1, args.kernel_size_pool2], [args.kernel_size_pool2, args.kernel_size_pool1]] stride_size_pool = [[args.stride_size_pool1, args.stride_size_pool2], [args.stride_size_pool2, args.stride_size_pool1]] hidden_dim = 200 num_layers = 2 dropout = 0 num_labels = 4 hidden_dim_lstm = 200 epoch_num = 50 num_layers_lstm = 2 nfft = [512, 1024] weight = args.weight #model = MultiSpectrogramModel(input_channels,out_channels, kernel_size_cnn, stride_size_cnn, kernel_size_pool, #stride_size_pool, hidden_dim,num_layers,dropout,num_labels, batch_size, #hidden_dim_lstm,num_layers_lstm,device, nfft, weight, False) model = resnet18() print( "============================ Number of parameters ====================================" ) print(str(sum(p.numel() for p in model.parameters() if p.requires_grad))) path = "batch_size:{};out_channels:{};kernel_size_cnn:{};stride_size_cnn:{};kernel_size_pool:{};stride_size_pool:{}; weight:{}".format( args.batch_size, out_channels, kernel_size_cnn, stride_size_cnn, kernel_size_pool, stride_size_pool, weight) with open("/scratch/speech/models/classification/resnet_stats.txt", "a+") as f: f.write("\n" + "============ model starts ===========") f.write( "\n" + "model_parameters: " + str(sum(p.numel() for p in model.parameters() if p.requires_grad)) + "\n" + path + "\n") model.cuda() model = DataParallel(model, device_ids=device_ids) model.train() # Use Adam as the optimizer with learning rate 0.01 to make it fast for testing purposes optimizer = optim.Adam(model.parameters(), lr=0.001) optimizer2 = optim.SGD(model.parameters(), lr=0.1) scheduler = ReduceLROnPlateau(optimizer=optimizer, factor=0.5, patience=2, threshold=1e-3) #scheduler2=ReduceLROnPlateau(optimizer=optimizer2, factor=0.5, patience=2, threshold=1e-3) #scheduler2 =CosineAnnealingLR(optimizer2, T_max=300, eta_min=0.0001) scheduler3 = MultiStepLR(optimizer, [5, 10, 15], gamma=0.1) # Load the training data training_data = IEMOCAP(name='mel', nfft=nfft, train=True) train_loader = DataLoader(dataset=training_data, batch_size=batch_size, shuffle=True, collate_fn=my_collate, num_workers=0, drop_last=True) testing_data = IEMOCAP(name='mel', nfft=nfft, train=False) test_loader = DataLoader(dataset=testing_data, batch_size=batch_size, shuffle=True, collate_fn=my_collate, num_workers=0, drop_last=True) #print("=================") #print(len(training_data)) #print("===================") test_acc = [] train_acc = [] test_loss = [] train_loss = [] device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') for epoch in range( epoch_num ): # again, normally you would NOT do 300 epochs, it is toy data print("===================================" + str(epoch + 1) + "==============================================") losses = 0 correct = 0 model.train() for j, (input_lstm, input1, input2, target, seq_length) in enumerate(train_loader): if (j + 1) % 20 == 0: print("=================================Train Batch" + str(j + 1) + "===================================================") model.zero_grad() x = model(input1) target = target.to(device) target_index = torch.argmax(target, dim=1).to(device) correct_batch = torch.sum(target_index == torch.argmax(x, dim=1)) losses_batch = F.cross_entropy(x, torch.max(target, 1)[1]) correct_batch = torch.unsqueeze(correct_batch, dim=0) losses_batch = torch.unsqueeze(losses_batch, dim=0) loss = torch.mean(losses_batch, dim=0) #print(loss) correct_batch = torch.sum(correct_batch, dim=0) losses += loss.item() * batch_size loss.backward() #weight=model.module.state_dict()["weight"] #weight=torch.exp(10*weight)/(1+torch.exp(10*weight)).item() optimizer.step() correct += correct_batch.item() accuracy = correct * 1.0 / ((j + 1) * batch_size) losses = losses / ((j + 1) * batch_size) #scheduler3.step() losses_test = 0 correct_test = 0 #torch.save(model.module.state_dict(), "/scratch/speech/models/classification/spec_full_joint_checkpoint_epoch_{}.pt".format(epoch+1)) model.eval() with torch.no_grad(): for j, (input_lstm, input1, input2, target, seq_length) in enumerate(test_loader): if (j + 1) % 10 == 0: print( "=================================Test Batch" + str(j + 1) + "===================================================") x = model(input1) target = target.to(device) target_index = torch.argmax(target, dim=1).to(device) correct_batch = torch.sum( target_index == torch.argmax(x, dim=1)) losses_batch = F.cross_entropy(x, torch.max(target, 1)[1]) correct_batch = torch.unsqueeze(correct_batch, dim=0) losses_batch = torch.unsqueeze(losses_batch, dim=0) loss = torch.mean(losses_batch, dim=0) correct_batch = torch.sum(correct_batch, dim=0) losses_test += loss.item() * batch_size correct_test += correct_batch.item() #print("how many correct:", correct_test) accuracy_test = correct_test * 1.0 / ((j + 1) * batch_size) losses_test = losses_test / ((j + 1) * batch_size) # data gathering test_acc.append(accuracy_test) train_acc.append(accuracy) test_loss.append(losses_test) train_loss.append(losses) print( "Epoch: {}-----------Training Loss: {} -------- Testing Loss: {} -------- Training Acc: {} -------- Testing Acc: {}" .format(epoch + 1, losses, losses_test, accuracy, accuracy_test) + "\n") with open("/scratch/speech/models/classification/resnet_stats.txt", "a+") as f: #f.write("Epoch: {}-----------Training Loss: {} -------- Testing Loss: {} -------- Training Acc: {} -------- Testing Acc: {}".format(epoch+1,losses,losses_test, accuracy, accuracy_test)+"\n") if epoch == epoch_num - 1: f.write("Best Accuracy:{:06.5f}".format(max(test_acc)) + "\n") f.write("Average Top 10 Accuracy:{:06.5f}".format( np.mean(np.sort(np.array(test_acc))[-10:])) + "\n") f.write("=============== model ends ===================" + "\n") print("success:{}, Best Accuracy:{}".format(path, max(test_acc)))
def main(): args = setup_train_args() # 日志同时输出到文件和console global logger logger = create_logger(args) # 当用户使用GPU,并且GPU可用时 args.cuda = torch.cuda.is_available() and not args.no_cuda device = 'cuda' if args.cuda else 'cpu' logger.info('using device:{}'.format(device)) # 为CPU设置种子用于生成随机数,以使得结果是确定的 # 为当前GPU设置随机种子;如果使用多个GPU,应该使用torch.cuda.manual_seed_all()为所有的GPU设置种子。 # 当得到比较好的结果时我们通常希望这个结果是可以复现 if args.seed: set_random_seed(args) # 设置使用哪些显卡进行训练 # 初始化tokenizer tokenizer = BertTokenizer(vocab_file=args.vocab_path) # tokenizer的字典大小 vocab_size = len(tokenizer) global pad_id pad_id = tokenizer.convert_tokens_to_ids(PAD) # 创建对话模型的输出目录 if not os.path.exists(args.dialogue_model_output_path): os.mkdir(args.dialogue_model_output_path) # 创建MMI模型的输出目录 if not os.path.exists(args.mmi_model_output_path): os.mkdir(args.mmi_model_output_path) # 加载GPT2模型 model, n_ctx = create_model(args, vocab_size) model.to(device) # 对原始数据进行预处理,将原始语料转换成对应的token_id if args.raw and args.train_mmi: # 如果当前是要训练MMI模型 preprocess_mmi_raw_data(args, tokenizer, n_ctx) elif args.raw and not args.train_mmi: # 如果当前是要训练对话生成模型 print("_______________________________________") preprocess_raw_data(args, tokenizer, n_ctx) # 是否使用多块GPU进行并行运算 multi_gpu = False if args.cuda and torch.cuda.device_count() > 1: logger.info("Let's use GPUs to train") model = DataParallel( model, device_ids=[int(i) for i in args.device.split(',')]) multi_gpu = True # 记录模型参数数量 num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() logger.info('number of model parameters: {}'.format(num_parameters)) # 加载数据 logger.info("loading traing data") if args.train_mmi: # 如果是训练MMI模型 with open(args.train_mmi_tokenized_path, "r", encoding="utf8") as f: data = f.read() else: # 如果是训练对话生成模型 with open(args.train_tokenized_path, "r", encoding="utf8") as f: data = f.read() data_list = data.split("\n") train_list, test_list = train_test_split(data_list, test_size=0.2, random_state=1) # 开始训练 train(model, device, train_list, multi_gpu, args) # 测试模型 evaluate(model, device, test_list, multi_gpu, args)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡') parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False, help='选择模型参数') parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='选择词库') parser.add_argument('--raw_data_path', default='data/train.json', type=str, required=False, help='原始训练语料') parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False, help='tokenized语料存放位置') parser.add_argument('--raw', action='store_true', help='是否先做tokenize') parser.add_argument('--epochs', default=5, type=int, required=False, help='训练循环') parser.add_argument('--batch_size', default=8, type=int, required=False, help='训练batch size') parser.add_argument('--lr', default=1.5e-4, type=float, required=False, help='学习率') parser.add_argument('--warmup_steps', default=2000, type=int, required=False, help='warm up步数') parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次loss,设置为gradient accumulation的整数倍') parser.add_argument('--stride', default=768, type=int, required=False, help='训练时取训练数据的窗口步长') parser.add_argument('--gradient_accumulation', default=1, type=int, required=False, help='梯度积累') parser.add_argument('--fp16', action='store_true', help='混合精度') parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False) parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False) parser.add_argument('--num_pieces', default=100, type=int, required=False, help='将训练语料分成多少份') parser.add_argument('--min_length', default=128, type=int, required=False, help='最短收录文章长度') parser.add_argument('--output_dir', default='model/', type=str, required=False, help='模型输出路径') parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型训练起点路径') parser.add_argument('--writer_dir', default='tensorboard_summary/', type=str, required=False, help='Tensorboard路径') parser.add_argument('--segment', action='store_true', help='中文以词为单位') parser.add_argument('--bpe_token', action='store_true', help='subword') parser.add_argument('--encoder_json', default="tokenizations/encoder.json", type=str, help="encoder.json") parser.add_argument('--vocab_bpe', default="tokenizations/vocab.bpe", type=str, help="vocab.bpe") args = parser.parse_args() print('args:\n' + args.__repr__()) if args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 model_config = transformers.modeling_gpt2.GPT2Config.from_json_file( args.model_config) print('config:\n' + model_config.to_json_string()) n_ctx = model_config.n_ctx if args.bpe_token: full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe) else: full_tokenizer = tokenization_bert.BertTokenizer( vocab_file=args.tokenizer_path) full_tokenizer.max_len = 999999 device = 'cuda' if torch.cuda.is_available() else 'cpu' print('using device:', device) raw_data_path = args.raw_data_path tokenized_data_path = args.tokenized_data_path raw = args.raw # 选择是否从零开始构建数据集 epochs = args.epochs batch_size = args.batch_size lr = args.lr warmup_steps = args.warmup_steps log_step = args.log_step stride = args.stride gradient_accumulation = args.gradient_accumulation fp16 = args.fp16 # 不支持半精度的显卡请勿打开 fp16_opt_level = args.fp16_opt_level max_grad_norm = args.max_grad_norm num_pieces = args.num_pieces min_length = args.min_length output_dir = args.output_dir # tb_writer = SummaryWriter(log_dir=args.writer_dir) assert log_step % gradient_accumulation == 0 if not os.path.exists(output_dir): os.mkdir(output_dir) if raw: build_files(data_path=raw_data_path, tokenized_data_path=tokenized_data_path, num_pieces=num_pieces, full_tokenizer=full_tokenizer, min_length=min_length) print('files built') if not args.pretrained_model: model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config) else: model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained( args.pretrained_model) model.train() model.to(device) num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() print('number of parameters: {}'.format(num_parameters)) multi_gpu = False full_len = 0 print('calculating total steps') for i in tqdm(range(num_pieces)): with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: full_len += len([int(item) for item in f.read().strip().split()]) total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation) print('total steps = {}'.format(total_steps)) optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True) scheduler = transformers.WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=total_steps) if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = DataParallel( model, device_ids=[int(i) for i in args.device.split(',')]) multi_gpu = True print('starting training') overall_step = 0 running_loss = 0 for epoch in range(epochs): print('epoch {}'.format(epoch + 1)) now = datetime.now() print('time: {}'.format(now)) x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32) random.shuffle(x) piece_num = 0 for i in x: with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: line = f.read().strip() tokens = line.split() tokens = [int(token) for token in tokens] start_point = 0 samples = [] while start_point < len(tokens) - n_ctx: samples.append(tokens[start_point:start_point + n_ctx]) start_point += stride if start_point < len(tokens): samples.append(tokens[len(tokens) - n_ctx:]) random.shuffle(samples) for step in range(len(samples) // batch_size): # drop last # prepare data batch = samples[step * batch_size:(step + 1) * batch_size] batch_inputs = [] for ids in batch: int_ids = [int(x) for x in ids] batch_inputs.append(int_ids) batch_inputs = torch.tensor(batch_inputs).long().to(device) # forward pass outputs = model.forward(input_ids=batch_inputs, labels=batch_inputs) loss, logits = outputs[:2] # get loss if multi_gpu: loss = loss.mean() if gradient_accumulation > 1: loss = loss / gradient_accumulation # loss backward if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # optimizer step if (overall_step + 1) % gradient_accumulation == 0: running_loss += loss.item() optimizer.step() optimizer.zero_grad() scheduler.step() if (overall_step + 1) % log_step == 0: # tb_writer.add_scalar('loss', loss.item() * gradient_accumulation, overall_step) print( 'now time: {}:{}. Step {} of piece {} of epoch {}, loss {}' .format( datetime.now().hour, datetime.now().minute, step + 1, piece_num, epoch + 1, running_loss * gradient_accumulation / (log_step / gradient_accumulation))) running_loss = 0 overall_step += 1 piece_num += 1 print('saving model for epoch {}'.format(epoch + 1)) if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)): os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1)) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'model_epoch{}'.format(epoch + 1)) # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1)) # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1)) print('epoch {} finished'.format(epoch + 1)) then = datetime.now() print('time: {}'.format(then)) print('time for one epoch: {}'.format(then - now)) print('training finished') if not os.path.exists(output_dir + 'final_model'): os.mkdir(output_dir + 'final_model') model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'final_model')
def train(opt): # set device to cpu/gpu if opt.use_gpu: device = torch.device("cuda", opt.gpu_id) else: device = torch.device("cpu") # Data transformations for data augmentation transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.RandomErasing(), ]) transform_val = transforms.Compose([ transforms.ToTensor(), ]) # get CIFAR10/CIFAR100 train/val set if opt.dataset == "CIFAR10": alp_lambda = 0.5 lambda_loss = [0.005, 0.001] train_set = CIFAR10(root="./data", train=True, download=True, transform=transform_train) val_set = CIFAR10(root="./data", train=True, download=True, transform=transform_val) else: alp_lambda = 0.5 lambda_loss = [0.005, 0.001] train_set = CIFAR100(root="./data", train=True, download=True, transform=transform_train) val_set = CIFAR100(root="./data", train=True, download=True, transform=transform_val) num_classes = np.unique(train_set.targets).shape[0] # set stratified train/val split idx = list(range(len(train_set.targets))) train_idx, val_idx, _, _ = train_test_split( idx, train_set.targets, test_size=opt.val_split, random_state=42) # get train/val samplers train_sampler = SubsetRandomSampler(train_idx) val_sampler = SubsetRandomSampler(val_idx) # get train/val dataloaders train_loader = DataLoader(train_set, sampler=train_sampler, batch_size=opt.batch_size, num_workers=opt.num_workers) val_loader = DataLoader(val_set, sampler=val_sampler, batch_size=opt.batch_size, num_workers=opt.num_workers) data_loaders = {"train": train_loader, "val": val_loader} print("Dataset -- {}, Metric -- {}, Train Mode -- {}, Backbone -- {}".format(opt.dataset, opt.metric, opt.train_mode, opt.backbone)) print("Train iteration batch size: {}".format(opt.batch_size)) print("Train iterations per epoch: {}".format(len(train_loader))) # get backbone model if opt.backbone == "resnet18": model = resnet18(pretrained=False) else: model = resnet34(pretrained=False) # set metric loss function in_features = model.fc.in_features model.fc = Softmax(in_features, num_classes) model.to(device) if opt.use_gpu: model = DataParallel(model).to(device) criterion = CrossEntropyLoss() mse_criterion = MSELoss() cent_criterion = CenterLoss(num_classes, in_features, device) # set optimizer and LR scheduler if opt.optimizer == "sgd": optimizer = SGD([{"params": model.parameters()}], lr=opt.lr, weight_decay=opt.weight_decay, momentum=0.9) cent_optimizer = SGD([{"params": cent_criterion.parameters()}], lr=opt.lr, weight_decay=opt.weight_decay, momentum=0.9) else: optimizer = Adam([{"params": model.parameters()}], lr=opt.lr, weight_decay=opt.weight_decay) cent_optimizer = Adam([{"params": cent_criterion.parameters()}], lr=opt.lr, weight_decay=opt.weight_decay) if opt.scheduler == "decay": scheduler = lr_scheduler.StepLR( optimizer, step_size=opt.lr_step, gamma=opt.lr_decay) else: scheduler = lr_scheduler.ReduceLROnPlateau( optimizer, factor=0.1, patience=10) # train/val loop for epoch in range(opt.epoch): for phase in ["train", "val"]: total_examples, total_correct, total_loss = 0, 0, 0 if phase == "train": model.train() else: model.eval() start_time = time.time() for ii, data in enumerate(data_loaders[phase]): # load data batch to device images, labels = data images = images.to(device) labels = labels.to(device).long() # perform adversarial attack update to images if opt.train_mode == "at" or opt.train_mode == "alp": adv_images = pgd( model, images, labels, 8. / 255, 2. / 255, 7) else: pass # at train mode if opt.train_mode == "at": # get feature embedding and logits from resnet features, predictions = model(images, labels) adv_features, adv_predictions = model(adv_images, labels) # get center loss cent_loss = cent_criterion(features, labels) cent_loss = cent_loss + \ cent_criterion(adv_features, labels) # get feature norm loss norm = features.mm(features.t()).diag() adv_norm = adv_features.mm(adv_features.t()).diag() norm_loss = (torch.sum(norm) + torch.sum(adv_norm)) / \ (features.size(0) + adv_features.size(0)) # get cross-entropy loss ce_loss = criterion(predictions, labels) ce_loss = ce_loss + criterion(adv_predictions, labels) # combine cross-entropy loss, center loss and feature norm loss using lambda weights loss = ce_loss + lambda_loss[0] * \ cent_loss + lambda_loss[1] * norm_loss optimizer.zero_grad() cent_optimizer.zero_grad() # for result accumulation predictions = adv_predictions # alp train mode elif opt.train_mode == "alp": # get feature embedding and logits from resnet features, predictions = model(images, labels) adv_features, adv_predictions = model(adv_images, labels) # get center loss cent_loss = cent_criterion(features, labels) cent_loss = cent_loss + \ cent_criterion(adv_features, labels) # get feature norm loss norm = features.mm(features.t()).diag() adv_norm = adv_features.mm(adv_features.t()).diag() norm_loss = (torch.sum(norm) + torch.sum(adv_norm)) / \ (features.size(0) + adv_features.size(0)) # get cross-entropy loss ce_loss = criterion(predictions, labels) ce_loss = ce_loss + criterion(adv_predictions, labels) # get alp loss alp_loss = mse_criterion(adv_predictions, predictions) # combine cross-entropy loss, center loss and feature norm loss using lambda weights loss = ce_loss + lambda_loss[0] * \ cent_loss + lambda_loss[1] * norm_loss # combine loss with alp loss loss = loss + alp_lambda * alp_loss optimizer.zero_grad() cent_optimizer.zero_grad() # for result accumulation predictions = adv_predictions # clean train mode else: # get feature embedding and logits from resnet features, predictions = model(images, labels) # get center loss cent_loss = cent_criterion(features, labels) # get feature norm loss norm = features.mm(features.t()).diag() norm_loss = torch.sum(norm) / features.size(0) # get cross-entropy loss ce_loss = criterion(predictions, labels) # combine cross-entropy loss, center loss and feature norm loss using lambda weights loss = ce_loss + lambda_loss[0] * \ cent_loss + lambda_loss[1] * norm_loss optimizer.zero_grad() cent_optimizer.zero_grad() # only take step if in train phase if phase == "train": loss.backward() optimizer.step() cent_optimizer.step() # accumulate train or val results predictions = torch.argmax(predictions, 1) total_examples += predictions.size(0) total_correct += predictions.eq(labels).sum().item() total_loss += loss.item() # print accumulated train/val results at end of epoch if ii == len(data_loaders[phase]) - 1: end_time = time.time() acc = total_correct / total_examples loss = total_loss / len(data_loaders[phase]) print("{}: Epoch -- {} Loss -- {:.6f} Acc -- {:.6f} Time -- {:.6f}sec".format( phase, epoch, loss, acc, end_time - start_time)) if phase == "train": loss = total_loss / len(data_loaders[phase]) scheduler.step(loss) else: print("") # save model after training for opt.epoch save_model(model, opt.dataset, opt.metric, opt.train_mode, opt.backbone)
class ProGAN: """ Wrapper around the Generator and the Discriminator """ def __init__(self, depth=7, latent_size=512, learning_rate=0.001, beta_1=0, beta_2=0.99, eps=1e-8, drift=0.001, n_critic=1, use_eql=True, loss="wgan-gp", use_ema=True, ema_decay=0.999, device=th.device("cpu")): """ constructor for the class :param depth: depth of the GAN (will be used for each generator and discriminator) :param latent_size: latent size of the manifold used by the GAN :param learning_rate: learning rate for Adam :param beta_1: beta_1 for Adam :param beta_2: beta_2 for Adam :param eps: epsilon for Adam :param n_critic: number of times to update discriminator (Used only if loss is wgan or wgan-gp) :param drift: drift penalty for the (Used only if loss is wgan or wgan-gp) :param use_eql: whether to use equalized learning rate :param loss: the loss function to be used Can either be a string => ["wgan-gp", "wgan", "lsgan", "lsgan-with-sigmoid"] Or an instance of GANLoss :param use_ema: boolean for whether to use exponential moving averages :param ema_decay: value of mu for ema :param device: device to run the GAN on (GPU / CPU) """ from torch.optim import Adam from torch.nn import DataParallel # Create the Generator and the Discriminator self.gen = Generator(depth, latent_size, use_eql=use_eql).to(device) self.dis = Discriminator(depth, latent_size, use_eql=use_eql).to(device) # if code is to be run on GPU, we can use DataParallel: if device == th.device("cuda"): self.gen = DataParallel(self.gen) self.dis = DataParallel(self.dis) # state of the object self.latent_size = latent_size self.depth = depth self.use_ema = use_ema self.ema_decay = ema_decay self.n_critic = n_critic self.use_eql = use_eql self.device = device self.drift = drift # define the optimizers for the discriminator and generator self.gen_optim = Adam(self.gen.parameters(), lr=learning_rate, betas=(beta_1, beta_2), eps=eps) self.dis_optim = Adam(self.dis.parameters(), lr=learning_rate, betas=(beta_1, beta_2), eps=eps) # define the loss function used for training the GAN self.loss = self.__setup_loss(loss) if self.use_ema: from .CustomLayers import update_average # create a shadow copy of the generator self.gen_shadow = copy.deepcopy(self.gen) # updater function: self.ema_updater = update_average # initialize the gen_shadow weights equal to the # weights of gen self.ema_updater(self.gen_shadow, self.gen, beta=0) def __setup_loss(self, loss): from . import Losses as losses if isinstance(loss, str): loss = loss.lower() # lowercase the string if loss == "wgan": loss = losses.WGAN_GP(self.device, self.dis, self.drift, use_gp=False) # note if you use just wgan, you will have to use weight clipping # in order to prevent gradient exploding elif loss == "wgan-gp": loss = losses.WGAN_GP(self.device, self.dis, self.drift, use_gp=True) elif loss == "lsgan": loss = losses.LSGAN(self.device, self.dis) elif loss == "lsgan-with-sigmoid": loss = losses.LSGAN_SIGMOID(self.device, self.dis) else: raise ValueError("Unknown loss function requested") elif not isinstance(loss, losses.GANLoss): raise ValueError( "loss is neither an instance of GANLoss nor a string") return loss def optimize_discriminator(self, noise, real_batch, depth, alpha): """ performs one step of weight update on discriminator using the batch of data :param noise: input noise of sample generation :param real_batch: real samples batch :param depth: current depth of optimization :param alpha: current alpha for fade-in :return: current loss (Wasserstein loss) """ from torch.nn import AvgPool2d from torch.nn.functional import upsample # downsample the real_batch for the given depth down_sample_factor = int(np.power(2, self.depth - depth - 1)) prior_downsample_factor = max(int(np.power(2, self.depth - depth)), 0) ds_real_samples = AvgPool2d(down_sample_factor)(real_batch) if depth > 0: prior_ds_real_samples = upsample( AvgPool2d(prior_downsample_factor)(real_batch), scale_factor=2) else: prior_ds_real_samples = ds_real_samples # real samples are a combination of ds_real_samples and prior_ds_real_samples real_samples = (alpha * ds_real_samples) + ( (1 - alpha) * prior_ds_real_samples) loss_val = 0 for _ in range(self.n_critic): # generate a batch of samples fake_samples = self.gen(noise, depth, alpha).detach() loss = self.loss.dis_loss(real_samples, fake_samples, depth, alpha) # optimize discriminator self.dis_optim.zero_grad() loss.backward() self.dis_optim.step() loss_val += loss.item() return loss_val / self.n_critic def optimize_generator(self, noise, depth, alpha): """ performs one step of weight update on generator for the given batch_size :param noise: input random noise required for generating samples :param depth: depth of the network at which optimization is done :param alpha: value of alpha for fade-in effect :return: current loss (Wasserstein estimate) """ # generate fake samples: fake_samples = self.gen(noise, depth, alpha) # TODO: Change this implementation for making it compatible for relativisticGAN loss = self.loss.gen_loss(None, fake_samples, depth, alpha) # optimize the generator self.gen_optim.zero_grad() loss.backward() self.gen_optim.step() # if use_ema is true, apply ema to the generator parameters if self.use_ema: self.ema_updater(self.gen_shadow, self.gen, self.ema_decay) # return the loss value return loss.item()
def training(M): batch_size = len(gpu_ids) * 1 data_len = 100000 hover_loader = DataLoader(dataset=HoverDataset(data_len), batch_size=batch_size, shuffle=True, drop_last=True, num_workers=8) res = 256 label_dir = 'crn0/Label256Full' l = os.listdir(label_dir) for epoch in range(200): running_loss = 0 c_t = 0 print("New Epoch") for data in hover_loader: a = time.time() label_images, input_images = data label_images = label_images.cuda() input_images = input_images.cuda(gpu_ids[-1]) b = time.time() #print(label_images.shape) if label_images.shape[0] != batch_size or label_images.shape[ 1] != DIMENSION: print("skip") continue c_t += label_images.shape[0] # for I in enumerate(l): #J = str.replace(I[1], 'gtFine_color.png', 'leftImg8bit.png') # label_images1 = Variable(torch.unsqueeze(torch.from_numpy(helper.get_semantic_map( # 'crn0/Label256Full/'+I[1])).float().permute(2, 0, 1), dim=0)) # .cuda()#training label # input_images = Variable(torch.unsqueeze(torch.from_numpy( # io.imread("crn0/RGB256Full/"+J)).float(), dim=0).permute(0, 3, 1, 2)) if M == 0: model = cascaded_model(label_images, res) model = model.cuda() model = DataParallel(model, gpu_ids) # model.load_state_dict(torch.load('mynet_updated.pth')) # if u want to resume training from a pretrained model then add the .pth file here optimizer = optim.Adam(model.parameters(), lr=0.0001 * len(gpu_ids), betas=(0.9, 0.999), eps=1e-08, weight_decay=0) optimizer.zero_grad() Generator = model(label_images) Loss = Net(input_images, Generator, label_images) c = time.time() print(Loss.data) if len(gpu_ids) > 1: Loss = Loss.mean() Loss.backward() optimizer.step() M = 1 running_loss += Loss.data.item() d = time.time() print(epoch, c_t, Loss.data.item(), b - a, c - b, d - c) if c_t % 1000 == 0: Generator = Generator.permute(0, 2, 3, 1) Generator = Generator.cpu() Generator = Generator.data.numpy() output = np.minimum(np.maximum(Generator, 0.0), 255.0) scipy.misc.toimage(output[0, :, :, :], cmin=0, cmax=255).save( "crn0/vis/{}_{}_output_real.jpg".format(epoch, c_t)) shuffle(l) # can replace the 2975 with c_t for generalization epoch_loss = running_loss / data_len print(epoch, epoch_loss) torch.save(model.state_dict(), 'crn0/mynet_epoch{}_CRN.pth'.format(epoch)) #epoch_acc = running_corrects / 2975.0 # return Loss best_model_wts = model.state_dict() model.load_state_dict(best_model_wts) return model
def train(self): if not self.pretrained_model: model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel( config=self.model_config) else: model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained( self.pretrained_model) model.train() model.to(self.device) # 计算模型参数量 num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() self.print_and_log('模型参数量 = {}'.format(num_parameters)) if self.do_tokenize: self.print_and_log("开始加载训练集") self.tokenize_and_save() self.print_and_log("训练集加载完毕") full_len = 0 for i in range(self.split_num): with open( self.tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: full_len += len( [int(item) for item in f.read().strip().split()]) sample_num = int(full_len / self.stride) epoch_steps = int(full_len / self.stride / self.batch_size / self.gradient_accumulation) total_steps = int(full_len / self.stride * self.epochs / self.batch_size / self.gradient_accumulation) self.print_and_log('样本数 = {}'.format(sample_num)) self.print_and_log('epoch 步数 = {}'.format(epoch_steps)) self.print_and_log('总步数 = {}'.format(total_steps)) optimizer = pytorch_transformers.AdamW(model.parameters(), lr=self.lr, correct_bias=True) scheduler = pytorch_transformers.WarmupLinearSchedule( optimizer, warmup_steps=self.warmup_steps, t_total=total_steps) if self.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=self.fp16_opt_level) if torch.cuda.device_count() > 1: model = DataParallel(model) multi_gpu = True else: multi_gpu = False overall_step = 0 running_loss = 0 for epoch in range(self.epochs): self.print_and_log('epoch {}'.format(epoch + 1)) now = datetime.now() self.print_and_log('time: {}'.format(now)) optimizer.zero_grad() split_indices = np.linspace(0, self.split_num - 1, self.split_num, dtype=np.int32) random.shuffle(split_indices) for split_index in split_indices: with open( self.tokenized_data_path + 'tokenized_train_{}.txt'.format(split_index), 'r') as f: line = f.read().strip() all_ids = line.split() all_ids = [int(x) for x in all_ids] start_point = 0 samples = [] while start_point < len(all_ids) - self.n_ctx: samples.append(all_ids[start_point:start_point + self.n_ctx]) start_point += self.stride random.shuffle(samples) for i in range(len(samples) // self.batch_size): # drop last batch = samples[i * self.batch_size:(i + 1) * self.batch_size] batch_labels = torch.tensor(batch, dtype=torch.long).to( self.device) batch_inputs = torch.tensor(batch, dtype=torch.long).to( self.device) outputs = model.forward(input_ids=batch_inputs, labels=batch_labels) loss, logits = outputs[:2] if multi_gpu: loss = loss.mean() if self.gradient_accumulation > 1: loss = loss / self.gradient_accumulation # loss backward if self.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), self.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), self.max_grad_norm) if (i + 1) % self.gradient_accumulation == 0: running_loss += loss.item() scheduler.step() optimizer.step() optimizer.zero_grad() overall_step += 1 if (overall_step + 1) % self.log_step == 0 and running_loss != 0: self.print_and_log( 'now time: {}:{}. Step {} of epoch {}, loss {}'. format( datetime.now().hour, datetime.now().minute, overall_step + 1, epoch + 1, running_loss * self.gradient_accumulation / self.log_step)) running_loss = 0 if not os.path.exists(self.output_dir + 'model_epoch{}'.format(epoch + 1)): os.makedirs(self.output_dir + 'model_epoch{}'.format(epoch + 1)) gpt2_model = model.transformer model_to_save = gpt2_model.module if hasattr( gpt2_model, 'module') else gpt2_model model_to_save.save_pretrained(self.output_dir + 'model_epoch{}'.format(epoch + 1)) # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1)) # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1)) then = datetime.now() self.print_and_log('time: {}'.format(then)) self.print_and_log('time for one epoch: {}'.format(then - now)) self.print_and_log('training finished') self.f_log.close() if not os.path.exists(self.output_dir + 'final_model'): os.makedirs(self.output_dir + 'final_model') gpt2_model = model.transformer model_to_save = gpt2_model.module if hasattr(gpt2_model, 'module') else gpt2_model model_to_save.save_pretrained(self.output_dir + 'final_model')
def main(): global args args = parser.parse_args() torch.manual_seed(0) torch.cuda.manual_seed_all(0) model = import_module(args.model) config, net, loss, get_pbb = model.get_model() start_epoch = args.start_epoch save_dir = args.save_dir if args.resume: checkpoint = torch.load(args.resume) if start_epoch == 0: start_epoch = checkpoint['epoch'] + 1 if not save_dir: save_dir = checkpoint['save_dir'] else: save_dir = os.path.join('results', save_dir) net.load_state_dict(checkpoint['state_dict']) else: if start_epoch == 0: start_epoch = 1 if not save_dir: exp_id = time.strftime('%Y%m%d-%H%M%S', time.localtime()) save_dir = os.path.join('results', args.model + '-' + exp_id) else: save_dir = os.path.join('results', save_dir) if not os.path.exists(save_dir): os.makedirs(save_dir) logfile = os.path.join(save_dir, 'log') if args.test != 1: sys.stdout = Logger(logfile) pyfiles = [f for f in os.listdir('./') if f.endswith('.py')] for f in pyfiles: shutil.copy(f, os.path.join(save_dir, f)) n_gpu = setgpu(args.gpu) args.n_gpu = n_gpu net = net.cuda() loss = loss.cuda() cudnn.benchmark = True net = DataParallel(net) datadir = config_detector['preprocess_result_path'] print 'datadir = ', datadir net = DataParallel(net, device_ids=[0]) def get_lr(epoch): if epoch <= args.epochs * 0.5: lr = args.lr elif epoch <= args.epochs * 0.8: lr = 0.1 * args.lr else: lr = 0.01 * args.lr return lr def weights_init(m): classname = m.__class__.__name__ if classname.find('Conv') != -1: m.weight.data.normal_(0.0, 0.02) elif classname.find('BatchNorm') != -1: m.weight.data.normal_(1.0, 0.02) m.bias.data.fill_(0) elif classname.find('Linear') != -1: m.bias.data.fill_(0) # Cross-Validation of 3D-semi, train k_fold = args.fold print "Authorizing fold: {:d}".format(k_fold) # Loading training set dataset = data.DataBowl3Detector( datadir, 'detector/luna_file_id/subset_fold{:d}'.format(k_fold) + '/file_id_rpn_train.npy', config, phase='train') rpn_train_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) optimizer = torch.optim.SGD(net.parameters(), args.lr, momentum=0.9, weight_decay=args.weight_decay) # Training process train_loss_l, train_tpr_l = [], [] # weights initialize net.apply(weights_init) for epoch in range(start_epoch, args.epochs + 1): if not os.path.exists(os.path.join(save_dir, 'fold{:d}'.format(k_fold))): os.makedirs(os.path.join(save_dir, 'fold{:d}'.format(k_fold))) train_loss, train_tpr = train( rpn_train_loader, net, loss, epoch, optimizer, get_lr, args.save_freq, os.path.join(save_dir, 'fold{:d}'.format(k_fold))) # Append loss results train_loss_l.append(train_loss) train_tpr_l.append(train_tpr) # Save Train-Validation results if not os.path.exists('./train-vali-results/fold{:d}'.format(k_fold)): os.makedirs('./train-vali-results/fold{:d}'.format(k_fold)) np.save( './train-vali-results/fold{:d}'.format(k_fold) + '/rpn-train-loss.npy', np.asarray(train_loss_l).astype(np.float64)) np.save( './train-vali-results/fold{:d}'.format(k_fold) + '/rpn-train-tpr.npy', np.asarray(train_tpr_l).astype(np.float64)) # Testing process if args.test == 1: margin = 32 sidelen = 144 split_comber = SplitComb(sidelen, config['max_stride'], config['stride'], margin, config['pad_value']) dataset = data.DataBowl3Detector( datadir, 'detector/luna_file_id/subset_fold{:d}'.format(k_fold) + '/file_id_test.npy', config, phase='test', split_comber=split_comber) test_loader = DataLoader( dataset, batch_size=1, # 在测试阶段,batch size 固定为1 shuffle=False, num_workers=args.workers, collate_fn=data.collate, pin_memory=False) split_comber = SplitComb(sidelen, config['max_stride'], config['stride'], margin, config['pad_value']) dataset = data.DataBowl3Detector( datadir, 'detector/luna_file_id/subset_fold{:d}'.format(k_fold) + '/file_id_total_train.npy', config, phase='test', split_comber=split_comber) train_total_loader = DataLoader( dataset, batch_size=1, # 在测试阶段,batch size 固定为1 shuffle=False, num_workers=args.workers, collate_fn=data.collate, pin_memory=False) split_comber = SplitComb(sidelen, config['max_stride'], config['stride'], margin, config['pad_value']) dataset = data.DataBowl3Detector( datadir, 'detector/luna_file_id/file_id_unlabel.npy', config, phase='test', split_comber=split_comber) unlabel_loader = DataLoader( dataset, batch_size=1, # 在测试阶段,batch size 固定为1 shuffle=False, num_workers=args.workers, collate_fn=data.collate, pin_memory=False) test_dir = os.path.join(save_dir, 'voi_fold{:d}'.format(k_fold), 'test') if not os.path.exists(test_dir): os.makedirs(test_dir) find_voi(test_loader, net, get_pbb, test_dir, config) total_train_dir = os.path.join(save_dir, 'voi_fold{:d}'.format(k_fold), 'total_train') if not os.path.exists(total_train_dir): os.makedirs(total_train_dir) find_voi(train_total_loader, net, get_pbb, total_train_dir, config) unlabel_dir = os.path.join(save_dir, 'voi_fold{:d}'.format(k_fold), 'unlabel') if not os.path.exists(unlabel_dir): os.makedirs(unlabel_dir) find_voi(unlabel_loader, net, get_pbb, unlabel_dir, config)
if args.dataset in ['PaviaU', 'Pavia']: num_cla = 9 elif args.dataset in ['Indian', 'Salinas']: num_cla = 16 elif args.dataset == 'KSC': num_cla = 13 else: print('undefined dataset') make_if_not_exist(trained_model_dir) model = DataParallel(dict[args.model_name](num_classes=num_cla, dropout_keep_prob=0)) if args.use_cuda: model=model.cuda() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=1e-5) start_epoch = 0 if args.restore and len(os.listdir(trained_model_dir)): model, start_epoch = model_restore(model, trained_model_dir) train_info_record = trained_model_dir + 'train_info_' + args.model_name + '.txt' for epoch in range(start_epoch+1, args.epochs+1): start = time.time() train(epoch, model, train_loader, optimizer, args) end = time.time() print('epoch: {} , cost {} seconds'.format(epoch, end-start)) if epoch % args.model_save_interval == 0 and epoch > args.epochs*0.9: model_name = trained_model_dir + '/trained_model{}.pkl'.format(epoch) torch.save(model.cpu().state_dict(), model_name)
def main(): global args args = parser.parse_args() torch.manual_seed(0) ################################## nodmodel = import_module(args.model1) config1, nod_net, loss, get_pbb = nodmodel.get_model() args.lr_stage = config1['lr_stage'] args.lr_preset = config1['lr'] save_dir = args.save_dir ################################## casemodel = import_module(args.model2) config2 = casemodel.config args.lr_stage2 = config2['lr_stage'] args.lr_preset2 = config2['lr'] topk = config2['topk'] case_net = casemodel.CaseNet(topk=topk, nodulenet=nod_net) args.miss_ratio = config2['miss_ratio'] args.miss_thresh = config2['miss_thresh'] if args.debug: args.save_dir = 'debug' ################################### ################################ start_epoch = args.start_epoch if args.resume: checkpoint = torch.load(args.resume) if start_epoch == 0: start_epoch = checkpoint['epoch'] + 1 if not save_dir: save_dir = checkpoint['save_dir'] else: save_dir = os.path.join('results', save_dir) case_net.load_state_dict(checkpoint['state_dict']) else: if start_epoch == 0: start_epoch = 1 if not save_dir: exp_id = time.strftime('%Y%m%d-%H%M%S', time.localtime()) save_dir = os.path.join('results', args.model1 + '-' + exp_id) else: save_dir = os.path.join('results', save_dir) if args.epochs == None: end_epoch = args.lr_stage2[-1] else: end_epoch = args.epochs ################################ if not os.path.exists(save_dir): os.makedirs(save_dir) logfile = os.path.join(save_dir, 'log') if args.test1 != 1 and args.test2 != 1: sys.stdout = Logger(logfile) pyfiles = [f for f in os.listdir('./') if f.endswith('.py')] for f in pyfiles: shutil.copy(f, os.path.join(save_dir, f)) ################################ torch.cuda.set_device(0) # nod_net = nod_net.cuda() case_net = case_net.cuda() loss = loss.cuda() cudnn.benchmark = True if not args.debug: case_net = DataParallel(case_net) nod_net = DataParallel(nod_net) ################################ if args.test1 == 1: testsplit = np.load('full.npy') dataset = DataBowl3Classifier(testsplit, config2, phase='test') predlist = test_casenet(case_net, dataset).T anstable = np.concatenate([[testsplit], predlist], 0).T df = pandas.DataFrame(anstable) df.columns = {'id', 'cancer'} df.to_csv('allstage1.csv', index=False) return if args.test2 == 1: testsplit = np.load('test.npy') dataset = DataBowl3Classifier(testsplit, config2, phase='test') predlist = test_casenet(case_net, dataset).T anstable = np.concatenate([[testsplit], predlist], 0).T df = pandas.DataFrame(anstable) df.columns = {'id', 'cancer'} df.to_csv('quick', index=False) return if args.test3 == 1: testsplit3 = np.load('stage2.npy') dataset = DataBowl3Classifier(testsplit3, config2, phase='test') predlist = test_casenet(case_net, dataset).T anstable = np.concatenate([[testsplit3], predlist], 0).T df = pandas.DataFrame(anstable) df.columns = {'id', 'cancer'} df.to_csv('stage2_ans.csv', index=False) return print("save_dir", save_dir) print("save_freq", args.save_freq) # trainsplit = np.load('kaggleluna_full.npy') train_list = [f.split('_')[0] for f in os.listdir(config1['datadir'])] trainsplit = sorted(set(train_list), key=train_list.index) # valsplit = np.load('valsplit.npy') # testsplit = np.load('test.npy') dataset = DataBowl3Detector(trainsplit, config1, phase='train') train_loader_nod = DataLoader(dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) # dataset = DataBowl3Detector(valsplit,config1,phase = 'val') # val_loader_nod = DataLoader(dataset,batch_size = args.batch_size, # shuffle = False,num_workers = args.workers,pin_memory=True) optimizer = torch.optim.SGD(nod_net.parameters(), args.lr, momentum=0.9, weight_decay=args.weight_decay) # trainsplit = np.load('full.npy') dataset = DataBowl3Classifier(trainsplit, config2, phase='train') train_loader_case = DataLoader(dataset, batch_size=args.batch_size2, shuffle=True, num_workers=args.workers, pin_memory=True) # dataset = DataBowl3Classifier(valsplit,config2,phase = 'val') # val_loader_case = DataLoader(dataset,batch_size = max([args.batch_size2,1]), # shuffle = False,num_workers = args.workers,pin_memory=True) # dataset = DataBowl3Classifier(trainsplit,config2,phase = 'val') # all_loader_case = DataLoader(dataset,batch_size = max([args.batch_size2,1]), # shuffle = False,num_workers = args.workers,pin_memory=True) optimizer2 = torch.optim.SGD(case_net.parameters(), args.lr, momentum=0.9, weight_decay=args.weight_decay) ''' 1. case_net 分类模型权重 加载分类模型权重,设置初始化参数 2. 配置log路径文件 3. case_net在gpu上部署,多gpu部署 4. 测试: 1. 使用方法test_casenet 对dtaset进行分类 终止程序运行 5. 训练功能: 1. 检测器训练集加载 2. 优化器 3. 分类器训练集加载 4. 优化器2 5. for(start_epoch, end_epoch): 每隔30个epoch进行一次目标检测器训练 每个epoch进行 分类器的训练 ''' for epoch in range(start_epoch, end_epoch + 1): if epoch == start_epoch: lr = args.lr debug = args.debug args.lr = 0.0 args.debug = True train_casenet(epoch, case_net, train_loader_case, optimizer2, args) args.lr = lr args.debug = debug if epoch < args.lr_stage[-1]: train_nodulenet(train_loader_nod, nod_net, loss, epoch, optimizer, args) # validate_nodulenet(val_loader_nod, nod_net, loss) if epoch > config2['startepoch']: train_casenet(epoch, case_net, train_loader_case, optimizer2, args) # val_casenet(epoch,case_net,val_loader_case,args) # val_casenet(epoch,case_net,all_loader_case,args) if epoch % args.save_freq == 0: state_dict = case_net.module.state_dict() for key in state_dict.keys(): state_dict[key] = state_dict[key].cpu() torch.save( { 'epoch': epoch, 'save_dir': save_dir, 'state_dict': state_dict, 'args': args }, os.path.join(save_dir, '%03d.ckpt' % epoch))
def main(): # drive.mount('/content/drive/') # path = '/content/drive/My Drive/Colab Notebooks' # os.chdir(path) ''' import moxing as mox mox.file.make_dirs('/cache') mox.file.copy_parallel('obs://ghost-story/ghost/nlpdata/config/model_config_small.json', '/cache/config/model_config_small.json') mox.file.copy_parallel('obs://ghost-story/ghost/nlpdata/cache/vocab_small.txt', '/cache/cache/vocab_small.txt') model_config = transformers.modeling_gpt2.GPT2Config.from_json_file('/cache/config/model_config_small.json') mox.file.copy_parallel('obs://ghost-story/ghost/nlpdata/ghost.json', '/cache/ghost.json') mox.file.copy_parallel('obs://ghost-story/ghost/nlpdata/data/tokenization/', '/cache/data/tokenization/') mox.file.copy_parallel('obs://ghost-story/ghost/nlpdata/model/', '/cache/data/model/') mox.file.copy_parallel('obs://ghost-story/ghost/', '/cache/data/model/') args = parser.parse_args() args, unparsed = parser.parse_known_args() ''' parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡') parser.add_argument('--model_config', default='/content/gpt-2-chinese-finetune/nlpdata/config/model_config_small.json', type=str, required=False, help='选择模型参数') parser.add_argument('--tokenizer_path', default='/content/gpt-2-chinese-finetune/nlpdata/cache/vocab_small.txt', type=str, required=False, help='选择词库') parser.add_argument('--raw_data_path', default='/content/gpt-2-chinese-finetune/nlpdata/ghost.json', type=str, required=False, help='原始训练语料') parser.add_argument('--tokenized_data_path', default='/content/gpt-2-chinese-finetune/nlpdata/', help='tokenized语料存放位置') parser.add_argument('--raw', action='store_true', help='是否先做tokenize') parser.add_argument('--epochs', default=50, type=int, required=False, help='训练循环') parser.add_argument('--batch_size', default=1, type=int, required=False, help='训练batch size') parser.add_argument('--lr', default=1.5e-3, type=float, required=False, help='学习率') parser.add_argument('--warmup_steps', default=2000, type=int, required=False, help='warm up步数') parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次loss') parser.add_argument('--stride', default=768, type=int, required=False, help='训练时取训练数据的窗口步长') parser.add_argument('--gradient_accumulation', default=1, type=int, required=False, help='梯度积累') parser.add_argument('--fp16', action='store_true', help='混合精度') parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False) parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False) parser.add_argument('--num_pieces', default=1, type=int, required=False, help='将训练语料分成多少份') # parser.add_argument('--output_dir', default='obs://ghost-story/ghost/nlpdata/model/', type=str, required=False, help='模型输出路径') parser.add_argument('--pretrained_model', default='args = /content/gpt-2-chinese-finetune/nlpdata/cache', type=str, required=False, help='模型训练起点路径') parser.add_argument('--segment', action='store_true', help='中文以词为单位') args = parser.parse_args() #args, unparsed = parser.parse_known_args() print('args:\n' + args.__repr__()) # if args.segment: # from data import tokenization_bert_word_level as tokenization_bert # else: # import Tokenization os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 # model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(args.model_config) # model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(mox.file.read('/cache/config/model_config_small.json')) model_config = transformers.modeling_gpt2.GPT2Config.from_json_file('/content/gpt-2-chinese-finetune/nlpdata/config/model_config_small.json') print('config:\n' + model_config.to_json_string()) n_ctx = model_config.n_ctx # full_tokenizer = Tokenization.BertTokenizer(vocab_file=args.tokenizer_path) # full_tokenizer = BertTokenizer(vocab_file=args.tokenizer_path) full_tokenizer = BertTokenizer(vocab_file='/content/gpt-2-chinese-finetune/nlpdata/cache/vocab_small.txt') full_tokenizer.max_model_input_sizes = 999999 device = 'cuda' if torch.cuda.is_available() else 'cpu' print('using device:', device) raw_data_path = '/content/gpt-2-chinese-finetune/nlpdata/ghost.json' tokenized_data_path = '/content/gpt-2-chinese-finetune/nlpdata/cache/' # raw = args.raw # 选择是否从零开始构建数据集 raw = True epochs = args.epochs batch_size = args.batch_size lr = args.lr warmup_steps = args.warmup_steps log_step = args.log_step stride = args.stride gradient_accumulation = args.gradient_accumulation fp16 = args.fp16 # 不支持半精度的显卡请勿打开 fp16_opt_level = args.fp16_opt_level max_grad_norm = args.max_grad_norm num_pieces = args.num_pieces output_dir = '/content/gpt-2-chinese-finetune/nlpdata/cache/' if raw: print('building files') build_files(raw_data_path=raw_data_path, tokenized_data_path=tokenized_data_path, full_tokenizer=full_tokenizer, num_pieces=num_pieces) print('files built') # if not args.pretrained_model: model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config) # else: # model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(args.pretrained_model) model.train() model.to(device) multi_gpu = False full_len = 0 print('calculating total steps') for i in tqdm(range(num_pieces)): with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: full_len += len([int(item) for item in f.read().strip().split()]) total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation) print('total steps = {}'.format(total_steps)) optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True) scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps) if fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = DataParallel(model) multi_gpu = True print('starting training') running_loss = 0 for epoch in range(epochs): print('epoch {}'.format(epoch + 1)) now = datetime.now() print('time: {}'.format(now)) x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32) random.shuffle(x) piece_num = 0 for i in x: with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: line = f.read().strip() tokens = line.split() tokens = [int(token) for token in tokens] start_point = 0 samples = [] while start_point < len(tokens) - n_ctx: samples.append(tokens[start_point: start_point + n_ctx]) start_point += stride if start_point < len(tokens): samples.append(tokens[len(tokens) - n_ctx:]) random.shuffle(samples) for step in range(len(samples) // batch_size): # prepare data batch = samples[step * batch_size: (step + 1) * batch_size] batch_labels = [] batch_inputs = [] for ids in batch: int_ids_for_labels = [int(x) for x in ids] int_ids_for_inputs = [int(x) for x in ids] batch_labels.append(int_ids_for_labels) batch_inputs.append(int_ids_for_inputs) batch_labels = torch.tensor(batch_labels).long().to(device) batch_inputs = torch.tensor(batch_inputs).long().to(device) # forward pass outputs = model.forward(input_ids=batch_inputs, labels=batch_labels) loss, logits = outputs[:2] # get loss if multi_gpu: loss = loss.mean() if gradient_accumulation > 1: loss = loss / gradient_accumulation # loss backward if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # optimizer step if (step + 1) % gradient_accumulation == 0: running_loss += loss.item() optimizer.step() optimizer.zero_grad() scheduler.step() if (step + 1) % log_step == 0: print('now time: {}:{}. Step {} of piece {} of epoch {}, loss {}'.format( datetime.now().hour, datetime.now().minute, (step + 1) // gradient_accumulation, piece_num, epoch + 1, running_loss / log_step)) running_loss = 0 piece_num += 1 print('saving model for epoch {}'.format(epoch + 1)) if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)): os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1)) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'model_epoch{}'.format(epoch + 1)) #mox.file.copy_parallel(output_dir + 'model_epoch{}'.format(epoch + 1), '/content/gpt-2-chinese-finetune/nlpdata/model/') # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1)) # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1)) print('epoch {} finished'.format(epoch + 1)) then = datetime.now() print('time: {}'.format(then)) print('time for one epoch: {}'.format(then - now)) print('training finished') if not os.path.exists(output_dir + 'final_model'): os.mkdir(output_dir + 'final_model') model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'final_model')
def train_model(train_dataset, train_num_each, val_dataset, val_num_each): num_train = len(train_dataset) num_val = len(val_dataset) train_useful_start_idx = get_useful_start_idx(sequence_length, train_num_each) val_useful_start_idx = get_useful_start_idx(sequence_length, val_num_each) num_train_we_use = len(train_useful_start_idx) // num_gpu * num_gpu num_val_we_use = len(val_useful_start_idx) // num_gpu * num_gpu # num_train_we_use = 8000 # num_val_we_use = 800 train_we_use_start_idx = train_useful_start_idx[0:num_train_we_use] # 训练数据开始位置 val_we_use_start_idx = val_useful_start_idx[0:num_val_we_use] np.random.seed(0) np.random.shuffle(train_we_use_start_idx) train_idx = [] for i in range(num_train_we_use): for j in range(sequence_length): train_idx.append(train_we_use_start_idx[i] + j * srate) # 训练数据位置,每一张图是一个数据 val_idx = [] for i in range(num_val_we_use): for j in range(sequence_length): val_idx.append(val_we_use_start_idx[i] + j * srate) num_train_all = float(len(train_idx)) num_val_all = float(len(val_idx)) print('num of train dataset: {:6d}'.format(num_train)) print('num train start idx : {:6d}'.format(len(train_useful_start_idx))) print('last idx train start: {:6d}'.format(train_useful_start_idx[-1])) print('num of train we use : {:6d}'.format(num_train_we_use)) print('num of all train use: {:6d}'.format(int(num_train_all))) print('num of valid dataset: {:6d}'.format(num_val)) print('num valid start idx : {:6d}'.format(len(val_useful_start_idx))) print('last idx valid start: {:6d}'.format(val_useful_start_idx[-1])) print('num of valid we use : {:6d}'.format(num_val_we_use)) print('num of all valid use: {:6d}'.format(int(num_val_all))) val_loader = DataLoader( val_dataset, batch_size=val_batch_size, # sampler=val_idx, sampler=SeqSampler(val_dataset, val_idx), num_workers=workers, pin_memory=False ) model = resnet_lstm() if use_gpu: model = model.cuda() model = DataParallel(model) criterion = nn.CrossEntropyLoss() ''' if multi_optim == 0: if optimizer_choice == 0: optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=use_nesterov) if sgd_adjust_lr == 0: exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=sgd_adjust_lr, gamma=sgd_gamma) elif sgd_adjust_lr == 1: exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min') elif optimizer_choice == 1: optimizer = optim.Adam(model.parameters(), lr=learning_rate) elif multi_optim == 1: if optimizer_choice == 0: optimizer = optim.SGD([ {'params': model.module.share.parameters()}, {'params': model.module.lstm.parameters(), 'lr': learning_rate}, {'params': model.module.fc.parameters(), 'lr': learning_rate}, ], lr=learning_rate / 10, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=use_nesterov) if sgd_adjust_lr == 0: exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=sgd_adjust_lr, gamma=sgd_gamma) elif sgd_adjust_lr == 1: exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min') elif optimizer_choice == 1: optimizer = optim.Adam([ {'params': model.module.share.parameters()}, {'params': model.module.lstm.parameters(), 'lr': learning_rate}, {'params': model.module.fc.parameters(), 'lr': learning_rate}, ], lr=learning_rate / 10) ''' optimizer = optim.Adam(model.parameters(), lr=learning_rate) best_model_wts = copy.deepcopy(model.state_dict()) best_val_accuracy = 0.0 correspond_train_acc = 0.0 record_np = np.zeros([epochs, 4]) for epoch in range(epochs): np.random.seed(epoch) np.random.shuffle(train_we_use_start_idx) train_idx = [] for i in range(num_train_we_use): for j in range(sequence_length): train_idx.append(train_we_use_start_idx[i] + j * srate) train_loader = DataLoader( train_dataset, batch_size=train_batch_size, sampler=SeqSampler(train_dataset, train_idx), num_workers=workers, pin_memory=False ) model.train() train_loss = 0.0 train_corrects = 0 train_start_time = time.time() num = 0 train_num = 0 for data in train_loader: num = num + 1 #inputs, labels_phase, kdata = data inputs, labels_phase = data if use_gpu: inputs = Variable(inputs.cuda()) labels = Variable(labels_phase.cuda()) #kdatas = Variable(kdata.cuda()) else: inputs = Variable(inputs) labels = Variable(labels_phase) #kdatas = Variable(kdata) optimizer.zero_grad() #outputs = model.forward(inputs, kdatas) outputs = model.forward(inputs) outputs = F.softmax(outputs, dim=1) _, preds = torch.max(outputs.data, 1) print(num) print(preds) print(labels) loss = criterion(outputs, labels) loss.backward() optimizer.step() train_loss += loss.data train_corrects += torch.sum(preds == labels.data) train_num += labels.shape[0] print(train_corrects.cpu().numpy() / train_num) if train_corrects.cpu().numpy() / train_num > 0.75: torch.save(copy.deepcopy(model.state_dict()), 'test.pth') train_elapsed_time = time.time() - train_start_time train_accuracy = train_corrects.cpu().numpy() / train_num train_average_loss = train_loss / train_num # begin eval model.eval() val_loss = 0.0 val_corrects = 0 val_num = 0 val_start_time = time.time() for data in val_loader: #inputs, labels_phase, kdata = data inputs, labels_phase = data #labels_phase = labels_phase[(sequence_length - 1)::sequence_length] #kdata = kdata[(sequence_length - 1)::sequence_length] if use_gpu: inputs = Variable(inputs.cuda()) labels = Variable(labels_phase.cuda()) #kdatas = Variable(kdata.cuda()) else: inputs = Variable(inputs) labels = Variable(labels_phase) #kdatas = Variable(kdata) if crop_type == 0 or crop_type == 1: #outputs = model.forward(inputs, kdatas) outputs = model.forward(inputs) elif crop_type == 5: inputs = inputs.permute(1, 0, 2, 3, 4).contiguous() inputs = inputs.view(-1, 3, 224, 224) #outputs = model.forward(inputs, kdatas) outputs = model.forward(inputs) outputs = outputs.view(5, -1, 3) outputs = torch.mean(outputs, 0) elif crop_type == 10: inputs = inputs.permute(1, 0, 2, 3, 4).contiguous() inputs = inputs.view(-1, 3, 224, 224) #outputs = model.forward(inputs, kdatas) outputs = model.forward(inputs) outputs = outputs.view(10, -1, 3) outputs = torch.mean(outputs, 0) #outputs = outputs[sequence_length - 1::sequence_length] _, preds = torch.max(outputs.data, 1) print(num) print(preds) print(labels) loss = criterion(outputs, labels) val_loss += loss.data val_corrects += torch.sum(preds == labels.data) val_num += labels.shape[0] val_elapsed_time = time.time() - val_start_time val_accuracy = val_corrects.cpu().numpy() / val_num val_average_loss = val_loss / val_num print('epoch: {:4d}' ' train in: {:2.0f}m{:2.0f}s' ' train loss: {:4.4f}' ' train accu: {:.4f}' ' valid in: {:2.0f}m{:2.0f}s' ' valid loss: {:4.4f}' ' valid accu: {:.4f}' .format(epoch, train_elapsed_time // 60, train_elapsed_time % 60, train_average_loss, train_accuracy, val_elapsed_time // 60, val_elapsed_time % 60, val_average_loss, val_accuracy)) if optimizer_choice == 0: if sgd_adjust_lr == 0: exp_lr_scheduler.step() elif sgd_adjust_lr == 1: exp_lr_scheduler.step(val_average_loss) if val_accuracy > best_val_accuracy: best_val_accuracy = val_accuracy correspond_train_acc = train_accuracy best_model_wts = copy.deepcopy(model.state_dict()) if val_accuracy == best_val_accuracy: if train_accuracy > correspond_train_acc: correspond_train_acc = train_accuracy best_model_wts = copy.deepcopy(model.state_dict()) record_np[epoch, 0] = train_accuracy record_np[epoch, 1] = train_average_loss record_np[epoch, 2] = val_accuracy record_np[epoch, 3] = val_average_loss np.save(str(epoch) + '.npy', record_np) print('best accuracy: {:.4f} cor train accu: {:.4f}'.format(best_val_accuracy, correspond_train_acc)) save_val = int("{:4.0f}".format(best_val_accuracy * 10000)) save_train = int("{:4.0f}".format(correspond_train_acc * 10000)) model_name = "lstm" \ + "_epoch_" + str(epochs) \ + "_length_" + str(sequence_length) \ + "_opt_" + str(optimizer_choice) \ + "_mulopt_" + str(multi_optim) \ + "_flip_" + str(use_flip) \ + "_crop_" + str(crop_type) \ + "_batch_" + str(train_batch_size) \ + "_train_" + str(save_train) \ + "_val_" + str(save_val) \ + ".pth" torch.save(best_model_wts, model_name) record_name = "lstm" \ + "_epoch_" + str(epochs) \ + "_length_" + str(sequence_length) \ + "_opt_" + str(optimizer_choice) \ + "_mulopt_" + str(multi_optim) \ + "_flip_" + str(use_flip) \ + "_crop_" + str(crop_type) \ + "_batch_" + str(train_batch_size) \ + "_train_" + str(save_train) \ + "_val_" + str(save_val) \ + ".npy" np.save(record_name, record_np)
def main(): global args args = parser.parse_args() seed = 0 torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.cuda.set_device(0) model = import_module(args.model) config, net, loss, get_pbb = model.get_model() start_epoch = args.start_epoch save_dir = args.save_dir if args.resume: checkpoint = torch.load(args.resume) if start_epoch == 0: start_epoch = checkpoint['epoch'] + 1 if not save_dir: save_dir = checkpoint['save_dir'] else: save_dir = os.path.join('results', save_dir) net.load_state_dict(checkpoint['state_dict']) else: if start_epoch == 0: start_epoch = 1 if not save_dir: exp_id = time.strftime('%Y%m%d-%H%M%S', time.localtime()) save_dir = os.path.join('results', args.model + '-' + exp_id) else: save_dir = os.path.join('results', save_dir) if not os.path.exists(save_dir): os.makedirs(save_dir) logfile = os.path.join(save_dir, 'log') # if training, save files to know how training was done if args.test != 1: sys.stdout = Logger(logfile) # sys.stdout = logging.getLogger(logfile) print sys.argv pyfiles = [f for f in os.listdir('./') if f.endswith('.py')] for f in pyfiles: shutil.copy(f, os.path.join(save_dir, f)) shutil.copy('config_training.py', os.path.join(save_dir)) n_gpu = setgpu(args.gpu) args.n_gpu = n_gpu net = net.cuda() loss = loss.cuda() cudnn.benchmark = True net = DataParallel(net) datadir = config_training[ 'preprocess_result_path'] if args.data is None else args.data if args.test == 1: margin = 32 sidelen = 144 split_comber = SplitComb(sidelen, config['max_stride'], config['stride'], margin, config['pad_value']) test_set_file = args.test_filename dataset = data.DataBowl3Detector(datadir, test_set_file, config, phase='test', split_comber=split_comber) test_loader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=args.workers, collate_fn=data.collate, pin_memory=False) test(test_loader, net, get_pbb, save_dir, config, args.test_set) return #net = DataParallel(net) dataset = data.DataBowl3Detector(datadir, args.train_filename, config, phase='train') train_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) dataset = data.DataBowl3Detector(datadir, args.val_filename, config, phase='val') val_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.optim == 'adam': optimizer = torch.optim.Adam(net.parameters()) elif args.optim == 'sgd': optimizer = torch.optim.SGD(net.parameters(), args.lr, momentum=0.9, weight_decay=args.weight_decay) def get_lr(epoch): if epoch <= args.epochs * 0.5: lr = args.lr elif epoch <= args.epochs * 0.8: lr = 0.1 * args.lr else: lr = 0.01 * args.lr return lr for epoch in range(start_epoch, args.epochs + 1): train(train_loader, net, loss, epoch, optimizer, get_lr, args.save_freq, save_dir) validate(val_loader, net, loss)
', training network from scratch. Press enter to continue.') # Create the dataloader classIdx = classNameToIdx[className] trainSet = IcaDataset(classIdx, nLeadingZerosFormat, paths['rgbDir'], rgbFormat, paths['segDir'], segFormat, paths['posesPath'], keypoints, K) # Torch dataset trainSampler = RandomSampler(trainSet) trainBatchSampler = BatchSampler(trainSampler, batchSize, drop_last=True) # Torch sampler trainLoader = DataLoader(trainSet, batch_sampler=trainBatchSampler, num_workers=8) # Initialize the optimizer optimizer = Adam(network.parameters(), lr=learningRate) # Train the model nIterations = len(trainSet) // batchSize for iEpoch in range(nEpochs): print('Starting epoch #' + str(iEpoch + 1) + ' out of ' + str(nEpochs)) tEpochStart = time.time() for idx, data in enumerate(trainLoader): # Start training loop iteration timer tTrainingLoopStart = time.time() # Extract data tExtractDataStart = time.time() image, maskGT, vertexGT, vertexWeightsGT = [d.cuda() for d in data] tExtractDataElapsed = time.time() - tExtractDataStart
class model(base_process): def train_stage_2(self): batch = 240 lr1 = 0.15 data_set = loader(os.path.join(os.getcwd(), 'data_2'), {"mode": "training"}) data_set_test = loader(os.path.join(os.getcwd(), 'data_2'),{"mode": "test"}, data_set.index) data_set_eval = loader(os.path.join(os.getcwd(), 'data_2'),{"mode": "eval"}, data_set.index) data_loader = DataLoader(data_set, batch, True, collate_fn=call_back.detection_collate_RPN) data_loader_test = DataLoader(data_set_test, batch, False, collate_fn=call_back.detection_collate_RPN) data_loader_eval = DataLoader(data_set_eval, batch, False, collate_fn=call_back.detection_collate_RPN) # optim = Adadelta(self.ROI.parameters(), lr=lr1, weight_decay=1e-5) start_time = time.time() optim_a = Adadelta([{'params': self.pre.parameters()}, {'params': self.ROI.parameters()}], lr=0.15, weight_decay=1e-5) cfg.test = False count = 0 for epoch in range(200): runing_losss = 0.0 cls_loss = 0 coor_loss = 0 cls_loss2 = 0 coor_loss2 = 0 count += 1 # base_time = RPN_time = ROI_time = nms_time = pre_gt = loss_time = linear_time = 0 for data in data_loader: y = data[1] x = data[0].cuda() peak = data[2] num = data[3] optim_a.zero_grad() with torch.no_grad(): if self.flag >= 2: result = self.base_process(x, y, peak) feat1 = result['feat_8'] feat2 = result['feat_16'] feat3 = result['feat_32'] feat4 = result['feat_64'] label = result['label'] loss_box = result['loss_box'] cross_entropy = result['cross_entropy'] cls_score = self.pre(feat1, feat2, feat3, feat4) cls_score = self.ROI(cls_score) cross_entropy2 = self.tool2.cal_loss2(cls_score, label) loss_total = cross_entropy2 loss_total.backward() optim_a.step() runing_losss += loss_total.item() cls_loss2 += cross_entropy2.item() cls_loss += cross_entropy.item() coor_loss += loss_box.item() end_time = time.time() torch.cuda.empty_cache() print( "epoch:{a} time:{ff}: loss:{b:.4f} cls:{d:.4f} cor{e:.4f} cls2:{f:.4f} cor2:{g:.4f} date:{fff}".format( a=epoch, b=runing_losss, d=cls_loss, e=coor_loss, f=cls_loss2, g=coor_loss2, ff=int(end_time - start_time), fff=time.asctime())) # if epoch % 10 == 0: # adjust_learning_rate(optim, 0.9, epoch, 50, lr1) p = None # if epoch % 2 == 0: # print("test result") # save(self.RPN.module.state_dict(), # os.path.join(os.getcwd(), str(epoch) + 'rpn_a2.p')) # save(self.RPN.module.state_dict(), # os.path.join(os.getcwd(), str(epoch) + 'base_a2.p')) start_time = end_time all_data = [] all_label = [] for data in data_loader: y = data[1] x = data[0].cuda() num = data[3] peak = data[2] with torch.no_grad(): if self.flag >= 2: result = self.base_process_2(x, y, peak) data_ = result['x'] label = result['label'] loss_box = result['loss_box'] cross_entropy = result['cross_entropy'] all_data.extend(data_.cpu()) all_label.extend(label.cpu()) for data in data_loader_eval: y = data[1] x = data[0].cuda() num = data[3] peak = data[2] with torch.no_grad(): if self.flag >= 2: result = self.base_process_2(x, y, peak) data_ = result['x'] label = result['label'] loss_box = result['loss_box'] cross_entropy = result['cross_entropy'] all_data.extend(data_.cpu()) all_label.extend(label.cpu()) for data in data_loader_test: y = data[1] x = data[0].cuda() num = data[3] peak = data[2] with torch.no_grad(): if self.flag >= 2: result = self.base_process_2(x, y, peak) data_ = result['x'] label = result['label'] loss_box = result['loss_box'] cross_entropy = result['cross_entropy'] all_data.extend(data_.cpu()) all_label.extend(label.cpu()) all_data = torch.stack(all_data, 0).numpy() all_label = torch.LongTensor(all_label).numpy() from imblearn.over_sampling import SMOTE fun = SMOTE() all_data, all_label = fun.fit_resample(all_data, all_label) total = len(all_label) training_label = all_label[:int(0.7 * total)] training_data = all_data[:int(0.7 * total)] test_label = all_label[-int(0.2 * total):] test_data = all_data[-int(0.2 * total):] count = 0 self.ROI = roi().cuda() self.ROI = DataParallel(self.ROI, device_ids=[0]) self.ROI.apply(weights_init) optim_b = Adadelta(self.ROI.parameters(), lr=0.15, weight_decay=1e-5) for epoch in range(1200): runing_losss = 0.0 cls_loss = 0 coor_loss = 0 cls_loss2 = 0 coor_loss2 = 0 count += 1 optim_b.zero_grad() optim_a.zero_grad() # base_time = RPN_time = ROI_time = nms_time = pre_gt = loss_time = linear_time = 0 for j in range(int(len(training_label) / 240)): data_ = torch.Tensor(training_data[j * 240:j * 240 + 240]).view(240, 1024, 15).cuda() label_ = torch.LongTensor(training_label[j * 240:j * 240 + 240]).cuda() optim_b.zero_grad() cls_score = self.ROI(data_) cross_entropy2 = self.tool2.cal_loss2(cls_score, label_) loss_total = cross_entropy2 loss_total.backward() optim_b.step() runing_losss += loss_total.item() cls_loss2 += cross_entropy2.item() cls_loss += cross_entropy.item() coor_loss += loss_box.item() end_time = time.time() torch.cuda.empty_cache() print( "epoch:{a} time:{ff}: loss:{b:.4f} cls:{d:.4f} cor{e:.4f} cls2:{f:.4f} cor2:{g:.4f} date:{fff}".format( a=epoch, b=runing_losss, d=cls_loss, e=coor_loss, f=cls_loss2, g=coor_loss2, ff=int(end_time - start_time), fff=time.asctime())) if epoch % 10 == 0 and epoch > 0: adjust_learning_rate(optim_b, 0.9, epoch, 50, 0.3) p = None self.eval_(test_data, test_label) # self.ROI_eval(data_loader_eval, {"epoch": epoch}) start_time = end_time print('finish') def eval_(self, data, label): self.ROI = self.ROI.eval() gt = [] pre = [] total = int(len(label) / 240) with torch.no_grad(): for i in range(total): a = i * 240 b = a + 240 sin_x = torch.Tensor(data[a:b]).cuda() sin_x = sin_x.view(240, 1024, 15) sin_y = label[a:b] predict = self.ROI(sin_x) predict, index = torch.max(predict, 1) pre.extend(index.cpu().tolist()) gt.extend(sin_y) print("ppv:{}".format(metrics.precision_score(gt, pre, average='micro'))) print("spe:{}".format(specificity_score(gt, pre, average='micro'))) print("sen:{}".format(metrics.recall_score(gt, pre, average='micro'))) def base_process_2(self, x, y, peak): cross_entropy, loss_box = torch.ones(1), torch.ones(1) with torch.no_grad(): x1, x2, x3, x4 = self.features(x) if self.flag == 3: predict_confidence, box_predict = self.RPN(x1, x2, x3, x4) proposal, batch_offset, batch_conf = self.tool.get_proposal(predict_confidence, box_predict, y, test=True) # save_proposal = [i.cpu().numpy() for i in proposal] # save_data = x.cpu().numpy() # save_y = [i.numpy() for i in y] # self.save_dict['data'].append(save_data) # self.save_dict['label'].append(save_y) # self.save_dict['predict'].append(save_proposal) proposal, label = self.tool2.pre_gt_match_uniform(proposal, y, training=True, params={'peak': peak}) if 1: for i in range(len(proposal)): tmp = torch.zeros(proposal[i].size()[0], 1).fill_( i).cuda() proposal[i] = torch.cat([tmp, proposal[i]], 1) proposal = torch.cat(proposal, 0) feat4, label, class_num = self.tool2.roi_pooling_cuda(x4, proposal, label=label, stride=64, pool=self.pool4, batch=True) feat3 = \ self.tool2.roi_pooling_cuda(x3, proposal, stride=64, pool=self.pool3, batch=True, label=None)[ 0] feat2 = \ self.tool2.roi_pooling_cuda(x2, proposal, stride=32, pool=self.pool2, batch=True, label=None)[0] feat1 = \ self.tool2.roi_pooling_cuda(x1, proposal, stride=16, pool=self.pool1, batch=True, label=None, )[0] x = self.pre(feat1, feat2, feat3, feat4) x = x.view(-1, 1024 * 15) if self.flag == 2: result = {} result['x'] = x result['label'] = label result['predict_offset'] = 0 result['class_num'] = class_num result['batch_cor_weight'] = 0 result['cross_entropy'] = cross_entropy result['loss_box'] = loss_box return result elif self.flag == 3: result = {} result['x'] = x result['label'] = label result['class_num'] = class_num result['cross_entropy'] = cross_entropy result['loss_box'] = loss_box return result
def run_once(self, opt, run_engine_opt, log_dir, prev_log_dir=None, fold_idx=0): """Simply run the defined run_step of the related method once.""" check_manual_seed(self.seed) log_info = {} if self.logging: # check_log_dir(log_dir) rm_n_mkdir(log_dir) tfwriter = SummaryWriter(log_dir=log_dir) json_log_file = log_dir + "/stats.json" with open(json_log_file, "w") as json_file: json.dump({}, json_file) # create empty file log_info = { "json_file": json_log_file, "tfwriter": tfwriter, } #### loader_dict = {} for runner_name, runner_opt in run_engine_opt.items(): loader_dict[runner_name] = self._get_datagen( opt["batch_size"][runner_name], runner_name, opt["target_info"]["gen"], nr_procs=runner_opt["nr_procs"], fold_idx=fold_idx, ) #### def get_last_chkpt_path(prev_phase_dir, net_name): stat_file_path = prev_phase_dir + "/stats.json" with open(stat_file_path) as stat_file: info = json.load(stat_file) epoch_list = [int(v) for v in info.keys()] last_chkpts_path = "%s/%s_epoch=%d.tar" % ( prev_phase_dir, net_name, max(epoch_list), ) return last_chkpts_path # TODO: adding way to load pretrained weight or resume the training # parsing the network and optimizer information net_run_info = {} net_info_opt = opt["run_info"] for net_name, net_info in net_info_opt.items(): assert inspect.isclass(net_info["desc"]) or inspect.isfunction( net_info["desc"] ), "`desc` must be a Class or Function which instantiate NEW objects !!!" net_desc = net_info["desc"]() # TODO: customize print-out for each run ? # summary_string(net_desc, (3, 270, 270), device='cpu') pretrained_path = net_info["pretrained"] if pretrained_path is not None: if pretrained_path == -1: # * depend on logging format so may be broken if logging format has been changed pretrained_path = get_last_chkpt_path( prev_log_dir, net_name) net_state_dict = torch.load(pretrained_path)["desc"] else: chkpt_ext = os.path.basename(pretrained_path).split( ".")[-1] if chkpt_ext == "npz": net_state_dict = dict(np.load(pretrained_path)) net_state_dict = { k: torch.from_numpy(v) for k, v in net_state_dict.items() } elif chkpt_ext == "tar": # ! assume same saving format we desire net_state_dict = torch.load(pretrained_path)["desc"] colored_word = colored(net_name, color="red", attrs=["bold"]) print("Model `%s` pretrained path: %s" % (colored_word, pretrained_path)) # load_state_dict returns (missing keys, unexpected keys) net_state_dict = convert_pytorch_checkpoint(net_state_dict) load_feedback = net_desc.load_state_dict(net_state_dict, strict=False) # * uncomment for your convenience print("Missing Variables: \n", load_feedback[0]) print("Detected Unknown Variables: \n", load_feedback[1]) # * extremely slow to pass this on DGX with 1 GPU, why (?) net_desc = DataParallel(net_desc) net_desc = net_desc.to("cuda") # print(net_desc) # * dump network definition or not? optimizer, optimizer_args = net_info["optimizer"] optimizer = optimizer(net_desc.parameters(), **optimizer_args) # TODO: expand for external aug for scheduler nr_iter = opt["nr_epochs"] * len(loader_dict["train"]) scheduler = net_info["lr_scheduler"](optimizer) net_run_info[net_name] = { "desc": net_desc, "optimizer": optimizer, "lr_scheduler": scheduler, # TODO: standardize API for external hooks "extra_info": net_info["extra_info"], } # parsing the running engine configuration assert ("train" in run_engine_opt ), "No engine for training detected in description file" # initialize runner and attach callback afterward # * all engine shared the same network info declaration runner_dict = {} for runner_name, runner_opt in run_engine_opt.items(): runner_dict[runner_name] = RunEngine( dataloader=loader_dict[runner_name], engine_name=runner_name, run_step=runner_opt["run_step"], run_info=net_run_info, log_info=log_info, ) for runner_name, runner in runner_dict.items(): callback_info = run_engine_opt[runner_name]["callbacks"] for event, callback_list, in callback_info.items(): for callback in callback_list: if callback.engine_trigger: triggered_runner_name = callback.triggered_engine_name callback.triggered_engine = runner_dict[ triggered_runner_name] runner.add_event_handler(event, callback) # retrieve main runner main_runner = runner_dict["train"] main_runner.state.logging = self.logging main_runner.state.log_dir = log_dir # start the run loop main_runner.run(opt["nr_epochs"]) print("\n") print("########################################################") print("########################################################") print("\n") return
def main(): global args args = parser.parse_args() torch.manual_seed(0) torch.cuda.set_device(0) model = import_module(args.model) config, net, loss, get_pbb = model.get_model() start_epoch = args.start_epoch save_dir = args.save_dir if args.resume: checkpoint = torch.load(args.resume) if start_epoch == 0: start_epoch = checkpoint['epoch'] + 1 if not save_dir: save_dir = checkpoint['save_dir'] else: save_dir = os.path.join('results',save_dir) net.load_state_dict(checkpoint['state_dict']) else: if start_epoch == 0: start_epoch = 1 if not save_dir: exp_id = time.strftime('%Y%m%d-%H%M%S', time.localtime()) save_dir = os.path.join('results', args.model + '-' + exp_id) else: save_dir = os.path.join('results',save_dir) if not os.path.exists(save_dir): os.makedirs(save_dir) logfile = os.path.join(save_dir,'log') if args.test!=1: sys.stdout = Logger(logfile) pyfiles = [f for f in os.listdir('./') if f.endswith('.py')] for f in pyfiles: shutil.copy(f,os.path.join(save_dir,f)) n_gpu = setgpu(args.gpu) args.n_gpu = n_gpu net = net.cuda() loss = loss.cuda() cudnn.benchmark = True net = DataParallel(net) datadir = config_training['preprocess_result_path'] if args.test == 1: margin = 32 sidelen = 144 split_comber = SplitComb(sidelen,config['max_stride'],config['stride'],margin,config['pad_value']) dataset = data.DataBowl3Detector( datadir, 'full.npy', config, phase='test', split_comber=split_comber) test_loader = DataLoader( dataset, batch_size = 1, shuffle = False, num_workers = args.workers, collate_fn = data.collate, pin_memory=False) test(test_loader, net, get_pbb, save_dir,config) return #net = DataParallel(net) dataset = data.DataBowl3Detector( datadir, 'kaggleluna_full.npy', config, phase = 'train') train_loader = DataLoader( dataset, batch_size = args.batch_size, shuffle = True, num_workers = args.workers, pin_memory=True) dataset = data.DataBowl3Detector( datadir, 'valsplit.npy', config, phase = 'val') val_loader = DataLoader( dataset, batch_size = args.batch_size, shuffle = False, num_workers = args.workers, pin_memory=True) optimizer = torch.optim.SGD( net.parameters(), args.lr, momentum = 0.9, weight_decay = args.weight_decay) def get_lr(epoch): if epoch <= args.epochs * 0.5: lr = args.lr elif epoch <= args.epochs * 0.8: lr = 0.1 * args.lr else: lr = 0.01 * args.lr return lr for epoch in range(start_epoch, args.epochs + 1): train(train_loader, net, loss, epoch, optimizer, get_lr, args.save_freq, save_dir) validate(val_loader, net, loss)
def main(): torch.manual_seed(0) # torch.cuda.set_device(1) setgpu("all") epochs = 1000 def getlr(epoch, epochs): lr = 0.01 if epoch <= epochs * 0.5: lr = lr elif epoch <= epochs * 0.8: lr = 0.1 * lr else: lr = 0.01 * lr return lr datadir = "/home/user/disk2/video/2017/" savedir = "/home/user/disk2/video/saveV2/" logfile = os.path.join(savedir, 'log.txt') logfileVal = os.path.join(savedir, 'logVal.txt') if not os.path.exists(savedir): os.makedirs(savedir) dataset = dataLoader.DataSet(datadir) datasetVal = dataLoader.DataSetVal(datadir) net = nets.EmbeddingNet() # checkpoint = torch.load(savedir+"428.ckpt") # net.load_state_dict(checkpoint) net = DataParallel(net) net = net.cuda() loss = nets.Loss() loss = loss.cuda() trainLoader = DataLoader(dataset, batch_size=48, shuffle=True, num_workers=12, pin_memory=True) valLoader = DataLoader(datasetVal, batch_size=6, shuffle=True, num_workers=18, pin_memory=True) cudnn.benchmark = True lr = 0.01 optimizer = torch.optim.SGD(net.parameters(), lr, momentum=0.9, weight_decay=1e-4) for epoch in range(epochs): train(trainLoader, net, loss, epoch, optimizer, getlr, savedir, logfile, epochs) if epoch % 10 == 0: val(valLoader, net, loss, epoch, getlr, savedir, logfileVal, epochs) state_dict = net.module.state_dict() for key in state_dict.keys(): state_dict[key] = state_dict[key].cpu() torch.save(state_dict, os.path.join(savedir, '%03d.ckpt' % epoch)) print "save " + str(epoch) file = open(logfile, "a") file.write("save " + str(epoch)) file.close()
transforms = Compose([ Resize(config.IMAGE_SIZE), CenterCrop(config.IMAGE_SIZE), ToTensor(), Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) dataset = ImageFolder("../ganData/face/", transform=transforms) dataLoader = DataLoader(dataset=dataset, batch_size=config.BATCH_SIZE, shuffle=True, num_workers=config.NUM_WORKERS_LOAD_IMAGE, drop_last=True) netG, netD = DataParallel(GeneratorNet()), DataParallel(DiscriminatorNet()) map_location = lambda storage, loc: storage optimizer_generator = Adam(netG.parameters(), config.LR_GENERATOR, betas=(config.BETA1, 0.999)) optimizer_discriminator = Adam(netD.parameters(), config.LR_DISCRIMINATOR, betas=(config.BETA1, 0.999)) criterion = BCELoss() true_labels = Variable(t.ones(config.BATCH_SIZE)) fake_labels = Variable(t.zeros(config.BATCH_SIZE)) fix_noises = Variable(t.randn(config.BATCH_SIZE, config.NOISE_Z, 1, 1)) noises = Variable(t.randn(config.BATCH_SIZE, config.NOISE_Z, 1, 1)) # errord_meter = AverageValueMeter() # errorg_meter = AverageValueMeter()
def main(): global args args = parser.parse_args() config_training = import_module(args.config) config_training = config_training.config # from config_training import config as config_training torch.manual_seed(0) torch.cuda.set_device(0) model = import_module(args.model) config, net, loss, get_pbb = model.get_model() start_epoch = args.start_epoch save_dir = args.save_dir if args.resume: checkpoint = torch.load(args.resume) # if start_epoch == 0: # start_epoch = checkpoint['epoch'] + 1 # if not save_dir: # save_dir = checkpoint['save_dir'] # else: # save_dir = os.path.join('results',save_dir) net.load_state_dict(checkpoint['state_dict']) # else: if start_epoch == 0: start_epoch = 1 if not save_dir: exp_id = time.strftime('%Y%m%d-%H%M%S', time.localtime()) save_dir = os.path.join('results', args.model + '-' + exp_id) else: save_dir = os.path.join('results', save_dir) if not os.path.exists(save_dir): os.makedirs(save_dir) logfile = os.path.join(save_dir, 'log') if args.test != 1: sys.stdout = Logger(logfile) pyfiles = [f for f in os.listdir('./') if f.endswith('.py')] for f in pyfiles: shutil.copy(f, os.path.join(save_dir, f)) n_gpu = setgpu(args.gpu) args.n_gpu = n_gpu net = net.cuda() loss = loss.cuda() cudnn.benchmark = False # True net = DataParallel(net) traindatadir = config_training['train_preprocess_result_path'] valdatadir = config_training['val_preprocess_result_path'] testdatadir = config_training['test_preprocess_result_path'] trainfilelist = [] print config_training['train_data_path'] for folder in config_training['train_data_path']: print folder for f in os.listdir(folder): if f.endswith('.mhd') and f[:-4] not in config_training['black_list']: trainfilelist.append(folder.split('/')[-2]+'/'+f[:-4]) valfilelist = [] for folder in config_training['val_data_path']: for f in os.listdir(folder): if f.endswith('.mhd') and f[:-4] not in config_training['black_list']: valfilelist.append(folder.split('/')[-2]+'/'+f[:-4]) testfilelist = [] for folder in config_training['test_data_path']: for f in os.listdir(folder): if f.endswith('.mhd') and f[:-4] not in config_training['black_list']: testfilelist.append(folder.split('/')[-2]+'/'+f[:-4]) if args.test == 1: margin = 32 sidelen = 144 import data split_comber = SplitComb( sidelen, config['max_stride'], config['stride'], margin, config['pad_value']) dataset = data.DataBowl3Detector( testdatadir, testfilelist, config, phase='test', split_comber=split_comber) test_loader = DataLoader( dataset, batch_size=1, shuffle=False, num_workers=args.workers, collate_fn=data.collate, pin_memory=False) for i, (data, target, coord, nzhw) in enumerate(test_loader): # check data consistency if i >= len(testfilelist)/args.batch_size: break test(test_loader, net, get_pbb, save_dir, config) return #net = DataParallel(net) import data print len(trainfilelist) dataset = data.DataBowl3Detector( traindatadir, trainfilelist, config, phase='train') train_loader = DataLoader( dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) dataset = data.DataBowl3Detector( valdatadir, valfilelist, config, phase='val') val_loader = DataLoader( dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) for i, (data, target, coord) in enumerate(train_loader): # check data consistency if i >= len(trainfilelist)/args.batch_size: break for i, (data, target, coord) in enumerate(val_loader): # check data consistency if i >= len(valfilelist)/args.batch_size: break optimizer = torch.optim.SGD( net.parameters(), args.lr, momentum=0.9, weight_decay=args.weight_decay) def get_lr(epoch): if epoch <= args.epochs * 1/3: # 0.5: lr = args.lr elif epoch <= args.epochs * 2/3: # 0.8: lr = 0.1 * args.lr elif epoch <= args.epochs * 0.8: lr = 0.05 * args.lr else: lr = 0.01 * args.lr return lr for epoch in range(start_epoch, start_epoch + args.epochs): train(train_loader, net, loss, epoch, optimizer, get_lr, args.save_freq, save_dir) validate(val_loader, net, loss)
def main(): if raw: print('building files') build_files(data_path=raw_data_path) print('files built') model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel( config=model_config) model.to(device) multi_gpu = False full_line = '' print('calculating total steps') for i in tqdm(range(num_pieces)): with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: full_line += f.read() full_line = full_line.strip() full_line = [int(item) for item in full_line.split()] len_full_line = len(full_line) samples = [] start_point = 0 while start_point + n_ctx < len_full_line: samples.append(full_line[start_point:start_point + n_ctx]) start_point += stride total_steps = int( len(samples) * epochs / batch_size / gradient_accumulation) print('total steps = {}'.format(total_steps)) optimizer = pytorch_transformers.AdamW(model.parameters(), lr=lr, correct_bias=True) scheduler = pytorch_transformers.WarmupLinearSchedule( optimizer, warmup_steps=warmup_steps, t_total=total_steps) if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = DataParallel(model) multi_gpu = True print('starting training') for epoch in range(epochs): print('epoch {}'.format(epoch + 1)) now = datetime.now() print('time: {}'.format(now)) running_loss = 0 random.shuffle(samples) for step in range(len(samples) // batch_size): # prepare data batch = samples[step * batch_size:(step + 1) * batch_size] batch_labels = [] batch_inputs = [] for ids in batch: int_ids_for_labels = [int(x) for x in ids] int_ids_for_inputs = [int(x) for x in ids] batch_labels.append(int_ids_for_labels) batch_inputs.append(int_ids_for_inputs) batch_labels = torch.tensor(batch_labels).long().to(device) batch_inputs = torch.tensor(batch_inputs).long().to(device) # forward pass outputs = model.forward(input_ids=batch_inputs, labels=batch_labels) loss, logits = outputs[:2] # get loss if multi_gpu: loss = loss.mean() if gradient_accumulation > 1: loss = loss / gradient_accumulation # loss backward if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # optimizer step if (step + 1) % gradient_accumulation == 0: running_loss += loss.item() scheduler.step() optimizer.step() optimizer.zero_grad() if (step + 1) % log_step == 0: print('step {} of epoch {}, loss {}'.format( (step + 1) // gradient_accumulation, epoch + 1, running_loss * gradient_accumulation**2 / log_step)) running_loss = 0 print('saving model for epoch {}'.format(epoch + 1)) if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)): os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1)) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'model_epoch{}'.format(epoch + 1)) # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1)) # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1)) print('epoch {} finished'.format(epoch + 1)) then = datetime.now() print('time: {}'.format(then)) print('time for one epoch: {}'.format(then - now)) print('training finished') if not os.path.exists(output_dir + 'final_model'): os.mkdir(output_dir + 'final_model') model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'final_model')
def run(): logger.info("using device: {}".format(config.DEVICE)) train_data = process_raw_data() train_list, test_list = train_test_split(train_data, test_size=0.2, random_state=34) # 加载GPT2模型 model, n_ctx = create_model(False) model.to(config.DEVICE) # 是否使用多块GPU进行并行运算: 可以选择要使用哪几块显卡来进行训练 multi_gpu = False if torch.cuda.is_available() and torch.cuda.device_count() > 1: logger.info("Using more than one GPUs to train...") os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = config.DEVICE_NUM model = DataParallel( model, device_ids=[int(i) for i in config.DEVICE_NUM.split(",")]) multi_gpu = True # 记录模型参数数量 num_parameters = sum( [parameter.numel() for parameter in model.parameters()]) logger.info("number of model parameters: {}".format(num_parameters)) # 加载数据 logger.info("loading training data") train_dataset = DialogueDataset(train_list, n_ctx) batch_num = len(train_dataset) // config.BATCH_SIZE test_dataset = DialogueDataset(test_list, n_ctx) test_batch_num = len(test_dataset) // config.BATCH_SIZE train_data_loader = DataLoader(train_dataset, batch_size=config.BATCH_SIZE, shuffle=True, num_workers=4, collate_fn=collate_fn) test_data_loader = DataLoader(test_dataset, batch_size=config.BATCH_SIZE, shuffle=True, num_workers=1, collate_fn=collate_fn) # 计算所有epoch进行参数优化的总步数total_steps total_steps = int( len(train_data_loader) * config.EPOCHS / config.BATCH_SIZE / config.GRADIENT_ACCUMULATION) logger.info('total training steps = {}'.format(total_steps)) # 设置优化器,并且在初始训练时,使用warmup策略 optimizer = AdamW(model.parameters(), lr=config.LEARNING_RATE, correct_bias=True) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=config.WARM_STEPS, num_training_steps=total_steps) logger.info("start training...") best_loss = 100 best_accuracy = 0 for epoch in range(config.EPOCHS): train_fn(model, train_data_loader, optimizer, scheduler, epoch, batch_num, multi_gpu) loss, accuracy = eval_fn(model, test_data_loader, test_batch_num, multi_gpu) if loss < best_loss or accuracy > best_accuracy: logger.info('saving model for epoch {}, best loss: {}'.format( epoch + 1, loss)) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(config.MODEL_PATH) best_loss = loss best_accuracy = accuracy
def main(args): manualSeed = random.randint(1, 100000) print("Random Seed: ", manualSeed) random.seed(manualSeed) torch.manual_seed(manualSeed) torch.cuda.manual_seed_all(manualSeed) cudnn.benchmark = True #cudnn.deterministic = False cudnn.enabled = True root = '' train_source, num_classes = preprocess(root + 'market/bounding_box_train', relabel=True) gallery, _ = preprocess(root + 'market/bounding_box_test', relabel=False) query, _ = preprocess(root + 'market/query', relabel=False) marketTrain = Market('train', train_source, root + 'market/bounding_box_train/', 'train', args.height, args.width, 'data/pose_train.json') galleryds = Market('val', gallery, root + 'market/bounding_box_test/', 'gallery', args.height, args.width, 'data/pose_gallery.json') querds = Market('val', query, root + 'market/query/', 'query', args.height, args.width, 'data/pose_query.json') num_epochs = args.epochs train_batch_size = 32 #args.batch_size test_batch_size = 64 train_loader = DataLoader(marketTrain, batch_size=train_batch_size, shuffle=True, num_workers=8, pin_memory=False) query_loader = DataLoader(querds, batch_size=train_batch_size, shuffle=False, num_workers=8, pin_memory=False) gallery_loader = DataLoader(galleryds, batch_size=train_batch_size, shuffle=False, num_workers=8, pin_memory=False) reidNet = resnet50(pretrained=True, num_classes=num_classes) model = DataParallel(reidNet).cuda() # Optimizer if hasattr(model.module, 'base'): base_param_ids = set(map(id, model.module.base.parameters())) new_params = [ p for p in model.parameters() if id(p) not in base_param_ids ] param_groups = [{ 'params': model.module.base.parameters(), 'lr_mult': 0.1 }, { 'params': new_params, 'lr_mult': 1.0 }] print('Learning rate is set.') else: param_groups = model.parameters() optimiser = torch.optim.SGD(param_groups, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=True) # Schedule learning rate step_size = args.step_size def adjust_lr(epoch): _lr = args.lr * (args.lr_factor**(epoch // step_size)) print(_lr) for g in optimiser.param_groups: g['lr'] = _lr * g.get('lr_mult', 1) #checkpoint = torch.load('models_epoch/reidNet_10.pth') #model.load_state_dict(checkpoint['state_dict']) #optimiser.load_state_dict(checkpoint['optimizer']) criterion = torch.nn.CrossEntropyLoss(reduction='elementwise_mean').cuda() start_epoch = 0 #checkpoint['epoch'] + 1 for epoch in range(start_epoch, num_epochs): adjust_lr(epoch) print("Starting Epoch [%d]" % (epoch)) tloss = train(train_loader, model, optimiser, criterion) state = { 'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimiser.state_dict(), } evaluator = Evaluator(model) all = evaluator.evaluate(query_loader, gallery_loader, query, gallery, args.output_feature, args.rerank) with open('losses/rank1.txt', 'a') as the_file: the_file.write(str(all[0] * 100) + '\n') the_file.close() model_name = 'models_epoch/reidNet_' \ + str(epoch) + '_' + str(all[0] * 100)[:5] +'.pth' torch.save(state, model_name)
class ProGAN(BaseModel): """ Wrapper around the Generator and the Discriminator """ def __init__(self, depth=7, latent_size=256, num_channels=3, learning_rate=1e-3, beta_1=0, beta_2=0.99, eps=1e-8, drift=0.001, use_eql=True, use_ema=True, ema_decay=0.999, checkpoint=None, **kwargs): """ constructor for the class ProGAN, extends BaseModel :param depth: depth of the GAN, 2^depth is the final size of generated images :param latent_size: latent size of the manifold used by the GAN :param num_channels: *NOT YET IMPLEMENTED* will control number of channels of in/outputs :param drift: drift penalty for the discriminator (Used only if loss is wgan or wgan-gp) :param use_eql: whether to use equalized learning rate :param use_ema: boolean for whether to use exponential moving averages :param ema_decay: value of mu for ema :param checkpoint: generator checkpoint to load for inference :param learning_rate: base learning rate for Adam :param beta_1: beta_1 parameter for Adam :param beta_2: beta_2 parameter for Adam :param eps: epsilon parameter for Adam """ super(ProGAN, self).__init__(**kwargs) # state of the object self.latent_size = latent_size self.num_channels = num_channels self.depth = depth - 1 # ensures generated images are size 2^depth self.use_ema = use_ema self.ema_decay = ema_decay self.use_eql = use_eql self.drift = drift self.dataloader = None # Create the Generator and the Discriminator self.G = Generator(self.depth, self.latent_size, use_eql=self.use_eql).to(self.device) self.D = Discriminator(self.depth, self.latent_size, use_eql=self.use_eql).to(self.device) # if code is to be run on GPU, we can use DataParallel: if self.device == th.device("cuda"): self.G = DataParallel(self.G) self.D = DataParallel(self.D) # define the optimizers for the discriminator and generator self.default_rate = learning_rate self.G_optim = Adam(self.G.parameters(), lr=learning_rate, betas=(beta_1, beta_2), eps=eps) self.D_optim = Adam(self.D.parameters(), lr=learning_rate, betas=(beta_1, beta_2), eps=eps) # setup the ema for the generator if self.use_ema: # create a shadow copy of the generator self.G_shadow = copy.deepcopy(self.G) # initialize the G_shadow weights equal to the weights of G self.update_average(self.G_shadow, self.G, beta=0) if checkpoint is not None: self.model_names = ['G'] self.load_networks(checkpoint) self.set_requires_grad(self.G, requires_grad=False) def setup_loss(self, loss): if isinstance(loss, str): loss = loss.lower() # lowercase the string if loss == "wgan": loss = WGAN_GP(self.device, self.D, self.drift, use_gp=False) # note if you use just wgan, you will have to use weight clipping # in order to prevent gradient exploding elif loss == "wgan-gp": loss = WGAN_GP(self.device, self.D, self.drift, use_gp=True) elif loss == "lsgan": loss = LSGAN(self.D) elif loss == "lsgan-sig": loss = LSGAN_SIGMOID(self.D) elif loss == "hinge": loss = HingeLoss(self.D) elif loss == "rel-avg": loss = RelativisticAverageHinge(self.D) elif loss == "r1-reg": loss = R1Regularized(self.device, self.D) else: raise ValueError("Unknown loss function requested") elif not isinstance(loss, GANLoss): raise ValueError( "loss is neither an instance of GANLoss nor a string") return loss # This function updates the exponential average weights based on the current training def update_average(self, model_tgt, model_src, beta): """ update the target model using exponential moving averages :param model_tgt: target model :param model_src: source model :param beta: value of decay beta :return: None (updates the target model) """ # turn off gradient calculation self.set_requires_grad(model_tgt, False) self.set_requires_grad(model_src, False) param_dict_src = dict(model_src.named_parameters()) for p_name, p_tgt in model_tgt.named_parameters(): p_src = param_dict_src[p_name] assert (p_src is not p_tgt) p_tgt.copy_(beta * p_tgt + (1. - beta) * p_src) # turn back on the gradient calculation self.set_requires_grad(model_tgt, True) self.set_requires_grad(model_src, True) def forward(self, real_A): return self.G(real_A, self.depth - 1, alpha=1) def optimize_D(self, noise, real_batch, depth, alpha): self.set_requires_grad(self.G, False) self.set_requires_grad(self.D, True) # downsample the real_batch for the given depth down_sample_factor = int( np.power(2, self.depth - depth - 1)) if not self.dataloader.prescaled_data else 1 prior_downsample_factor = max(int( np.power(2, self.depth - depth)), 0) if not self.dataloader.prescaled_data else 2 ds_real_samples = AvgPool2d(down_sample_factor)(real_batch) if depth > 0: prior_ds_real_samples = interpolate( AvgPool2d(prior_downsample_factor)(real_batch), scale_factor=2) else: prior_ds_real_samples = ds_real_samples # real samples are a combination of ds_real_samples and prior_ds_real_samples real_samples = (alpha * ds_real_samples) + ( (1 - alpha) * prior_ds_real_samples) loss_val = 0 for _ in range(self.n_critic): # optimize discriminator self.D_optim.zero_grad() # generate a batch of samples fake_samples = self.G(noise, depth, alpha).detach() loss = self.loss.loss_D(real_samples.requires_grad_(), fake_samples.requires_grad_(), depth=depth, alpha=alpha) if not isinstance(self.loss, R1Regularized): loss.backward() self.D_optim.step() loss_val += loss.item() return loss_val / self.n_critic def optimize_G(self, noise, real_batch, depth, alpha): self.set_requires_grad(self.G, True) self.set_requires_grad(self.D, False) # optimize the generator self.G_optim.zero_grad() fake_samples = self.G(noise, depth, alpha) loss = self.loss.loss_G(real_batch, fake_samples, depth=depth, alpha=alpha) loss.backward() self.G_optim.step() # if use_ema is true, apply ema to the generator parameters if self.use_ema: self.update_average(self.G_shadow, self.G, self.ema_decay) # return the loss value return loss.item() def train(self, continue_train=False, data_path='maua/datasets/default_progan', dataloader=None, start_epoch=1, start_depth=1, until_depth=None, fade_in=0.5, save_freq=25, log_freq=5, num_epochs=50, learning_rates_dict={ 256: 5e-4, 512: 2.5e-4, 1024: 1e-4 }, n_critic=1, loss="wgan-gp"): """ Training function for ProGAN object :param continue_train: whether to continue training or not :param data_path: path to folder containing images to train on :param dataloader: custom dataloader to use, otherwise images will only be resized to max resolution :param start_epoch: epoch to continue training from (defaults to most recent, if continuing training) :param start_depth: depth to continue training from (defaults to most recent, if continuing training) :param until_depth: depth to continue training until (defaults to self.depth) :param fade_in: fraction of epochs per depth to fade into the new resolution :param save_freq: frequency to save checkpoints in number of epochs :param log_freq: frequency to log images in number of or fraction of epochs :param learning_rates_dict: dictionary of learning rates per resolution (defaults to self.learning_rate) :param n_critic: number of times to update discriminator (Used only if loss is wgan or wgan-gp) :param loss: the loss function to be used. Can either be a string => ["wgan-gp", "wgan", "lsgan", "lsgan-sig", "hinge", "rel-avg", "r1-reg"] or an instance of GANLoss """ self.model_names = ["G", "D"] self.n_critic = n_critic self.loss = self.setup_loss(loss) os.makedirs(os.path.join(self.save_dir, "images"), exist_ok=True) start_epoch = epoch = 1 total_epochs = num_epochs * self.depth if continue_train: epoch = self.get_latest_network(start_epoch, max_epoch=total_epochs) start_depth = start_depth if start_depth != 1 else math.ceil( epoch / num_epochs) start_epoch = epoch - math.floor(epoch / num_epochs) * num_epochs # create dataloader if dataloader is None and self.dataloader is None: transforms = tv.transforms.Compose( [tn.Resize(2**(self.depth + 1)), tn.ToTensor()]) dataloader = ProGANDataLoader(data_path=data_path, transforms=transforms) dataloader.generate_prescaled_dataset( sizes=list(map(lambda x: 2**(x + 3), range(self.depth - 1)))) self.dataloader = dataloader batches_dict = self.dataloader.get_batch_sizes(self) dataset_size = len(dataloader) print('# training images = %d' % dataset_size) # create fixed_input for logging fixed_input = th.randn(12, self.latent_size).to(self.device) print("Starting training on " + str(self.device)) global_time = time.time() for depth in range(start_depth, self.depth if until_depth is None else until_depth): current_res = 2**(depth + 2) print("Current resolution: %d x %d" % (current_res, current_res)) # update batch size and learning rate for scale dataloader.set_batch_size(current_res, batches_dict[current_res]) total_batches = dataloader.batches() learning_rate = learning_rates_dict.get(current_res, self.default_rate) self.D_optim.lr = self.G_optim.lr = learning_rate for e in range(start_epoch if depth == start_depth else 1, num_epochs + 1): start = time.time() # calculate the value of alpha for fade-in effect alpha = min(e / (num_epochs * fade_in), 1) if log_freq < 1: print("Start of epoch: %s / %s \t Fade in: %s" % (epoch, total_epochs, alpha)) loss_D, loss_G = 0, 0 for i, batch in enumerate(dataloader, 1): images = batch.to(self.device) noise = th.randn(images.shape[0], self.latent_size).to(self.device) loss_D += self.optimize_D(noise, images, depth, alpha) loss_G += self.optimize_G(noise, images, depth, alpha) if i % math.ceil(total_batches * log_freq) == 0 and not ( i == 0 or i == total_batches): elapsed = str( datetime.timedelta(seconds=time.time() - global_time)) print( "Elapsed: [%s] Batch: %d / %d d_loss: %f g_loss: %f" % (elapsed, i, total_batches, loss_D / math.ceil(total_batches * log_freq), loss_G / math.ceil(total_batches * log_freq))) loss_D, loss_G = 0, 0 # create a grid of samples and save it gen_img_file = os.path.join( self.save_dir, "images", "sample_res%d_e%d_b%d" % (current_res, epoch, i) + ".png") with th.no_grad(): self.create_grid( samples=self.G(fixed_input, depth, alpha), scale_factor=int( np.power(2, self.depth - depth - 2)), img_file=gen_img_file, ) if log_freq < 1: print("End of epoch:", epoch, "Took: ", time.time() - start, "sec") if log_freq >= 1 and epoch % log_freq == 0 or epoch == total_epochs: elapsed = str( datetime.timedelta(seconds=time.time() - global_time)) print( "Elapsed: [%s] Epoch: %d / %d Fade in: %.02f d_loss: %f g_loss: %f" % (elapsed, epoch, num_epochs * (self.depth - 1), alpha, loss_D, loss_G)) # create a grid of samples and save it gen_img_file = os.path.join( self.save_dir, "images", "sample_res%d_e%d" % (current_res, epoch) + ".png") with th.no_grad(): self.create_grid( samples=self.G(fixed_input, depth, alpha), scale_factor=int( np.power(2, self.depth - depth) / 4), img_file=gen_img_file, ) if epoch % save_freq == 0 or epoch == total_epochs: self.save_networks(epoch) epoch += 1 print("Training finished, took: ", datetime.timedelta(seconds=time.time() - global_time)) self.save_networks("final") # used to create grid of training images for logging def create_grid(self, samples, scale_factor, img_file, real_imgs=False): samples = th.clamp(samples, min=0, max=1) if scale_factor > 1 and not real_imgs: samples = interpolate(samples, scale_factor=scale_factor) save_image(samples, img_file, nrow=int(np.sqrt(len(samples)) + 1))
def get_model(dev, z_dim, nc): vae = DataParallel(VAE(dev=dev, z_dim=z_dim, nc=nc)) vae = vae.to(dev).double() opt = torch.optim.Adam(vae.parameters(), lr=1e-3) return vae, opt
def train_Ours(args, train_loader, val_loader, knownclass, Encoder, Decoder, NorClsfier, SSDClsfier, summary_writer, saver): seed = init_random_seed(args.manual_seed) criterionCls = nn.CrossEntropyLoss() criterionRec = nn.MSELoss() if args.parallel_train: Encoder = DataParallel(Encoder) Decoder = DataParallel(Decoder) NorClsfier = DataParallel(NorClsfier) SSDClsfier = DataParallel(SSDClsfier) optimizer = optim.Adam( list(Encoder.parameters()) + list(NorClsfier.parameters()) + list(SSDClsfier.parameters()) + list(Decoder.parameters()), lr=args.lr) if args.adv is 'PGDattack': print("**********Defense PGD Attack**********") elif args.adv is 'FGSMattack': print("**********Defense FGSM Attack**********") if args.adv is 'PGDattack': from advertorch.attacks import PGDAttack nor_adversary = PGDAttack(predict1=Encoder, predict2=NorClsfier, nb_iter=args.adv_iter) rot_adversary = PGDAttack(predict1=Encoder, predict2=SSDClsfier, nb_iter=args.adv_iter) elif args.adv is 'FGSMattack': from advertorch.attacks import GradientSignAttack nor_adversary = GradientSignAttack(predict1=Encoder, predict2=NorClsfier) rot_adversary = GradientSignAttack(predict1=Encoder, predict2=SSDClsfier) global_step = 0 # ---------- # Training # ---------- for epoch in range(args.n_epoch): Encoder.train() Decoder.train() NorClsfier.train() SSDClsfier.train() for steps, (orig, label, rot_orig, rot_label) in enumerate(train_loader): label = lab_conv(knownclass, label) orig, label = orig.cuda(), label.long().cuda() rot_orig, rot_label = rot_orig.cuda(), rot_label.long().cuda() with ctx_noparamgrad_and_eval(Encoder): with ctx_noparamgrad_and_eval(NorClsfier): with ctx_noparamgrad_and_eval(SSDClsfier): adv = nor_adversary.perturb(orig, label) rot_adv = rot_adversary.perturb(rot_orig, rot_label) latent_feat = Encoder(adv) norpred = NorClsfier(latent_feat) norlossCls = criterionCls(norpred, label) recon = Decoder(latent_feat) lossRec = criterionRec(recon, orig) ssdpred = SSDClsfier(Encoder(rot_adv)) rotlossCls = criterionCls(ssdpred, rot_label) loss = args.norClsWgt * norlossCls + args.rotClsWgt * rotlossCls + args.RecWgt * lossRec optimizer.zero_grad() loss.backward() optimizer.step() #============ tensorboard the log info ============# lossinfo = { 'loss': loss.item(), 'norlossCls': norlossCls.item(), 'lossRec': lossRec.item(), 'rotlossCls': rotlossCls.item(), } global_step += 1 #============ print the log info ============# if (steps + 1) % args.log_step == 0: errors = OrderedDict([ ('loss', loss.item()), ('norlossCls', norlossCls.item()), ('lossRec', lossRec.item()), ('rotlossCls', rotlossCls.item()), ]) saver.print_current_errors((epoch + 1), (steps + 1), errors) # evaluate performance on validation set periodically if ((epoch + 1) % args.val_epoch == 0): # switch model to evaluation mode Encoder.eval() NorClsfier.eval() running_corrects = 0.0 epoch_size = 0.0 val_loss_list = [] # calculate accuracy on validation set for steps, (images, label) in enumerate(val_loader): label = lab_conv(knownclass, label) images, label = images.cuda(), label.long().cuda() adv = nor_adversary.perturb(images, label) with torch.no_grad(): logits = NorClsfier(Encoder(adv)) _, preds = torch.max(logits, 1) running_corrects += torch.sum(preds == label.data) epoch_size += images.size(0) val_loss = criterionCls(logits, label) val_loss_list.append(val_loss.item()) val_loss_mean = sum(val_loss_list) / len(val_loss_list) val_acc = running_corrects.double() / epoch_size print('Val Acc: {:.4f}, Val Loss: {:.4f}'.format( val_acc, val_loss_mean)) valinfo = { 'Val Acc': val_acc.item(), 'Val Loss': val_loss.item(), } for tag, value in valinfo.items(): summary_writer.add_scalar(tag, value, (epoch + 1)) orig_show = vutils.make_grid(orig, normalize=True, scale_each=True) recon_show = vutils.make_grid(recon, normalize=True, scale_each=True) summary_writer.add_image('Ori_Image', orig_show, (epoch + 1)) summary_writer.add_image('Rec_Image', recon_show, (epoch + 1)) if ((epoch + 1) % args.model_save_epoch == 0): model_save_path = os.path.join(args.results_path, args.training_type, 'snapshots', args.datasetname + '-' + args.split, args.denoisemean, args.adv + str(args.adv_iter)) mkdir(model_save_path) torch.save( Encoder.state_dict(), os.path.join(model_save_path, "Encoder-{}.pt".format(epoch + 1))) torch.save( NorClsfier.state_dict(), os.path.join(model_save_path, "NorClsfier-{}.pt".format(epoch + 1))) torch.save( Decoder.state_dict(), os.path.join(model_save_path, "Decoder-{}.pt".format(epoch + 1))) torch.save(Encoder.state_dict(), os.path.join(model_save_path, "Encoder-final.pt")) torch.save(NorClsfier.state_dict(), os.path.join(model_save_path, "NorClsfier-final.pt")) torch.save(Decoder.state_dict(), os.path.join(model_save_path, "Decoder-final.pt"))
def train(args): # Setup TrainDataLoader trainloader = CCFLoader(args.traindir, split=args.split,is_transform=True, img_size=(args.img_rows, args.img_cols)) n_classes = trainloader.n_classes TrainDataLoader = data.DataLoader(trainloader, batch_size=args.batch_size, num_workers=8, shuffle=True) #Setup for validate valloader = CCFLoader(args.traindir, split='val', is_transform=True, img_size=(args.img_rows, args.img_cols)) VALDataLoader = data.DataLoader(valloader,batch_size=4, num_workers=4, shuffle=False) # Setup visdom for visualization vis = visdom.Visdom() assert vis.check_connection() loss_window = vis.line(X=np.zeros((1,)), Y=np.zeros((1)), opts=dict(xlabel='minibatches', ylabel='Loss', title=args.arch+' Training Loss', legend=['Loss'])) valacc_window = vis.line(X=np.zeros((1,)), Y=np.zeros((1)), opts=dict(xlabel='minibatches', ylabel='ACC', title='Val ACC', legend=['ACC'])) # Setup Model start_epoch = 0 if(args.snapshot==None): model = get_model(args.arch, n_classes) model = DataParallel(model.cuda(args.gpu[0]),device_ids=args.gpu) else: model = get_model(args.arch, n_classes) state_dict = torch.load(args.snapshot).state_dict() from collections import OrderedDict new_state_dict = OrderedDict() for k,v in state_dict.items(): name =k[7:] #remove moudle new_state_dict[name] = v model.load_state_dict(new_state_dict) model = DataParallel(model.cuda(),device_ids=[i for i in range(len(args.gpu))]) start_epoch = int(os.path.basename(args.snapshot).split('.')[0]) optimizer = torch.optim.SGD(model.parameters(), lr=args.l_rate, momentum=0.99, weight_decay=5e-4) for epoch in range(args.n_epoch): adjust_learning_rate(optimizer,args.l_rate,epoch,args.step) if(epoch < start_epoch): continue for i, (images, labels) in enumerate(TrainDataLoader): if torch.cuda.is_available(): images = Variable(images.cuda(args.gpu[0])) labels = Variable(labels.cuda(args.gpu[0])) else: images = Variable(images) labels = Variable(labels) iter = len(TrainDataLoader)*epoch + i #poly_lr_scheduler(optimizer, args.l_rate, iter) model.train() optimizer.zero_grad() outputs = model(images) if(isinstance(outputs,tuple)): loss = cross_entropy2d(outputs[0], labels,weights_per_class) + args.clsloss_weight * bin_clsloss(outputs[1], labels) else: #loss = cross_entropy2d(outputs, labels) loss = cross_entropy2d(outputs, labels,weights_per_class) #loss = focal_loss2d(outputs, labels) loss.backward() optimizer.step() vis.line( X=torch.ones((1, 1)).cpu()*iter, Y=torch.Tensor([loss.data[0]]).unsqueeze(0).cpu(), win=loss_window, update='append') print("Epoch [%d/%d] iteration: %d with Loss: %.4f" % (epoch+1, args.n_epoch, iter+1, loss.data[0])) #validation loss,acc = validate(model,VALDataLoader,n_classes) vis.line(X=torch.ones((1, 1)).cpu()*(epoch+1),Y=torch.ones((1, 1)).cpu()*acc,win=valacc_window,update='append') if(not os.path.exists("snapshot/{}".format(args.arch))): os.mkdir("snapshot/{}".format(args.arch)) torch.save(model, "snapshot/{}/{}.pkl".format(args.arch, epoch+1))
def train(self): if not self.pretrained_model: model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel( config=self.model_config) else: self.print_and_log('加载预训练模型') model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained( self.pretrained_model) model.train() model.to(self.device) # 计算模型参数量 num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() self.print_and_log('模型参数量: {}'.format(num_parameters)) self.print_and_log("开始加载训练集") train_loader = self.create_dataloader() self.print_and_log("训练集加载完毕") epoch_steps = int(train_loader.sampler.num_samples / self.batch_size / self.accumulation_steps) total_steps = epoch_steps * self.epochs self.print_and_log('总样本数 = {}'.format( train_loader.sampler.num_samples)) self.print_and_log('epoch 步数 = {}'.format(epoch_steps)) self.print_and_log('总步数 = {}'.format(total_steps)) optimizer = pytorch_transformers.AdamW(model.parameters(), lr=self.lr, correct_bias=True) scheduler = pytorch_transformers.WarmupLinearSchedule( optimizer, warmup_steps=self.warmup_steps, t_total=total_steps) if self.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=self.fp16_opt_level) if torch.cuda.device_count() > 1: model = DataParallel(model) multi_gpu = True else: multi_gpu = False overall_step = 0 running_loss = 0 model.train() for epoch in range(self.epochs): self.print_and_log('epoch {}'.format(epoch + 1)) now = datetime.now() self.print_and_log('time: {}'.format(now)) optimizer.zero_grad() for i, batch_data in enumerate(train_loader): if torch.cuda.is_available(): # keyword_ids = batch_data[0].to(self.device, non_blocking=True) passage_ids = batch_data[1].to(self.device, non_blocking=True) label_ids = passage_ids.clone().to(self.device, non_blocking=True) else: # keyword_ids = batch_data[0] passage_ids = batch_data[1] label_ids = passage_ids.clone() outputs = model(input_ids=passage_ids, labels=label_ids) loss, logits = outputs[:2] # 多 GPU 训练 if multi_gpu: loss = loss.mean() # 梯度累加 if self.gradient_accumulation > 1: loss = loss / self.gradient_accumulation # 混合精度训练或正常训练 if self.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), self.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), self.max_grad_norm) # 更新权重 if (i + 1) % self.gradient_accumulation == 0: running_loss += loss.item() scheduler.step() optimizer.step() optimizer.zero_grad() overall_step += 1 # 报告 train loss if (overall_step + 1) % self.log_step == 0 and running_loss != 0: self.print_and_log( 'now time: {}:{}. Step {} of epoch {}, loss {}'.format( datetime.now().hour, datetime.now().minute, overall_step + 1, epoch + 1, running_loss * self.gradient_accumulation / self.log_step)) running_loss = 0 # 保存模型 if (epoch + 1) % 1 == 0: if not os.path.exists(self.output_dir + 'model_epoch{}'.format(epoch + 1)): os.makedirs(self.output_dir + 'model_epoch{}'.format(epoch + 1)) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(self.output_dir + 'model_epoch{}'.format(epoch + 1)) # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1)) # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1)) then = datetime.now() self.print_and_log('time: {}'.format(then)) self.print_and_log('time for one epoch: {}'.format(then - now)) model.train() self.print_and_log('training finished') self.f_log.close() if not os.path.exists(self.output_dir + 'final_model'): os.makedirs(self.output_dir + 'final_model') model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(self.output_dir + 'final_model')
def main(): global args args = parser.parse_args() torch.manual_seed(0) ################################## nodmodel = import_module(args.model1) config1, nod_net, loss, get_pbb = nodmodel.get_model() args.lr_stage = config1['lr_stage'] args.lr_preset = config1['lr'] save_dir = args.save_dir ################################## casemodel = import_module(args.model2) config2 = casemodel.config args.lr_stage2 = config2['lr_stage'] args.lr_preset2 = config2['lr'] topk = config2['topk'] case_net = casemodel.CaseNet(topk = topk,nodulenet=nod_net) args.miss_ratio = config2['miss_ratio'] args.miss_thresh = config2['miss_thresh'] if args.debug: args.save_dir = 'debug' ################################### ################################ start_epoch = args.start_epoch if args.resume: checkpoint = torch.load(args.resume) if start_epoch == 0: start_epoch = checkpoint['epoch'] + 1 if not save_dir: save_dir = checkpoint['save_dir'] else: save_dir = os.path.join('results',save_dir) case_net.load_state_dict(checkpoint['state_dict']) else: if start_epoch == 0: start_epoch = 1 if not save_dir: exp_id = time.strftime('%Y%m%d-%H%M%S', time.localtime()) save_dir = os.path.join('results', args.model1 + '-' + exp_id) else: save_dir = os.path.join('results',save_dir) if args.epochs == None: end_epoch = args.lr_stage2[-1] else: end_epoch = args.epochs ################################ if not os.path.exists(save_dir): os.makedirs(save_dir) logfile = os.path.join(save_dir,'log') if args.test1!=1 and args.test2!=1 : sys.stdout = Logger(logfile) pyfiles = [f for f in os.listdir('./') if f.endswith('.py')] for f in pyfiles: shutil.copy(f,os.path.join(save_dir,f)) ################################ torch.cuda.set_device(0) #nod_net = nod_net.cuda() case_net = case_net.cuda() loss = loss.cuda() cudnn.benchmark = True if not args.debug: case_net = DataParallel(case_net) nod_net = DataParallel(nod_net) ################################ if args.test1 == 1: testsplit = np.load('full.npy') dataset = DataBowl3Classifier(testsplit, config2, phase = 'test') predlist = test_casenet(case_net,dataset).T anstable = np.concatenate([[testsplit],predlist],0).T df = pandas.DataFrame(anstable) df.columns={'id','cancer'} df.to_csv('allstage1.csv',index=False) return if args.test2 ==1: testsplit = np.load('test.npy') dataset = DataBowl3Classifier(testsplit, config2, phase = 'test') predlist = test_casenet(case_net,dataset).T anstable = np.concatenate([[testsplit],predlist],0).T df = pandas.DataFrame(anstable) df.columns={'id','cancer'} df.to_csv('quick',index=False) return if args.test3 == 1: testsplit3 = np.load('stage2.npy') dataset = DataBowl3Classifier(testsplit3,config2,phase = 'test') predlist = test_casenet(case_net,dataset).T anstable = np.concatenate([[testsplit3],predlist],0).T df = pandas.DataFrame(anstable) df.columns={'id','cancer'} df.to_csv('stage2_ans.csv',index=False) return print(save_dir) print(args.save_freq) trainsplit = np.load('kaggleluna_full.npy') valsplit = np.load('valsplit.npy') testsplit = np.load('test.npy') dataset = DataBowl3Detector(trainsplit,config1,phase = 'train') train_loader_nod = DataLoader(dataset,batch_size = args.batch_size, shuffle = True,num_workers = args.workers,pin_memory=True) dataset = DataBowl3Detector(valsplit,config1,phase = 'val') val_loader_nod = DataLoader(dataset,batch_size = args.batch_size, shuffle = False,num_workers = args.workers,pin_memory=True) optimizer = torch.optim.SGD(nod_net.parameters(), args.lr,momentum = 0.9,weight_decay = args.weight_decay) trainsplit = np.load('full.npy') dataset = DataBowl3Classifier(trainsplit,config2,phase = 'train') train_loader_case = DataLoader(dataset,batch_size = args.batch_size2, shuffle = True,num_workers = args.workers,pin_memory=True) dataset = DataBowl3Classifier(valsplit,config2,phase = 'val') val_loader_case = DataLoader(dataset,batch_size = max([args.batch_size2,1]), shuffle = False,num_workers = args.workers,pin_memory=True) dataset = DataBowl3Classifier(trainsplit,config2,phase = 'val') all_loader_case = DataLoader(dataset,batch_size = max([args.batch_size2,1]), shuffle = False,num_workers = args.workers,pin_memory=True) optimizer2 = torch.optim.SGD(case_net.parameters(), args.lr,momentum = 0.9,weight_decay = args.weight_decay) for epoch in range(start_epoch, end_epoch + 1): if epoch ==start_epoch: lr = args.lr debug = args.debug args.lr = 0.0 args.debug = True train_casenet(epoch,case_net,train_loader_case,optimizer2,args) args.lr = lr args.debug = debug if epoch<args.lr_stage[-1]: train_nodulenet(train_loader_nod, nod_net, loss, epoch, optimizer, args) validate_nodulenet(val_loader_nod, nod_net, loss) if epoch>config2['startepoch']: train_casenet(epoch,case_net,train_loader_case,optimizer2,args) val_casenet(epoch,case_net,val_loader_case,args) val_casenet(epoch,case_net,all_loader_case,args) if epoch % args.save_freq == 0: state_dict = case_net.module.state_dict() for key in state_dict.keys(): state_dict[key] = state_dict[key].cpu() torch.save({ 'epoch': epoch, 'save_dir': save_dir, 'state_dict': state_dict, 'args': args}, os.path.join(save_dir, '%03d.ckpt' % epoch))
def main(verbose=1, print_freq=100, restore=True, ckpt_path=None, val_freq=1, run_id="model", dset_mode="grayscale_mask", model_type="siamese", dataset_name="deepfashion", ckpt_type="siamese", freeze_encoder_until_it=1000): print("TRAINING MODEL {} ON DATASET {}".format(model_type, dataset_name)) if restore and ckpt_path: raise RuntimeError("Specify restore 0R ckpt_path") ckpt_savepath = os.path.join(cfg.CKPT_DIR, "{}.pth".format(run_id)) print("Saving ckpts to {}".format(ckpt_savepath)) logs_savepath = os.path.join(cfg.LOGDIR, run_id) print("Saving logs to {}".format(logs_savepath)) if restore or ckpt_path: print("Restoring weights from {}".format( ckpt_savepath if restore else ckpt_path)) if cfg.USE_GPU: if not torch.cuda.is_available(): raise RuntimeError("cuda not available") device = torch.device('cuda') else: device = torch.device("cpu") print('DEVICE', device) # model model = get_model(model_type) model = DataParallel(model) # must call this before constructing the optimizer: # https://pytorch.org/docs/stable/optim.html model.to(device) # set up training # TODO better one? optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=0.0001) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1) criterion = ContrastiveLoss() initial_epoch = 0 iteration = 0 unfrozen = False if ckpt_path: ckpt = torch.load(ckpt_path) state_dict = ckpt['model_state_dict'] if ckpt_type == model_type: model.load_state_dict(state_dict) elif model_type == 'dual' and ckpt_type == 'siamese': model = load_siamese_ckpt_into_dual(model, state_dict) else: raise NotImplementedError() elif restore: if os.path.exists(ckpt_savepath): print("LOADING MODEL") ckpt = torch.load(ckpt_savepath) model.load_state_dict(ckpt['model_state_dict']) optimizer.load_state_dict(ckpt['optimizer_state_dict']) initial_epoch = ckpt['epoch'] iteration = ckpt['it'] dset_mode = ckpt.get('dset_mode', dset_mode) else: raise RuntimeError("Should not get here! Check for bugs") print("Using dset_mode {}".format(dset_mode)) # dataset train_ds, test_ds = get_dataset(dataset_name, dset_mode) # train_ds = Subset(train_ds, range(500)) # test_ds = Subset(test_ds, range(100)) train_dl = DataLoader(train_ds, batch_size=cfg.BATCH_SIZE, shuffle=True, num_workers=cfg.NUM_WORKERS) test_dl = DataLoader(test_ds, batch_size=cfg.BATCH_SIZE, shuffle=False, num_workers=cfg.NUM_WORKERS) # training loop start = time.time() try: for epoch in range(initial_epoch, cfg.NUM_EPOCHS): logger = SummaryWriter(logs_savepath) # effectively puts the model in train mode. # Opposite of model.eval() model.train() print("Epoch {}".format(epoch)) for i, (im1, im2, y) in tqdm(enumerate(train_dl), total=len(train_ds) / cfg.BATCH_SIZE): iteration += 1 if not unfrozen and iteration > freeze_encoder_until_it: print("Unfreezing encoder") unfrozen = True for param in model.parameters(): param.requires_grad = True logger.add_scalar('DataTime', time.time() - start, iteration) im1 = im1.to(device) im2 = im2.to(device) y = y.to(device) enc1, enc2 = model(im1, im2) loss = criterion(enc1, enc2, y) # I think this zeros out previous gradients (in case people # want to accumulate gradients?) optimizer.zero_grad() loss.backward() optimizer.step() # logging logger.add_scalar('TrainLoss', loss.item(), iteration) logger.add_scalar('ItTime', time.time() - start, iteration) start = time.time() # display metrics # do some validation if (epoch + 1) % val_freq == 0: print("Validating...") model.eval() # puts model in validation mode with torch.no_grad(): for i, (im1, im2, y) in tqdm(enumerate(test_dl), total=len(test_ds) / cfg.BATCH_SIZE): im1 = im1.to(device) im2 = im2.to(device) y = y.to(device) enc1, enc2 = model(im1, im2) loss = criterion(enc1, enc2, y) logger.add_scalar('ValLoss', loss, iteration) # end of epoch lr_scheduler.step() save_ckpt(ckpt_savepath, model, epoch, iteration, optimizer, dset_mode, dataset_name, model_type) except KeyboardInterrupt: print('Got keyboard interrupt, saving model...') save_ckpt(ckpt_savepath, model, epoch, iteration, optimizer, dset_mode, dataset_name, model_type)
class Train(): def __init__(self, config): self.config = config ATTR_HEAD = {'race': RaceHead, 'gender': GenderHead, 'age': AgeHead, 'recognition': self.config.recognition_head} self.writer = SummaryWriter(config.log_path) if path.isfile(self.config.train_source): self.train_loader = LMDBDataLoader(self.config, self.config.train_source) else: self.train_loader = CustomDataLoader(self.config, self.config.train_source, self.config.train_list) class_num = self.train_loader.class_num() print(len(self.train_loader.dataset)) print(f'Classes: {class_num}') self.model = ResNet(self.config.depth, self.config.drop_ratio, self.config.net_mode) if self.config.attribute == 'recognition': self.head = ATTR_HEAD[self.config.attribute](classnum=class_num, m=self.config.margin) else: self.head = ATTR_HEAD[self.config.attribute](classnum=class_num) paras_only_bn, paras_wo_bn = separate_bn_param(self.model) dummy_input = torch.zeros(1, 3, 112, 112) self.writer.add_graph(self.model, dummy_input) if torch.cuda.device_count() > 1: print(f"Model will use {torch.cuda.device_count()} GPUs!") self.model = DataParallel(self.model) self.head = DataParallel(self.head) self.model = self.model.to(self.config.device) self.head = self.head.to(self.config.device) self.weights = None if self.config.attribute in ['race', 'gender']: _, self.weights = np.unique(self.train_loader.dataset.get_targets(), return_counts=True) self.weights = np.max(self.weights) / self.weights self.weights = torch.tensor(self.weights, dtype=torch.float, device=self.config.device) self.config.weights = self.weights print(self.weights) if self.config.val_source is not None: if self.config.attribute != 'recognition': if path.isfile(self.config.val_source): self.val_loader = LMDBDataLoader(self.config, self.config.val_source, False) else: self.val_loader = CustomDataLoader(self.config, self.config.val_source, self.config.val_list, False) else: self.validation_list = [] for val_name in config.val_list: dataset, issame = get_val_pair(self.config.val_source, val_name) self.validation_list.append([dataset, issame, val_name]) self.optimizer = optim.SGD([{'params': paras_wo_bn, 'weight_decay': self.config.weight_decay}, {'params': self.head.parameters(), 'weight_decay': self.config.weight_decay}, {'params': paras_only_bn}], lr=self.config.lr, momentum=self.config.momentum) if self.config.resume: print(f'Resuming training from {self.config.resume}') load_state(self.model, self.head, self.optimizer, self.config.resume, False) if self.config.pretrained: print(f'Loading pretrained weights from {self.config.pretrained}') load_state(self.model, self.head, None, self.config.pretrained, True) print(self.config) self.save_file(self.config, 'config.txt') print(self.optimizer) self.save_file(self.optimizer, 'optimizer.txt') self.tensorboard_loss_every = max(len(self.train_loader) // 100, 1) self.evaluate_every = max(len(self.train_loader) // 5, 1) if self.config.lr_plateau: self.scheduler = ReduceLROnPlateau(self.optimizer, mode=self.config.max_or_min, factor=0.1, patience=3, verbose=True, threshold=0.001, cooldown=1) if self.config.early_stop: self.early_stop = EarlyStop(mode=self.config.max_or_min) def run(self): self.model.train() self.head.train() running_loss = 0. step = 0 val_acc = 0. val_loss = 0. best_step = 0 best_acc = float('Inf') if self.config.max_or_min == 'max': best_acc *= -1 for epoch in range(self.config.epochs): train_logger = TrainLogger(self.config.batch_size, self.config.frequency_log) if epoch + 1 in self.config.reduce_lr and not self.config.lr_plateau: self.reduce_lr() for idx, data in enumerate(self.train_loader): imgs, labels = data imgs = imgs.to(self.config.device) labels = labels.to(self.config.device) self.optimizer.zero_grad() embeddings = self.model(imgs) if self.config.attribute == 'recognition': outputs = self.head(embeddings, labels) else: outputs = self.head(embeddings) if self.weights is not None: loss = self.config.loss(outputs, labels, weight=self.weights) else: loss = self.config.loss(outputs, labels) loss.backward() running_loss += loss.item() self.optimizer.step() if step % self.tensorboard_loss_every == 0: loss_board = running_loss / self.tensorboard_loss_every self.writer.add_scalar('train_loss', loss_board, step) running_loss = 0. if step % self.evaluate_every == 0 and step != 0: if self.config.val_source is not None: val_acc, val_loss = self.evaluate(step) self.model.train() self.head.train() best_acc, best_step = self.save_model(val_acc, best_acc, step, best_step) print(f'Best accuracy: {best_acc:.5f} at step {best_step}') else: save_state(self.model, self.head, self.optimizer, self.config, 0, step) train_logger(epoch, self.config.epochs, idx, len(self.train_loader), loss.item()) step += 1 if self.config.lr_plateau: self.scheduler.step(val_acc) if self.config.early_stop: self.early_stop(val_acc) if self.early_stop.stop: print("Early stopping model...") break val_acc, val_loss = self.evaluate(step) best_acc = self.save_model(val_acc, best_acc, step, best_step) print(f'Best accuracy: {best_acc} at step {best_step}') def save_model(self, val_acc, best_acc, step, best_step): if (self.config.max_or_min == 'max' and val_acc > best_acc) or \ (self.config.max_or_min == 'min' and val_acc < best_acc): best_acc = val_acc best_step = step save_state(self.model, self.head, self.optimizer, self.config, val_acc, step) return best_acc, best_step def reduce_lr(self): for params in self.optimizer.param_groups: params['lr'] /= 10 print(self.optimizer) def tensorboard_val(self, accuracy, step, loss=0, dataset=''): self.writer.add_scalar('{}val_acc'.format(dataset), accuracy, step) if self.config.attribute != 'recognition': self.writer.add_scalar('val_loss', loss, step) def evaluate(self, step): if self.config.attribute != 'recognition': val_acc, val_loss = self.evaluate_attribute() self.tensorboard_val(val_acc, step, val_loss) elif self.config.attribute == 'recognition': val_loss = 0 val_acc = 0 print('Validating...') for idx, validation in enumerate(self.validation_list): dataset, issame, val_name = validation acc, std = self.evaluate_recognition(dataset, issame) self.tensorboard_val(acc, step, dataset=f'{val_name}_') print(f'{val_name}: {acc:.5f}+-{std:.5f}') val_acc += acc val_acc /= (idx + 1) self.tensorboard_val(val_acc, step) print(f'Mean accuracy: {val_acc:.5f}') return val_acc, val_loss def evaluate_attribute(self): self.model.eval() self.head.eval() y_true = torch.tensor([], dtype=self.config.output_type, device=self.config.device) all_outputs = torch.tensor([], device=self.config.device) with torch.no_grad(): for imgs, labels in iter(self.val_loader): imgs = imgs.to(self.config.device) labels = labels.to(self.config.device) embeddings = self.model(imgs) outputs = self.head(embeddings) y_true = torch.cat((y_true, labels), 0) all_outputs = torch.cat((all_outputs, outputs), 0) if self.weights is not None: loss = round(self.config.loss(all_outputs, y_true, weight=self.weights).item(), 4) else: loss = round(self.config.loss(all_outputs, y_true).item(), 4) y_true = y_true.cpu().numpy() if self.config.attribute == 'age': y_pred = all_outputs.cpu().numpy() y_pred = np.round(y_pred, 0) y_pred = np.sum(y_pred, axis=1) y_true = np.sum(y_true, axis=1) accuracy = round(mean_absolute_error(y_true, y_pred), 4) else: _, y_pred = torch.max(all_outputs, 1) y_pred = y_pred.cpu().numpy() accuracy = round(np.sum(y_true == y_pred) / len(y_pred), 4) return accuracy, loss def evaluate_recognition(self, samples, issame, nrof_folds=10, tta=False): self.model.eval() idx = 0 embeddings = np.zeros([len(samples), self.config.embedding_size]) with torch.no_grad(): for idx in range(0, len(samples), self.config.batch_size): batch = torch.tensor(samples[idx:idx + self.config.batch_size]) embeddings[idx:idx + self.config.batch_size] = self.model(batch.to(self.config.device)).cpu() idx += self.config.batch_size tpr, fpr, accuracy, best_thresholds = verification.evaluate(embeddings, issame, nrof_folds) return round(accuracy.mean(), 5), round(accuracy.std(), 5) def save_file(self, string, file_name): file = open(path.join(self.config.work_path, file_name), "w") file.write(str(string)) file.close()
def main(): global args args = parser.parse_args() torch.manual_seed(0) torch.cuda.set_device(0) model = import_module(args.model) config, net, loss = model.get_model() start_epoch = args.start_epoch save_dir = args.save_dir if args.resume: checkpoint = torch.load(args.resume) #if start_epoch == 0: # start_epoch = checkpoint['epoch'] + 1 #if not save_dir: # save_dir = checkpoint['save_dir'] #else: save_dir = os.path.join('results', save_dir) net.load_state_dict(checkpoint['state_dict']) else: if start_epoch == 0: start_epoch = 1 if not save_dir: exp_id = time.strftime('%Y%m%d-%H%M%S', time.localtime()) save_dir = os.path.join('results', args.model + '-' + exp_id) else: save_dir = os.path.join('results', save_dir) if not os.path.exists(save_dir): os.makedirs(save_dir) logfile = os.path.join(save_dir, 'log') if args.test != 1: sys.stdout = Logger(logfile) pyfiles = [f for f in os.listdir('./') if f.endswith('.py')] for f in pyfiles: shutil.copy(f, os.path.join(save_dir, f)) n_gpu = setgpu(args.gpu) args.n_gpu = n_gpu print("arg", args.gpu) print("num_gpu", n_gpu) net = net.cuda() loss = loss.cuda() cudnn.benchmark = True net = DataParallel(net) datadir = config_training['preprocess_result_path'] print("datadir", datadir) print("pad_val", config['pad_value']) print("aug type", config['augtype']) dataset = data.DataBowl3Detector(datadir, 'train_luna_9.npy', config, phase='train') print("len train_dataset", dataset.__len__()) train_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) dataset = data.DataBowl3Detector(datadir, 'val9.npy', config, phase='val') print("len val_dataset", dataset.__len__()) val_loader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=args.workers, pin_memory=True) optimizer = torch.optim.SGD(net.parameters(), args.lr, momentum=0.9, weight_decay=args.weight_decay) def get_lr(epoch): if epoch <= args.epochs * 0.5: lr = args.lr elif epoch <= args.epochs * 0.8: lr = 0.1 * args.lr else: lr = 0.01 * args.lr return lr best_val_loss = 100 best_mal_loss = 100 for epoch in range(start_epoch, args.epochs + 1): print("epoch", epoch) train(train_loader, net, loss, epoch, optimizer, get_lr, args.save_freq, save_dir) best_val_loss, best_mal_loss = validate(val_loader, net, loss, best_val_loss, best_mal_loss, epoch, save_dir)