def densenet201(config_channels, anchors, num_cls, **kwargs): model = DenseNet(config_channels, anchors, num_cls, num_init_features=64, growth_rate=32, block_config=(6, 12, 48, 32), **kwargs) if config_channels.config.getboolean('model', 'pretrained'): url = model_urls['densenet201'] logging.info('use pretrained model: ' + url) state_dict = model.state_dict() for key, value in model_zoo.load_url(url).items(): if key in state_dict: state_dict[key] = value model.load_state_dict(state_dict) return model
def resnet50(config_channels, anchors, num_cls, **kwargs): model = ResNet(config_channels, anchors, num_cls, Bottleneck, [3, 4, 6, 3], **kwargs) if config_channels.config.getboolean('model', 'pretrained'): url = _model.model_urls['resnet50'] logging.info('use pretrained model: ' + url) state_dict = model.state_dict() for key, value in model_zoo.load_url(url).items(): if key in state_dict: state_dict[key] = value model.load_state_dict(state_dict) return model
def finetune(self, model, path): if os.path.isdir(path): path, _step, _epoch = utils.train.load_model(path) _state_dict = torch.load(path, map_location=lambda storage, loc: storage) state_dict = model.state_dict() ignore = utils.RegexList(self.args.ignore) for key, value in state_dict.items(): try: if not ignore(key): state_dict[key] = _state_dict[key] except KeyError: logging.warning('%s not in finetune file %s' % (key, path)) model.load_state_dict(state_dict)
def __init__(self, args=args): #RANDOM MODEL INITIALIZATION FUNCTION def init_weights(m): if isinstance(m, torch.nn.Linear) or isinstance( m, torch.nn.Conv2d): torch.nn.init.xavier_uniform_(m.weight.data) #INITIALIZE VARIABLES self.SR_COUNT = args.action_space SRMODEL_PATH = args.srmodel_path self.batch_size = args.batch_size self.TRAINING_LRPATH = glob.glob( os.path.join(args.training_lrpath, "*")) self.TRAINING_HRPATH = glob.glob( os.path.join(args.training_hrpath, "*")) self.TRAINING_LRPATH.sort() self.TRAINING_HRPATH.sort() self.PATCH_SIZE = args.patchsize self.patchinfo_dir = args.patchinfo self.TESTING_PATH = glob.glob(os.path.join(args.testing_path, "*")) self.LR = args.learning_rate self.UPSIZE = args.upsize self.step = 0 self.name = args.name if args.name != 'none': self.logger = logger.Logger( args.name) #create our logger for tensorboard in log directory else: self.logger = None self.device = torch.device(args.device) #determine cpu/gpu #DEFAULT START OR START ON PREVIOUSLY TRAINED EPOCH if args.model_dir != "": self.load(args) print('continue training for model: ' + args.model_dir) else: self.SRmodels = [] self.SRoptimizers = [] self.schedulers = [] #LOAD A COPY OF THE MODEL N TIMES for i in range(self.SR_COUNT): if args.model == 'ESRGAN': model = arch.RRDBNet(3, 3, 64, 23, gc=32) model.load_state_dict(torch.load(args.ESRGAN_PATH), strict=True) print('ESRGAN loaded') elif args.model == 'random': model = arch.RRDBNet(3, 3, 64, 23, gc=32) model.apply(init_weights) print('Model RRDB Loaded with random weights...') elif args.model == 'RCAN': torch.manual_seed(args.seed) checkpoint = utility.checkpoint(args) if checkpoint.ok: module = import_module('model.' + args.model.lower()) model = module.make_model(args).to(self.device) kwargs = {} model.load_state_dict(torch.load( args.pre_train, **kwargs), strict=False) else: print('error') self.SRmodels.append(model) self.SRmodels[-1].to(self.device) self.SRoptimizers.append( torch.optim.Adam(model.parameters(), lr=1e-4)) self.schedulers.append( torch.optim.lr_scheduler.StepLR(self.SRoptimizers[-1], 10000, gamma=0.1)) #self.patchinfo = np.load(self.patchinfo_dir) self.agent = agent.Agent(args)
half_padding = padding / 2 output_length = sample_size - padding print low_resolution_samples.shape lowres_set = data.TensorDataset( torch.from_numpy(low_resolution_samples), torch.from_numpy(np.zeros(low_resolution_samples.shape[0]))) lowres_loader = torch.utils.data.DataLoader(lowres_set, batch_size=batch_size, shuffle=False) hires_loader = lowres_loader model = model.Net(40, 28) model.load_state_dict(torch.load('../model/pytorch_model_12000')) if use_gpu: model = model.cuda() _loss = nn.MSELoss() running_loss = 0.0 running_loss_validate = 0.0 reg_loss = 0.0 for i, (v1, v2) in enumerate(zip(lowres_loader, hires_loader)): _lowRes, _ = v1 _highRes, _ = v2 _lowRes = Variable(_lowRes).float() _highRes = Variable(_highRes).float()
def load(self, args): if args.model_dir != "": loadedparams = torch.load(args.model_dir, map_location=self.device) self.agent = agent.Agent(args, chkpoint=loadedparams) else: self.agent = agent.Agent(args) self.SRmodels = [] self.SRoptimizers = [] self.schedulers = [] for i in range(args.action_space): #CREATE THE ARCH if args.model == 'basic': model = arch.RRDBNet(3, 3, 32, args.d, gc=8, upsize=args.upsize) elif args.model == 'ESRGAN': model = arch.RRDBNet(3, 3, 64, 23, gc=32, upsize=args.upsize) elif args.model == 'RCAN': torch.manual_seed(args.seed) checkpoint = utility.checkpoint(args) if checkpoint.ok: module = import_module('model.rcan') model = module.make_model(args).to(self.device) kwargs = {} else: print('error loading RCAN model. QUITING') quit() #LOAD THE WEIGHTS if args.model_dir != "": model.load_state_dict(loadedparams["sisr" + str(i)]) print('continuing training') elif args.random: print('random init') elif args.model == 'ESRGAN': #model.load_state_dict(torch.load(args.ESRGAN_PATH),strict=True) loaded_dict = torch.load(args.ESRGAN_PATH) model_dict = model.state_dict() loaded_dict = { k: v for k, v in loaded_dict.items() if k in model_dict } model_dict.update(loaded_dict) model.load_state_dict(model_dict) elif args.model == 'RCAN': print('RCAN loaded!') model.load_state_dict(torch.load(args.pre_train, **kwargs), strict=True) elif args.model == 'basic': if args.d == 1: model.load_state_dict(torch.load(args.basicpath_d1), strict=False) elif args.d == 2: model.load_state_dict(torch.load(args.basicpath_d2), strict=True) elif args.d == 4: model.load_state_dict(torch.load(args.basicpath_d4), strict=True) elif args.d == 8: model.load_state_dict(torch.load(args.basicpath_d8), strict=True) else: print( 'no pretrained model available. Random initialization of basic block' ) self.SRmodels.append(model) self.SRmodels[-1].to(self.device) self.SRoptimizers.append( torch.optim.Adam(model.parameters(), lr=1e-5)) scheduler = torch.optim.lr_scheduler.StepLR(self.SRoptimizers[-1], 200, gamma=0.8) self.schedulers.append(scheduler)
train_inputs = train_inputs.cuda() train_labels = train_labels.cuda() test_inputs = test_inputs.cuda() test_labels = test_labels.cuda() epoch = 0 lr = args.lr # network part model = model.network(args, input_dim=train_inputs.shape[1], class_num= args.class_num) print(train_inputs.shape, train_labels.shape) optimizer = torch.optim.Adam(model.parameters(), lr = args.lr, weight_decay = args.l2) # optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9) # optimizer = torch.optim.Adadelta(model.parameters(), lr = args.lr, weight_decay = args.l2) if args.load_model_path != "": opti_path = args.load_model_path + "_opti" model.load_state_dict(torch.load(args.load_model_path)) if args.gpu: model.cuda() epoch = 0 log_test = utils.setup_logger(0, 'test_log', os.path.join(args.log_path, 'ds_test_log.txt')) log_train = utils.setup_logger(0, 'train_log', os.path.join(args.log_path, 'ds_train_log.txt')) best_accuracy, best_f1, best_train_accuracy = 0.0, 0.0, 0.0 early_stop_counter = 0 loss_function = nn.CrossEntropyLoss() ftrain_accuracy = open((os.path.join(args.log_path, 'l2_'+str(args.l2)+'ds_train_accuracy.txt')), "w") floss = open((os.path.join(args.log_path, 'l2_'+str(args.l2)+'ds_loss.txt')), "w") ftest = open((os.path.join(args.log_path, 'l2_'+str(args.l2)+'_ds_test_accuracy.txt')), "w") # train while epoch<args.max_epoch:
img = cv2.imread('../wheel.jfif', 0) rows, cols = img.shape path = data.path.values target = data.target.values dataset = dataset.DrivingDataset(path, target, config.IMAGE_FOLDER, (66, 200)) train_loader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True) model = model.SelfDrivingModel() model.load_state_dict(torch.load('../models/2.h5')) model.eval() i = 0 s_a = 0 while (cv2.waitKey(10) != ord('q')): data = dataset[i] image = data['image'] image = image.view(1, 3, 66, 200) target = data['target'] full_img = cv2.imread(config.IMAGE_FOLDER + str(i) + ".jpg") rad = model.forward(image).detach().numpy() degree = rad * 180.0 / 3.14159265 print(f'predicted values : {rad} , original value {target}')
def trainClassier(params): def validation(test_loader, model): correct = 0 total_test = 0 cnt = 0 cross_entropy = 0 model.eval() with torch.no_grad(): for sample_batch in test_loader: images, labels = sample_batch if params.useGPU: images, labels = Variable(images.cuda()), Variable( labels.cuda()) out = model.forward(images) loss = torch.nn.CrossEntropyLoss()(out, labels) _, pred = torch.max(out, 1) correct += (pred == labels).sum().item() cross_entropy += loss total_test += labels.size(0) cnt += 1 return correct / total_test, cross_entropy / cnt train_data = ImageFolder( root=params.processed, transform=transforms.Compose([ # transforms.Grayscale(), transforms.Resize(380), transforms.RandomHorizontalFlip(), transforms.RandomCrop(360), transforms.RandomRotation(10), transforms.ToTensor(), transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]) ])) print(train_data.classes) train_loader = DataLoader(train_data, batch_size=params.batchSize, shuffle=True) testset = ImageFolder( root=params.val_data_set, transform=transforms.Compose([ # transforms.Grayscale(), transforms.ToTensor(), transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]) ])) test_loader = DataLoader(testset, batch_size=4, shuffle=False) model = model.classifier(pre_train=True) if params.useGPU: print('gpu is available') model = torch.nn.DataParallel(model, device_ids=[0]).cuda() else: model = torch.nn.DataParallel(model) try: model.load_state_dict(torch.load(params.model_path)) print('load model successfully') except: print('cannot find model') criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), params.learningRate) for epoch in range(0, params.numEpochs): model.train() loss_sum = 0 for batch_n, batch in enumerate(train_loader): start_time = time.time() inputs, labels = batch inputs, labels = Variable(inputs), Variable(labels) if params.useGPU: inputs, labels = Variable(inputs.cuda()), Variable( labels.cuda()) optimizer.zero_grad() outputs = model.forward(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() loss_sum += loss.item() if batch_n % 10 == 9: _, pred = torch.max(outputs, 1) correct = (pred == labels).sum().item() print( 'Epoch: [{}/{}], batch: {}, took: {:.3f}, loss: {:.5f}, Acc: {:.5f}' .format(epoch, params.numEpochs, batch_n, time.time() - start_time, loss_sum / 10, correct / labels.size(0))) loss_sum = 0 acc, loss = validation(test_loader, model) print('Epoch: [{}/{}], acc: {:.5f}, loss: {:.5f}'.format( epoch, params.numEpochs, acc, loss)) if epoch % 5 == 4: torch.save(model.state_dict(), params.saved_path + str(epoch) + 'resnet34.pt')
def train(model, train_loader): filename = "rnn_state.pt" try: state = torch.load(filename) model.load_state_dict(state["state_dict"]) #optimizer.load_state_dict(state["optimizer_dict"]) except: # raise print("Could not load model file") state = {} state["train_loss_history"] = [] state["test_loss_history"] = [] state["epoch"] = 0 criterion = nn.NLLLoss() lr = 0.005 print_every = 5000 plot_every = 1000 n_epoch = 50 train_loss = 0.0 count = 0 while state["epoch"] < n_epoch: n_batch = len(train_loader) model.train() for i_batch, batch_data in enumerate(train_loader, 0): name_tensor = Variable(batch_data["name_tensor"]) lang_tensor = Variable(batch_data["lang_tensor"]) name_tensor = name_tensor.view(name_tensor.size()[1:]) lang_tensor = lang_tensor.view(1) model.zero_grad() hidden = model.initHidden() n_letters = name_tensor.size()[0] for i in range(n_letters): output, hidden = model(name_tensor[i], hidden) loss = criterion(output, lang_tensor) loss.backward() train_loss += loss.data[0] for p in model.parameters(): p.data.add_(-lr, p.grad.data) if count % plot_every == 0: train_loss_avg = train_loss / plot_every print("Epoch: %i/%i, Batch: %i/%i, Loss: %f, %s" % (state["epoch"], n_epoch, i_batch, n_batch, train_loss_avg, batch_data["lang"])) state["train_loss_history"].append(train_loss_avg) train_loss = 0.0 plt.cla() plt.plot(state["train_loss_history"]) plt.plot(state["test_loss_history"]) plt.draw() plt.pause(0.1) count += 1 print("\nEpoch: %i/%i Saved!" % (state["epoch"], n_epoch)) state["state_dict"] = model.state_dict() # state["optimizer_dict"] = optimizer.state_dict() state["epoch"] += 1 torch.save(state, filename)
"WARNING: You have a CUDA device, so you should probably run with --cuda" ) else: torch.cuda.manual_seed(args.seed) if args.temperature < 1e-3: parser.error("--temperature has to be greater or equal 1e-3") #with open(args.checkpoint, 'rb') as f: vocab_obj = Vocab(args.vocab_dir, 0, args.glove_file) ntokens, emsize = vocab_obj.size() model = model.RNNModel(vocab_obj, args.model, ntokens, emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) model.load_state_dict(torch.load('/content/drive/My Drive/lngmodel')) model.eval() if args.model == 'QRNN': model.reset() if args.cuda: model.cuda() else: model.cpu() #corpus = data.Corpus(args.data) #ntokens = len(corpus.dictionary) hidden = model.init_hidden(1) input = Variable(torch.rand(1, 1).mul(ntokens).long(), volatile=True) if args.cuda: input.data = input.data.cuda()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model', required=True) parser.add_argument('--dataset', default='/home/nax/Downloads/shopee-product-matching/test.csv') parser.add_argument('--config', default='configs/baseline.py') parser.add_argument('--apex', action='store_true') parser.add_argument('--embedding-size', type=int) parser.add_argument('--batch-size', type=int) parser.add_argument('--image-size', type=int) args = parser.parse_args() threshold = 0.9075778192639249 config = util.load_config(args.config) util.update_args(args, config) if args.apex: from apex import amp val_dataset = data.DMLDataset(args.dataset, is_training=False, is_testing=True) val_loader = data_util.DataLoader( val_dataset, batch_size=args.batch_size, collate_fn=val_dataset.collate_fn ) backbone = util.get_class_fn(config['model'])() backbone.eval() in_size = backbone(torch.rand(1, 3, 224, 224)).squeeze().size(0) backbone.train() emb = torch.nn.Linear(in_size, args.embedding_size) model = torch.nn.Sequential(backbone, emb) model.eval() if not args.apex: model = torch.nn.DataParallel(model) model = model.cuda() if args.apex: model = amp.initialize(model, opt_level='O1') model = torch.nn.DataParallel(model) states = torch.load(args.model) model.load_state_dict(states['state_dict']) if args.apex: amp.load_state_dict(states['amp']) model.eval() all_fvecs = [] all_ids = [] for batch in val_loader: all_fvecs.append(model(batch['image'].cuda()).detach().cpu().numpy()) all_ids += batch['posting_id'] all_fvecs = np.vstack(all_fvecs) all_ids = np.asarray(all_ids) D = cdist(all_fvecs, all_fvecs) preds = D <= threshold for i, p in enumerate(preds): print(','.join(list(all_ids[p])))
debias_model.load_state_dict(torch.load(ARGS.debias_checkpoint, map_location='cpu')) print('DONE.') joint_model = joint_model.JointModel( debias_model=debias_model, tagging_model=tagging_model) if CUDA: joint_model = joint_model.cuda() if ARGS.checkpoint is not None and os.path.exists(ARGS.checkpoint): print('LOADING FROM ' + ARGS.checkpoint) # TODO(rpryzant): is there a way to do this more elegantly? # https://pytorch.org/tutorials/beginner/saving_loading_models.html#saving-loading-model-across-devices if CUDA: joint_model.load_state_dict(torch.load(ARGS.checkpoint, map_location='cpu')) joint_model = joint_model.cuda() else: joint_model.load_state_dict(torch.load(ARGS.checkpoint, map_location='cpu')) print('...DONE') # # # # # # # # # # # # EVAL # # # # # # # # # # # # # # joint_model.eval() hits, preds, golds, srcs = joint_utils.run_eval( joint_model, eval_dataloader, tok2id, ARGS.inference_output, ARGS.max_seq_len, ARGS.beam_width) print('eval/bleu', seq2seq_utils.get_bleu(preds, golds), 0) print('eval/true_hits', np.mean(hits), 0)
# learning_rate=6e-4 # lr_decay=True # warmup_tokens=512*20 # final_tokens=200*len(pretrain_dataset)*block_size # num_workers=4 # Hyperparameters for finetuning WITH a pretrained model: # max_epochs=10 # batch_size=256 # learning_rate=6e-4 # lr_decay=True # warmup_tokens=512*20 # final_tokens=200*len(pretrain_dataset)*block_size # num_workers=4 if args.reading_params_path: model.load_state_dict(torch.load(args.reading_params_path), strict=False) tconf = trainer.TrainerConfig(max_epochs=10, batch_size=256, learning_rate=6e-4, lr_decay=True, warmup_tokens=512*20, final_tokens=200*len(pretrain_dataset)*block_size, num_workers=4) else: tconf = trainer.TrainerConfig(max_epochs=75, batch_size=256, learning_rate=6e-4, lr_decay=True, warmup_tokens=512*20, final_tokens=200*len(pretrain_dataset)*block_size, num_workers=4) finetune_dataset = dataset.NameDataset(pretrain_dataset, open(args.finetune_corpus_path, encoding="utf8").read()) t = trainer.Trainer(model, finetune_dataset, None, tconf) t.train() torch.save(model.state_dict(), args.writing_params_path) elif args.function == 'evaluate': assert args.outputs_path is not None
os.chdir(CHEXNET_PATH) # original saved file with DataParallel loaded = torch.load('./pretrained/model.pth.tar') state_dict = loaded['state_dict'] # create new OrderedDict that does not contain `module.` # initialize and load the model model = model.DenseNet121(N_CLASSES).cuda() model = torch.nn.DataParallel(model).cuda() new_state_dict = OrderedDict() for k, v in state_dict.items(): for layer in ['norm', 'relu', 'conv']: if re.search(r'.' + layer + '.[0-9]', k): k = k.replace('.' + layer + '.', '.' + layer) new_state_dict[k] = v # load params model.load_state_dict(new_state_dict) model.cpu() print('Now see converted state dict:') print(new_state_dict.keys()) # saving model: state = { 'epoch': loaded['epoch'], 'arch': loaded['arch'], 'state_dict': model.state_dict(), 'optimizer': loaded['optimizer'], } torch.save(state, './pretrained/model2.pth')
def densenet161(config, anchors, num_cls, pretrained=False, **kwargs): model = DenseNet(config, anchors, num_cls, num_init_features=96, growth_rate=48, block_config=(6, 12, 36, 24), **kwargs) if pretrained: model.load_state_dict(model_zoo.load_url(model_urls['densenet161'])) return model
text = open(args.finetune_corpus_path).read() finetune_dataset = dataset.NameDataset(pretrain_dataset, text) # Hyperparameters for finetuning WITHOUT a pretrained model: tconf = trainer.TrainerConfig(max_epochs=75, batch_size=256, learning_rate=6e-4, lr_decay=True, warmup_tokens=512 * 20, final_tokens=200 * len(pretrain_dataset) * block_size, num_workers=4) # 1. If args.reading_params_path is specified, load these parameters # into the model if args.reading_params_path is not None: model.load_state_dict(torch.load(args.reading_params_path)) # Hyperparameters for finetuning WITH a pretrained model: tconf = trainer.TrainerConfig(max_epochs=10, batch_size=256, learning_rate=6e-4, lr_decay=True, warmup_tokens=512 * 20, final_tokens=200 * len(pretrain_dataset) * block_size, num_workers=4) # 3. Save the resulting model in args.writing_params_path tconf.ckpt_path = args.writing_params_path trainer = trainer.Trainer(model, finetune_dataset, None, tconf) trainer.train()
random.shuffle(dataset) n_datasets = [] for dataset in datasets: img = [e[0] for e in dataset] qst = [e[1] for e in dataset] ans = [e[2] for e in dataset] n_datasets.append((img, qst, ans)) return tuple(n_datasets) rel_train, rel_test, norel_train, norel_test = load_data() try: os.makedirs(model_dirs) except: print('directory {} already exists'.format(model_dirs)) if args.resume: filename = os.path.join(model_dirs, args.resume) if os.path.isfile(filename): print('==> loading checkpoint {}'.format(filename)) checkpoint = torch.load(filename) model.load_state_dict(checkpoint) print('==> loaded checkpoint {}'.format(filename)) for epoch in range(1, args.epochs + 1): train(epoch, rel_train, norel_train) test(epoch, rel_test, norel_test) model.save_model(epoch)
def train(dataset="kaggle_pna", train_ds="train", arch="couplenet", net="res152", start_epoch=1, max_epochs=20, disp_interval=100, save_dir="save", num_workers=4, cuda=True, large_scale=False, mGPUs=True, batch_size=4, class_agnostic=False, anchor_scales=4, optimizer="sgd", lr_decay_step=10, lr_decay_gamma=.1, session=1, resume=False, checksession=1, checkepoch=1, checkpoint=0, use_tfboard=False, flip_prob=0.0, scale=0.0, scale_prob=0.0, translate=0.0, translate_prob=0.0, angle=0.0, dist="cont", rotate_prob=0.0, shear_factor=0.0, shear_prob=0.0, rpn_loss_cls_wt=1, rpn_loss_box_wt=1, RCNN_loss_cls_wt=1, RCNN_loss_bbox_wt=1, **kwargs): print("Train Arguments: {}".format(locals())) # Import network definition if arch == 'rcnn': from model.faster_rcnn.resnet import resnet elif arch == 'rfcn': from model.rfcn.resnet_atrous import resnet elif arch == 'couplenet': from model.couplenet.resnet_atrous import resnet from roi_data_layer.pnaRoiBatchLoader import roibatchLoader from roi_data_layer.pna_roidb import combined_roidb print('Called with kwargs:') print(kwargs) # Set up logger if use_tfboard: from model.utils.logger import Logger # Set the logger logger = Logger('./logs') # Anchor settings: ANCHOR_SCALES: [8, 16, 32] or [4, 8, 16, 32] if anchor_scales == 3: scales = [8, 16, 32] elif anchor_scales == 4: scales = [4, 8, 16, 32] # Dataset related settings: MAX_NUM_GT_BOXES: 20, 30, 50 if train_ds == "train": imdb_name = "pna_2018_train" elif train_ds == "trainval": imdb_name = "pna_2018_trainval" set_cfgs = [ 'ANCHOR_SCALES', str(scales), 'ANCHOR_RATIOS', '[0.5,1,2]', 'MAX_NUM_GT_BOXES', '30' ] import model model_repo_path = os.path.dirname( os.path.dirname(os.path.dirname(model.__file__))) cfg_file = "cfgs/{}_ls.yml".format( net) if large_scale else "cfgs/{}.yml".format(net) if cfg_file is not None: cfg_from_file(os.path.join(model_repo_path, cfg_file)) if set_cfgs is not None: cfg_from_list(set_cfgs) train_kwargs = kwargs.pop("TRAIN", None) resnet_kwargs = kwargs.pop("RESNET", None) mobilenet_kwargs = kwargs.pop("MOBILENET", None) if train_kwargs is not None: for key, value in train_kwargs.items(): cfg["TRAIN"][key] = value if resnet_kwargs is not None: for key, value in resnet_kwargs.items(): cfg["RESNET"][key] = value if mobilenet_kwargs is not None: for key, value in mobilenet_kwargs.items(): cfg["MOBILENET"][key] = value if kwargs is not None: for key, value in kwargs.items(): cfg[key] = value print('Using config:') cfg.MODEL_DIR = os.path.abspath(cfg.MODEL_DIR) cfg.TRAIN_DATA_CLEAN_PATH = os.path.abspath(cfg.TRAIN_DATA_CLEAN_PATH) pprint.pprint(cfg) np.random.seed(cfg.RNG_SEED) print("LEARNING RATE: {}".format(cfg.TRAIN.LEARNING_RATE)) # Warning to use cuda if available if torch.cuda.is_available() and not cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) # Train set # Note: Use validation set and disable the flipped to enable faster loading. cfg.TRAIN.USE_FLIPPED = True cfg.USE_GPU_NMS = cuda imdb, roidb, ratio_list, ratio_index = combined_roidb(imdb_name) train_size = len(roidb) print('{:d} roidb entries'.format(len(roidb))) # output_dir = os.path.join(save_dir, arch, net, dataset) output_dir = cfg.MODEL_DIR if not os.path.exists(output_dir): os.makedirs(output_dir) sampler_batch = sampler(train_size, batch_size) dataset = roibatchLoader(roidb, ratio_list, ratio_index, batch_size, imdb.num_classes, training=True) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=sampler_batch, num_workers=num_workers) # Initilize the tensor holder im_data = torch.FloatTensor(1) im_info = torch.FloatTensor(1) num_boxes = torch.LongTensor(1) gt_boxes = torch.FloatTensor(1) # Copy tensors in CUDA memory if cuda: im_data = im_data.cuda() im_info = im_info.cuda() num_boxes = num_boxes.cuda() gt_boxes = gt_boxes.cuda() # Make variable im_data = Variable(im_data) im_info = Variable(im_info) num_boxes = Variable(num_boxes) gt_boxes = Variable(gt_boxes) if cuda: cfg.CUDA = True # Initilize the network: if net == 'vgg16': # model = vgg16(imdb.classes, pretrained=True, class_agnostic=args.class_agnostic) print("Pretrained model is not downloaded and network is not used") elif net == 'res18': model = resnet(imdb.classes, 18, pretrained=False, class_agnostic=class_agnostic) # TODO: Check dim error elif net == 'res34': model = resnet(imdb.classes, 34, pretrained=False, class_agnostic=class_agnostic) # TODO: Check dim error elif net == 'res50': model = resnet(imdb.classes, 50, pretrained=False, class_agnostic=class_agnostic) # TODO: Check dim error elif net == 'res101': model = resnet(imdb.classes, 101, pretrained=True, class_agnostic=class_agnostic) elif net == 'res152': model = resnet(imdb.classes, 152, pretrained=True, class_agnostic=class_agnostic) else: print("network is not defined") pdb.set_trace() # Create network architecture model.create_architecture() # Update model parameters lr = cfg.TRAIN.LEARNING_RATE # tr_momentum = cfg.TRAIN.MOMENTUM # tr_momentum = args.momentum params = [] for key, value in dict(model.named_parameters()).items(): if value.requires_grad: if 'bias' in key: params += [{'params': [value], 'lr': lr * (cfg.TRAIN.DOUBLE_BIAS + 1), \ 'weight_decay': cfg.TRAIN.BIAS_DECAY and cfg.TRAIN.WEIGHT_DECAY or 0}] else: params += [{ 'params': [value], 'lr': lr, 'weight_decay': cfg.TRAIN.WEIGHT_DECAY }] # Optimizer if optimizer == "adam": lr = lr * 0.1 optimizer = torch.optim.Adam(params) elif optimizer == "sgd": optimizer = torch.optim.SGD(params, momentum=cfg.TRAIN.MOMENTUM) # Resume training if resume: load_name = os.path.join( output_dir, '{}_{}_{}_{}.pth'.format(arch, checksession, checkepoch, checkpoint)) print("loading checkpoint %s" % (load_name)) checkpoint = torch.load(load_name) session = checkpoint['session'] + 1 start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr = optimizer.param_groups[0]['lr'] if 'pooling_mode' in checkpoint.keys(): cfg.POOLING_MODE = checkpoint['pooling_mode'] print("loaded checkpoint %s" % (load_name)) # Train on Multiple GPUS if mGPUs: model = nn.DataParallel(model) # Copy network to CUDA memroy if cuda: model.cuda() # Training loop iters_per_epoch = int(train_size / batch_size) sys.stdout.flush() for epoch in range(start_epoch, max_epochs + 1): # remove batch re-sizing for augmentation or adjust? dataset.resize_batch() # Set model to train mode model.train() loss_temp = 0 start = time.time() # Update learning rate as per decay step if epoch % (lr_decay_step + 1) == 0: adjust_learning_rate(optimizer, lr_decay_gamma) lr *= lr_decay_gamma # Get batch data and train data_iter = iter(dataloader) for step in range(iters_per_epoch): sys.stdout.flush() data = next(data_iter) # Apply augmentations aug_img_tensors, aug_bbox_tensors = apply_augmentations( data[0], data[2], flip_prob=flip_prob, scale=scale, scale_prob=scale_prob, translate=translate, translate_prob=translate_prob, angle=angle, dist=dist, rotate_prob=rotate_prob, shear_factor=shear_factor, shear_prob=shear_prob) # im_data.data.resize_(data[0].size()).copy_(data[0]) im_data.data.resize_(aug_img_tensors.size()).copy_(aug_img_tensors) im_info.data.resize_(data[1].size()).copy_(data[1]) # gt_boxes.data.resize_(data[2].size()).copy_(data[2]) gt_boxes.data.resize_( aug_bbox_tensors.size()).copy_(aug_bbox_tensors) num_boxes.data.resize_(data[3].size()).copy_(data[3]) # Compute multi-task loss model.zero_grad() rois, cls_prob, bbox_pred, \ rpn_loss_cls, rpn_loss_box, \ RCNN_loss_cls, RCNN_loss_bbox, \ rois_label = model(im_data, im_info, gt_boxes, num_boxes) loss = rpn_loss_cls_wt * rpn_loss_cls.mean() + rpn_loss_box_wt * rpn_loss_box.mean() + \ RCNN_loss_cls_wt * RCNN_loss_cls.mean() + RCNN_loss_bbox_wt * RCNN_loss_bbox.mean() loss_temp += loss.data[0] # Backward pass to compute gradients and update weights optimizer.zero_grad() loss.backward() if net == "vgg16": clip_gradient(model, 10.) optimizer.step() # Display training stats on terminal if step % disp_interval == 0: end = time.time() if step > 0: loss_temp /= disp_interval if mGPUs: batch_loss = loss.data[0] loss_rpn_cls = rpn_loss_cls.mean().data[0] loss_rpn_box = rpn_loss_box.mean().data[0] loss_rcnn_cls = RCNN_loss_cls.mean().data[0] loss_rcnn_box = RCNN_loss_bbox.mean().data[0] fg_cnt = torch.sum(rois_label.data.ne(0)) bg_cnt = rois_label.data.numel() - fg_cnt else: batch_loss = loss.data[0] loss_rpn_cls = rpn_loss_cls.data[0] loss_rpn_box = rpn_loss_box.data[0] loss_rcnn_cls = RCNN_loss_cls.data[0] loss_rcnn_box = RCNN_loss_bbox.data[0] fg_cnt = torch.sum(rois_label.data.ne(0)) bg_cnt = rois_label.data.numel() - fg_cnt print("[session %d][epoch %2d][iter %4d/%4d] loss: %.4f, lr: %.2e" \ % (session, epoch, step, iters_per_epoch, loss_temp, lr)) print("\t\t\tfg/bg=(%d/%d), time cost: %f" % (fg_cnt, bg_cnt, end - start)) print("\t\t\t batch_loss: %.4f, rpn_cls: %.4f, rpn_box: %.4f, rcnn_cls: %.4f, rcnn_box %.4f" \ % (batch_loss, loss_rpn_cls, loss_rpn_box, loss_rcnn_cls, loss_rcnn_box)) if use_tfboard: info = { 'loss': loss_temp, 'loss_rpn_cls': loss_rpn_cls, 'loss_rpn_box': loss_rpn_box, 'loss_rcnn_cls': loss_rcnn_cls, 'loss_rcnn_box': loss_rcnn_box } for tag, value in info.items(): logger.scalar_summary(tag, value, step) loss_temp = 0 start = time.time() # Save model at checkpoints if mGPUs: save_name = os.path.join( output_dir, '{}_{}_{}_{}.pth'.format(arch, session, epoch, step)) save_checkpoint( { 'session': session, 'epoch': epoch + 1, 'model': model.module.state_dict(), 'optimizer': optimizer.state_dict(), 'pooling_mode': cfg.POOLING_MODE, 'class_agnostic': class_agnostic, }, save_name) else: save_name = os.path.join( output_dir, '{}_{}_{}_{}.pth'.format(arch, session, epoch, step)) save_checkpoint( { 'session': session, 'epoch': epoch + 1, 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'pooling_mode': cfg.POOLING_MODE, 'class_agnostic': class_agnostic, }, save_name) print('save model: {}'.format(save_name)) end = time.time() delete_older_checkpoints( os.path.join(cfg.MODEL_DIR, "couplenet_{}_*.pth".format(i))) print("Run Time: ", end - start)
def main(): # 随机种子 np.random.seed(666) torch.manual_seed(666) torch.cuda.manual_seed_all(666) random.seed(666) # 获取当前文件名,用于创建模型及结果文件的目录 file_name = os.path.basename(__file__).split('.')[0] # 创建保存模型和结果的文件夹 if not os.path.exists('./model/%s' % file_name): os.makedirs('./model/%s' % file_name) if not os.path.exists('./result/%s' % file_name): os.makedirs('./result/%s' % file_name) # 创建日志文件 if not os.path.exists('./result/%s.txt' % file_name): with open('./result/%s.txt' % file_name, 'w') as acc_file: pass with open('./result/%s.txt' % file_name, 'a') as acc_file: acc_file.write('\n%s %s\n' % (time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(time.time())), file_name)) # 默认使用PIL读图 def default_loader(path): # return Image.open(path) return Image.open(path).convert('RGB') # 训练集图片读取 class TrainDataset(Dataset): def __init__(self, label_list, transform=None, target_transform=None, loader=default_loader): imgs = [] for index, row in label_list.iterrows(): imgs.append((row['img_path'], row['label'])) self.imgs = imgs self.transform = transform self.target_transform = target_transform self.loader = loader def __getitem__(self, index): filename, label = self.imgs[index] img = self.loader(filename) if self.transform is not None: img = self.transform(img) return img, label def __len__(self): return len(self.imgs) # 验证集图片读取 class ValDataset(Dataset): def __init__(self, label_list, transform=None, target_transform=None, loader=default_loader): imgs = [] for index, row in label_list.iterrows(): imgs.append((row['img_path'], row['label'])) self.imgs = imgs self.transform = transform self.target_transform = target_transform self.loader = loader def __getitem__(self, index): filename, label = self.imgs[index] img = self.loader(filename) if self.transform is not None: img = self.transform(img) return img, label def __len__(self): return len(self.imgs) # 测试集图片读取 class TestDataset(Dataset): def __init__(self, label_list, transform=None, target_transform=None, loader=default_loader): imgs = [] for index, row in label_list.iterrows(): imgs.append((row['img_path'])) self.imgs = imgs self.transform = transform self.target_transform = target_transform self.loader = loader def __getitem__(self, index): filename = self.imgs[index] img = self.loader(filename) if self.transform is not None: img = self.transform(img) return img, filename def __len__(self): return len(self.imgs) # 数据增强:在给定角度中随机进行旋转 class FixedRotation(object): def __init__(self, angles): self.angles = angles def __call__(self, img): return fixed_rotate(img, self.angles) def fixed_rotate(img, angles): angles = list(angles) angles_num = len(angles) index = random.randint(0, angles_num - 1) return img.rotate(angles[index]) # 训练函数 def train(train_loader, model, criterion, optimizer, epoch): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() acc = AverageMeter() # switch to train mode model.train() end = time.time() # 从训练集迭代器中获取训练数据 for i, (images, target) in enumerate(train_loader): # 评估图片读取耗时 data_time.update(time.time() - end) # 将图片和标签转化为tensor image_var = torch.tensor(images).cuda(async=True) label = torch.tensor(target).cuda(async=True) # 将图片输入网络,前传,生成预测值 y_pred = model(image_var) # 计算loss loss = criterion(y_pred, label) losses.update(loss.item(), images.size(0)) # 计算top1正确率 prec, PRED_COUNT = accuracy(y_pred.data, target, topk=(1, 1)) acc.update(prec, PRED_COUNT) # 对梯度进行反向传播,使用随机梯度下降更新网络权重 optimizer.zero_grad() loss.backward() optimizer.step() # 评估训练耗时 batch_time.update(time.time() - end) end = time.time() # 打印耗时与结果 if i % print_freq == 0: print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Accuray {acc.val:.3f} ({acc.avg:.3f})'.format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, acc=acc)) # 验证函数 def validate(val_loader, model, criterion): batch_time = AverageMeter() losses = AverageMeter() acc = AverageMeter() # switch to evaluate mode model.eval() end = time.time() for i, (images, labels) in enumerate(val_loader): image_var = torch.tensor(images).cuda(async=True) target = torch.tensor(labels).cuda(async=True) # 图片前传。验证和测试时不需要更新网络权重,所以使用torch.no_grad(),表示不计算梯度 with torch.no_grad(): y_pred = model(image_var) loss = criterion(y_pred, target) # measure accuracy and record loss prec, PRED_COUNT = accuracy(y_pred.data, labels, topk=(1, 1)) losses.update(loss.item(), images.size(0)) acc.update(prec, PRED_COUNT) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % print_freq == 0: print('TrainVal: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Accuray {acc.val:.3f} ({acc.avg:.3f})'.format( i, len(val_loader), batch_time=batch_time, loss=losses, acc=acc)) print(' * Accuray {acc.avg:.3f}'.format(acc=acc), '(Previous Best Acc: %.3f)' % best_precision, ' * Loss {loss.avg:.3f}'.format(loss=losses), 'Previous Lowest Loss: %.3f)' % lowest_loss) return acc.avg, losses.avg # 测试函数 def test(test_loader, model): csv_map = OrderedDict({'filename': [], 'probability': []}) # switch to evaluate mode model.eval() for i, (images, filepath) in enumerate(tqdm(test_loader)): # bs, ncrops, c, h, w = images.size() filepath = [os.path.basename(i) for i in filepath] image_var = torch.tensor(images, requires_grad=False) # for pytorch 0.4 with torch.no_grad(): y_pred = model(image_var) # 使用softmax函数将图片预测结果转换成类别概率 smax = nn.Softmax(1) smax_out = smax(y_pred) # 保存图片名称与预测概率 csv_map['filename'].extend(filepath) for output in smax_out: prob = ';'.join([str(i) for i in output.data.tolist()]) csv_map['probability'].append(prob) result = pd.DataFrame(csv_map) result['probability'] = result['probability'].map( lambda x: [float(i) for i in x.split(';')]) # 转换成提交样例中的格式 sub_filename, sub_label = [], [] for index, row in result.iterrows(): sub_filename.append(row['filename']) pred_label = np.argmax(row['probability']) if pred_label == 0: sub_label.append('norm') else: sub_label.append('defect%d' % pred_label) # 生成结果文件,保存在result文件夹中,可用于直接提交 submission = pd.DataFrame({ 'filename': sub_filename, 'label': sub_label }) submission.to_csv('./result/%s/submission.csv' % file_name, header=None, index=False) return # 保存最新模型以及最优模型 def save_checkpoint(state, is_best, is_lowest_loss, filename='./model/%s/checkpoint.pth.tar' % file_name): torch.save(state, filename) if is_best: shutil.copyfile(filename, './model/%s/model_best.pth.tar' % file_name) if is_lowest_loss: shutil.copyfile(filename, './model/%s/lowest_loss.pth.tar' % file_name) # 用于计算精度和时间的变化 class AverageMeter(object): """Computes and stores the average and current value""" def __init__(self): self.reset() def reset(self): self.val = 0 self.avg = 0 self.sum = 0 self.count = 0 def update(self, val, n=1): self.val = val self.sum += val * n self.count += n self.avg = self.sum / self.count # 学习率衰减:lr = lr / lr_decay def adjust_learning_rate(): nonlocal lr lr = lr / lr_decay return optim.Adam(model.parameters(), lr, weight_decay=weight_decay, amsgrad=True) # 计算top K准确率 def accuracy(y_pred, y_actual, topk=(1, )): """Computes the precision@k for the specified values of k""" final_acc = 0 maxk = max(topk) # for prob_threshold in np.arange(0, 1, 0.01): PRED_COUNT = y_actual.size(0) PRED_CORRECT_COUNT = 0 prob, pred = y_pred.topk(maxk, 1, True, True) # prob = np.where(prob > prob_threshold, prob, 0) for j in range(pred.size(0)): if int(y_actual[j]) == int(pred[j]): PRED_CORRECT_COUNT += 1 if PRED_COUNT == 0: final_acc = 0 else: final_acc = PRED_CORRECT_COUNT / PRED_COUNT return final_acc * 100, PRED_COUNT # 程序主体 # 设定GPU ID os.environ["CUDA_VISIBLE_DEVICES"] = '0, 1 , 2, 3' # 小数据集上,batch size不易过大。如出现out of memory,应调小batch size batch_size = 24 # 进程数量,最好不要超过电脑最大进程数,尽量能被batch size整除。windows下报错可以改为workers=0 workers = 12 # epoch数量,分stage进行,跑完一个stage后降低学习率进入下一个stage stage_epochs = [20, 10, 10] # 初始学习率 lr = 1e-4 # 学习率衰减系数 (new_lr = lr / lr_decay) lr_decay = 5 # 正则化系数 weight_decay = 1e-4 # 参数初始化 stage = 0 start_epoch = 0 total_epochs = sum(stage_epochs) best_precision = 0 lowest_loss = 100 # 设定打印频率,即多少step打印一次,用于观察loss和acc的实时变化 # 打印结果中,括号前面为实时loss和acc,括号内部为epoch内平均loss和acc print_freq = 1 # 验证集比例 val_ratio = 0.12 # 是否只验证,不训练 evaluate = False # 是否从断点继续跑 resume = False # 创建inception_v4模型 model = model.v4(num_classes=12) model = torch.nn.DataParallel(model).cuda() # optionally resume from a checkpoint if resume: checkpoint_path = './model/%s/checkpoint.pth.tar' % file_name if os.path.isfile(checkpoint_path): print("=> loading checkpoint '{}'".format(checkpoint_path)) checkpoint = torch.load(checkpoint_path) start_epoch = checkpoint['epoch'] + 1 best_precision = checkpoint['best_precision'] lowest_loss = checkpoint['lowest_loss'] stage = checkpoint['stage'] lr = checkpoint['lr'] model.load_state_dict(checkpoint['state_dict']) # 如果中断点恰好为转换stage的点,需要特殊处理 if start_epoch in np.cumsum(stage_epochs)[:-1]: stage += 1 optimizer = adjust_learning_rate() model.load_state_dict( torch.load('./model/%s/model_best.pth.tar' % file_name)['state_dict']) print("=> loaded checkpoint (epoch {})".format( checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(resume)) # 读取训练图片列表 all_data = pd.read_csv('data/label.csv') # 分离训练集和测试集,stratify参数用于分层抽样 train_data_list, val_data_list = train_test_split( all_data, test_size=val_ratio, random_state=666, stratify=all_data['label']) # 读取测试图片列表 test_data_list = pd.read_csv('data/test.csv') # 图片归一化,由于采用ImageNet预训练网络,因此这里直接采用ImageNet网络的参数 normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # 训练集图片变换,输入网络的尺寸为384*384 train_data = TrainDataset( train_data_list, transform=transforms.Compose([ transforms.Resize((400, 400)), transforms.ColorJitter(0.15, 0.15, 0.15, 0.075), transforms.RandomHorizontalFlip(), transforms.RandomGrayscale(), # transforms.RandomRotation(20), FixedRotation([0, 90, 180, 270]), transforms.RandomCrop(384), transforms.ToTensor(), normalize, ])) # 验证集图片变换 val_data = ValDataset(val_data_list, transform=transforms.Compose([ transforms.Resize((400, 400)), transforms.CenterCrop(384), transforms.ToTensor(), normalize, ])) # 测试集图片变换 test_data = TestDataset(test_data_list, transform=transforms.Compose([ transforms.Resize((400, 400)), transforms.CenterCrop(384), transforms.ToTensor(), normalize, ])) # 生成图片迭代器 train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=workers) val_loader = DataLoader(val_data, batch_size=batch_size * 2, shuffle=False, pin_memory=False, num_workers=workers) test_loader = DataLoader(test_data, batch_size=batch_size * 2, shuffle=False, pin_memory=False, num_workers=workers) # 使用交叉熵损失函数 criterion = nn.CrossEntropyLoss().cuda() # 优化器,使用带amsgrad的Adam optimizer = optim.Adam(model.parameters(), lr, weight_decay=weight_decay, amsgrad=True) if evaluate: validate(val_loader, model, criterion) else: # 开始训练 for epoch in range(start_epoch, total_epochs): # train for one epoch train(train_loader, model, criterion, optimizer, epoch) # evaluate on validation set precision, avg_loss = validate(val_loader, model, criterion) # 在日志文件中记录每个epoch的精度和loss with open('./result/%s.txt' % file_name, 'a') as acc_file: acc_file.write('Epoch: %2d, Precision: %.8f, Loss: %.8f\n' % (epoch, precision, avg_loss)) # 记录最高精度与最低loss,保存最新模型与最佳模型 is_best = precision > best_precision is_lowest_loss = avg_loss < lowest_loss best_precision = max(precision, best_precision) lowest_loss = min(avg_loss, lowest_loss) state = { 'epoch': epoch, 'state_dict': model.state_dict(), 'best_precision': best_precision, 'lowest_loss': lowest_loss, 'stage': stage, 'lr': lr, } save_checkpoint(state, is_best, is_lowest_loss) # 判断是否进行下一个stage if (epoch + 1) in np.cumsum(stage_epochs)[:-1]: stage += 1 optimizer = adjust_learning_rate() model.load_state_dict( torch.load('./model/%s/model_best.pth.tar' % file_name)['state_dict']) print('Step into next stage') with open('./result/%s.txt' % file_name, 'a') as acc_file: acc_file.write( '---------------Step into next stage----------------\n' ) # 记录线下最佳分数 with open('./result/%s.txt' % file_name, 'a') as acc_file: acc_file.write('* best acc: %.8f %s\n' % (best_precision, os.path.basename(__file__))) with open('./result/best_acc.txt', 'a') as acc_file: acc_file.write( '%s * best acc: %.8f %s\n' % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime( time.time())), best_precision, os.path.basename(__file__))) # 读取最佳模型,预测测试集,并生成可直接提交的结果文件 best_model = torch.load('./model/%s/model_best.pth.tar' % file_name) model.load_state_dict(best_model['state_dict']) test(test_loader=test_loader, model=model) # 释放GPU缓存 torch.cuda.empty_cache()
def train_model(dataloaders, model, criterion, optimizer, scheduler, num_epochs, save_epoch, save_name='model', save_path='./pkl'): isReduceLROnPlateau = False if isinstance(scheduler, lr_scheduler.ReduceLROnPlateau): isReduceLROnPlateau = True since = time.time() best_model_wts = None best_loss = float("inf") trainLoss = [] valLoss = [] lrs = [] epochs = [] plt.ion() for epoch in range(1, num_epochs + 1): epochs += [epoch] lrs += [optimizer.param_groups[0]['lr']] # train: model.train() running_loss = 0.0 data_size = 0 for inputs, labels in dataloaders['train']: inputs = inputs.to(device) labels = labels.to(device) optimizer.zero_grad() torch.set_grad_enabled(True) outputs = model(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() # statistics data_size += inputs.size(0) running_loss += loss.item() * inputs.size( 0) # 本次Iterate*样本数=本次的总样本loss(防止最后一个batch大小不同,或train与val的不同) epoch_loss = running_loss / data_size # 一个epoch的平均loss trainLoss += [epoch_loss] # validation: model.eval() running_loss = 0.0 data_size = 0 for inputs, labels in dataloaders['val']: inputs = inputs.to(device) labels = labels.to(device) torch.set_grad_enabled(False) outputs = model(inputs) loss = criterion(outputs, labels) # statistics data_size += inputs.size(0) running_loss += loss.item() * inputs.size( 0) # 本次Iterate*样本数=本次的总样本loss(防止最后一个batch大小不同,或train与val的不同) epoch_loss = running_loss / data_size # 一个epoch的平均loss valLoss += [epoch_loss] # auto update lr if scheduler: if isReduceLROnPlateau: scheduler.step(epoch_loss) else: scheduler.step() # show each epoch if args.show_each_epoch: print('Epoch {}/{}\n{}'.format(epoch, num_epochs, '-' * 10)) print( 'train_loss: {:.4f}\n val_loss: {:.4f}\nlearning_rate: {:.4f}\n' .format(trainLoss[-1], valLoss[-1], optimizer.param_groups[0]['lr'])) # 一个epoch更新 # deep copy the model(lav loss) if valLoss[-1] < best_loss: best_loss = valLoss[-1] best_model_wts = copy.deepcopy(model.state_dict()) if not os.path.exists(save_path): os.makedirs(save_path) torch.save( model, '{}/{}_{}-trainLoss_{:.4f}-valLoss_{:.4f}.pkl'.format( save_path, save_name, epoch, trainLoss[-1], valLoss[-1])) # printHistory(epochs,trainLoss,valLoss,lrs) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Best val Loss: {:4f}'.format(best_loss)) # load best model weights model.load_state_dict(best_model_wts) if not os.path.exists('{}/best/'.format(save_path)): os.makedirs('{}/best/'.format(save_path)) torch.save(model, '{}/best/{}.pkl'.format(save_path, save_name)) return model
def trainDetector(params): def compute_iou(box1, box2): '''Compute the intersection over union of two boxes, each box is [x1,y1,x2,y2]. Args: box1: (tensor) bounding boxes, sized [4]. box2: (tensor) bounding boxes, sized [4]. Return: (tensor) iou. ''' xmin1, ymin1, xmax1, ymax1 = box1 xmin2, ymin2, xmax2, ymax2 = box2 xx1 = np.max([xmin1, xmin2]) yy1 = np.max([ymin1, ymin2]) xx2 = np.min([xmax1, xmax2]) yy2 = np.min([ymax1, ymax2]) area1 = ((xmax1 - xmin1) * (ymax1 - ymin1)) area2 = ((xmax2 - xmin2) * (ymax2 - ymin2)) inter_area = (np.max([0, xx2 - xx1])) * (np.max([0, yy2 - yy1])) iou = inter_area / (area1 + area2 - inter_area + 1e-6) return iou class PDdata(torch.utils.data.Dataset): def __init__(self, mode=0, is_aug=True): self.mode = mode self.is_aug = is_aug if mode == 0: # 0 for train self.img_path = params.data_dir self.anno_path = params.addi_path elif mode == 1: # 1 for validation self.img_path = params.val_data_set self.anno_path = params.addi_path with open(self.anno_path, 'r') as f: self.bbox = json.load(f)["bbox"] self.map = [i for i in self.bbox.keys()] def __len__(self): return len(self.bbox) def __getitem__(self, item): ''' return: img: (C, H, W) labels (H, W) affine (4, H, W) --- dim0: [xmin, ymin, xmax, ymax] bbax [xmin, ymin, xmax, ymax] ''' img = cv2.imread(self.img_path + '/' + self.map[item] + '.png') img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) if self.is_aug: img = transforms.ColorJitter(contrast=1)(img) img = transforms.ToTensor()(img) img = transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])(img) # img = torch.unsqueeze(img, 0) # compute labels and Affine for img x, y, width, height = self.bbox[self.map[item]] bbox = torch.tensor([x, y, x + width, y + height]) labels = torch.LongTensor(12, 12).zero_() affine = torch.zeros(4, 12, 12) x = x / 32 y = y / 32 width = width / 32 height = height / 32 box2 = torch.tensor([x, y, x + width, y + height]) for i in range(12): for j in range(12): box1 = torch.zeros(4) box1[0] = j - width / 2 box1[1] = i - height / 2 box1[2] = j + width / 2 box1[3] = i + height / 2 if compute_iou(box1, box2) >= 0.5: labels[i][j] = 1.0 affine[0][i][j] = x - j affine[1][i][j] = y - i affine[2][i][j] = x + width - j affine[3][i][j] = y + height - i return img, labels, affine, bbox def validation(model, test_loader): model.eval() iou_sum = 0 cnt = 0 for batch_n, (inputs, labels, _, bboxes) in enumerate(test_loader): if params.useGPU: inputs, labels, bboxes = \ Variable(inputs.cuda()), Variable( labels.cuda()), Variable(bboxes.cuda()) else: inputs, labels, bboxes = \ Variable(inputs), Variable(labels), Variable(bboxes) xProb, xAffine = model(inputs) a, b, c, d = torch.where(xProb == torch.max(xProb[:, 1, :, :])) print(xProb[a, b, c, d]) affine = xAffine[0, :, c[0], d[0]] ymin = float(-0.5 * affine[0] - 0.5 * affine[1] + affine[4] + c[0]) * 32 xmin = float(-0.5 * affine[2] - 0.5 * affine[3] + affine[5] + d[0]) * 32 ymax = float(0.5 * affine[0] + 0.5 * affine[1] + affine[4] + c[0]) * 32 xmax = float(0.5 * affine[0] + 0.5 * affine[3] + affine[5] + d[0]) * 32 bbox_pred = torch.tensor([xmin, ymin, xmax, ymax]) iou = compute_iou(bbox_pred, bboxes[0]) iou_sum += iou cnt += 1 return iou_sum / cnt train_data = PDdata() trian_loader = DataLoader(train_data, params.batchSize, shuffle=True) test_data = PDdata(mode=1, is_aug=False) test_loader = DataLoader(test_data, 1, shuffle=False) model = model.PlateDetector() if params.useGPU: print('gpu is available') model = torch.nn.DataParallel(model, device_ids=[0]).cuda() try: model.load_state_dict(torch.load( os.path.join(params.model_path, "99PD.pt")), strict=False) print('load pretrained model successfully') except: print('fail to load pretrained model') optimizer = torch.optim.Adam(model.parameters(), lr=params.learningRate, weight_decay=0.0005) loss1 = nn.NLLLoss() loss2 = nn.L1Loss() for epoch in range(0, params.numEpochs): model.train() loss_sum = 0 for batch_n, (inputs, labels, affines, _) in enumerate(trian_loader): start_time = time.time() if params.useGPU: inputs, labels, affines = \ Variable(inputs.cuda()), Variable( labels.cuda()), Variable(affines.cuda()) else: inputs, labels, affines = \ Variable(inputs), Variable(labels), Variable(affines) optimizer.zero_grad() xProb, xAffine = model(inputs) loc_loss = loss1(xProb, labels) mask = torch.unsqueeze(labels, 1) ymin = (-0.5 * xAffine[:, 0, :, :].unsqueeze(1) - 0.5 * xAffine[:, 1, :, :].unsqueeze(1) + xAffine[:, 4, :, :].unsqueeze(1)) * mask xmin = (-0.5 * xAffine[:, 2, :, :].unsqueeze(1) - 0.5 * xAffine[:, 3, :, :].unsqueeze(1) + xAffine[:, 5, :, :].unsqueeze(1)) * mask ymax = (0.5 * xAffine[:, 0, :, :].unsqueeze(1) + 0.5 * xAffine[:, 1, :, :].unsqueeze(1) + xAffine[:, 4, :, :].unsqueeze(1)) * mask xmax = (0.5 * xAffine[:, 2, :, :].unsqueeze(1) + 0.5 * xAffine[:, 3, :, :].unsqueeze(1) + xAffine[:, 5, :, :].unsqueeze(1)) * mask affine_box = torch.cat((xmin, ymin, xmax, ymax), dim=1) affine_loss = loss2(affine_box, affines) loss = loc_loss + affine_loss loss.backward() optimizer.step() loss_sum += loss if batch_n % 10 == 9: print('Epoch: [{}/{}], batch: {}, took: {:.3f}, loss: {:.5f}'. format(epoch, params.numEpochs, batch_n, time.time() - start_time, loss_sum / 10)) loss_sum = 0 if epoch % 5 == 4: torch.save(model.state_dict(), params.saved_path + str(epoch) + 'PD.pt') iou = validation(model, test_loader) print('Epoch: [{}/{}], aver_iou: {:.5}'.format(epoch, params.numEpochs, iou))
from argParser import optionsTest import torchvision as tv args = optionsTest().parse() torch.manual_seed(args.seed) print('Instantiating test model') in_c = 3 + args.in_light * 3 model = model.PS_FCN_run(args.use_BN, in_c) print('Loading Saved Model') saved_model = torch.load(args.model_path) if args.cuda: saved_model = saved_model.cuda() model = model.cuda() model.load_state_dict(saved_model.state_dict()) test_set = DiLiGenT_main(args, 'test') test_loader = torch.utils.data.DataLoader(test_set, batch_size=args.test_batch, num_workers=args.workers, pin_memory=args.cuda, shuffle=False) model.eval() print('---- Testing for %d images - DiLiGent Dataset ----' % (len(test_loader))) err_mean = 0 with torch.no_grad(): for i, sample in enumerate(test_loader):
valid_batch_size=config.VALID_BATCH_SIZE, test_batch_size=config.TEST_BATCH_SIZE) # Accessing the process_data_for_model method of Preprocess class training_loader, valid_loader, testing_loader = Preprocess.process_data_for_model( ) ################################################################################# model = model.DistillBERTClass() # Creating the model shape model.to(device) # Loading back the model from checkpoint checkpoint = torch.load( config.checkpoint_path, map_location=device) # Loading the model from check point model.load_state_dict(checkpoint['model_state_dict']) model.eval() model.to(device) # Loading model to GPU # Validation on test data # Creating the loss function # Optimizer is not needed since its for prediction loss_function = torch.nn.CrossEntropyLoss() test_loss, test_accu, y_test_actual, y_test_predicted, y_test_predicted_prob_list = valid( model=model, testing_loader=testing_loader, loss_fn=loss_function) print("Loss on test data = %0.2f%%" % test_loss) print("Accuracy on test data = %0.2f%%" % test_accu) test_confusion_matrix_df, classification_report = utility.report(
def load_model(): with open(model_path, 'rb') as f: model.load_state_dict(torch.load(f))
import torch from torch.autograd import Variable import torch.nn as nn import torchvision.transforms as transforms import cv2 import numpy as np import predict as pt import time import model model = model.YoloModel().cuda() model = torch.nn.DataParallel(model).cuda() model.load_state_dict(torch.load('yolo.h5')) # cap = cv2.VideoCapture('http://172.16.1.226:8081') cap = cv2.VideoCapture('test.avi') if not cap.isOpened(): print('not open') while (1): now = time.time() ret, origin = cap.read() if not ret: break origin = cv2.resize(origin, (1024, 768)) h, w, _ = origin.shape frame = cv2.resize(origin, (224, 224)) result = pt.predict_gpu_img(model, frame) for left_up, right_bottom, class_name, _, prob in result: if prob > .6: x1 = int(left_up[0] * w / 224.)
args = parser.parse_args() # set random seed torch.manual_seed(args.seed) # read test data print("Loading test data...") test_loader = data_loader.make_dataloader(args.test_dir, batch_size=1) file_names = sorted([ s.split('.')[0] for s in os.listdir(os.path.join(args.test_dir, 'frames')) ]) # read model print("Loading trained model...") model = model.CNN() model.load_state_dict( torch.load(args.model, map_location=lambda storage, loc: storage)) model.eval() print("Generating predictions...") for i, (data, labels) in tqdm(enumerate(test_loader)): print(file_names[i]) data = Variable(data) output = torch.exp(model(data)).data.squeeze() with open(args.output, 'a') as f: for j in range(30): f.write("%s, %d, %.8f\n" % (file_names[i], j + 1, output[j]))
loc = '/home/ml/ksinha4/word_vectors/conceptnet/numberbatch-en-17.06.txt' embeddings = corpus.get_word_embeddings( loc, save_name='concept_embeddings.mod') # embeddings = corpus.load_embeddings('concept_embeddings.mod') print embeddings[corpus.dictionary.word2idx['man']] model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied, embeddings=embeddings, use_embeddings=True) if args.cuda: model.cuda() if args.load: print 'Loading model {}'.format(args.load) prev_model = torch.load(open(args.load)) model.load_state_dict(prev_model.state_dict()) print 'Model loaded' criterion = nn.CrossEntropyLoss() ############################################################################### # Training code ############################################################################### def repackage_hidden(h): """Wraps hidden states in new Variables, to detach them from their history.""" if type(h) == Variable: return Variable(h.data) else: return tuple(repackage_hidden(v) for v in h)
print('-' * 89) # Save the model if the validation loss is the best we've seen so far. if not best_val_loss or val_loss < best_val_loss and args.rank <= 0: with open(args.save, 'wb') as f: torch.save(model.state_dict(), f) best_val_loss = val_loss torch.cuda.synchronize() except KeyboardInterrupt: print('-' * 89) print('Exiting from training early') # Load the best saved model. if os.path.exists(args.save): with open(args.save, 'rb') as f: model.load_state_dict(torch.load(f)) if not args.no_weight_norm and args.rank <= 0: remove_weight_norm(rnn_model) with open(args.save, 'wb') as f: torch.save(model.state_dict(), f) torch.cuda.synchronize() if test_data is not None: # Run on test data. test_loss = evaluate(test_data) print('=' * 89) print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format( test_loss, math.exp(test_loss))) print('=' * 89)
import torch import sys, os.path, glob import pandas as pd import model csv_file = 'millionlive_idol_dict.csv' weights_file = 'clcnn_50.pkl' max_length = 110 device = torch.device('cpu') model = model.CLCNN(max_length=max_length) model.load_state_dict(torch.load(weights_file, map_location=device)) model.eval() # Generate an idol dictionary idol_data_frame = pd.read_csv(csv_file) idol_df = idol_data_frame.set_index('id') idol_dict = idol_df['idol_name'].to_dict() def string_to_codepoint(_str, max_length=max_length): _encoded_str = [ord(_x) for _x in str(_str).strip()] _encoded_str = _encoded_str[:max_length] _str_len = len(str(_str)) # String length if _str_len < max_length: # If string length is less than a num of max_length, do zero padding _encoded_str += ([0] * (max_length - _str_len)) return _encoded_str def predict(model, input_str): model = model.eval()
# create model model = model.GenModel(args.encoder_dim, args.input_dim, args.input_nf, args.coarse_feat_dim, args.refine_feat_dim, args.num_hierarchy_levels, not args.no_pass_occ, not args.no_pass_feats, args.use_skip_sparse, args.use_skip_dense).cuda() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) if args.retrain: print('loading model:', args.retrain) checkpoint = torch.load(args.retrain) args.start_epoch = args.start_epoch if args.start_epoch != 0 else checkpoint[ 'epoch'] model.load_state_dict(checkpoint['state_dict']) #, strict=False) optimizer.load_state_dict(checkpoint['optimizer']) last_epoch = -1 if not args.retrain else args.start_epoch - 1 scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.decay_lr, gamma=0.5, last_epoch=last_epoch) # data files train_files, val_files = data_util.get_train_files(args.data_path, args.train_file_list, args.val_file_list) _OVERFIT = False if len(train_files) == 1: _OVERFIT = True args.use_loss_masking = False
if args.seed >= 0: torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) #if args.temperature < 1e-3: # parser.error("--temperature has to be greater or equal 1e-3") model = model.RNNModel(args.model, args.data_size, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied).cuda() if args.fp16: model.half() with open(args.load_model, 'rb') as f: sd = torch.load(f) try: model.load_state_dict(sd) except: apply_weight_norm(model.rnn) model.load_state_dict(sd) remove_weight_norm(model) def get_neuron_and_polarity(sd, neuron): """return a +/- 1 indicating the polarity of the specified neuron in the module""" if neuron == -1: neuron = None if 'classifier' in sd: sd = sd['classifier'] if 'weight' in sd: weight = sd['weight'] else:
def distill(model, buffer, config, criterion, train_loader, valid_loader, id): model = copy.deepcopy(model) run_config = config['run_config'] param_config = config['param_config'] log_config = config['log_config'] model.train() eval_trainloader = copy.deepcopy(train_loader) buff_imgs, buff_trgs = next(iter(DataLoader(buffer, batch_size=len(buffer)))) buff_imgs, buff_trgs = buff_imgs.to(run_config['device']), buff_trgs.to(run_config['device']) buff_imgs.requires_grad = True init_valid = DataLoader(ModelInitDataset(model, 10), batch_size=1, collate_fn=lambda x: x) init_loader = DataLoader(ModelInitDataset(model, -1), batch_size=1, collate_fn=lambda x: x) init_iter = iter(init_loader) buff_opt = torch.optim.SGD([buff_imgs], lr=param_config['meta_lr'], ) lr_list = [] lr_opts = [] for _ in range(param_config['inner_steps']): lr = np.log(np.exp([param_config[ 'model_lr']]) - 1) # Inverse of softplus (so that the starting learning rate is actually the specified one) lr = torch.tensor(lr, requires_grad=True, device=run_config['device']) lr_list.append(lr) lr_opts.append(torch.optim.SGD([lr], param_config['lr_lr'], )) for i in range(param_config['outer_steps']): for step, (ds_imgs, ds_trgs) in enumerate(train_loader): try: init_batch = next(init_iter) except StopIteration: init_iter = iter(init_loader); init_batch = next(init_iter) ds_imgs = ds_imgs.to(run_config['device']) ds_trgs = ds_trgs.to(run_config['device']) acc_loss = None epoch_loss = [None for _ in range(param_config['inner_steps'])] for r, sigma in enumerate(init_batch): model.load_state_dict(sigma) model_opt = torch.optim.SGD(model.parameters(), lr=1) with higher.innerloop_ctx(model, model_opt) as (fmodel, diffopt): for j in range(param_config['inner_steps']): buff_out = fmodel(buff_imgs) buff_loss = criterion(buff_out, buff_trgs) buff_loss = buff_loss * torch.log(1 + torch.exp(lr_list[j])) diffopt.step(buff_loss) # Update the buffer (actually we just record the loss and update it outside the inner loop) ds_out = fmodel(ds_imgs) ds_loss = criterion(ds_out, ds_trgs) epoch_loss[j] = epoch_loss[j] + ds_loss if epoch_loss[j] is not None else ds_loss acc_loss = acc_loss + ds_loss if acc_loss is not None else ds_loss # Metrics (20 samples of loss and accuracy at the last inner step) if (((step + i * len(train_loader)) % int( round(len(train_loader) * param_config['outer_steps'] * 0.1)) == \ int(round(len(train_loader) * param_config['outer_steps'] * 0.1)) - 1) or ( step + i * len(train_loader)) == 0) \ and j == param_config['inner_steps'] - 1 and r == 0: lrs = [np.log(np.exp(lr.item()) + 1) for lr in lr_list] lrs_log = {f'Learning rate {i} - {id}': lr for (i, lr) in enumerate(lrs)} train_loss, train_accuracy = test_distill(init_valid, lrs, [buff_imgs, buff_trgs], model, criterion, eval_trainloader, run_config) test_loss, test_accuracy = test_distill(init_valid, lrs, [buff_imgs, buff_trgs], model, criterion, valid_loader, run_config) metrics = {f'Distill train loss {id}': train_loss, f'Distill train accuracy {id}': train_accuracy, f'Distill test loss {id}': test_loss, f'Distill test accuracy {id}': test_accuracy, f'Distill step {id}': step + i * len(train_loader)} if log_config['wandb']: wandb.log({**metrics, **lrs_log}) if log_config['print']: print(metrics) # Update the lrs for j in range(param_config['inner_steps']): lr_opts[j].zero_grad() grad, = autograd.grad(epoch_loss[j], lr_list[j], retain_graph=True) lr_list[j].grad = grad lr_opts[j].step() buff_opt.zero_grad() acc_loss.backward() buff_opt.step() aux = [] buff_imgs, buff_trgs = buff_imgs.detach().cpu(), buff_trgs.detach().cpu() for i in range(buff_imgs.size(0)): aux.append([buff_imgs[i], buff_trgs[i]]) lr_list = [np.log(1 + np.exp(lr.item())) for lr in lr_list] return Buffer(aux, len(aux), ), lr_list