def main(): args = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) args.add_argument("--model", help="The model to train on") args = args.parse_args() if args.model == 'alexnet': model = models.AlexNet(1) train(model, args.model) elif args.model == 'resnet18': model = models.ResNet(18, 1) train(model, args.model) elif args.model == 'resnet34': model = models.ResNet(34, 1) train(model, args.model) elif args.model == 'resnet': model = models.ResNet(101, 1) train(model, args.model) else: if args.model == 'all': # model = models.AlexNet() # train(model, 'AlexNet') # del model model = models.create_pretrained_alexnet(1) train(model,'AlexNetFinetuned') del model # model = models.ResNet(34,1) # train(model, 'ResNet34') # del model model = models.ResNet(50,1) train(model, 'ResNet50') del model model = models.ResNet(101,1) train(model, 'ResNet101') del model raise ValueError("Did not provide a valid model")
def load_model(self): if self.cuda: self.device = torch.device('cuda') cudnn.benchmark = True else: self.device = torch.device('cpu') # self.model = LeNet().to(self.device) self.model = models.AlexNet().to(self.device) # self.model = VGG11().to(self.device) # self.model = VGG13().to(self.device) # self.model = VGG16().to(self.device) # self.model = VGG19().to(self.device) # self.model = GoogLeNet().to(self.device) # self.model = resnet18().to(self.device) # self.model = resnet34().to(self.device) # self.model = resnet50().to(self.device) # self.model = resnet101().to(self.device) # self.model = resnet152().to(self.device) # self.model = DenseNet121().to(self.device) # self.model = DenseNet161().to(self.device) # self.model = DenseNet169().to(self.device) # self.model = DenseNet201().to(self.device) # self.model = WideResNet(depth=28, num_classes=10).to(self.device) self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr) self.scheduler = optim.lr_scheduler.MultiStepLR(self.optimizer, milestones=[75, 150], gamma=0.5) self.criterion = nn.CrossEntropyLoss().to(self.device)
def visual(**kwargs): opt.parse(kwargs) model = models.AlexNet() checkpoint = torch.load('/home/hdc/yfq/CAG/checkpoints/AlexNet1.pth') model_dict = model.state_dict() state_dict = {k: v for k, v in checkpoint.items() if k in model_dict} model.load_state_dict(state_dict, False) fc_weight = checkpoint['module.classifier.weight'] normalize = T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) transforms0 = T.Compose([T.RandomResizedCrop(512)]) transforms1 = T.Compose([T.ToTensor(), normalize]) # data img_path = '/home/hdc/yfq/CAG/data/visual1/3.jpg' data = Image.open(img_path) data0 = transforms0(data) data1 = transforms1(data0) data1 = data1.unsqueeze(0) model.eval() score, feature = model(data1) CAMs = returnCAM(feature, fc_weight) _, _, height, width = data1.size() heatmap = cv2.applyColorMap(cv2.resize(CAMs[1], (width, height)), cv2.COLORMAP_JET) result = heatmap * 0.3 + np.array(data0) * 0.5 cv2.imwrite('/home/hdc/yfq/CAG/data/visual1/3.CAM.bmp', result) return 1
def __init__(self, checkpoint_path, cp_name, loader, cuda): self.cp_task = os.path.join(checkpoint_path, 'task'+cp_name+'.pt') if cp_name else os.path.join(checkpoint_path, 'task_checkpoint_{}ep.pt') self.cp_domain = os.path.join(checkpoint_path, 'Domain_{}'+cp_name+'.pt') if cp_name else os.path.join(checkpoint_path, 'Domain_{}.pt') self.dataloader = loader self.cuda = cuda self.feature_extractor = models.AlexNet(num_classes = 7, baseline = False) self.task_classifier = models.task_classifier()
def train_running(): with tf.name_scope('input'): train_RGB_batch, train_FLOW_batch, train_label_batch, _ = input_data.get_batch(train_txt, IMG_W, IMG_H, BATCH_SIZE, CAPACITY) val_RGB_batch, val_FLOW_batch, val_label_batch, _ = input_data.get_batch(val_txt, IMG_W, IMG_H, BATCH_SIZE, CAPACITY) x1 = tf.placeholder(tf.float32, shape=[BATCH_SIZE, IMG_W, IMG_H, 3]) x2 = tf.placeholder(tf.float32, shape=[BATCH_SIZE, IMG_W, IMG_H, 3]) y_ = tf.placeholder(tf.int32, shape=[BATCH_SIZE]) logits = models.AlexNet(x1, x2, N_CLASSES) loss = tools.loss(logits, y_) acc = tools.accuracy(logits, y_) train_op = tools.optimize(loss, LEARNING_RATE) with tf.Session() as sess: saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess= sess, coord=coord) summary_op = tf.summary.merge_all() train_writer = tf.summary.FileWriter(logs_train_dir, sess.graph) val_writer = tf.summary.FileWriter(logs_val_dir, sess.graph) try: for step in np.arange(MAX_STEP): if coord.should_stop(): break tra_RGB_images,tra_FLOW_images, tra_labels = sess.run([train_RGB_batch, train_FLOW_batch, train_label_batch]) _, tra_loss, tra_acc = sess.run([train_op, loss, acc], feed_dict={x1:tra_RGB_images, x2:tra_FLOW_images, y_:tra_labels}) if step % 50 == 0: print('Step %d, train loss = %.4f, train accuracy = %.2f%%' %(step, tra_loss, tra_acc)) summary_str = sess.run(summary_op, feed_dict={x1:tra_RGB_images, x2:tra_FLOW_images, y_:tra_labels}) train_writer.add_summary(summary_str, step) # if step % 200 == 0 or (step + 1) == MAX_STEP: val_RGB_images, val_FLOW_images, val_labels = sess.run([val_RGB_batch, val_FLOW_batch, val_label_batch]) val_loss, val_acc = sess.run([loss, acc], feed_dict={x1:val_RGB_images, x2: val_FLOW_images, y_:val_labels}) print('** Step %d, val loss = %.4f, val accuracy = %.2f%% **' %(step, val_loss, val_acc)) summary_str = sess.run(summary_op, feed_dict={x1:val_RGB_images, x2: val_FLOW_images, y_:val_labels}) val_writer.add_summary(summary_str, step) # if step % 2000 == 0 or (step + 1) == MAX_STEP: checkpoint_path = os.path.join(model_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) except tf.errors.OutOfRangeError: print('Done training -- epoch limit reached') finally: coord.request_stop() coord.join(threads)
def evaluate_running(): with tf.Graph().as_default(): val_batch, val_label_batch, n_test = input_data.get_batch( val_txt, IMG_W, IMG_H, BATCH_SIZE, CAPACITY) x = tf.placeholder(tf.float32, shape=[BATCH_SIZE, IMG_W, IMG_H, 3]) y_ = tf.placeholder(tf.int32, shape=[BATCH_SIZE]) logits = models.AlexNet(x, N_CLASSES) top_k_op = tf.nn.in_top_k(logits, y_, 1) saver = tf.train.Saver(tf.global_variables()) with tf.Session() as sess: print("Reading checkpoints...") ckpt = tf.train.get_checkpoint_state(model_dir) if ckpt and ckpt.model_checkpoint_path: global_step = ckpt.model_checkpoint_path.split('/')[-1].split( '-')[-1] saver.restore(sess, ckpt.model_checkpoint_path) print('Loading success, global_step is %s' % global_step) else: print('No checkpoint file found') return coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) try: num_iter = int(math.ceil(n_test / BATCH_SIZE)) true_count = 0 total_sample_count = num_iter * BATCH_SIZE step = 0 while step < num_iter and not coord.should_stop(): val_images, val_labels = sess.run( [val_batch, val_label_batch]) predictions = sess.run([top_k_op], feed_dict={ x: val_images, y_: val_labels }) true_count += np.sum(predictions) step += 1 precision = true_count / total_sample_count print('precision = %.2f' % precision) except Exception as e: coord.request_stop(e) finally: coord.request_stop() coord.join(threads)
def evaluate_running(): with tf.Graph().as_default(): data_dir = './data/KTH_RGB/' model_dir = './model/KTH_RGB6000/' train_image, train_label, val_image, val_label, n_test = input_data.get_files( data_dir, RATIO, ret_val_num=True) train_batch, train_label_batch = input_data.get_batch( train_image, train_label, IMG_W, IMG_H, BATCH_SIZE, CAPACITY) val_batch, val_label_batch = input_data.get_batch( val_image, val_label, IMG_W, IMG_H, BATCH_SIZE, CAPACITY) # logits = models.AlexNet(val_batch, N_CLASSES) top_k_op = tf.nn.in_top_k(logits, val_label_batch, 1) saver = tf.train.Saver(tf.global_variables()) with tf.Session() as sess: print("Reading checkpoints...") ckpt = tf.train.get_checkpoint_state(model_dir) if ckpt and ckpt.model_checkpoint_path: global_step = ckpt.model_checkpoint_path.split('/')[-1].split( '-')[-1] saver.restore(sess, ckpt.model_checkpoint_path) print('Loading success, global_step is %s' % global_step) else: print('No checkpoint file found') return coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) try: num_iter = int(math.ceil(n_test / BATCH_SIZE)) true_count = 0 total_sample_count = num_iter * BATCH_SIZE step = 0 while step < num_iter and not coord.should_stop(): val_images_, val_labels_ = sess.run( [val_batch, val_label_batch]) predictions = sess.run([top_k_op]) true_count += np.sum(predictions) step += 1 precision = true_count / total_sample_count print('precision = %.3f' % precision) except Exception as e: coord.request_stop(e) finally: coord.request_stop() coord.join(threads)
def classify(model_data_path, image_paths): '''Classify the given images using GoogleNet.''' # Get the data specifications for the GoogleNet model spec = models.get_data_spec(model_class=models.AlexNet) # Create a placeholder for the input image input_node = tf.placeholder(tf.float32, shape=(None, spec.crop_size, spec.crop_size, spec.channels)) # Construct the network net = models.AlexNet({'data': input_node}) # Create an image producer (loads and processes images in parallel) image_producer = dataset.ImageProducer(image_paths=image_paths, data_spec=spec) with tf.Session() as sesh: # Start the image processing workers coordinator = tf.train.Coordinator() threads = image_producer.start(session=sesh, coordinator=coordinator) # Load the converted parameters print('Loading the model') net.load(model_data_path, sesh) # Load the input image print('Loading the images') indices, input_images = image_producer.get(sesh) # Perform a forward pass through the network to get the class probabilities print('Classifying') probs = sesh.run(net.get_output(), feed_dict={input_node: input_images}) display_results([image_paths[i] for i in indices], probs) # Stop the worker threads coordinator.request_stop() coordinator.join(threads, stop_grace_period_secs=2)
train_source_3 = args.data_path + 'train_' + args.source3 + '.hdf' test_source_1 = args.data_path + 'val_' + args.source1 + '.hdf' test_source_2 = args.data_path + 'val_' + args.source2 + '.hdf' test_source_3 = args.data_path + 'val_' + args.source3 + '.hdf' target_path = args.data_path + 'test_' + args.target + '.hdf' source_dataset = Loader_unif_sampling(hdf_path1=train_source_1, hdf_path2=train_source_2, hdf_path3=train_source_3, transform=img_transform_train) source_loader = torch.utils.data.DataLoader(dataset=source_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers) test_source_dataset = Loader_unif_sampling(hdf_path1=test_source_1, hdf_path2=test_source_2, hdf_path3=test_source_3, transform=img_transform_test) test_source_loader = torch.utils.data.DataLoader(dataset=test_source_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers) target_dataset = Loader_validation(hdf_path=target_path, transform=img_transform_test) target_loader = torch.utils.data.DataLoader(dataset=target_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers) model = models.AlexNet(num_classes = 7, baseline = True) state_dict = torch.load("../alexnet_caffe.pth.tar") del state_dict["classifier.fc8.weight"] del state_dict["classifier.fc8.bias"] not_loaded = model.load_state_dict(state_dict, strict = False) optimizer = optim.SGD(list(model.features.parameters())+list(model.classifier.parameters()), lr=args.lr, momentum=args.momentum, nesterov=True) if args.cuda: model = model.cuda() torch.backends.cudnn.benchmark=True trainer = TrainLoop(model, optimizer, source_loader, test_source_loader, target_loader, args.patience, args.l2, args.penalty_weight, args.penalty_anneal_epochs, checkpoint_path=args.checkpoint_path, checkpoint_epoch=args.checkpoint_epoch, cuda=args.cuda) err = trainer.train(n_epochs=args.epochs, save_every=args.save_every)
def __init__(self, checkpoint, cuda): self.cp_task = checkpoint self.cuda = cuda self.model = models.AlexNet(num_classes = 7, baseline = True)
generator_test = datagen_test.flow_from_directory(directory=test_dir, batch_size=batch_size, target_size=(ncols, nrows), color_mode='grayscale', class_mode='categorical', shuffle=False) nclasses = len(generator_train.class_indices) steps_test = generator_test.n // batch_size print(steps_test) steps_per_epoch = generator_train.n // batch_size print(steps_per_epoch, "Steps Per Epoch") #model = AlexNet(input_shape, nclasses) model = models.AlexNet(input_shape, nclasses) model.summary() Adam = keras.optimizers.adam(lr=lr, amsgrad=True) model.compile(loss='categorical_crossentropy', optimizer= Adam,\ metrics=['accuracy']) history = model.fit_generator(generator_train, epochs=epochs, steps_per_epoch=steps_per_epoch, validation_data=generator_test, validation_steps=steps_test) '''show me the plots''' plt.figure(figsize=[8, 6]) plt.plot(history.history['loss'], 'r', linewidth=3.0) plt.plot(history.history['val_loss'], 'b', linewidth=3.0)
def main(): global args, best_prec1, best_prec5 args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() #horovod initialize hvd.init() log = None if hvd.rank() == 0: log = SummaryWriter(log_dir=args.log_dir) print('The Training Model is %s' % args.arch) # Check the save_dir exists or not if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) if args.cuda: torch.cuda.set_device(hvd.local_rank()) normalize = transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent # issues with Infiniband implementations that are not fork-safe if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and mp._supports_context and 'forkserver' in mp.get_all_start_methods()): kwargs['multiprocessing_context'] = 'forkserver' train_dataset = datasets.CIFAR10('data-%d'%hvd.local_rank(), train=True, transform=transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]), download=True) val_dataset = datasets.CIFAR10('data-%d'%hvd.local_rank(), train=False,transform=transforms.Compose([ transforms.ToTensor(), normalize, ])) #Horovod Partition the training data train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) val_sampler = torch.utils.data.distributed.DistributedSampler( val_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader( train_dataset,batch_size=args.batch_size,sampler=train_sampler,**kwargs) val_loader = torch.utils.data.DataLoader( val_dataset,batch_size=args.batch_size,sampler=val_sampler,**kwargs) # model = torch.nn.DataParallel(resnet.__dict__[args.arch]()) if args.arch in resnet.__dict__: model = resnet.__dict__[args.arch]() elif args.arch == 'alexnet': model = models.AlexNet() elif args.arch == 'vgg16': model = models.VGG16() if hvd.rank() == 0: numel = sum(p.numel() for p in model.parameters()) print('Total params: {:d}'.format(numel)) lr_scaler = hvd.size() if args.cuda: model.cuda() if args.use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.evaluate, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() if args.half: model.half() criterion.half() base_optimizer = torch.optim.SGD(model.parameters(), args.lr * lr_scaler, momentum=args.momentum, weight_decay=args.weight_decay) # lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(base_optimizer, # milestones=[100, 150], last_epoch=args.start_epoch - 1) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(base_optimizer, root_rank=0) #Compression # compression = Allgather(MGCCompressor(0.05), ResidualMemory(), hvd.size()) # compression = Allgather(TernGradCompressor(), ResidualMemory(), hvd.size()) compression = Allreduce(NoneCompressor(), NoneMemory()) # compression = Allgather(DgcCompressor(0.01), ResidualMemory(), hvd.size()) # compression = Allgather(LowQSGDCompressor(), ResidualMemory(), hvd.size()) # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer(base_optimizer, compression, named_parameters=model.named_parameters()) if hvd.rank() == 0: log.add_scalar('train/accuracy', 0., 0) log.add_scalar('test/accuracy', 0., 0) for epoch in range(args.start_epoch + 1, args.epochs + 1): adjust_learning_rate(optimizer, epoch, size=lr_scaler) if hvd.rank() == 0: print('current lr {:.5e}'.format(optimizer.param_groups[0]['lr'])) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, log=log) # evaluate on validation set prec1, prec5 = validate(val_loader, model, criterion, epoch, log=log) # remember best prec@1 and save checkpoint best_prec1 = max(prec1, best_prec1) best_prec5 = max(prec5, best_prec5) if hvd.rank() == 0: print('Best Pred@1:{:.2f}%, Prec@5:{:.2f}%\n'.format(best_prec1, best_prec5)) # if epoch > 0 and epoch % args.save_every == 0: # save_checkpoint({ # 'epoch': epoch + 1, # 'state_dict': model.state_dict(), # 'best_prec1': best_prec1, # }, is_best, filename=os.path.join(args.save_dir, 'checkpoint.th')) # # save_checkpoint({ # 'state_dict': model.state_dict(), # 'best_prec1': best_prec1, # }, is_best, filename=os.path.join(args.save_dir, 'model.th')) if hvd.rank() == 0: log.close()
def training(args,*k,**kw): # if use gpus device = torch.device("cuda:{}".format(args.gpuindex) if torch.cuda.is_available() and args.gpu else "cpu") print("user device: {}".format(device)) # redis helper related redis_helper = redishelper.GoSGDHelper(host=args.host, port=args.port) redis_helper.signin() while redis_helper.cur_edge_num() < args.edgenum: time.sleep(1) # sleep 1 second model_score = 1.0 / args.edgenum # the initial model parameters score # log_file and summary path log_file = "{0}-{1}-edge-{2}.log".format(time.strftime('%Y%m%d-%H%M%S',time.localtime(time.time())), args.model,redis_helper.ID) log_dir = "tbruns/{0}-{1}-cifar10-edge-{2}".format(time.strftime('%Y%m%d%H%M%S',time.localtime(time.time())),args.model,redis_helper.ID) logger = open(log_file,'w') swriter = SummaryWriter(log_dir) # load traing data trainset = dataset.AGGData(root=args.dataset, train=True, download=False, transform=None) testset = dataset.AGGData(root=args.dataset, train=False, download=False, transform=None) testloader = torch.utils.data.DataLoader(testset, batch_size=args.batchsize, shuffle=False, num_workers=0) # construct neural network net = None if args.model == "lenet5": net = models.LeNet5() elif args.model == "resnet18": net = models.ResNet18() elif args.model == "alexnet": net = models.AlexNet(args.num_classes) elif args.model == "alexnetimg8": net = models.AlexNetImg8(args.num_classes) elif args.model == "squeezenet": net = models.SqueezeNet() elif args.model == "mobilenetv2": net = models.MobileNetV2() elif args.model == "resnet34": net = models.ResNet34() elif args.model == "resnet50": net = models.ResNet50() elif args.model == "resnet101": net = models.ResNet101() else: net = models.ResNet152() net.to(device) # define optimizer criterion = nn.CrossEntropyLoss() criterion_loss = nn.CrossEntropyLoss(reduction='none') optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9) lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer,milestones=list(args.lrschstep), gamma=0.1) # start training wallclock = 0.0 iteration = 0 # global iterations for epoch in range(0,args.epoch,1): starteg = time.time() # merge parameters of other edge if epoch > 0: mintime,maxtime,param_list = redis_helper.min2max_time_params() print("The min/max time cost of last epoch: {}/{}".format(mintime,maxtime)) for item in param_list: w1 = model_score / (model_score + item[0]) w2 = item[0] / (model_score + item[0]) for local,other in zip(net.parameters(),item[1]): local.data = local.data * w1 + other.data.to(device) * w2 model_score = model_score + item[0] while redis_helper.finish_update() is False: time.sleep(1.0) critical_extra_start = time.time() # identify critical training samples critrainset = critical_identify(net,trainset,criterion_loss,device,args) critrainloader = torch.utils.data.DataLoader(critrainset, batch_size=args.batchsize, shuffle=True, num_workers=0) critical_extra_cost = time.time() - critical_extra_start training_start = time.time() running_loss = 0.0 record_running_loss = 0.0 for i, data in enumerate(critrainloader, 0): iteration += 1 # get the inputs inputs, labels = data inputs = inputs.to(device) labels = labels.squeeze().to(device) # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = net(inputs).squeeze() loss = criterion(outputs, labels) loss.backward() optimizer.step() # print statistics running_loss += loss.item() record_running_loss += loss.item() if i % 10 == 9: swriter.add_scalar("Training loss",record_running_loss / 10,epoch*len(critrainloader)+i) record_running_loss = 0.0 if i % 2000 == 1999: # print every 2000 mini-batches print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 2000)) running_loss = 0.0 training_cost = time.time() - training_start # push time and parameters to Redis model_score = model_score / 2 sel_edge_id = redis_helper.random_edge_id(can_be_self=True) paramls = list(map(lambda x: x.cpu(),list(net.parameters()))) redis_helper.ins_time_params(sel_edge_id,training_cost,model_score,paramls) while not redis_helper.finish_push(): time.sleep(1.0) wallclock += time.time() - starteg total, kaccuracy = validation(net,testloader,device,topk=(1,5)) curtime = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) _header="[ {} Epoch {} /Iteration {} Wallclock {}]".format(curtime,epoch+1,iteration, wallclock) print('{} Accuracy of the network on the {} test images: {} %'.format(_header, total, kaccuracy_str(kaccuracy))) logger.write('{},{},{},{}\n'.format(epoch+1 ,iteration, wallclock, accuracy_str(kaccuracy))) logger.flush() # write to disk for item in kaccuracy: swriter.add_scalar("Top{}Accuracy".format(item[0]), item[1], epoch) # adopt learning rate of optimizer if args.lrscheduler: lr_scheduler.step() print('Finished Training') redis_helper.register_out() logger.close() # close log file writer return net
from models import AlexNet import models model = models.AlexNet() import models model = getattr(models, 'AlexNet')()
domain3_dataset = Loader_validation(hdf_path=domain3_path, transform=img_transform) domain3_loader = torch.utils.data.DataLoader(dataset=domain3_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers) domain4_dataset = Loader_validation(hdf_path=domain4_path, transform=img_transform) domain4_loader = torch.utils.data.DataLoader(dataset=domain4_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers) feature_extractor = models.AlexNet(baseline=False) ckpt = torch.load(args.encoder_path) if args.dg_type == 'erm': not_loaded = feature_extractor.load_state_dict(ckpt['model_state'], strict=False) else: not_loaded = feature_extractor.load_state_dict( ckpt['feature_extractor_state'], strict=False) data1, labels1, _ = next(iter(domain1_loader)) domain1 = torch.zeros_like(labels1) data2, labels2, _ = next(iter(domain2_loader)) domain2 = torch.ones_like(labels2) data3, labels3, _ = next(iter(domain3_loader)) domain3 = 2 * torch.ones_like(labels3) data4, labels4, _ = next(iter(domain4_loader))
def retrain_running(): with tf.name_scope('input'): train_batch, train_label_batch = input_data.read_and_decode( train_tfrecords_file, IMG_W, IMG_H, BATCH_SIZE, MIN_AFTER_DEQUENE) val_batch, val_label_batch = input_data.read_and_decode( val_tfrecords_file, IMG_W, IMG_H, BATCH_SIZE, MIN_AFTER_DEQUENE) x = tf.placeholder(tf.float32, shape=[BATCH_SIZE, IMG_W, IMG_H, 3]) y_ = tf.placeholder(tf.int32, shape=[BATCH_SIZE]) logits = models.AlexNet(x, N_CLASSES) loss = tools.loss(logits, y_) acc = tools.accuracy(logits, y_) train_op = tools.optimize(loss, LEARNING_RATE) with tf.Session() as sess: saver = tf.train.Saver(tf.global_variables()) print("Reading checkpoints...") ckpt = tf.train.get_checkpoint_state(model_dir) if ckpt and ckpt.model_checkpoint_path: global_step = ckpt.model_checkpoint_path.split('/')[-1].split( '-')[-1] saver.restore(sess, ckpt.model_checkpoint_path) print('Loading success, global_step is %s' % global_step) else: print('No checkpoint file found') return coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) summary_op = tf.summary.merge_all() train_writer = tf.summary.FileWriter(logs_train_dir, sess.graph) val_writer = tf.summary.FileWriter(logs_val_dir, sess.graph) try: for step in np.arange(MAX_STEP): if coord.should_stop(): break tra_images, tra_labels = sess.run( [train_batch, train_label_batch]) _, tra_loss, tra_acc = sess.run([train_op, loss, acc], feed_dict={ x: tra_images, y_: tra_labels }) if step % 50 == 0: print( 'Step %d, train loss = %.4f, train accuracy = %.2f%%' % (step, tra_loss, tra_acc)) summary_str = sess.run(summary_op, feed_dict={ x: tra_images, y_: tra_labels }) train_writer.add_summary(summary_str, step) # if step % 200 == 0 or (step + 1) == MAX_STEP: val_images, val_labels = sess.run( [val_batch, val_label_batch]) val_loss, val_acc = sess.run([loss, acc], feed_dict={ x: val_images, y_: val_labels }) print( '** Step %d, val loss = %.4f, val accuracy = %.2f%% **' % (step, val_loss, val_acc)) summary_str = sess.run(summary_op, feed_dict={ x: val_images, y_: val_labels }) val_writer.add_summary(summary_str, step) # if step % 2000 == 0 or (step + 1) == MAX_STEP: checkpoint_path = os.path.join(new_model_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) except tf.errors.OutOfRangeError: print('Done training -- epoch limit reached') finally: coord.request_stop() coord.join(threads)
import models name_to_model = { 'LeNet': lambda args: models.LeNet(**args), 'AlexNet': lambda args: models.AlexNet(**args), 'MLP': lambda args: models.MLP(**args), 'ResNet18': lambda args: models.ResNet18(**args), 'PResNet18': lambda args: models.PResNet18(**args), 'Permutation': lambda args: models.TensorPermutation(32, 32, **args), 'ResNet20Original': lambda args: models.resnet20original(), 'MobileNet': lambda args: models.MobileNet(**args), 'ShuffleNet': lambda args: models.ShuffleNetG2(), 'WideResNet28': lambda args: models.WideResNet28(**args), } def get_model(model_config): name = model_config['name'] return name_to_model[name](model_config.get('args', None))
task_classifier = models.task_classifier() domain_discriminator_list = [] for i in range(1): if args.rp_size == 4096 or args.ablation == 'RP': disc = models.domain_discriminator_ablation_RP( optim.SGD, args.lr_domain, args.momentum_domain, args.l2).train() else: disc = models.domain_discriminator(args.rp_size, optim.SGD, args.lr_domain, args.momentum_domain, args.l2).train() domain_discriminator_list.append(disc) feature_extractor = models.AlexNet(num_classes=5, baseline=False) state_dict = torch.load("../alexnet_caffe.pth.tar") del state_dict["classifier.fc8.weight"] del state_dict["classifier.fc8.bias"] not_loaded = feature_extractor.load_state_dict(state_dict, strict=False) optimizer_task = optim.SGD(list(feature_extractor.parameters()) + list(task_classifier.parameters()), lr=args.lr_task, momentum=args.momentum_task, weight_decay=args.l2) models_dict = {} models_dict['feature_extractor'] = feature_extractor models_dict['task_classifier'] = task_classifier models_dict['domain_discriminator_list'] = domain_discriminator_list
def train_and_test(): with tf.name_scope('input'): train_image, train_label, val_image, val_label, n_test = input_data.get_files( data_dir, RATIO, ret_val_num=True) train_batch, train_label_batch = input_data.get_batch( train_image, train_label, IMG_W, IMG_H, BATCH_SIZE, CAPACITY) val_batch, val_label_batch = input_data.get_batch( val_image, val_label, IMG_W, IMG_H, BATCH_SIZE, CAPACITY) x = tf.placeholder(tf.float32, shape=[BATCH_SIZE, IMG_W, IMG_H, 3]) y_ = tf.placeholder(tf.int32, shape=[BATCH_SIZE]) logits = models.AlexNet(x, N_CLASSES) loss = tools.loss(logits, y_) acc = tools.accuracy(logits, y_) train_op = tools.optimize(loss, LEARNING_RATE) top_k_op = tf.nn.in_top_k(logits, y_, 1) with tf.Session() as sess: saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) summary_op = tf.summary.merge_all() train_writer = tf.summary.FileWriter(logs_train_dir, sess.graph) val_writer = tf.summary.FileWriter(logs_val_dir, sess.graph) try: for step in np.arange(MAX_STEP): if coord.should_stop(): break tra_images, tra_labels = sess.run( [train_batch, train_label_batch]) _, tra_loss, tra_acc = sess.run([train_op, loss, acc], feed_dict={ x: tra_images, y_: tra_labels }) if step % 50 == 0: print( 'Step %d, train loss = %.4f, train accuracy = %.2f%%' % (step, tra_loss, tra_acc)) summary_str = sess.run(summary_op, feed_dict={ x: tra_images, y_: tra_labels }) train_writer.add_summary(summary_str, step) if step % 200 == 0 or (step + 1) == MAX_STEP: val_images, val_labels = sess.run( [val_batch, val_label_batch]) val_loss, val_acc = sess.run([loss, acc], feed_dict={ x: val_images, y_: val_labels }) print( '** Step %d, val loss = %.4f, val accuracy = %.2f%% **' % (step, val_loss, val_acc)) summary_str = sess.run(summary_op, feed_dict={ x: val_images, y_: val_labels }) val_writer.add_summary(summary_str, step) if step % 2000 == 0 or (step + 1) == MAX_STEP: checkpoint_path = os.path.join(model_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) print('----------------') print('Testing Now!') print('There are %d test examples' % (n_test)) num_iter = int(math.ceil(n_test / BATCH_SIZE)) true_count = 0 total_sample_count = num_iter * BATCH_SIZE step = 0 while step < num_iter: if coord.should_stop(): break val_images, val_labels = sess.run([val_batch, val_label_batch]) predictions = sess.run([top_k_op], feed_dict={ x: val_images, y_: val_labels }) true_count += np.sum(predictions) step += 1 precision = true_count / total_sample_count * 100.0 print('precision = %.2f%%' % precision) except tf.errors.OutOfRangeError: print('Done training -- epoch limit reached') finally: coord.request_stop() coord.join(threads)
def train(**kwargs): # config.parse(kwargs) vis = visdom.Visdom(env=config.env) # step1: configure model model = models.AlexNet(num_classees=config.num_classes) # model = models.ResNet34(num_classes = config.num_classes) if config.use_gpu: os.environ['CUDA_VISIBLE_DEVICE'] = config.gpu_id if config.pretrained_model_path: model.load(config.pretrained_model_path) if config.use_gpu and torch.cuda.is_available(): model.cuda() # step2: data train_data = FaceExpression(config.train_data_root, train=True) val_data = FaceExpression(config.train_data_root, train=False) test_data = FaceExpression(config.test_data_root, test=True) train_dataloader = DataLoader(train_data, config.batch_size, shuffle=True, num_workers=config.num_workers) val_dataloader = DataLoader(val_data, config.batch_size, shuffle=False, num_workers=config.num_workers) test_dataloader = DataLoader(test_data, config.batch_size, shuffle=False, num_workers=config.num_workers) # step3: criterion and optimizer criterion = torch.nn.CrossEntropyLoss() lr = config.lr #optimizer = torch.optim.Adam(model.parameters(), lr = lr, weight_decay = config.weight_decay) optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=config.weight_decay) # step4: meters loss_meter = meter.AverageValueMeter() confusion_matrix = meter.ConfusionMeter(config.num_classes) previous_loss = 1e100 # train total_train_accuracy, total_val_accuracy, total_test_accuracy = [], [], [] train_accuracy, val_accuracy, test_accuracy = 0, 0, 0 for epoch in tqdm(range(config.max_epoch)): loss_meter.reset() confusion_matrix.reset() #for data,label in tqdm(train_dataloader): for data, label in train_dataloader: #print(data.shape) #print(label.shape) input = Variable(data) target = Variable(label) if config.use_gpu and torch.cuda.is_available(): input = input.cuda() target = target.cuda() optimizer.zero_grad() score = model(input) loss = criterion(score, target) loss.backward() optimizer.step() # meters update and visualize loss_meter.add(loss.data[0]) confusion_matrix.add(score.data, target.data) # model.save() # update learning rate if loss_meter.value()[0] > previous_loss: lr = lr * config.lr_decay # 第二种降低学习率的方法:不会有moment等信息的丢失 for param_group in optimizer.param_groups: param_group['lr'] = lr previous_loss = loss_meter.value()[0] # validate and visualize train_cm_values = confusion_matrix.value() train_accuracy = sum( [train_cm_values[i][i] for i in range(config.num_classes)]) / train_cm_values.sum() val_cm, val_accuracy = val(model, val_dataloader) test_cm, test_accuracy = val(model, test_dataloader) content = '''*******epoch:{epoch} , lr:{lr}, loss:{loss} train_cm:{train_cm} val_cm:{val_cm}\n '''.format(epoch=epoch, loss=loss_meter.value()[0], val_cm=str(val_cm.value()), train_cm=str(confusion_matrix.value()), lr=lr) total_train_accuracy.append(train_accuracy) total_val_accuracy.append(val_accuracy) total_test_accuracy.append(test_accuracy) # print('{0} epoch, loss {1}, train accuracy {2:4f}, val accuracy {3:4f}, test accuracy {4:4f}'.format( # epoch+1, # loss_meter.value()[0], # train_accuracy, # val_accuracy, # test_accuracy)) title = 'brightness_random_crop_1_channel_total_accuracy' if epoch % 20 == 0: x_epoch = list(range(epoch)) train_acc = dict(x=x_epoch, y=total_train_accuracy, type='custom', name='train') val_acc = dict(x=x_epoch, y=total_val_accuracy, type='custom', name='val') test_acc = dict(x=x_epoch, y=total_test_accuracy, type='custom', name='test') layout = dict(title=title, xaxis={'title': 'epochs'}, yaxis={'title': 'accuracy'}) data = [train_acc, val_acc, test_acc] vis._send({'data': data, 'layout': layout, 'win': title})
def classify(model_data_path, image_pathFolder): '''Classify the given images using AlexNet.''' print(model_data_path) print(image_pathFolder) image_paths = [] for filename in os.listdir(image_pathFolder): image_paths.append(image_pathFolder + filename) print(image_paths) #if(True) #sys.exit(0) # Get the data specifications for the AlexNet model spec = models.get_data_spec(model_class=models.AlexNet) #print(spec) # Create a placeholder for the input image input_node = tf.placeholder(tf.float32, shape=(None, spec.crop_size, spec.crop_size, spec.channels)) print(input_node) # Construct the network net = models.AlexNet({'data': input_node}) print("net---------------------") # Create an image producer (loads and processes images in parallel) image_producer = dataset.ImageProducer(image_paths=image_paths, data_spec=spec) print(image_producer) os.environ["TF_CPP_MIN_LOG_LEVEL"] = '1' with tf.Session() as sesh: print( 'start -----------------------------------------------------------------%s' % datetime.now()) sesh.run(tf.global_variables_initializer()) # Start the image processing workers coordinator = tf.train.Coordinator() threads = image_producer.start(session=sesh, coordinator=coordinator) # Load the converted parameters print( 'Loading the model -----------------------------------------------------------------%s' % datetime.now()) net.load(model_data_path, sesh) # Load the input image print( 'Loading the images-----------------------------------------------------------------%s' % datetime.now()) indices, input_images = image_producer.get(sesh) # Perform a forward pass through the network to get the class probabilities print( 'Classifying -----------------------------------------------------------------%s' % datetime.now()) probs = sesh.run(net.get_output(), feed_dict={input_node: input_images}) print( 'Classifying END -----------------------------------------------------------------%s' % datetime.now()) display_results([image_paths[i] for i in indices], probs) # Stop the worker threads coordinator.request_stop() coordinator.join(threads, stop_grace_period_secs=2)
parser.add_argument("--num_classes", type=int, default=1000) return parser.parse_args() def adjust_learning_rate(optimizer, epoch, args): lr = args.learning_rate * (0.1 ** (epoch // 30)) for param_group in optimizer.param_groups: param_group["lr"] = lr if __name__ == "__main__": args = parse_args() args = utils.analyze_arguments(args) model = models.AlexNet( args.num_classes, pooling=args.pooling, activation=args.activation, normalization=args.normalization ).to(args.device) if args.starting_epoch > 0: utils.load_model(model, "%s-epoch-%03d" % (args.name, args.starting_epoch - 1)) # optimizer = torch.optim.Adam(model.parameters(), args.learning_rate) optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=0.9, weight_decay=5e-4) criterion = torch.nn.CrossEntropyLoss().to(args.device) accuracy_meter = meters.AccuracyMeter((1, 3, 5)) writer = tensorboardX.SummaryWriter(os.path.join("log", args.name)) train_loader = args.dataset(args.batch_size, train=True) valid_loader = args.dataset(args.batch_size, train=False) training_steps = 0 for epoch in range(args.starting_epoch, args.starting_epoch + args.epochs): adjust_learning_rate(optimizer, epoch, args) model.train()
def draw_confusion_matrix(show_confusion_matrix=True): with tf.Graph().as_default(): predictions_label = [] true_label = [] val_batch, val_label_batch, n_test = input_data.get_batch( val_txt, IMG_W, IMG_H, BATCH_SIZE, CAPACITY) x = tf.placeholder(tf.float32, shape=[BATCH_SIZE, IMG_W, IMG_H, 3]) logits = models.AlexNet(x, N_CLASSES) y_pred = tf.nn.softmax(logits) y_pred_cls = tf.argmax(y_pred, axis=1) saver = tf.train.Saver(tf.global_variables()) with tf.Session() as sess: print("Reading checkpoints...") ckpt = tf.train.get_checkpoint_state(model_dir) if ckpt and ckpt.model_checkpoint_path: global_step = ckpt.model_checkpoint_path.split('/')[-1].split( '-')[-1] saver.restore(sess, ckpt.model_checkpoint_path) print('Loading success, global_step is %s' % global_step) else: print('No checkpoint file found') return coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) try: num_iter = int(math.ceil(n_test / BATCH_SIZE)) step = 0 while step < num_iter and not coord.should_stop(): val_images, val_labels = sess.run( [val_batch, val_label_batch]) true_label = np.append(true_label, val_labels) predictions = sess.run([y_pred_cls], feed_dict={x: val_images}) predictions = np.array(predictions) predictions = np.squeeze(predictions) predictions_label = np.append(predictions_label, predictions) step += 1 predictions_label = np.int32(predictions_label) true_label = np.int32(true_label) # Plot the confusion matrix, if desired. if show_confusion_matrix: print("Confusion Matrix:") tools.plot_confusion_matrix(label_true=true_label, label_pred=predictions_label, num_classes=N_CLASSES) except Exception as e: coord.request_stop(e) finally: coord.request_stop() coord.join(threads)
import scipy.io as sio import dlib from imutils import face_utils import datasets import utils import models PROJECT_DIR = "E:/demo/python/head_pose/" AFLW2000_DATA_DIR = 'E:/data/AFLW2000/' AFLW2000_MODEL_FILE = PROJECT_DIR + 'model/aflw2000_model.h5' AFLW2000_TEST_SAVE_DIR = 'E:/ml/data/aflw2000_test/' BIWI_DATA_DIR = 'E:/ml/data/Biwi/kinect_head_pose_db/hpdb/' BIWI_MODEL_FILE = PROJECT_DIR + 'model/biwi_model.h5' BIWI_TEST_SAVE_DIR = 'E:/ml/data/biwi_test/' BIN_NUM = 66 INPUT_SIZE = 64 BATCH_SIZE=16 EPOCHS=20 dataset = datasets.Biwi(BIWI_DATA_DIR, 'filename_list.txt', batch_size=BATCH_SIZE, input_size=INPUT_SIZE, ratio=0.95) net = models.AlexNet(dataset, BIN_NUM, batch_size=BATCH_SIZE, input_size=INPUT_SIZE) net.train(BIWI_MODEL_FILE, max_epoches=EPOCHS, load_weight=False) net.test(BIWI_TEST_SAVE_DIR)
import models if __name__ == "__main__": print("t") model_alex = models.AlexNet() print(model_alex) model_res = models.ResNet34() print(model_res)