def init_pred(): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # YOLO network hyperparameters B = 2 # number of bounding box predictions per cell S = 14 # width/height of network output grid (larger than 7x7 from paper since we use a different network) # load_network_path = '/mnt/c/Users/herbe/CS242/fa18-cs242-final/yolo/best_detector.pth' load_network_path = '/home/herbertwangwrt/fa18-cs242-final/yolo/best_detector.pth' pretrained = True # use to load a previously trained network if load_network_path is not None: # print('Loading saved network from {}'.format(load_network_path)) net = resnet50().to(device) net.load_state_dict( torch.load(load_network_path, map_location=lambda storage, loc: storage)) else: # print('Load pre-trained model') net = resnet50(pretrained=pretrained).to(device) batch_size = 24 file_root_train = 'VOCdevkit_2007/VOC2007/JPEGImages/' annotation_file_train = currentdir + '/voc2007.txt' train_dataset = VocDetectorDataset(root_img_dir=file_root_train, dataset_file=annotation_file_train, train=True, S=S) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4) file_root_test = 'VOCdevkit_2007/VOC2007test/JPEGImages/' annotation_file_test = currentdir + '/voc2007test.txt' test_dataset = VocDetectorDataset(root_img_dir=file_root_test, dataset_file=annotation_file_test, train=False, S=S) test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4) net.eval() return net
for i, box in enumerate(boxes): x1 = int(box[0] * w) x2 = int(box[2] * w) y1 = int(box[1] * h) y2 = int(box[3] * h) cls_index = cls_indexs[i] cls_index = int(cls_index) # convert LongTensor to int prob = probs[i] prob = float(prob) result.append([(x1, y1), (x2, y2), VOC_CLASSES[cls_index], image_name, prob]) return result if __name__ == '__main__': model = resnet50() print('load model...') model.load_state_dict(torch.load('best.pth')) model.eval() model.cuda() image_name = 'person.jpg' image = cv2.imread(image_name) print('predicting...') result = predict_gpu(model, image_name) for left_up, right_bottom, class_name, _, prob in result: color = Color[VOC_CLASSES.index(class_name)] cv2.rectangle(image, left_up, right_bottom, color, 2) label = class_name + str(round(prob, 2)) text_size, baseline = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.4, 1) p1 = (left_up[0], left_up[1] - text_size[1])
from resnet_yolo import resnet50, resnet18 from yoloLoss import YoloLoss from dataset import yoloDataset from visualize import Visualizer import numpy as np use_gpu = torch.cuda.is_available() file_root = '/home/xzh/data/VOCdevkit/VOC2012/allimgs/' learning_rate = 0.001 num_epochs = 50 batch_size = 24 use_resnet = True if use_resnet: net = resnet50() else: net = vgg16_bn() # net.classifier = nn.Sequential( # nn.Linear(512 * 7 * 7, 4096), # nn.ReLU(True), # nn.Dropout(), # #nn.Linear(4096, 4096), # #nn.ReLU(True), # #nn.Dropout(), # nn.Linear(4096, 1470), # ) #net = resnet18(pretrained=True) #net.fc = nn.Linear(512,1470) # initial Linear # for m in net.modules():
import matplotlib.pyplot as plt device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # YOLO network hyperparameters B = 2 # number of bounding box predictions per cell S = 14 # width/height of network output grid (larger than 7x7 from paper since we use a different network) load_network_path = None pretrained = True # use to load a previously trained network if load_network_path is not None: print('Loading saved network from {}'.format(load_network_path)) net = resnet50().to(device) net.load_state_dict(torch.load(load_network_path)) else: print('Load pre-trained model') net = resnet50(pretrained=pretrained).to(device) learning_rate = 0.001 num_epochs = 50 batch_size = 24 # Yolo loss component coefficients (as given in Yolo v1 paper) lambda_coord = 5 lambda_noobj = 0.5
def main(): global args args = parser.parse_args() load_network_path = args.model_path batch_size = args.batch_size S = args.S ''' To implement Yolo we will rely on a pretrained classifier as the backbone for our detection network. PyTorch offers a variety of models which are pretrained on ImageNet in the [`torchvision.models`] (https://pytorch.org/docs/stable/torchvision/models.html) package. In particular, we will use the ResNet50 architecture as a base for our detector. This is different from the base architecture in the Yolo paper and also results in a different output grid size (14x14 instead of 7x7). Models are typically pretrained on ImageNet since the dataset is very large. The pretrained model provides a very useful weight initialization for our detector, so that the network is able to learn quickly and effictively. ''' if args.eval: if load_network_path is None: print("Model path not specified!!") exit(0) else: print('Loading saved network from {}'.format(load_network_path)) net = resnet50().to(device) net.load_state_dict(torch.load(load_network_path)) # To evaluate detection results we use mAP (mean of average precision over each class) net.eval() test_aps = evaluate(net, test_dataset_file=annotation_file_test) output_submission_csv('my_solution.csv', test_aps) else: pretrained = True # use to load a previously trained network if load_network_path is not None: print('Loading saved network from {}'.format(load_network_path)) net = resnet50().to(device) net.load_state_dict(torch.load(load_network_path)) else: print('Load pre-trained model') net = resnet50(pretrained=pretrained).to(device) ''' Since Pascal is a small dataset (5000 in train+val) we have combined the train and val splits to train our detector. The train dataset loader also using a variety of data augmentation techniques including random shift, scaling, crop, and flips. Data augmentation is slightly more complicated for detection dataset since the bounding box annotations must be kept consistent through the transformations. Since the output of the dector network we train is an SxSx(B*5+C), we use an encoder to convert the original bounding box coordinates into relative grid bounding box coordinates corresponding to the the expected output. We also use a decoder which allows us to convert the opposite direction into image coordinate bounding boxes. ''' train_dataset = VocDetectorDataset(root_img_dir=file_root_train, dataset_file=annotation_file_train, train=True, S=S) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4) print('Loaded %d train images' % len(train_dataset)) test_dataset = VocDetectorDataset(root_img_dir=file_root_test, dataset_file=annotation_file_test, train=False, S=S) test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4) print('Loaded %d test images' % len(test_dataset)) loss_history = train(args, net, train_loader, test_loader) plt.plot(np.squeeze(loss_history)) plt.ylabel('loss') plt.xlabel('iterations') plt.title("Training Loss") plt.savefig('training_loss.png')
# print(h, w, count) # means = mean / (1.0 * h * w * count) # print('b, g, r = ', means) # # new_state_dict = resnet.state_dict() # for k in new_state_dict.keys(): # print(k) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') vgg = models.vgg16().to(device) net = resnet50().to(device) resnet = models.resnet50(pretrained=True).to(device) # summary(net,(3, 448, 448)) # print(net) new_state_dict = resnet.state_dict() dd = net.state_dict() # for k in dd.keys(): # print(k) # net = models.resnet50().to(device) # # summary(net,(3,224,224)) # summary(net,(3,448,448)) # # print(net)