def test(): import test pygame.init() tb = TextBox(text="Fishy") clock = pygame.time.Clock() group = pygame.sprite.LayeredDirty(tb, layer=0, _use_update=True) def testlogic(event): if event == None: return if event.type == pygame.KEYDOWN: if event.key == 27: return True if event.key == K_BACKSPACE: if len(tb.text) > 0: tb.text = tb.text[:-1] return if event.key >= 256: return tb.text += chr(event.key) def testrender(screen): clock.tick() time = clock.get_time() group.update(time) bgd = pygame.Surface((screen.get_width(), screen.get_height())) group.draw(screen, bgd=bgd) test.test(testlogic, testrender)
def startSouth(below, ordered = []): "Sort Southern cities into order" jyunban = sortx(below, 'ascending') #Decide starting city minami = len(jyunban) ordered.append(jyunban[0]) jyunban.pop(0) alldistance = getdistances(ordered[0], jyunban) #Decide last southern city index = alldistance.index(max(alldistance)) furthest = jyunban[index] jyunban.pop(index) while len(ordered) < minami/2: #Route from start point decidenext(jyunban, ordered, 'south') test(ordered) second = sortx(jyunban, 'descending') #Route from end point lensecond = len(second) gyaku = [] gyaku.append(furthest) while len(gyaku) < lensecond+1: decidenext(second, gyaku, 'north') for i in range(1,len(gyaku)+1): #Put two routes together ordered.append(gyaku[-i]) #checkintercept(ordered) return ordered
def run(train_file, valid_file, test_file, output_file): '''The function to run your ML algorithm on given datasets, generate the output and save them into the provided file path Parameters ---------- train_file: string the path to the training file valid_file: string the path to the validation file test_file: string the path to the testing file output_file: string the path to the output predictions to be saved ''' ## your implementation here # read data from input train_samples, word2num = train_data_prepare(train_file) valid_samples = test_data_prepare(valid_file, word2num, 'valid') # your training algorithm model = train(train_samples, valid_samples, word2num) # your prediction code test(test_file, output_file, word2num, model)
def main (argv): logger = initLogger(); if (len(argv) == 0): print "Missing argument. Options: init, store, list, test, get, restore"; elif (argv[0] == "init"): init.init(archiveDir); elif (argv[0] == "store"): if (len(argv) < 2): print "Usage: mybackup store <directory>"; else: store.store(archiveDir, argv[1], logger); elif (argv[0] == "list"): if (len(argv) < 2): listBackups.list(archiveDir) else: listBackups.list(archiveDir, argv[1]) elif (argv[0] == "get"): if (len(argv) < 2): print "Usage: mybackup get <pattern>"; else: restore.getFile(archiveDir, argv[1]); elif (argv[0] == "restore"): if (len(argv) < 2): restore.restoreAll(archiveDir) else: restore.restoreAll(archiveDir, argv[1]) elif (argv[0] == "test"): test.test(archiveDir, logger) else: print "Unknown option: "+argv[0];
def main(): print "In Main Experiment\n" # get the classnames from the directory structure directory_names = list(set(glob.glob(os.path.join("train", "*"))).difference(set(glob.glob(os.path.join("train", "*.*"))))) # get the number of rows through image count numberofImages = parseImage.gestNumberofImages(directory_names) num_rows = numberofImages # one row for each image in the training dataset # We'll rescale the images to be 25x25 maxPixel = 25 imageSize = maxPixel * maxPixel num_features = imageSize + 2 + 128 # for our ratio X = np.zeros((num_rows, num_features), dtype=float) y = np.zeros((num_rows)) # numeric class label files = [] namesClasses = list() #class name list # Get the image training data parseImage.readImage(True, namesClasses, directory_names,X, y, files) print "Training" # get test result train.train(X, y, namesClasses) print "Testing" test.test(num_rows, num_features, X, y, namesClasses = list())
def cli_main(): if sys.version_info[0] < 3: print("STK needs Python 3.x to run. Check your execution path and file associations.") print("Your Python version is: ") print(sys.version) return if len(sys.argv) >= 2: if sys.argv[1] == "--test": import test test.test(sys.argv[2:]) elif sys.argv[1] == "--markdown2html" and len(sys.argv) == 4: input_file = sys.argv[2] output_dir = sys.argv[3] parser = MDParser() parser.parse_file(input_file) exporter = HtmlExporter() exporter.export(parser.saga, output_dir) elif sys.argv[1] == "--generate_ep_card" and len(sys.argv) == 4: #generate_episode_card(sys.argv[2], sys.argv[3]) #TODO generate_episode_card print("Not implemented yet...") else: print_usage() else: print_usage()
def mainHandler(threadNum, link, deep, key, test): event = threading.Event() # 产生一个event对象,对象维护一个flag,当 event.clear() # 将event的flag设为false pool = threadPool(threadNum, event) # 初始化一个threadNum个线程的线程池,event对象用于通知主线程继续执行 showProgress(pool.getQueue(), deep, event) pool.putJob((link, deep), key) # job是(link,deep)的一个tuple,key是关键字 pool.wait() # 阻塞主线程 if test: # 需要自测模块运行 import test test.test(key, dbFile)
def test3(): cost = [1,1,1] eqs = [[1,1,0], [2,2,2]] eqB = [2, 5] expectedCost = [1,1,1] expectedConstraints = [[3,-2,0], [1,1,0]] expectedThresholds = [2, 5] test((expectedCost, expectedConstraints, expectedThresholds), simplex.standardForm(cost, equalities=eqs, eqThreshold=eqB))
def mainHandler(threadNum, link, deep, key, test): event = threading.Event() event.clear() pool = threadPool(threadNum, event) showProgress(pool.getQueue(), deep, event) pool.putJob((link, deep), key) pool.wait() if test: # 需要自测模块运行 import test test.test(key, dbFile)
def test(): import test pygame.init() image = pygame.image.load("qbird.png") sprite = ManipulatableDirtySprite(image = image) clock = pygame.time.Clock() group = pygame.sprite.LayeredDirty(sprite, layer = 0, _use_update = True) keysused = {} def testlogic(event): if event == None: time = clock.get_time() if K_UP in keysused and keysused[K_UP]: sprite.y -= 1 if K_DOWN in keysused and keysused[K_DOWN]: sprite.y += 1 if K_LEFT in keysused and keysused[K_LEFT]: sprite.x -= 1 if K_RIGHT in keysused and keysused[K_RIGHT]: sprite.x += 1 if K_q in keysused and keysused[K_q]: sprite.rotation -= 1 if K_e in keysused and keysused[K_e]: sprite.rotation += 1 if K_w in keysused and keysused[K_w]: sprite.ycenter -= 0.015625 if K_s in keysused and keysused[K_s]: sprite.ycenter += 0.015625 if K_a in keysused and keysused[K_a]: sprite.xcenter -= 0.015625 if K_d in keysused and keysused[K_d]: sprite.xcenter += 0.015625 if K_r in keysused and keysused[K_r]: sprite.yscale += 0.015625 if K_f in keysused and keysused[K_f]: sprite.yscale -= 0.015625 if K_t in keysused and keysused[K_t]: sprite.xscale += 0.015625 if K_g in keysused and keysused[K_g]: sprite.xscale -= 0.015625 if K_y in keysused and keysused[K_y]: sprite.opacity += 0.015625 if K_h in keysused and keysused[K_h]: sprite.opacity -= 0.015625 return if event.type == pygame.KEYDOWN: if event.key == 27: return True keysused[event.key] = True if event.type == pygame.KEYUP: keysused[event.key] = False def testrender(screen): clock.tick() bgd = pygame.Surface((screen.get_width(), screen.get_height())) group.draw(screen, bgd = bgd) test.test(testlogic, testrender)
def test2(): cost = [1,1,1] lts = [[3,-2,0]] ltB = [7] eqs = [[1,1,0]] eqB = [2] expectedCost = [1,1,1,0] expectedConstraints = [[3,-2,0,1], [1,1,0,0]] expectedThresholds = [7,2] test((expectedCost, expectedConstraints, expectedThresholds), simplex.standardForm(cost, lessThans=lts, ltThreshold=ltB, equalities=eqs, eqThreshold=eqB))
def evalQuiz(user, data): f = open(join(quiz_requests, '%s.quiz-%s.%d' % (user, data['page'].split('-')[-1], int(time() * 1000))), 'w') f.write(str(data)) f.close() path.insert(0, join(course_material, str(data['page']))) #test_mod = __import__(join(course_material, str(data['page']), 'test.py')) import test as test_mod test_mod.test(user, data) del test_mod
def trainrf(model_id,train_x,train_y,valid_x,valid_y,test_x): train_x,train_y=shuffle(train_x,train_y) random_state=random.randint(0, 1000000) print('random state: {state}'.format(state=random_state)) clf = RandomForestClassifier(n_estimators=random.randint(50,5000), criterion='gini', max_depth=random.randint(10,1000), min_samples_split=random.randint(2,50), min_samples_leaf=random.randint(1,10), min_weight_fraction_leaf=random.uniform(0.0,0.5), max_features=random.uniform(0.1,1.0), max_leaf_nodes=random.randint(1,10), bootstrap=False, oob_score=False, n_jobs=30, random_state=random_state, verbose=0, warm_start=True, class_weight=None ) clf.fit(train_x, train_y) valid_predictions1 = clf.predict_proba(valid_x) test_predictions1= clf.predict_proba(test_x) t1 = test(valid_y,valid_predictions1) ccv = CalibratedClassifierCV(base_estimator=clf,method="sigmoid",cv='prefit') ccv.fit(valid_x,valid_y) valid_predictions2 = ccv.predict_proba(valid_x) test_predictions2= ccv.predict_proba(test_x) t2 = test(valid_y,valid_predictions2) if t2<t1: valid_predictions=valid_predictions2 test_predictions=test_predictions2 t=t2 else: valid_predictions=valid_predictions1 test_predictions=test_predictions1 t=t1 if t < 0.450: data.saveData(valid_predictions,"../valid_results/valid_"+str(model_id)+".csv") data.saveData(test_predictions,"../results/results_"+str(model_id)+".csv")
def testFromPost(): cost = [1,1,1] gts = [[0,1,4]] gtB = [10] lts = [[3,-2,0]] ltB = [7] eqs = [[1,1,0]] eqB = [2] expectedCost = [1,1,1,0,0] expectedConstraints = [[0,1,4,-1,0], [3,-2,0,0,1], [1,1,0,0,0]] expectedThresholds = [10,7,2] test((expectedCost, expectedConstraints, expectedThresholds), simplex.standardForm(cost, gts, gtB, lts, ltB, eqs, eqB))
def train(model_id,train_x,train_y,valid_x,valid_y,test_x): train_x,train_y=shuffle(train_x,train_y) random_state=random.randint(0, 1000000) print('random state: {state}'.format(state=random_state)) clf = RandomForestClassifier(bootstrap=False, class_weight=None, criterion='entropy', max_depth=29008, max_features=36, max_leaf_nodes=None, min_samples_leaf=5, min_samples_split=3, min_weight_fraction_leaf=0.0, n_estimators=4494, n_jobs=8, oob_score=False, random_state=979271, verbose=0, warm_start=False) clf.fit(train_x, train_y) ccv = CalibratedClassifierCV(base_estimator=clf,method="sigmoid",cv="prefit") ccv.fit(valid_x,valid_y) valid_predictions = ccv.predict_proba(valid_x) test_predictions= ccv.predict_proba(test_x) loss = test(valid_y,valid_predictions,True) if loss<0.52: data.saveData(valid_predictions,"../valid_results/valid_"+str(model_id)+".csv") data.saveData(test_predictions,"../results/results_"+str(model_id)+".csv")
def cal(code): try: a=gc.getData(code) except: print('{} is wrong'.format(code)) else: print('{} is running'.format(code)) if a is not None and len(a)>60: global total total=total+1 a.sort_index(inplace=True) gc.ma(a,'close',[5,10,15,20,25]) MA_column=a.columns[-5:] a['Diff']= 100 * ((a[MA_column].max(axis=1) - a[MA_column].min(axis=1))/a[MA_column].max(axis=1)) b=a[-65:-5]['Diff'].mean() if b<2 and a[-65:-5]['Diff'].max()<10: good.append(code) global can_try can_try=can_try+1 # print('{},{}'.format(a[-2:-1].index.values[0],a[-1:].index.values[0])) diff=test.test(code,a[-6:-5].index.values[0],a[-1:].index.values[0]) diff_list.append(diff) if diff>0: global test_try test_try=test_try+1
def k_result(k): train_k = random.sample(train_set,k) scp_k = os.path.join(tempdir,'scp_k') with open(scp_k,'w') as f: f.writelines(train_k) final_dir = train(outdir, config, scp_k, proto, htk_dict, words_mlf, monophones, tempdir) return test(outdir, final_dir, wdnet, htk_dict, monophones, scp_test, words_mlf, tempdir)
def run(): args = parse_args() params = parser.Yaml(file_name=args.params) env = rave.Environment() env.SetViewer("qtcoin") env.Load(params.scene) env.UpdatePublishedBodies() robot = env.GetRobots()[0] time.sleep(0.1) # give time for environment to update navi = navigation.Navigation(robot, params, verbose=args.verbose) if args.test: test.test(navi) else: navi.run()
def urls(environ): template = os.path.join(environ['DOCUMENT_ROOT'], 'wsgi/vhost1/template') if environ['PATH_INFO'] == '/vhost1': if environ['REQUEST_METHOD'] == 'POST': return publish.post(environ, template) return publish.edit(environ, template) elif environ['PATH_INFO'] == '/vhost/pub': return publish.pub(environ) else: return test.test(environ, template)
def run(optim): progress = make_progressbar("Training with " + str(optim), 5) progress.start() model = net() model.training() for epoch in range(5): train(Xtrain, ytrain, model, optim, criterion, batch_size, "train") train(Xtrain, ytrain, model, optim, criterion, batch_size, "stats") progress.update(epoch + 1) progress.finish() model.evaluate() nll, _ = test(Xtrain, ytrain, model, batch_size) _, nerr = test(Xval, yval, model, batch_size) print("Trainset NLL: {:.2f}".format(nll)) print("Testset errors: {}".format(nerr))
def urls(environ): template = os.path.join(environ['DOCUMENT_ROOT'], 'test/template') if environ['PATH_INFO'] == '/test/work': return test.work() elif environ['PATH_INFO'] == '/test/html': return test.html(environ, template) elif environ['PATH_INFO'] == '/test2': return test2.test2() else: return test.test()
def main(_): pp.pprint(flags.FLAGS.__flags) if not os.path.exists(FLAGS.checkpoint_dir): os.makedirs(FLAGS.checkpoint_dir) if not os.path.exists(FLAGS.sample_dir): os.makedirs(FLAGS.sample_dir) random.seed(31241) np.random.seed(41982) tf.set_random_seed(1327634) color = True # Must change this and the dataset Flags to the correct path to use color if FLAGS.is_debug: reader = Bouncing_Balls_Data_Reader(FLAGS.dataset, FLAGS.batch_size, color=color, train_size=160*5, validation_size=8*5, test_size=8*5, num_partitions=5) else: reader = Bouncing_Balls_Data_Reader(FLAGS.dataset, FLAGS.batch_size, color=color) data_fn = lambda epoch, batch_index: reader.read_data(batch_index, reader.TRAIN) frame_shape = reader.read_data(0, reader.TRAIN).shape[2:] print("Frame shape: ", frame_shape) num_batches = reader.num_batches(reader.TRAIN) print("Num batches: %d" % num_batches) input_sequence_range = range(5, 16) print("Input sequence range min: %d, max: %d" % (min(input_sequence_range), max(input_sequence_range))) save_sample_fn = utils.gen_save_sample_fn(FLAGS.sample_dir, image_prefix="train") with tf.Session() as sess: pgn = PGN(sess, FLAGS.dataset_name, FLAGS.epoch, num_batches, FLAGS.batch_size, input_sequence_range, data_fn, frame_shape=frame_shape, save_sample_fn=save_sample_fn, checkpoint_dir=FLAGS.checkpoint_dir, lambda_adv_loss= FLAGS.lambda_adv_loss) if FLAGS.is_train: pgn.train() else: print("Loading from: %s" %(FLAGS.checkpoint_dir,)) if pgn.load(FLAGS.checkpoint_dir) : print(" [*] Successfully loaded") else: print(" [!] Load failed") if FLAGS.is_test: result = test.test(pgn, reader) result_str = pp.pformat(result) fid = open(os.path.join(FLAGS.sample_dir, 'test_out.txt'), mode='w') fid.write(unicode(result_str)) fid.close() if FLAGS.is_visualize: for i in range(3): vid_seq = reader.read_data(i, data_set_type=reader.TEST, batch_size=1)[:, 0, :, :, :] utils.make_prediction_gif(pgn, os.path.join(FLAGS.sample_dir, 'vis_%d.gif' % i), video_sequence=vid_seq) utils.plot_convergence(pgn.get_MSE_history(), "MSE Convergence", path=os.path.join(FLAGS.sample_dir, "vis_MSE_convergence.png"))
def call_script(self): #Disable all the buttons self.button.config(state='disabled') self.slogan.config(state='disabled') #Perform the task message=test.test() self.message['text'] = message time.sleep(0.5) #Enable all the buttons self.button.config(state='active') self.slogan.config(state='active')
def train(model_id,train_x,train_y,valid_x,valid_y,test_x): train_x,train_y=shuffle(train_x,train_y) # normalization scaler = StandardScaler() train_x = scaler.fit_transform(train_x) valid_x = scaler.transform(valid_x) test_x = scaler.transform(test_x) random_state=random.randint(0, 1000000) print('random state: {state}'.format(state=random_state)) clf = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=1000, multi_class='ovr', verbose=True ) clf.fit(train_x, train_y) valid_predictions = clf.predict_proba(valid_x) test(valid_y,valid_predictions) ccv = CalibratedClassifierCV(base_estimator=clf,method="sigmoid",cv='prefit') ccv.fit(train_x,train_y) valid_predictions = ccv.predict_proba(valid_x) test(valid_y,valid_predictions) test_predictions= ccv.predict_proba(test_x) data.saveData(valid_predictions,"../valid_results/valid_"+str(model_id)+".csv") data.saveData(test_predictions,"../results/results_"+str(model_id)+".csv")
def test(): import test import Bar pygame.init() bar = Bar.Bar(12340, 12345) bar2 = Bar.Bar(120, 135, color = (0, 0 ,255, 255)) bar2.rect.top = 30 clock = pygame.time.Clock() window = LayeredDirtySprite(width = 120, height = 100) window.add(bar) window.add(bar2) window.x = 100 window.y = 200 window.ycenter = -2.0 group = pygame.sprite.LayeredDirty(window, layer = 0, _use_update = True) bgd = [] def testlogic(event): if event == None: window.rotation += .01 return if event.type == pygame.KEYDOWN: if event.key == 27: return True if event.key >= 256: return import random if event.mod & 4095 == 0: bar.actual -= min((random.randint(5, 500), bar.actual)) else: bar.actual += min((random.randint(5, 500), bar._max - bar.actual)) def testrender(screen): clock.tick() time = clock.get_time() group.update(time) if len(bgd) == 0: bgd.append(pygame.Surface((screen.get_width(), screen.get_height()))) bgd[0].fill((0, 255, 0)) group.clear(screen, bgd[0]) group.draw(screen) test.test(testlogic, testrender)
def urls(environ): template = os.path.join(environ['DOCUMENT_ROOT'], 'wsgi/test/template') if environ['PATH_INFO'] == '/test/vhost': return test.vhost(environ, template) elif environ['PATH_INFO'] == '/test/post': return test.post(environ, template) elif environ['PATH_INFO'] == '/ws': return test.ws(environ, template) elif environ['PATH_INFO'] == '/foobar/': return test.foobar(environ, template) else: return test.test(environ, template)
def Prune(fileName, tree,Train, Validate, Predict, testFile, NeedPrune): csv = csvRead(fileName) columnCsv = [] modeList = [] traininfo = [] # print len(csv[1][0]) for num in range(len(csv[1][0])): # columnCsv.append(columnCreate(csv[1], num)) columnCsv = columnCreate(csv[1], num) categoryOrnumeric(columnCsv, csv, num, modeList, traininfo) myTree = createTree(csv[1], csv[0],modeList) if NeedPrune == True: if Validate: test.test(testFile,myTree, traininfo) if Predict: output.test(testFile,myTree,traininfo) # f = open("tree.txt", "w") # output.test('btest.csv',myTree,traininfo) # drawTree(myTree,0,f) # plotting.accurancyGet("bvalidate.csv", myTree, traininfo) return myTree,csv[1],traininfo
def run_for_values_comb(filename, result_file): print 'Data set: {}\n'.format(filename) for depth, trees, repl, sam_size, fold, dis in [(depth, trees, repl, sam_size, fold, dis) for depth in depth_values for trees in trees_values for repl in replacement_values for sam_size in sample_size_values for fold in folds_values for dis in discretization_values]: result = test(os.path.join(sample_data_dir, filename), depth, trees, repl, sam_size, fold, 0, dis) print '{} for: Depth: {}, tree no: {}, replacement: {}, sample size: {}, folds no: {}, discretization: {}' \ .format(result, depth, trees, repl, sam_size, fold, dis) result_file.write('{},{},{},{},{},{},{},{}\n' .format(filename, depth, trees, repl, sam_size, fold, dis, result))
def main(passed_args): args = init.getArgs(passed_args) # If auth arg is present, use it for 'get' calls. # Create the requests session to use for later 'get' calls. session = requests.session() # If there is an 'auth' arg, post the session the correct login info for the session if 'custom-auth' in args.keys(): if args['custom-auth'] == 'dvwa': payload = {'password': '******', 'username': '******'} response = session.post('http://127.0.0.1/dvwa/login.php', data=payload) session.cookies = response.cookies #Call the appropriate function using the mode param. if args['mode'] == 'discover': discovered = discover.discover(args, session) outputResults(discovered) elif args['mode'] == 'test': test.test(args, session) else: # This else is somewhat redundant. Mode input is verified in getArgs. print('Not a valid <mode> param. Must be \'discover\' or \'test\'') sys.exit(0)
def full_test(name, query, train_set, test_set, method): '''Performs a full automated test. @param name: test and model name, arbitrary @param query: question to train on @param train_set: name of the training set of games @param test_set: name of testing set of games @param method: feature extraction method ''' path = 'games/{}'.format(train_set) begin = time.time() dts.build_and_dump(name, query, method=method, path=path) trn.train_and_save(name, 'logistic_regression') print time.time()-begin return tst.test(name, test_set, query, method)
def train(): cfg = opt.cfg data = opt.data epochs = opt.epochs # 500200 batches at bs 64, 117263 images = 273 epochs batch_size = opt.batch_size accumulate = opt.accumulate # effective bs = batch_size * accumulate = 16 * 4 = 64 weights = opt.weights # initial training weights imgsz_min, imgsz_max, imgsz_test = opt.img_size # img sizes (min, max, test) # Image Sizes gs = 64 # (pixels) grid size assert math.fmod( imgsz_min, gs) == 0, '--img-size %g must be a %g-multiple' % (imgsz_min, gs) opt.multi_scale |= imgsz_min != imgsz_max # multi if different (min, max) if opt.multi_scale: if imgsz_min == imgsz_max: imgsz_min //= 1.5 imgsz_max //= 0.667 grid_min, grid_max = imgsz_min // gs, imgsz_max // gs imgsz_min, imgsz_max = grid_min * gs, grid_max * gs img_size = imgsz_max # initialize with max size # Configure run init_seeds() data_dict = parse_data_cfg(data) train_path = data_dict['train'] test_path = data_dict['valid'] nc = 1 if opt.single_cls else int( data_dict['classes']) # number of classes hyp['cls'] *= nc / 80 # update coco-tuned hyp['cls'] to current dataset # Remove previous results for f in glob.glob('*_batch*.png') + glob.glob(results_file): os.remove(f) # Initialize model model = Darknet(cfg).to(device) # Optimizer pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in dict(model.named_parameters()).items(): if '.bias' in k: pg2 += [v] # biases elif 'Conv2d.weight' in k: pg1 += [v] # apply weight_decay else: pg0 += [v] # all else if opt.adam: # hyp['lr0'] *= 0.1 # reduce lr (i.e. SGD=5E-3, Adam=5E-4) optimizer = optim.Adam(pg0, lr=hyp['lr0']) # optimizer = AdaBound(pg0, lr=hyp['lr0'], final_lr=0.1) else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) del pg0, pg1, pg2 start_epoch = 0 best_fitness = 0.0 attempt_download(weights) if weights.endswith('.pt'): # pytorch format # possible weights are '*.pt', 'yolov3-spp.pt', 'yolov3-tiny.pt' etc. chkpt = torch.load(weights, map_location=device) # load model try: chkpt['model'] = { k: v for k, v in chkpt['model'].items() if model.state_dict()[k].numel() == v.numel() } model.load_state_dict(chkpt['model'], strict=False) except KeyError as e: s = "%s is not compatible with %s. Specify --weights '' or specify a --cfg compatible with %s. " \ "See https://github.com/ultralytics/yolov3/issues/657" % (opt.weights, opt.cfg, opt.weights) raise KeyError(s) from e # load optimizer if chkpt['optimizer'] is not None: optimizer.load_state_dict(chkpt['optimizer']) best_fitness = chkpt['best_fitness'] # load results if chkpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(chkpt['training_results']) # write results.txt start_epoch = chkpt['epoch'] + 1 del chkpt elif len(weights) > 0: # darknet format # possible weights are '*.weights', 'yolov3-tiny.conv.15', 'darknet53.conv.74' etc. load_darknet_weights(model, weights) # Mixed precision training https://github.com/NVIDIA/apex if mixed_precision: model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0) # Scheduler https://github.com/ultralytics/yolov3/issues/238 lf = lambda x: ( ((1 + math.cos(x * math.pi / epochs)) / 2 )**1.0) * 0.95 + 0.05 # cosine https://arxiv.org/pdf/1812.01187.pdf scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf, last_epoch=start_epoch - 1) # scheduler = lr_scheduler.MultiStepLR(optimizer, [round(epochs * x) for x in [0.8, 0.9]], 0.1, start_epoch - 1) # Plot lr schedule # y = [] # for _ in range(epochs): # scheduler.step() # y.append(optimizer.param_groups[0]['lr']) # plt.plot(y, '.-', label='LambdaLR') # plt.xlabel('epoch') # plt.ylabel('LR') # plt.tight_layout() # plt.savefig('LR.png', dpi=300) # Initialize distributed training if device.type != 'cpu' and torch.cuda.device_count( ) > 1 and torch.distributed.is_available(): dist.init_process_group( backend='nccl', # 'distributed backend' init_method= 'tcp://127.0.0.1:9999', # distributed training init method world_size=1, # number of nodes for distributed training rank=0) # distributed training node rank model = torch.nn.parallel.DistributedDataParallel( model, find_unused_parameters=True) model.yolo_layers = model.module.yolo_layers # move yolo layer indices to top level # Dataset dataset = LoadImagesAndLabels( train_path, img_size, batch_size, augment=True, hyp=hyp, # augmentation hyperparameters rect=opt.rect, # rectangular training cache_images=opt.cache_images, single_cls=opt.single_cls) # Dataloader batch_size = min(batch_size, len(dataset)) nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers dataloader = torch.utils.data.DataLoader( dataset, batch_size=batch_size, num_workers=nw, shuffle=not opt. rect, # Shuffle=True unless rectangular training is used pin_memory=True, collate_fn=dataset.collate_fn) # Testloader testloader = torch.utils.data.DataLoader(LoadImagesAndLabels( test_path, imgsz_test, batch_size, hyp=hyp, rect=True, cache_images=opt.cache_images, single_cls=opt.single_cls), batch_size=batch_size, num_workers=nw, pin_memory=True, collate_fn=dataset.collate_fn) # Model parameters model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights # Model EMA ema = torch_utils.ModelEMA(model) # Start training nb = len(dataloader) # number of batches n_burn = max(3 * nb, 500) # burn-in iterations, max(3 epochs, 500 iterations) maps = np.zeros(nc) # mAP per class # torch.autograd.set_detect_anomaly(True) results = ( 0, 0, 0, 0, 0, 0, 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' t0 = time.time() print('Image sizes %g - %g train, %g test' % (imgsz_min, imgsz_max, imgsz_test)) print('Using %g dataloader workers' % nw) print('Starting training for %g epochs...' % epochs) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if dataset.image_weights: w = model.class_weights.cpu().numpy() * (1 - maps)**2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices(range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx mloss = torch.zeros(4).to(device) # mean losses print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) pbar = tqdm(enumerate(dataloader), total=nb) # progress bar for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device).float( ) / 255.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0 targets = targets.to(device) # Burn-in if ni <= n_burn * 2: model.gr = np.interp( ni, [0, n_burn * 2], [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) if ni == n_burn: # burnin complete print_model_biases(model) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp( ni, [0, n_burn], [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(ni, [0, n_burn], [0.9, hyp['momentum']]) # Multi-Scale training if opt.multi_scale: if ni / accumulate % 1 == 0: # adjust img_size (67% - 150%) every 1 batch img_size = random.randrange(grid_min, grid_max + 1) * gs sf = img_size / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to 32-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Run model pred = model(imgs) # Compute loss loss, loss_items = compute_loss(pred, targets, model) if not torch.isfinite(loss): print('WARNING: non-finite loss, ending training ', loss_items) return results # Scale loss by nominal batch_size of 64 loss *= batch_size / 64 # Compute gradient if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Optimize accumulated gradient if ni % accumulate == 0: optimizer.step() optimizer.zero_grad() ema.update(model) # Print batch results mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = '%.3gG' % (torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.3g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, len(targets), img_size) pbar.set_description(s) # Plot images with bounding boxes if ni < 1: f = 'train_batch%g.png' % i # filename plot_images(imgs=imgs, targets=targets, paths=paths, fname=f) if tb_writer: tb_writer.add_image(f, cv2.imread(f)[:, :, ::-1], dataformats='HWC') # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # Update scheduler scheduler.step() # Process epoch results ema.update_attr(model) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP is_coco = any([ x in data for x in ['coco.data', 'coco2014.data', 'coco2017.data'] ]) and model.nc == 80 results, maps = test.test(cfg, data, batch_size=batch_size, img_size=imgsz_test, model=ema.ema, save_json=final_epoch and is_coco, single_cls=opt.single_cls, dataloader=testloader) # Write epoch results with open(results_file, 'a') as f: f.write(s + '%10.3g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp results.txt gs://%s/results/results%s.txt' % (opt.bucket, opt.name)) # Write Tensorboard results if tb_writer: tags = [ 'train/giou_loss', 'train/obj_loss', 'train/cls_loss', 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/F1', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss' ] for x, tag in zip(list(mloss[:-1]) + list(results), tags): tb_writer.add_scalar(tag, x, epoch) # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save training results save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # Create checkpoint chkpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema.module.state_dict() if hasattr(model, 'module') else ema.ema.state_dict(), 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last checkpoint torch.save(chkpt, last) # Save best checkpoint if (best_fitness == fi) and not final_epoch: torch.save(chkpt, best) # Save backup every 10 epochs (optional) # if epoch > 0 and epoch % 10 == 0: # torch.save(chkpt, wdir + 'backup%g.pt' % epoch) # Delete checkpoint del chkpt # end epoch ---------------------------------------------------------------------------------------------------- # end training n = opt.name if len(n): n = '_' + n if not n.isnumeric() else n fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename ispt = f2.endswith('.pt') # is *.pt strip_optimizer(f2) if ispt else None # strip optimizer os.system('gsutil cp %s gs://%s/weights' % ( f2, opt.bucket)) if opt.bucket and ispt else None # upload if not opt.evolve: plot_results() # save as results.png print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if torch.cuda.device_count() > 1 else None torch.cuda.empty_cache() return results
def train( cfg, data_cfg, weights_from="", weights_to="", save_every=10, img_size=(1088, 608), resume=False, epochs=100, batch_size=16, accumulated_batches=1, freeze_backbone=False, opt=None, ): # The function starts timme = strftime("%Y-%d-%m %H:%M:%S", gmtime()) timme = timme[5:-3].replace('-', '_') timme = timme.replace(' ', '_') timme = timme.replace(':', '_') weights_to = osp.join(weights_to, 'run' + timme) mkdir_if_missing(weights_to) if resume: latest_resume = osp.join(weights_from, 'latest.pt') torch.backends.cudnn.benchmark = True # unsuitable for multiscale # Configure run f = open(data_cfg) data_config = json.load(f) trainset_paths = data_config['train'] dataset_root = data_config['root'] f.close() transforms = T.Compose([T.ToTensor()]) # Get dataloader dataset = JointDataset(dataset_root, trainset_paths, img_size, augment=True, transforms=transforms) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True, drop_last=True, collate_fn=collate_fn) # Initialize model model = Darknet(cfg, dataset.nID) # model = ResNet(cfg, dataset.nID) cutoff = -1 # backbone reaches to cutoff layer start_epoch = 0 if resume: checkpoint = torch.load(latest_resume, map_location='cpu') # Load weights to resume from model.load_state_dict(checkpoint['model']) model.cuda().train() # Set optimizer optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad, model.parameters()), lr=opt.lr, momentum=.9) start_epoch = checkpoint['epoch'] + 1 if checkpoint['optimizer'] is not None: optimizer.load_state_dict(checkpoint['optimizer']) del checkpoint # current, saved else: # Initialize model with backbone (optional) if cfg.endswith('yolov3.cfg'): load_darknet_weights(model, osp.join(weights_from, 'darknet53.conv.74')) cutoff = 75 elif cfg.endswith('yolov3-tiny.cfg'): load_darknet_weights(model, osp.join(weights_from, 'yolov3-tiny.conv.15')) cutoff = 15 model.cuda().train() # Set optimizer optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad, model.parameters()), lr=opt.lr, momentum=.9, weight_decay=1e-4) model = torch.nn.DataParallel(model) # Set scheduler scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=[int(0.5 * opt.epochs), int(0.75 * opt.epochs)], gamma=0.1) # An important trick for detection: freeze bn during fine-tuning if not opt.unfreeze_bn: for i, (name, p) in enumerate(model.named_parameters()): p.requires_grad = False if 'batch_norm' in name else True # model_info(model) t0 = time.time() for epoch in range(epochs): epoch += start_epoch logger.info( ('%8s%12s' + '%10s' * 6) % ('Epoch', 'Batch', 'box', 'conf', 'id', 'total', 'nTargets', 'time')) # Freeze darknet53.conv.74 for first epoch if freeze_backbone and (epoch < 2): for i, (name, p) in enumerate(model.named_parameters()): if int(name.split('.')[2]) < cutoff: # if layer < 75 p.requires_grad = False if (epoch == 0) else True ui = -1 rloss = defaultdict(float) # running loss optimizer.zero_grad() for i, (imgs, targets, _, _, targets_len) in enumerate(dataloader): if sum([len(x) for x in targets]) < 1: # if no targets continue continue # SGD burn-in burnin = min(1000, len(dataloader)) if (epoch == 0) & (i <= burnin): lr = opt.lr * (i / burnin)**4 for g in optimizer.param_groups: g['lr'] = lr # Compute loss, compute gradient, update parameters loss, components = model(imgs.cuda(), targets.cuda(), targets_len.cuda()) components = torch.mean(components.view(-1, 5), dim=0) loss = torch.mean(loss) loss.backward() # accumulate gradient for x batches before optimizing if ((i + 1) % accumulated_batches == 0) or (i == len(dataloader) - 1): optimizer.step() optimizer.zero_grad() # Running epoch-means of tracked metrics ui += 1 for ii, key in enumerate(model.module.loss_names): rloss[key] = (rloss[key] * ui + components[ii]) / (ui + 1) # rloss indicates running loss values with mean updated at every epoch s = ('%8s%12s' + '%10.3g' * 6) % ( '%g/%g' % (epoch, epochs - 1), '%g/%g' % (i, len(dataloader) - 1), rloss['box'], rloss['conf'], rloss['id'], rloss['loss'], rloss['nT'], time.time() - t0) t0 = time.time() if i % opt.print_interval == 0: logger.info(s) # Save latest checkpoint checkpoint = { 'epoch': epoch, 'model': model.module.state_dict(), 'optimizer': optimizer.state_dict() } copyfile(cfg, weights_to + '/cfg/yolo3.cfg') copyfile(data_cfg, weights_to + '/cfg/ccmcpe.json') latest = osp.join(weights_to, 'latest.pt') torch.save(checkpoint, latest) if epoch % save_every == 0 and epoch != 0: # making the checkpoint lite checkpoint["optimizer"] = [] torch.save( checkpoint, osp.join(weights_to, "weights_epoch_" + str(epoch) + ".pt")) # Calculate mAP if epoch % opt.test_interval == 0: with torch.no_grad(): mAP, R, P = test.test(cfg, data_cfg, weights=latest, batch_size=batch_size, img_size=img_size, print_interval=40, nID=dataset.nID) test.test_emb(cfg, data_cfg, weights=latest, batch_size=batch_size, img_size=img_size, print_interval=40, nID=dataset.nID) # Call scheduler.step() after opimizer.step() with pytorch > 1.1.0 scheduler.step()
def testare(self): self.assertTrue(test(5, 4)) self.assertFalse(test(3, 4))
def train(): cfg = opt.cfg data = opt.data img_size, img_size_test = opt.img_size if len( opt.img_size) == 2 else opt.img_size * 2 # train, test sizes epochs = opt.epochs # 500200 batches at bs 64, 117263 images = 273 epochs batch_size = opt.batch_size accumulate = opt.accumulate # effective bs = batch_size * accumulate = 16 * 4 = 64 weights = opt.weights # initial training weights # Initialize init_seeds() if opt.multi_scale: img_sz_min = round(img_size / 32 / 1.5) img_sz_max = round(img_size / 32 * 1.5) img_size = img_sz_max * 32 # initiate with maximum multi_scale size print('Using multi-scale %g - %g' % (img_sz_min * 32, img_size)) # Configure run data_dict = parse_data_cfg(data) train_path = data_dict['train'] test_path = data_dict['valid'] nc = 1 if opt.single_cls else int( data_dict['classes']) # number of classes # Remove previous results for f in glob.glob('*_batch*.png') + glob.glob(results_file): os.remove(f) # Initialize model model = Darknet(cfg, arc=opt.arc).to(device) # Optimizer pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in dict(model.named_parameters()).items(): if '.bias' in k: pg2 += [v] # biases elif 'Conv2d.weight' in k: pg1 += [v] # apply weight_decay else: pg0 += [v] # all else if opt.adam: # hyp['lr0'] *= 0.1 # reduce lr (i.e. SGD=5E-3, Adam=5E-4) optimizer = optim.Adam(pg0, lr=hyp['lr0']) # optimizer = AdaBound(pg0, lr=hyp['lr0'], final_lr=0.1) else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) del pg0, pg1, pg2 # https://github.com/alphadl/lookahead.pytorch # optimizer = torch_utils.Lookahead(optimizer, k=5, alpha=0.5) start_epoch = 0 best_fitness = 0.0 attempt_download(weights) if weights.endswith('.pt'): # pytorch format # possible weights are '*.pt', 'yolov3-spp.pt', 'yolov3-tiny.pt' etc. chkpt = torch.load(weights, map_location=device) # load model try: chkpt['model'] = { k: v for k, v in chkpt['model'].items() if model.state_dict()[k].numel() == v.numel() } model.load_state_dict(chkpt['model'], strict=False) except KeyError as e: s = "%s is not compatible with %s. Specify --weights '' or specify a --cfg compatible with %s. " \ "See https://github.com/ultralytics/yolov3/issues/657" % (opt.weights, opt.cfg, opt.weights) raise KeyError(s) from e # load optimizer if chkpt['optimizer'] is not None: optimizer.load_state_dict(chkpt['optimizer']) best_fitness = chkpt['best_fitness'] # load results if chkpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(chkpt['training_results']) # write results.txt start_epoch = chkpt['epoch'] + 1 del chkpt elif len(weights) > 0: # darknet format # possible weights are '*.weights', 'yolov3-tiny.conv.15', 'darknet53.conv.74' etc. load_darknet_weights(model, weights) # Scheduler https://github.com/ultralytics/yolov3/issues/238 # lf = lambda x: 1 - x / epochs # linear ramp to zero # lf = lambda x: 10 ** (hyp['lrf'] * x / epochs) # exp ramp # lf = lambda x: 1 - 10 ** (hyp['lrf'] * (1 - x / epochs)) # inverse exp ramp # scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=range(59, 70, 1), gamma=0.8) # gradual fall to 0.1*lr0 scheduler = lr_scheduler.MultiStepLR( optimizer, milestones=[round(opt.epochs * x) for x in [0.8, 0.9]], gamma=0.1) scheduler.last_epoch = start_epoch - 1 # # Plot lr schedule # y = [] # for _ in range(epochs): # scheduler.step() # y.append(optimizer.param_groups[0]['lr']) # plt.plot(y, label='LambdaLR') # plt.xlabel('epoch') # plt.ylabel('LR') # plt.tight_layout() # plt.savefig('LR.png', dpi=300) # Mixed precision training https://github.com/NVIDIA/apex if mixed_precision: model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0) # Initialize distributed training if device.type != 'cpu' and torch.cuda.device_count() > 1: dist.init_process_group( backend='nccl', # 'distributed backend' init_method= 'tcp://127.0.0.1:9999', # distributed training init method world_size=1, # number of nodes for distributed training rank=0) # distributed training node rank model = torch.nn.parallel.DistributedDataParallel( model, find_unused_parameters=True) model.yolo_layers = model.module.yolo_layers # move yolo layer indices to top level # Dataset dataset = LoadImagesAndLabels( train_path, img_size, batch_size, augment=True, hyp=hyp, # augmentation hyperparameters rect=opt.rect, # rectangular training cache_labels=True, cache_images=opt.cache_images, single_cls=opt.single_cls) # Dataloader batch_size = min(batch_size, len(dataset)) nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers dataloader = torch.utils.data.DataLoader( dataset, batch_size=batch_size, num_workers=nw, shuffle=not opt. rect, # Shuffle=True unless rectangular training is used pin_memory=True, collate_fn=dataset.collate_fn) # Testloader testloader = torch.utils.data.DataLoader(LoadImagesAndLabels( test_path, img_size_test, batch_size * 2, hyp=hyp, rect=True, cache_labels=True, cache_images=opt.cache_images, single_cls=opt.single_cls), batch_size=batch_size * 2, num_workers=nw, pin_memory=True, collate_fn=dataset.collate_fn) # Start training nb = len(dataloader) prebias = start_epoch == 0 model.nc = nc # attach number of classes to model model.arc = opt.arc # attach yolo architecture model.hyp = hyp # attach hyperparameters to model model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights maps = np.zeros(nc) # mAP per class # torch.autograd.set_detect_anomaly(True) results = ( 0, 0, 0, 0, 0, 0, 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' t0 = time.time() torch_utils.model_info(model, report='summary') # 'full' or 'summary' print('Using %g dataloader workers' % nw) print('Starting training for %g epochs...' % epochs) for epoch in range(start_epoch, epochs): # epoch ------------------------------ model.train() # Prebias if prebias: if epoch < 3: # prebias ps = 0.1, 0.9 # prebias settings (lr=0.1, momentum=0.9) else: # normal training ps = hyp['lr0'], hyp['momentum'] # normal training settings print_model_biases(model) prebias = False # Bias optimizer settings optimizer.param_groups[2]['lr'] = ps[0] if optimizer.param_groups[2].get( 'momentum') is not None: # for SGD but not Adam optimizer.param_groups[2]['momentum'] = ps[1] # Update image weights (optional) if dataset.image_weights: w = model.class_weights.cpu().numpy() * (1 - maps)**2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices(range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx mloss = torch.zeros(4).to(device) # mean losses print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) pbar = tqdm(enumerate(dataloader), total=nb) # progress bar for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device).float( ) / 255.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0 targets = targets.to(device) # Multi-Scale training if opt.multi_scale: if ni / accumulate % 10 == 0: # adjust (67% - 150%) every 10 batches img_size = random.randrange(img_sz_min, img_sz_max + 1) * 32 sf = img_size / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [ math.ceil(x * sf / 32.) * 32 for x in imgs.shape[2:] ] # new shape (stretched to 32-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Plot images with bounding boxes if ni == 0: fname = 'train_batch%g.png' % i plot_images(imgs=imgs, targets=targets, paths=paths, fname=fname) if tb_writer: tb_writer.add_image(fname, cv2.imread(fname)[:, :, ::-1], dataformats='HWC') # Hyperparameter burn-in # n_burn = nb - 1 # min(nb // 5 + 1, 1000) # number of burn-in batches # if ni <= n_burn: # for m in model.named_modules(): # if m[0].endswith('BatchNorm2d'): # m[1].momentum = 1 - i / n_burn * 0.99 # BatchNorm2d momentum falls from 1 - 0.01 # g = (i / n_burn) ** 4 # gain rises from 0 - 1 # for x in optimizer.param_groups: # x['lr'] = hyp['lr0'] * g # x['weight_decay'] = hyp['weight_decay'] * g # Run model pred = model(imgs) # Compute loss loss, loss_items = compute_loss(pred, targets, model, not prebias) if not torch.isfinite(loss): print('WARNING: non-finite loss, ending training ', loss_items) return results # Scale loss by nominal batch_size of 64 loss *= batch_size / 64 # Compute gradient if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Accumulate gradient for x batches before optimizing if ni % accumulate == 0: optimizer.step() optimizer.zero_grad() # Print batch results mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available( ) else 0 # (GB) s = ('%10s' * 2 + '%10.3g' * 6) % ('%g/%g' % (epoch, epochs - 1), '%.3gG' % mem, *mloss, len(targets), img_size) pbar.set_description(s) # end batch ------------------------------------------------------------------------------------------------ # Process epoch results final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP is_coco = any([ x in data for x in ['coco.data', 'coco2014.data', 'coco2017.data'] ]) and model.nc == 80 results, maps = test.test( cfg, data, batch_size=batch_size * 2, img_size=img_size_test, model=model, conf_thres=0.001 if final_epoch and is_coco else 0.1, # 0.1 for speed iou_thres=0.6, save_json=final_epoch and is_coco, single_cls=opt.single_cls, dataloader=testloader) # Update scheduler scheduler.step() # Write epoch results with open(results_file, 'a') as f: f.write(s + '%10.3g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp results.txt gs://%s/results/results%s.txt' % (opt.bucket, opt.name)) # Write Tensorboard results if tb_writer: x = list(mloss) + list(results) titles = [ 'GIoU', 'Objectness', 'Classification', 'Train loss', 'Precision', 'Recall', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' ] for xi, title in zip(x, titles): tb_writer.add_scalar(title, xi, epoch) # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save training results save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # Create checkpoint chkpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': model.module.state_dict() if type(model) is nn.parallel.DistributedDataParallel else model.state_dict(), 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last checkpoint torch.save(chkpt, last) # Save best checkpoint if best_fitness == fi: torch.save(chkpt, best) # Save backup every 10 epochs (optional) # if epoch > 0 and epoch % 10 == 0: # torch.save(chkpt, wdir + 'backup%g.pt' % epoch) # Delete checkpoint del chkpt # end epoch ---------------------------------------------------------------------------------------------------- # end training n = opt.name if len(n): n = '_' + n if not n.isnumeric() else n fresults, flast, fbest = 'results%s.txt' % n, 'last%s.pt' % n, 'best%s.pt' % n os.rename('results.txt', fresults) os.rename(wdir + 'last.pt', wdir + flast) if os.path.exists(wdir + 'last.pt') else None os.rename(wdir + 'best.pt', wdir + fbest) if os.path.exists(wdir + 'best.pt') else None if opt.bucket: # save to cloud os.system('gsutil cp %s gs://%s/results' % (fresults, opt.bucket)) os.system('gsutil cp %s gs://%s/weights' % (wdir + flast, opt.bucket)) # os.system('gsutil cp %s gs://%s/weights' % (wdir + fbest, opt.bucket)) if not opt.evolve: plot_results() # save as results.png print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if torch.cuda.device_count() > 1 else None torch.cuda.empty_cache() return results
def do_experiment(model_config): tf.reset_default_graph() experiment_id = ex.current_run._id print('Experiment ID: {eid}'.format(eid=experiment_id)) # Prepare data print('Preparing dataset') train_data, val_data, test_data = dataset.prepare_datasets(model_config) print('Dataset ready') # Start session tf_config = tf.ConfigProto() #tf_config.gpu_options.allow_growth = True tf_config.gpu_options.visible_device_list = str(model_config['GPU']) sess = tf.Session(config=tf_config) #sess = tf.Session() #sess = tf_debug.LocalCLIDebugWrapperSession(sess, ui_type="readline") print('Session started') # Create data iterators handle = tf.placeholder(tf.string, shape=[]) iterator = tf.data.Iterator.from_string_handle(handle, train_data.output_types, train_data.output_shapes) mixed_spec, voice_spec, background_spec, mixed_audio, voice_audio, background_audio = iterator.get_next( ) training_iterator = train_data.make_initializable_iterator() validation_iterator = val_data.make_initializable_iterator() testing_iterator = test_data.make_initializable_iterator() training_handle = sess.run(training_iterator.string_handle()) validation_handle = sess.run(validation_iterator.string_handle()) testing_handle = sess.run(testing_iterator.string_handle()) print('Iterators created') # Create variable placeholders and model is_training = tf.placeholder(shape=(), dtype=bool) mixed_phase = tf.expand_dims(mixed_spec[:, :, :-1, 3], 3) print('Creating model') # Restructure data from pipeline based on data type required if model_config['data_type'] == 'mag': mixed_input = tf.expand_dims(mixed_spec[:, :, :-1, 2], 3) voice_input = tf.expand_dims(voice_spec[:, :, :-1, 2], 3) elif model_config['data_type'] in ['mag_phase', 'mag_phase_diff']: mixed_input = mixed_spec[:, :, :-1, 2:4] voice_input = voice_spec[:, :, :-1, 2:4] elif model_config['data_type'] == 'real_imag': mixed_input = mixed_spec[:, :, :-1, 0:2] voice_input = voice_spec[:, :, :-1, 0:2] elif model_config['data_type'] in ['mag_real_imag', 'mag_phase2']: mixed_input = tf.concat([ tf.expand_dims(mixed_spec[:, :, :-1, 2], 3), mixed_spec[:, :, :-1, 0:2] ], 3) voice_input = tf.concat([ tf.expand_dims(voice_spec[:, :, :-1, 2], 3), voice_spec[:, :, :-1, 0:2] ], 3) elif model_config['data_type'] in [ 'mag_phase_real_imag', 'complex_to_mag_phase' ]: mixed_input = mixed_spec[:, :, :-1, :] voice_input = voice_spec[:, :, :-1, :] model = audio_models.MagnitudeModel(mixed_input, voice_input, mixed_phase, mixed_audio, voice_audio, background_audio, model_config['model_variant'], is_training, model_config['learning_rate'], model_config['data_type'], model_config['phase_weight'], name='Magnitude_Model') sess.run(tf.global_variables_initializer()) if model_config['loading']: print('Loading checkpoint') checkpoint = os.path.join(model_config['model_base_dir'], model_config['checkpoint_to_load']) restorer = tf.train.Saver() restorer.restore(sess, checkpoint) # Summaries model_folder = str(experiment_id) writer = tf.summary.FileWriter(os.path.join(model_config["log_dir"], model_folder), graph=sess.graph) # Get baseline metrics at initialisation test_count = 0 if model_config['initialisation_test']: print('Running initialisation test') initial_test_loss, test_count = test(sess, model, model_config, handle, testing_iterator, testing_handle, test_count, experiment_id) # Train the model model = train(sess, model, model_config, model_folder, handle, training_iterator, training_handle, validation_iterator, validation_handle, writer) # Test the trained model mean_test_loss, test_count = test(sess, model, model_config, handle, testing_iterator, testing_handle, test_count, experiment_id) print('{ts}:\n\tAll done with experiment {exid}!'.format( ts=datetime.datetime.now(), exid=experiment_id)) if model_config['initialisation_test']: print('\tInitial test loss: {init}'.format(init=initial_test_loss)) print('\tFinal test loss: {final}'.format(final=mean_test_loss))
model = Darknet(opt.model_def).to(device) if opt.model: if opt.model.endswith(".pt"): model.load_state_dict(torch.load(opt.model, map_location=device)['model']) else: _ = load_darknet_weights(model, opt.model) data_config = parse_data_cfg(opt.data_config) valid_path = data_config["valid"] class_names = load_classes(data_config["names"]) eval_model = lambda model:test(model=model,cfg=opt.model_def, data=opt.data_config) obtain_num_parameters = lambda model:sum([param.nelement() for param in model.parameters()]) #这个不应该注释掉,等会要恢复 with torch.no_grad(): origin_model_metric = eval_model(model) origin_nparameters = obtain_num_parameters(model) ''' module_defs是一个列表,列表的每一项都是一个字典.贮存的只是并不生效的网络结构信息 例如{'type': 'convolutional', 'batch_normalize': '1', 'filters': '32', 'size': '3', 'stride': '1', 'pad': '1', 'activation': 'leaky'} module_list是一个列表,列表的每一项都是一个列表,例如: Sequential( (conv_0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (batch_norm_0): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
batches = datamanager.batch_iter(train, FLAGS.batch_size, FLAGS.num_epochs) num_batches_per_epoch = int(len(train) / FLAGS.batch_size) + 1 print("Batch data") # Training loop. For each batch... num_batch = 1 num_epoch = 1 dev_x_batch = datamanager.generate_x(dev) dev_p1_batch, dev_p2_batch = datamanager.generate_p(dev) dev_y_batch = datamanager.generate_y(dev) for batch in batches: if num_batch == num_batches_per_epoch: num_epoch += 1 num_batch = 1 test(testing_data, cnn.input_x, cnn.input_p1, cnn.input_p2, cnn.scores, cnn.predictions, cnn.dropout_keep_prob, datamanager, sess, num_epoch) path = saver.save(sess, checkpoint_prefix, global_step=current_step) num_batch += 1 x_batch = datamanager.generate_x(batch) p1_batch, p2_batch = datamanager.generate_p(batch) y_batch = datamanager.generate_y(batch) loss = train_step(x_batch, y_batch, p1_batch, p2_batch) current_step = tf.train.global_step(sess, global_step) if current_step % FLAGS.evaluate_every == 0: print("Num_batch: {}".format(num_batch)) print("Num_epoch: {}".format(num_epoch)) dev_step(dev_x_batch, dev_y_batch, dev_p1_batch, dev_p2_batch)
def main_finetune_channels(): # 波段选择依据 # 最优的波段排序: # [850, 870, 930, 730, 790, 910, 770, 750, 670, 950, 990, 830, 890, 810, 970, 690, 710, 650, 590, 570, 630, 610, 550] # 依次增加一个波段, 前一个模型进行微调 CHANNEL_SORT = [ 850, 870, 930, 730, 790, 910, 770, 750, 670, 950, 990, 830, 890, 810, 970, 690, 710, 650, 590, 570, 630, 610, 550 ] for splitidx in range(4, 5): usedChannelsList = [CHANNEL_SORT[:i + 1] for i in range(23)] # for i_usedChannels in range(len(usedChannelsList)): for i_usedChannels in [4, 6]: usedChannels = usedChannelsList[i_usedChannels] print(getTime(), splitidx, len(usedChannels), '...') configer = EasyDict() configer.dsize = (64, 64) configer.datatype = 'Multi' configer.n_epoch = 300 configer.lrbase = 0.001 configer.n_channel = 23 configer.n_class = 63 configer.batchsize = 32 configer.stepsize = 250 configer.gamma = 0.2 configer.cuda = True configer.splitmode = 'split_{}x{}_{}'.format( configer.dsize[0], configer.dsize[1], splitidx) configer.modelbase = 'recognize_vgg11_bn' configer.usedChannels = usedChannels configer.n_usedChannels = len(configer.usedChannels) configer.modelname = '{}_{}_{}_finetune'.\ format(configer.modelbase, configer.splitmode, '_'.join(list(map(str, configer.usedChannels)))) configer.datapath = '/home/louishsu/Work/Workspace/ECUST2019_{}x{}'.\ format(configer.dsize[0], configer.dsize[1]) configer.logspath = '/home/louishsu/Work/Workspace/HUAWEI/pytorch/logs/{}_{}_{}subjects_logs'.\ format(configer.modelbase, configer.splitmode, configer.n_class) configer.mdlspath = '/home/louishsu/Work/Workspace/HUAWEI/pytorch/modelfiles/{}_{}_{}subjects_models'.\ format(configer.modelbase, configer.splitmode, configer.n_class) ## datasets trainset = RecognizeDataset(configer.datapath, configer.datatype, configer.splitmode, 'train', configer.usedChannels) validset = RecognizeDataset(configer.datapath, configer.datatype, configer.splitmode, 'valid', configer.usedChannels) trainloader = DataLoader(trainset, configer.batchsize, shuffle=True) validloader = DataLoader(validset, configer.batchsize, shuffle=False) ## ============================================================================================ ## model modelpath = os.path.join(configer.mdlspath, configer.modelname) + '.pkl' modeldir = '/'.join(modelpath.split('/')[:-1]) if not os.path.exists(modeldir): os.makedirs(modeldir) if i_usedChannels == 0: model = modeldict[configer.modelbase](configer.n_usedChannels, configer.n_class, configer.dsize[0]) params = model.parameters() torch.save(model, modelpath) else: modelpath_pretrain = os.path.join( modeldir, '{}_{}_{}_finetune.pkl'.format( configer.modelbase, configer.splitmode, '_'.join( list(map(str, usedChannelsList[i_usedChannels - 1]))))) model = torch.load(modelpath_pretrain) model.features[0] = nn.Conv2d(len(usedChannels), 64, 3, stride=1, padding=1) params = [{ 'params': model.features[1:].parameters(), 'lr': configer.lrbase * 0.01, }, { 'params': model.features[0].parameters(), }] torch.save(model, modelpath) if configer.cuda and is_available(): model.cuda() ## ============================================================================================ ## optimizer optimizer = optim.Adam(params, configer.lrbase, weight_decay=1e-3) ## loss loss = nn.CrossEntropyLoss() ## learning rate scheduler scheduler = lr_scheduler.StepLR(optimizer, configer.stepsize, configer.gamma) ## log logpath = os.path.join(configer.logspath, configer.modelname) if not os.path.exists(logpath): os.makedirs(logpath) logger = SummaryWriter(logpath) ## initialize acc_train = 0. acc_valid = 0. loss_train = float('inf') loss_valid = float('inf') loss_valid_last = float('inf') ## start training for i_epoch in range(configer.n_epoch): if configer.cuda and is_available(): empty_cache() scheduler.step(i_epoch) acc_train = [] acc_valid = [] loss_train = [] loss_valid = [] model.train() for i_batch, (X, y) in enumerate(trainloader): # get batch X = Variable(X.float()) y = Variable(y) if configer.cuda and is_available(): X = X.cuda() y = y.cuda() # forward y_pred_prob = model(X) loss_i = loss(y_pred_prob, y) acc_i = accuracy(y_pred_prob, y) # backward optimizer.zero_grad() loss_i.backward() optimizer.step() loss_train += [loss_i.detach().cpu().numpy()] acc_train += [acc_i.cpu().numpy()] model.eval() for i_batch, (X, y) in enumerate(validloader): # get batch X = Variable(X.float()) y = Variable(y) if configer.cuda and is_available(): X = X.cuda() y = y.cuda() # forward y_pred_prob = model(X) loss_i = loss(y_pred_prob, y) acc_i = accuracy(y_pred_prob, y) loss_valid += [loss_i.detach().cpu().numpy()] acc_valid += [acc_i.cpu().numpy()] loss_train = np.mean(np.array(loss_train)) acc_train = np.mean(np.array(acc_train)) loss_valid = np.mean(np.array(loss_valid)) acc_valid = np.mean(np.array(acc_valid)) logger.add_scalars('accuracy', { 'train': acc_train, 'valid': acc_valid }, i_epoch) logger.add_scalars('logloss', { 'train': loss_train, 'valid': loss_valid }, i_epoch) logger.add_scalar('lr', scheduler.get_lr()[-1], i_epoch) if loss_valid_last > loss_valid: loss_valid_last = loss_valid torch.save(model, modelpath) test(configer)
def train( cfg, data_cfg, img_size=416, resume=False, # 是否加载已训练得到的权重 epochs=273, # 跑几遍数据集 batch_size=16, accumulate=1, # 积累accumulate个梯度然后再进行反向传播 multi_scale=False, # 是否进行多尺度训练 freeze_backbone=False, # 冻结基础网络 transfer=False # 是否进行迁移学习Transfer learning (train only YOLO layers) ): init_seeds() # 初始化种子,包括numpy和torch的种子 weights = 'weights' + os.sep latest = weights + 'latest.pt' best = weights + 'best.pt' device = torch_utils.select_device() if multi_scale: img_size = 608 # initiate with maximum multi_scale size opt.num_workers = 0 # bug https://github.com/ultralytics/yolov3/issues/174 else: torch.backends.cudnn.benchmark = True # unsuitable for multiscale # Configure run data_dict = parse_data_cfg(data_cfg) # 处理voc.data or coco.data train_path = data_dict['train'] nc = int(data_dict['classes']) # nc = number of classes # Initialize model 整个darknet模型 model = Darknet(cfg, img_size).to(device) # pytorch 最常用的优化器,SGD optimizer = optim.SGD(model.parameters(), lr=hyp['lr0'], momentum=hyp['momentum'], weight_decay=hyp['weight_decay']) cutoff = -1 # backbone reaches to cutoff layer start_epoch = 0 best_loss = float('inf') nf = int(model.module_defs[model.yolo_layers[0] - 1] ['filters']) # yolo layer size (i.e. 255) 18 = (class+5) * 3 if resume: # Load previously saved model if transfer: # Transfer learning chkpt = torch.load(weights + 'yolov3-spp.pt', map_location=device) model.load_state_dict( { k: v for k, v in chkpt['model'].items() if v.numel() > 1 and v.shape[0] != 255 }, strict=False) for p in model.parameters(): p.requires_grad = True if p.shape[0] == nf else False else: # resume from latest.pt chkpt = torch.load(latest, map_location=device) # load checkpoint model.load_state_dict(chkpt['model']) start_epoch = chkpt['epoch'] + 1 if chkpt['optimizer'] is not None: optimizer.load_state_dict(chkpt['optimizer']) best_loss = chkpt['best_loss'] del chkpt else: # Initialize model with backbone (optional) if '-tiny.cfg' in cfg: cutoff = load_darknet_weights(model, weights + 'yolov3-tiny.conv.15') else: cutoff = load_darknet_weights(model, weights + 'darknet53.conv.74') # Scheduler https://github.com/ultralytics/yolov3/issues/238 # lf = lambda x: 1 - x / epochs # linear ramp to zero # lf = lambda x: 10 ** (hyp['lrf'] * x / epochs) # exp ramp lf = lambda x: 1 - 10**(hyp['lrf'] * (1 - x / epochs)) # inverse exp ramp scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lf, last_epoch=start_epoch - 1) # scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[218, 245], gamma=0.1, last_epoch=start_epoch-1) # Plot lr schedule y = [] for _ in range(epochs): scheduler.step() y.append(optimizer.param_groups[0]['lr']) plt.plot(y, label='LambdaLR') plt.xlabel('epoch') plt.xlabel('LR') plt.tight_layout() plt.savefig('LR.png', dpi=300) # Dataset img_size = 608 dataset = LoadImagesAndLabels(train_path, img_size, batch_size, augment=True, rect=False, image_weights=False) # Initialize distributed training 多GPU分布式训练 if torch.cuda.device_count() > 1: dist.init_process_group(backend=opt.backend, init_method=opt.dist_url, world_size=opt.world_size, rank=opt.rank) model = torch.nn.parallel.DistributedDataParallel(model) # sampler = torch.utils.data.distributed.DistributedSampler(dataset) # Dataloader 数据加载比较复杂,需要查看API结合进行理解 dataloader = DataLoader( dataset, batch_size=batch_size, num_workers=opt.num_workers, shuffle=False, # disable rectangular training if True pin_memory=True, collate_fn=dataset.collate_fn) # Mixed precision training https://github.com/NVIDIA/apex # install help: https://github.com/NVIDIA/apex/issues/259 mixed_precision = False if mixed_precision: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # 开始训练 model.hyp = hyp # attach hyperparameters to model model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights 不太明白以上那一行的作用 model_info(model, 'full') nb = len(dataloader) # 数据集中图片的个数 maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0) # P, R, mAP, F1, test_loss n_burnin = min(round(nb / 5 + 1), 1000) # burn-in batches # for f in glob.glob('train_batch*.jpg') + glob.glob('test_batch*.jpg'): # os.remove(f) t, t0 = time.time(), time.time() for epoch in range(start_epoch, epochs): model.train() # 将模型的模式改为train模式 print( ('\n%8s%12s' + '%10s' * 7) % ('Epoch', 'Batch', 'xy', 'wh', 'conf', 'cls', 'total', 'nTargets', 'time')) # Update scheduler scheduler.step() # Freeze backbone at epoch 0, unfreeze at epoch 1 if freeze_backbone and epoch < 2: for name, p in model.named_parameters(): if int(name.split('.')[1]) < cutoff: # if layer < 75 p.requires_grad = False if epoch == 0 else True # Update image weights (optional) w = model.class_weights.cpu().numpy() * (1 - maps) # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) # 不清楚这部分作用,同上 dataset.indices = random.choices(range(dataset.n), weights=image_weights, k=dataset.n) # random weighted index # indices:目录 mloss = torch.zeros(5).to( device) # mean losses 进行初始化,分别记录:xy, wh, conf, cls, total for i, (imgs, targets, _, _) in enumerate(dataloader): # i and data(imgs, targets, _ , _ ) 在dataset类中,torch.from_numpy(img), labels_out, img_path, (h, w) imgs = imgs.to(device) targets = targets.to(device) nt = len(targets) # num of targets 目标的个数 # Plot images with bounding boxes if epoch == 0 and i == 0: plot_images(imgs=imgs, targets=targets, fname='train_batch0.jpg') # SGD burn-in if epoch == 0 and i <= n_burnin: lr = hyp['lr0'] * (i / n_burnin)**4 for x in optimizer.param_groups: x['lr'] = lr # 运行模型,输入为图片,输出为向量 pred = model(imgs) # 计算预测结果和标注结果之间的loss, yolo论文中的计算loss那一部分就是在这里 loss, loss_items = compute_loss(pred, targets, model) if torch.isnan(loss): print('WARNING: nan loss detected, ending training') return results # Compute gradient if mixed_precision: #混合精度 with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Accumulate gradient for x batches before optimizing if (i + 1) % accumulate == 0 or (i + 1) == nb: optimizer.step() optimizer.zero_grad() # Update running mean of tracked metrics mloss = (mloss * i + loss_items) / (i + 1) # Print batch results s = ('%8s%12s' + '%10.3g' * 7) % ('%g/%g' % (epoch, epochs - 1), '%g/%g' % (i, nb - 1), *mloss, nt, time.time() - t) t = time.time() print(s) # Multi-Scale training (320 - 608 pixels) every 10 batches if multi_scale and (i + 1) % 10 == 0: dataset.img_size = random.choice(range(10, 20)) * 32 print('multi_scale img_size = %g' % dataset.img_size) # Calculate mAP (always test final epoch, skip first 5 if opt.nosave) if not (opt.notest or (opt.nosave and epoch < 10)) or epoch == epochs - 1: with torch.no_grad(): results, maps = test.test(cfg, data_cfg, batch_size=batch_size, img_size=img_size, model=model, conf_thres=0.1) # Write epoch results with open('results.txt', 'a') as file: file.write(s + '%11.3g' * 5 % results + '\n') # P, R, mAP, F1, test_loss # Update best loss test_loss = results[4] if test_loss < best_loss: best_loss = test_loss # Save training results save = (not opt.nosave) or (epoch == epochs - 1) if save: # Create checkpoint chkpt = { 'epoch': epoch, 'best_loss': best_loss, 'model': model.module.state_dict() if type(model) is nn.parallel.DistributedDataParallel else model.state_dict(), 'optimizer': optimizer.state_dict() } # Save latest checkpoint torch.save(chkpt, latest) # Save best checkpoint if best_loss == test_loss: torch.save(chkpt, best) # Save backup every 10 epochs (optional) if epoch > 0 and epoch % 10 == 0: torch.save(chkpt, weights + 'backup%g.pt' % epoch) # Delete checkpoint del chkpt dt = (time.time() - t0) / 3600 print('%g epochs completed in %.3f hours.' % (epoch - start_epoch, dt)) return results
def train(hyp, opt, device, tb_writer=None, wandb=None): logger.info( colorstr("hyperparameters: ") + ", ".join(f"{k}={v}" for k, v in hyp.items())) save_dir, epochs, batch_size, total_batch_size, weights, rank = ( Path(opt.save_dir), opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank, ) # Directories wdir = save_dir / "weights" wdir.mkdir(parents=True, exist_ok=True) # make dir last = wdir / "last.pt" best = wdir / "best.pt" results_file = save_dir / "results.txt" # Save run settings with open(save_dir / "hyp.yaml", "w") as f: yaml.dump(hyp, f, sort_keys=False) with open(save_dir / "opt.yaml", "w") as f: # yaml.dump(vars(opt), f, sort_keys=False) # opt 実行パラメータ yaml.dump(str(opt), f, sort_keys=False) # Configure plots = not opt.evolve # create plots cuda = device.type != "cpu" init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.SafeLoader) # data dict with torch_distributed_zero_first(rank): check_dataset(data_dict) # check train_path = data_dict["train"] test_path = data_dict["val"] nc = 1 if opt.single_cls else int(data_dict["nc"]) # number of classes names = (["item"] if opt.single_cls and len(data_dict["names"]) != 1 else data_dict["names"]) # class names assert len(names) == nc, "%g names found for nc=%g dataset in %s" % ( len(names), nc, opt.data, ) # check # Model pretrained = weights.endswith(".pt") if pretrained: with torch_distributed_zero_first(rank): attempt_download(weights) # download if not found locally ckpt = torch.load(weights, map_location=device) # load checkpoint if hyp.get("anchors"): ckpt["model"].yaml["anchors"] = round( hyp["anchors"]) # force autoanchor model = Model(opt.cfg or ckpt["model"].yaml, ch=3, nc=nc).to(device) # create exclude = ["anchor"] if opt.cfg or hyp.get("anchors") else [ ] # exclude keys state_dict = ckpt["model"].float().state_dict() # to FP32 state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect model.load_state_dict(state_dict, strict=False) # load logger.info( "Transferred %g/%g items from %s" % (len(state_dict), len(model.state_dict()), weights)) # report else: model = Model(opt.cfg, ch=3, nc=nc).to(device) # create # Freeze freeze = [] # parameter names to freeze (full or partial) for k, v in model.named_parameters(): v.requires_grad = True # train all layers if any(x in k for x in freeze): print("freezing %s" % k) v.requires_grad = False # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp["weight_decay"] *= total_batch_size * accumulate / nbs # scale weight_decay logger.info(f"Scaled weight_decay = {hyp['weight_decay']}") pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_modules(): if hasattr(v, "bias") and isinstance(v.bias, nn.Parameter): pg2.append(v.bias) # biases if isinstance(v, nn.BatchNorm2d): pg0.append(v.weight) # no decay elif hasattr(v, "weight") and isinstance(v.weight, nn.Parameter): pg1.append(v.weight) # apply decay if opt.adam: optimizer = optim.Adam(pg0, lr=hyp["lr0"], betas=(hyp["momentum"], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=hyp["lr0"], momentum=hyp["momentum"], nesterov=True) optimizer.add_param_group({ "params": pg1, "weight_decay": hyp["weight_decay"] }) # add pg1 with weight_decay optimizer.add_param_group({"params": pg2}) # add pg2 (biases) logger.info("Optimizer groups: %g .bias, %g conv.weight, %g other" % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR if opt.linear_lr: lf = (lambda x: (1 - x / (epochs - 1)) * (1.0 - hyp["lrf"]) + hyp["lrf"]) # linear else: lf = one_cycle(1, hyp["lrf"], epochs) # cosine 1->hyp['lrf'] scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # Logging if rank in [-1, 0] and wandb and wandb.run is None: opt.hyp = hyp # add hyperparameters wandb_run = wandb.init( config=opt, resume="allow", project="YOLOv5" if opt.project == "runs/train" else Path(opt.project).stem, name=save_dir.stem, id=ckpt.get("wandb_id") if "ckpt" in locals() else None, ) loggers = {"wandb": wandb} # loggers dict # Resume start_epoch, best_fitness = 0, 0.0 if pretrained: # Optimizer if ckpt["optimizer"] is not None: optimizer.load_state_dict(ckpt["optimizer"]) best_fitness = ckpt["best_fitness"] # Results if ckpt.get("training_results") is not None: with open(results_file, "w") as file: file.write(ckpt["training_results"]) # write results.txt # Epochs start_epoch = ckpt["epoch"] + 1 if opt.resume: assert ( start_epoch > 0 ), "%s training to %g epochs is finished, nothing to resume." % ( weights, epochs, ) if epochs < start_epoch: logger.info( "%s has been trained for %g epochs. Fine-tuning for %g additional epochs." % (weights, ckpt["epoch"], epochs)) epochs += ckpt["epoch"] # finetune additional epochs del ckpt, state_dict # Image sizes gs = int(model.stride.max()) # grid size (max stride) nl = model.model[ -1].nl # number of detection layers (used for scaling hyp['obj']) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size ] # verify imgsz are gs-multiples # DP mode if cuda and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) logger.info("Using SyncBatchNorm()") # EMA ema = ModelEMA(model) if rank in [-1, 0] else None # DDP mode if cuda and rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank) # Trainloader dataloader, dataset = create_dataloader( train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank, world_size=opt.world_size, workers=opt.workers, image_weights=opt.image_weights, quad=opt.quad, prefix=colorstr("train: "), ) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert ( mlc < nc ), "Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g" % ( mlc, nc, opt.data, nc - 1, ) # Process 0 if rank in [-1, 0]: ema.updates = start_epoch * nb // accumulate # set EMA updates testloader = create_dataloader( test_path, imgsz_test, batch_size * 2, gs, opt, # testloader hyp=hyp, cache=opt.cache_images and not opt.notest, rect=True, rank=-1, world_size=opt.world_size, workers=opt.workers, pad=0.5, prefix=colorstr("val: "), )[0] if not opt.resume: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency # model._initialize_biases(cf.to(device)) if plots: plot_labels(labels, save_dir, loggers) if tb_writer: tb_writer.add_histogram("classes", c, 0) # Anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp["anchor_t"], imgsz=imgsz) # Model parameters hyp["box"] *= 3.0 / nl # scale to layers hyp["cls"] *= nc / 80.0 * 3.0 / nl # scale to classes and layers hyp["obj"] *= (imgsz / 640)**2 * 3.0 / nl # scale to image size and layers model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # iou loss ratio (obj_loss = 1.0 or iou) model.class_weights = ( labels_to_class_weights(dataset.labels, nc).to(device) * nc ) # attach class weights model.names = names # Start training t0 = time.time() nw = max(round(hyp["warmup_epochs"] * nb), 1000) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0 ) # P, R, [email protected], [email protected], val_loss(box, obj, cls) scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) compute_loss = ComputeLoss(model) # init loss class logger.info(f"Image sizes {imgsz} train, {imgsz_test} test\n" f"Using {dataloader.num_workers} dataloader workers\n" f"Logging results to {save_dir}\n" f"Starting training for {epochs} epochs...") for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if opt.image_weights: # Generate indices if rank in [-1, 0]: cw = (model.class_weights.cpu().numpy() * (1 - maps)**2 / nc ) # class weights iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights dataset.indices = random.choices( range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx # Broadcast if DDP if rank != -1: indices = (torch.tensor(dataset.indices) if rank == 0 else torch.zeros(dataset.n)).int() dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) logger.info( ("\n" + "%10s" * 8) % ("Epoch", "gpu_mem", "box", "obj", "cls", "total", "targets", "img_size")) if rank in [-1, 0]: pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, ( imgs, targets, paths, _, ) in ( pbar ): # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = (imgs.to(device, non_blocking=True).float() / 255.0 ) # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x["lr"] = np.interp( ni, xi, [ hyp["warmup_bias_lr"] if j == 2 else 0.0, x["initial_lr"] * lf(epoch), ], ) if "momentum" in x: x["momentum"] = np.interp( ni, xi, [hyp["warmup_momentum"], hyp["momentum"]]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode="bilinear", align_corners=False) # Forward with amp.autocast(enabled=cuda): pred = model(imgs) # forward loss, loss_items = compute_loss( pred, targets.to(device)) # loss scaled by batch_size if rank != -1: loss *= (opt.world_size ) # gradient averaged between devices in DDP mode if opt.quad: loss *= 4.0 # Backward scaler.scale(loss).backward() # Optimize if ni % accumulate == 0: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1 ) # update mean losses mem = "%.3gG" % (torch.cuda.memory_reserved() / 1e9 if torch.cuda.is_available() else 0) # (GB) s = ("%10s" * 2 + "%10.4g" * 6) % ( "%g/%g" % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1], ) pbar.set_description(s) # Plot if plots and ni < 3: f = save_dir / f"train_batch{ni}.jpg" # filename Thread(target=plot_images, args=(imgs, targets, paths, f), daemon=True).start() # if tb_writer: # tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard elif plots and ni == 10 and wandb: wandb.log( { "Mosaics": [ wandb.Image(str(x), caption=x.name) for x in save_dir.glob("train*.jpg") if x.exists() ] }, commit=False, ) # end batch ------------------------------------------------------------------------------------------------ # end epoch ---------------------------------------------------------------------------------------------------- # Scheduler lr = [x["lr"] for x in optimizer.param_groups] # for tensorboard scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP if ema: ema.update_attr( model, include=[ "yaml", "nc", "hyp", "gr", "names", "stride", "class_weights", ], ) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP results, maps, times = test.test( opt.data, batch_size=batch_size * 2, imgsz=imgsz_test, model=ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=save_dir, verbose=nc < 50 and final_epoch, plots=plots and final_epoch, log_imgs=opt.log_imgs if wandb else 0, compute_loss=compute_loss, ) # Write with open(results_file, "a") as f: f.write( s + "%10.4g" * 7 % results + "\n") # P, R, [email protected], [email protected], val_loss(box, obj, cls) if len(opt.name) and opt.bucket: os.system("gsutil cp %s gs://%s/results/results%s.txt" % (results_file, opt.bucket, opt.name)) # Log tags = [ "train/box_loss", "train/obj_loss", "train/cls_loss", # train loss "metrics/precision", "metrics/recall", "metrics/mAP_0.5", "metrics/mAP_0.5:0.95", "val/box_loss", "val/obj_loss", "val/cls_loss", # val loss "x/lr0", "x/lr1", "x/lr2", ] # params for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): if tb_writer: tb_writer.add_scalar(tag, x, epoch) # tensorboard if wandb: wandb.log({tag: x}, step=epoch, commit=tag == tags[-1]) # W&B # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # weighted combination of [P, R, [email protected], [email protected]] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, "r") as f: # create checkpoint ckpt = { "epoch": epoch, "best_fitness": best_fitness, "training_results": f.read(), "model": ema.ema, "optimizer": None if final_epoch else optimizer.state_dict(), "wandb_id": wandb_run.id if wandb else None, } # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Strip optimizers final = best if best.exists() else last # final model for f in [last, best]: if f.exists(): strip_optimizer(f) # strip optimizers if opt.bucket: os.system(f"gsutil cp {final} gs://{opt.bucket}/weights") # upload # Plots if plots: plot_results(save_dir=save_dir) # save as results.png if wandb: files = [ "results.png", "confusion_matrix.png", *[f"{x}_curve.png" for x in ("F1", "PR", "P", "R")], ] wandb.log({ "Results": [ wandb.Image(str(save_dir / f), caption=f) for f in files if (save_dir / f).exists() ] }) if opt.log_artifacts: wandb.log_artifact(artifact_or_path=str(final), type="model", name=save_dir.stem) # Test best.pt logger.info("%g epochs completed in %.3f hours.\n" % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) if opt.data.endswith("coco.yaml") and nc == 80: # if COCO for conf, iou, save_json in ( [0.25, 0.45, False], [0.001, 0.65, True], ): # speed, mAP tests results, _, _ = test.test( opt.data, batch_size=batch_size * 2, imgsz=imgsz_test, conf_thres=conf, iou_thres=iou, model=attempt_load(final, device).half(), single_cls=opt.single_cls, dataloader=testloader, save_dir=save_dir, save_json=save_json, plots=False, ) else: dist.destroy_process_group() wandb.run.finish() if wandb and wandb.run else None torch.cuda.empty_cache() # mlflow with mlflow.start_run() as run: # Log args into mlflow for key, value in hyp.items(): mlflow.log_param(key, value) for key, value in vars(opt).items(): mlflow.log_param(key, value) # Log results into mlflow for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): # xがtorch.Tensorだったらfloatに直す if torch.is_tensor(x): x = x.item() # tag名に特殊記号があれば削除する if ":" in tag: tag = re.sub(r":", " ", tag) mlflow.log_metric(tag, x) # Log model mlflow.pytorch.log_model(model, "model") return results
def main_best_channels(): # 波段选择依据 # 以最佳的划分方式: # 依次选择每个波段进行实验 for splitidx in range(46, 51): for datatype in ['Multi', 'RGB']: if datatype == 'Multi': # usedChannelsList = [[i] for i in range(23)] usedChannelsList = [[550 + 20 * i for i in range(23)]] else: # usedChannelsList = ['R', 'G', 'B'] usedChannelsList = [ 'RGB', ] for usedChannels in usedChannelsList: print(getTime(), splitidx, datatype, usedChannels, '...') configer = EasyDict() configer.dsize = (64, 64) configer.datatype = datatype configer.n_epoch = 300 if datatype == 'Multi' else 350 configer.lrbase = 0.001 if datatype == 'Multi' else 0.0005 configer.n_channel = 23 configer.n_class = 63 configer.batchsize = 32 configer.stepsize = 250 configer.gamma = 0.2 configer.cuda = True configer.splitmode = 'split_{}x{}_{}'.format( configer.dsize[0], configer.dsize[1], splitidx) configer.modelbase = 'recognize_vgg11_bn' if configer.datatype == 'Multi': configer.usedChannels = usedChannels configer.n_usedChannels = len(configer.usedChannels) configer.modelname = '{}_{}_{}'.\ format(configer.modelbase, configer.splitmode, '_'.join(list(map(str, configer.usedChannels)))) elif configer.datatype == 'RGB': configer.usedChannels = usedChannels configer.n_usedChannels = len(configer.usedChannels) configer.modelname = '{}_{}_{}'.\ format(configer.modelbase, configer.splitmode, configer.usedChannels) configer.datapath = '/datasets/ECUST2019_{}x{}'.\ format(configer.dsize[0], configer.dsize[1]) configer.logspath = '/home/louishsu/Work/Workspace/HUAWEI/pytorch/logs/{}_{}_{}subjects_logs'.\ format(configer.modelbase, configer.splitmode, configer.n_class) configer.mdlspath = '/home/louishsu/Work/Workspace/HUAWEI/pytorch/modelfiles/{}_{}_{}subjects_models'.\ format(configer.modelbase, configer.splitmode, configer.n_class) train(configer) test(configer)
def train(): cfg = opt.cfg data = opt.data img_size = opt.img_size epochs = 1 if opt.prebias else opt.epochs # 500200 batches at bs 64, 117263 images = 273 epochs batch_size = opt.batch_size accumulate = opt.accumulate # effective bs = batch_size * accumulate = 16 * 4 = 64 weights = opt.weights # initial training weights if 'pw' not in opt.arc: # remove BCELoss positive weights hyp['cls_pw'] = 1. hyp['obj_pw'] = 1. # Initialize init_seeds() wdir = 'weights' + os.sep # weights dir last = wdir + 'last.pt' best = wdir + 'best.pt' multi_scale = opt.multi_scale if multi_scale: img_sz_min = round(img_size / 32 / 1.5) + 1 img_sz_max = round(img_size / 32 * 1.5) - 1 img_size = img_sz_max * 32 # initiate with maximum multi_scale size print('Using multi-scale %g - %g' % (img_sz_min * 32, img_size)) # Configure run data_dict = parse_data_cfg(data) train_path = data_dict['train'] nc = int(data_dict['classes']) # number of classes # Remove previous results for f in glob.glob('*_batch*.jpg') + glob.glob('results.txt'): os.remove(f) # Initialize model model = Darknet(cfg, arc=opt.arc).to(device) # Optimizer pg0, pg1 = [], [] # optimizer parameter groups for k, v in dict(model.named_parameters()).items(): if 'Conv2d.weight' in k: pg1 += [v] # parameter group 1 (apply weight_decay) else: pg0 += [v] # parameter group 0 # optimizer = optim.Adam(pg0, lr=hyp['lr0']) # optimizer = AdaBound(pg0, lr=hyp['lr0'], final_lr=0.1) optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']}) # add pg1 with weight_decay del pg0, pg1 cutoff = -1 # backbone reaches to cutoff layer start_epoch = 0 best_fitness = 0. if weights.endswith('.pt'): # pytorch format # possible weights are 'last.pt', 'yolov3-spp.pt', 'yolov3-tiny.pt' etc. if opt.bucket: os.system('gsutil cp gs://%s/last.pt %s' % (opt.bucket, last)) # download from bucket chkpt = torch.load(weights, map_location=device) # load model if opt.transfer: chkpt['model'] = {k: v for k, v in chkpt['model'].items() if model.state_dict()[k].numel() == v.numel()} model.load_state_dict(chkpt['model'], strict=False) else: model.load_state_dict(chkpt['model']) # load optimizer if chkpt['optimizer'] is not None: optimizer.load_state_dict(chkpt['optimizer']) best_fitness = chkpt['best_fitness'] # load results if chkpt.get('training_results') is not None: with open('results.txt', 'w') as file: file.write(chkpt['training_results']) # write results.txt start_epoch = chkpt['epoch'] + 1 del chkpt elif len(weights) > 0: # darknet format # possible weights are 'yolov3.weights', 'yolov3-tiny.conv.15', 'darknet53.conv.74' etc. cutoff = load_darknet_weights(model, weights) if opt.transfer or opt.prebias: # transfer learning edge (yolo) layers nf = int(model.module_defs[model.yolo_layers[0] - 1]['filters']) # yolo layer size (i.e. 255) for p in optimizer.param_groups: # lower param count allows more aggressive training settings: i.e. SGD ~0.1 lr0, ~0.9 momentum p['lr'] *= 100 if p.get('momentum') is not None: # for SGD but not Adam p['momentum'] *= 0.9 for p in model.parameters(): if opt.prebias and p.numel() == nf: # train (yolo biases) p.requires_grad = True elif opt.transfer and p.shape[0] == nf: # train (yolo biases+weights) p.requires_grad = True else: # freeze layer p.requires_grad = False # Scheduler https://github.com/ultralytics/yolov3/issues/238 # lf = lambda x: 1 - x / epochs # linear ramp to zero # lf = lambda x: 10 ** (hyp['lrf'] * x / epochs) # exp ramp # lf = lambda x: 1 - 10 ** (hyp['lrf'] * (1 - x / epochs)) # inverse exp ramp # scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[round(opt.epochs * x) for x in [0.8, 0.9]], gamma=0.1) scheduler.last_epoch = start_epoch - 1 # # Plot lr schedule # y = [] # for _ in range(epochs): # scheduler.step() # y.append(optimizer.param_groups[0]['lr']) # plt.plot(y, label='LambdaLR') # plt.xlabel('epoch') # plt.ylabel('LR') # plt.tight_layout() # plt.savefig('LR.png', dpi=300) # Mixed precision training https://github.com/NVIDIA/apex if mixed_precision: model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0) # Initialize distributed training if torch.cuda.device_count() > 1: dist.init_process_group(backend='nccl', # 'distributed backend' init_method='tcp://127.0.0.1:9999', # distributed training init method world_size=1, # number of nodes for distributed training rank=0) # distributed training node rank model = torch.nn.parallel.DistributedDataParallel(model) model.yolo_layers = model.module.yolo_layers # move yolo layer indices to top level # Dataset dataset = LoadImagesAndLabels(train_path, img_size, batch_size, augment=True, hyp=hyp, # augmentation hyperparameters rect=opt.rect, # rectangular training image_weights=opt.img_weights, cache_images=False if opt.prebias else opt.cache_images) # Dataloader dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, num_workers=min(os.cpu_count(), batch_size), shuffle=not opt.rect, # Shuffle=True unless rectangular training is used pin_memory=True, collate_fn=dataset.collate_fn) # Start training model.nc = nc # attach number of classes to model model.arc = opt.arc # attach yolo architecture model.hyp = hyp # attach hyperparameters to model model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights model_info(model, report='summary') # 'full' or 'summary' nb = len(dataloader) maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' t0 = time.time() print('Starting %s for %g epochs...' % ('prebias' if opt.prebias else 'training', epochs)) for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------ model.train() print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) # Freeze backbone at epoch 0, unfreeze at epoch 1 (optional) freeze_backbone = False if freeze_backbone and epoch < 2: for name, p in model.named_parameters(): if int(name.split('.')[1]) < cutoff: # if layer < 75 p.requires_grad = False if epoch == 0 else True # Update image weights (optional) if dataset.image_weights: w = model.class_weights.cpu().numpy() * (1 - maps) ** 2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices(range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx mloss = torch.zeros(4).to(device) # mean losses pbar = tqdm(enumerate(dataloader), total=nb) # progress bar for i, (imgs, targets, paths, _) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device) targets = targets.to(device) # Multi-Scale training if multi_scale: if ni / accumulate % 10 == 0: # adjust (67% - 150%) every 10 batches img_size = random.randrange(img_sz_min, img_sz_max + 1) * 32 sf = img_size / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / 32.) * 32 for x in imgs.shape[2:]] # new shape (stretched to 32-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Plot images with bounding boxes if ni == 0: fname = 'train_batch%g.jpg' % i plot_images(imgs=imgs, targets=targets, paths=paths, fname=fname) if tb_writer: tb_writer.add_image(fname, cv2.imread(fname)[:, :, ::-1], dataformats='HWC') # Hyperparameter burn-in # n_burn = nb - 1 # min(nb // 5 + 1, 1000) # number of burn-in batches # if ni <= n_burn: # for m in model.named_modules(): # if m[0].endswith('BatchNorm2d'): # m[1].momentum = 1 - i / n_burn * 0.99 # BatchNorm2d momentum falls from 1 - 0.01 # g = (i / n_burn) ** 4 # gain rises from 0 - 1 # for x in optimizer.param_groups: # x['lr'] = hyp['lr0'] * g # x['weight_decay'] = hyp['weight_decay'] * g # Run model pred = model(imgs) # Compute loss loss, loss_items = compute_loss(pred, targets, model) if not torch.isfinite(loss): print('WARNING: non-finite loss, ending training ', loss_items) return results # Scale loss by nominal batch_size of 64 loss *= batch_size / 64 # Compute gradient if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Accumulate gradient for x batches before optimizing if ni % accumulate == 0: optimizer.step() optimizer.zero_grad() # Print batch results mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0 # (GB) s = ('%10s' * 2 + '%10.3g' * 6) % ( '%g/%g' % (epoch, epochs - 1), '%.3gG' % mem, *mloss, len(targets), img_size) pbar.set_description(s) # end batch ------------------------------------------------------------------------------------------------ # Update scheduler scheduler.step() # Process epoch results final_epoch = epoch + 1 == epochs if opt.prebias: print_model_biases(model) else: # Calculate mAP (always test final epoch, skip first 10 if opt.nosave) if not (opt.notest or (opt.nosave and epoch < 10)) or final_epoch: with torch.no_grad(): results, maps = test.test(cfg, data, batch_size=batch_size, img_size=opt.img_size, model=model, conf_thres=0.001 if final_epoch and epoch > 0 else 0.1, # 0.1 for speed save_json=final_epoch and epoch > 0 and 'coco.data' in data) # Write epoch results with open('results.txt', 'a') as file: file.write(s + '%10.3g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) # Write Tensorboard results if tb_writer: x = list(mloss) + list(results) titles = ['GIoU', 'Objectness', 'Classification', 'Train loss', 'Precision', 'Recall', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification'] for xi, title in zip(x, titles): tb_writer.add_scalar(title, xi, epoch) # Update best mAP fitness = results[2] # mAP if fitness > best_fitness: best_fitness = fitness # Save training results save = (not opt.nosave) or ((not opt.evolve) and final_epoch) if save: with open('results.txt', 'r') as file: # Create checkpoint chkpt = {'epoch': epoch, 'best_fitness': best_fitness, 'training_results': file.read(), 'model': model.module.state_dict() if type( model) is nn.parallel.DistributedDataParallel else model.state_dict(), 'optimizer': None if final_epoch else optimizer.state_dict()} # Save last checkpoint torch.save(chkpt, last) if opt.bucket: os.system('gsutil cp %s gs://%s' % (last, opt.bucket)) # upload to bucket # Save best checkpoint if best_fitness == fitness: torch.save(chkpt, best) # Save backup every 10 epochs (optional) if epoch > 0 and epoch % 10 == 0: torch.save(chkpt, wdir + 'backup%g.pt' % epoch) # Delete checkpoint del chkpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if len(opt.name): os.rename('results.txt', 'results_%s.txt' % opt.name) plot_results() # save as results.png print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if torch.cuda.device_count() > 1 else None torch.cuda.empty_cache() return results
img_size = opt.img_size device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = Darknet(opt.cfg, (img_size, img_size)).to(device) if opt.weights.endswith('.pt'): model.load_state_dict(torch.load(opt.weights)['model']) else: load_darknet_weights(model, opt.weights) print('\nloaded weights from ', opt.weights) eval_model = lambda model: test(opt.cfg, opt.data, weights=opt.weights, batch_size=16, img_size=img_size, iou_thres=0.5, conf_thres=0.001, nms_thres=0.5, save_json=False, model=model) obtain_num_parameters = lambda model: sum( [param.nelement() for param in model.parameters()]) print("\nlet's test the original model first:") with torch.no_grad(): origin_model_metric = eval_model(model) origin_nparameters = obtain_num_parameters(model) CBL_idx, Conv_idx, prune_idx = parse_module_defs(model.module_defs)
0.7, 0.8, 0.9, 1 ] MODELS = {} net = Net(HN1, HN2) init_state = copy.deepcopy(net.state_dict()) for lr in LR: net.load_state_dict(init_state) training_inputs = training_data[:, 0:5] training_add = training_data[:, 8:] training_inputs = np.append(training_inputs, training_add, axis=1) training_labels = training_data[:, 5:8] test_inputs = testing_data[:, 0:5] test_add = testing_data[:, 8:] test_inputs = np.append(test_inputs, test_add, axis=1) test_labels = testing_data[:, 5:8] train(net, training_inputs, training_labels, EPOCHS, lr, BATCH_SIZE) avg_mse = test(test_inputs, test_labels, net) MODELS['{a}_{x}-{y}_{z}_{b}'.format(a=HL, x=HN1, y=HN2, z=EPOCHS, b=lr)] = avg_mse with open('Data/Search/manual_search_results_{x}HL_lr.csv'.format(x=HL), 'w') as f: for key in MODELS.keys(): f.write("%s: %s\n" % (key, MODELS[key])) print(MODELS)
test_dataset = LoadImagesAndLabels(valid_path, img_size, batch_size, hyp=hyp, rect=True, cache_images=False) testloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, num_workers=min([os.cpu_count(), batch_size, 8]), shuffle=False, pin_memory=True, collate_fn=test_dataset.collate_fn) with torch.no_grad(): origin_model_metric = test.test(opt.cfg, opt.data, batch_size=batch_size, imgsz=img_size, model=model, dataloader=testloader, rank=-1, plot=False) origin_nparameters = obtain_num_parameters(model) origin_macs = performance_summary(model) CBL_idx, Conv_idx, prune_idx, _, _ = parse_module_defs2(model.module_defs) print("-------------------------------------------------------") max_mAP = 0 for i in range(number): compact_module_defs, current_parameters, compact_model = rand_prune_and_eval(model, 0, 1) with torch.no_grad(): # 防止随机生成的较差的模型撑爆显存,增大nmsconf阈值
# Construct validation memory val_mem = ReplayMemory(args, args.evaluation_size) T, done = 0, True while T < args.evaluation_size - args.history_length + 1: if done: state, done = env.reset(), False val_mem.preappend() # Set up memory for beginning of episode val_mem.append(state, None, None) state, _, done = env.step(random.randint(0, action_space - 1)) T += 1 # No need to postappend on done in validation memory if args.evaluate: dqn.eval() # Set DQN (policy network) to evaluation mode avg_reward, avg_Q = test(args, 0, dqn, val_mem, evaluate=True) # Test print('Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' + str(avg_Q)) else: # Training loop dqn.train() T, done = 0, True while T < args.T_max: if done: state, done = Variable(env.reset()), False dqn.reset_noise( ) # Draw a new set of noisy weights for each episode (better for before learning starts) mem.preappend() # Set up memory for beginning of episode action = dqn.act( state) # Choose an action greedily (with noisy weights)
def train(hyp): epochs = opt.epochs # 300 batch_size = opt.batch_size # 64 weights = opt.weights # initial training weights # Configure init_seeds(1) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # model dict train_path = data_dict['train'] test_path = data_dict['val'] nc = 1 if opt.single_cls else int(data_dict['nc']) # number of classes # Remove previous results for f in glob.glob('*_batch*.jpg') + glob.glob(results_file): os.remove(f) # Create model model = Model(opt.cfg).to(device) assert model.md['nc'] == nc, '%s nc=%g classes but %s nc=%g classes' % ( opt.data, nc, opt.cfg, model.md['nc']) # Image sizes gs = int(max(model.stride)) # grid size (max stride) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size ] # verify imgsz are gs-multiples # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= batch_size * accumulate / nbs # scale weight_decay pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_parameters(): if v.requires_grad: if '.bias' in k: pg2.append(v) # biases elif '.weight' in k and '.bn' not in k: pg1.append(v) # apply weight decay else: pg0.append(v) # all else optimizer = optim.Adam(pg0, lr=hyp['lr0']) if opt.adam else \ optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) print('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Load Model google_utils.attempt_download(weights) start_epoch, best_fitness = 0, 0.0 if weights.endswith('.pt'): # pytorch format ckpt = torch.load(weights, map_location=device) # load checkpoint # load model try: ckpt['model'] = { k: v for k, v in ckpt['model'].float().state_dict().items() if model.state_dict()[k].shape == v.shape } # to FP32, filter model.load_state_dict(ckpt['model'], strict=False) except KeyError as e: s = "%s is not compatible with %s. Specify --weights '' or specify a --cfg compatible with %s." \ % (opt.weights, opt.cfg, opt.weights) raise KeyError(s) from e # load optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # load results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt start_epoch = ckpt['epoch'] + 1 del ckpt # Mixed precision training https://github.com/NVIDIA/apex if mixed_precision: model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0) # Scheduler https://arxiv.org/pdf/1812.01187.pdf lf = lambda x: (( (1 + math.cos(x * math.pi / epochs)) / 2)**1.0) * 0.9 + 0.1 # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) scheduler.last_epoch = start_epoch - 1 # do not move # https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822 # plot_lr_scheduler(optimizer, scheduler, epochs) # Initialize distributed training if device.type != 'cpu' and torch.cuda.device_count( ) > 1 and torch.distributed.is_available(): dist.init_process_group( backend='nccl', # distributed backend init_method='tcp://127.0.0.1:9999', # init method world_size=1, # number of nodes rank=0) # node rank model = torch.nn.parallel.DistributedDataParallel(model) # pip install torch==1.4.0+cu100 torchvision==0.5.0+cu100 -f https://download.pytorch.org/whl/torch_stable.html # Dataset dataset = LoadImagesAndLabels( train_path, imgsz, batch_size, augment=True, hyp=hyp, # augmentation hyperparameters rect=opt.rect, # rectangular training cache_images=opt.cache_images, single_cls=opt.single_cls) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Correct your labels or your model.' % ( mlc, nc, opt.cfg) # Dataloader batch_size = min(batch_size, len(dataset)) nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers dataloader = torch.utils.data.DataLoader( dataset, batch_size=batch_size, num_workers=nw, shuffle=not opt. rect, # Shuffle=True unless rectangular training is used pin_memory=True, collate_fn=dataset.collate_fn) # Testloader testloader = torch.utils.data.DataLoader(LoadImagesAndLabels( test_path, imgsz_test, batch_size, hyp=hyp, rect=True, cache_images=opt.cache_images, single_cls=opt.single_cls), batch_size=batch_size, num_workers=nw, pin_memory=True, collate_fn=dataset.collate_fn) # Model parameters hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights model.names = data_dict['names'] # Class frequency labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # model._initialize_biases(cf.to(device)) if tb_writer: plot_labels(labels) tb_writer.add_histogram('classes', c, 0) # Check anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Exponential moving average ema = torch_utils.ModelEMA(model) # Start training t0 = time.time() nb = len(dataloader) # number of batches n_burn = max(3 * nb, 1e3) # burn-in iterations, max(3 epochs, 1k iterations) maps = np.zeros(nc) # mAP per class results = ( 0, 0, 0, 0, 0, 0, 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' print('Image sizes %g train, %g test' % (imgsz, imgsz_test)) print('Using %g dataloader workers' % nw) print('Starting training for %g epochs...' % epochs) # torch.autograd.set_detect_anomaly(True) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if dataset.image_weights: w = model.class_weights.cpu().numpy() * (1 - maps)**2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices(range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx mloss = torch.zeros(4, device=device) # mean losses print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) pbar = tqdm(enumerate(dataloader), total=nb) # progress bar for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device).float( ) / 255.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0 # Burn-in if ni <= n_burn: xi = [0, n_burn] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp( ni, xi, [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [0.9, hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward pred = model(imgs) # Loss loss, loss_items = compute_loss(pred, targets.to(device), model) if not torch.isfinite(loss): print('WARNING: non-finite loss, ending training ', loss_items) return results # Backward if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Optimize if ni % accumulate == 0: optimizer.step() optimizer.zero_grad() ema.update(model) # Print mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = '%.3gG' % (torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) # Plot if ni < 3: f = 'train_batch%g.jpg' % i # filename res = plot_images(images=imgs, targets=targets, paths=paths, fname=f) if tb_writer: tb_writer.add_image(f, res, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # Scheduler scheduler.step() # mAP ema.update_attr(model) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP results, maps, times = test.test( opt.data, batch_size=batch_size, imgsz=imgsz_test, save_json=final_epoch and opt.data.endswith(os.sep + 'coco.yaml'), model=ema.ema, single_cls=opt.single_cls, dataloader=testloader) # Write with open(results_file, 'a') as f: f.write(s + '%10.4g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp results.txt gs://%s/results/results%s.txt' % (opt.bucket, opt.name)) # Tensorboard if tb_writer: tags = [ 'train/giou_loss', 'train/obj_loss', 'train/cls_loss', 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/F1', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss' ] for x, tag in zip(list(mloss[:-1]) + list(results), tags): tb_writer.add_scalar(tag, x, epoch) # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema.module if hasattr(model, 'module') else ema.ema, 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last, best and delete torch.save(ckpt, last) if (best_fitness == fi) and not final_epoch: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training n = opt.name if len(n): n = '_' + n if not n.isnumeric() else n fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename ispt = f2.endswith('.pt') # is *.pt strip_optimizer(f2) if ispt else None # strip optimizer os.system('gsutil cp %s gs://%s/weights' % ( f2, opt.bucket)) if opt.bucket and ispt else None # upload if not opt.evolve: plot_results() # save as results.png print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group( ) if device.type != 'cpu' and torch.cuda.device_count() > 1 else None torch.cuda.empty_cache() return results
def train( cfg, data_cfg, img_size=416, resume=False, epochs=100, # 500200 batches at bs 4, 117263 images = 68 epochs batch_size=16, accumulate=4, # effective bs = 64 = batch_size * accumulate freeze_backbone=False, transfer=False # Transfer learning (train only YOLO layers) ): init_seeds() weights = 'weights' + os.sep latest = weights + 'latest.pt' best = weights + 'best.pt' device = torch_utils.select_device() torch.backends.cudnn.benchmark = True # possibly unsuitable for multiscale img_size_test = img_size # image size for testing if opt.multi_scale: img_size_min = round(img_size / 32 / 1.5) img_size_max = round(img_size / 32 * 1.5) img_size = img_size_max * 32 # initiate with maximum multi_scale size # Configure run data_dict = parse_data_cfg(data_cfg) train_path = data_dict['train'] nc = int(data_dict['classes']) # number of classes # Initialize model model = Darknet(cfg).to(device) # Optimizer optimizer = optim.SGD(model.parameters(), lr=hyp['lr0'], momentum=hyp['momentum'], weight_decay=hyp['weight_decay']) cutoff = -1 # backbone reaches to cutoff layer start_epoch = 0 best_loss = float('inf') nf = int(model.module_defs[model.yolo_layers[0] - 1]['filters']) # yolo layer size (i.e. 255) if resume: # Load previously saved model if transfer: # Transfer learning chkpt = torch.load(weights + 'yolov3-spp.pt', map_location=device) model.load_state_dict({k: v for k, v in chkpt['model'].items() if v.numel() > 1 and v.shape[0] != 255}, strict=False) for p in model.parameters(): p.requires_grad = True if p.shape[0] == nf else False else: # resume from latest.pt chkpt = torch.load(latest, map_location=device) # load checkpoint model.load_state_dict(chkpt['model']) start_epoch = chkpt['epoch'] + 1 if chkpt['optimizer'] is not None: optimizer.load_state_dict(chkpt['optimizer']) best_loss = chkpt['best_loss'] del chkpt else: # Initialize model with backbone (optional) if '-tiny' in cfg: cutoff = load_darknet_weights(model, weights + 'yolov3-tiny.conv.15') else: cutoff = load_darknet_weights(model, weights + 'darknet53.conv.74') # Scheduler https://github.com/ultralytics/yolov3/issues/238 # lf = lambda x: 1 - x / epochs # linear ramp to zero # lf = lambda x: 10 ** (hyp['lrf'] * x / epochs) # exp ramp # lf = lambda x: 1 - 10 ** (hyp['lrf'] * (1 - x / epochs)) # inverse exp ramp # scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[round(opt.epochs * x) for x in (0.8, 0.9)], gamma=0.1) scheduler.last_epoch = start_epoch - 1 # # Plot lr schedule # y = [] # for _ in range(epochs): # scheduler.step() # y.append(optimizer.param_groups[0]['lr']) # plt.plot(y, label='LambdaLR') # plt.xlabel('epoch') # plt.xlabel('LR') # plt.tight_layout() # plt.savefig('LR.png', dpi=300) # Dataset dataset = LoadImagesAndLabels(train_path, img_size, batch_size, augment=True, rect=False) # Initialize distributed training if torch.cuda.device_count() > 1: dist.init_process_group(backend=opt.backend, init_method=opt.dist_url, world_size=opt.world_size, rank=opt.rank) model = torch.nn.parallel.DistributedDataParallel(model) # sampler = torch.project.data.distributed.DistributedSampler(dataset) # Dataloader dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=opt.num_workers, shuffle=True, # disable rectangular training if True pin_memory=True, collate_fn=dataset.collate_fn) # Mixed precision training https://github.com/NVIDIA/apex # install help: https://github.com/NVIDIA/apex/issues/259 mixed_precision = False if mixed_precision: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Remove old results for f in glob.glob('*_batch*.jpg') + glob.glob('results.txt'): os.remove(f) # Start training model.hyp = hyp # attach hyperparameters to model model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights model_info(model) nb = len(dataloader) maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0) # P, R, mAP, F1, test_loss n_burnin = min(round(nb / 5 + 1), 1000) # burn-in batches t, t0 = time.time(), time.time() for epoch in range(start_epoch, epochs): print(epoch) model.train() print(('\n%8s%12s' + '%10s' * 7) % ('Epoch', 'Batch', 'xy', 'wh', 'conf', 'cls', 'total', 'targets', 'time')) # Update scheduler scheduler.step() # Freeze backbone at epoch 0, unfreeze at epoch 1 (optional) if freeze_backbone and epoch < 2: for name, p in model.named_parameters(): if int(name.split('.')[1]) < cutoff: # if layer < 75 p.requires_grad = False if epoch == 0 else True # # Update image weights (optional) # w = model.class_weights.cpu().numpy() * (1 - maps) # class weights # image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) # dataset.indices = random.choices(range(dataset.n), weights=image_weights, k=dataset.n) # random weighted index mloss = torch.zeros(5).to(device) # mean losses for i, (imgs, targets, _, _) in enumerate(dataloader): imgs = imgs.to(device) targets = targets.to(device) # Multi-Scale training if opt.multi_scale: if (i + 1 + nb * epoch) % 10 == 0: # adjust (67% - 150%) every 10 batches img_size = random.choice(range(img_size_min, img_size_max + 1)) * 32 print('img_size = %g' % img_size) scale_factor = img_size / max(imgs.shape[-2:]) imgs = F.interpolate(imgs, scale_factor=scale_factor, mode='bilinear', align_corners=False) # Plot images with bounding boxes if epoch == 0 and i == 0: plot_images(imgs=imgs, targets=targets, fname='train_batch%g.jpg' % i) # SGD burn-in if epoch == 0 and i <= n_burnin: lr = hyp['lr0'] * (i / n_burnin) ** 4 for x in optimizer.param_groups: x['lr'] = lr # Run model pred = model(imgs) # Compute loss loss, loss_items = compute_loss(pred, targets, model) if torch.isnan(loss): print('WARNING: nan loss detected, ending training') return results # Compute gradient if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Accumulate gradient for x batches before optimizing if (i + 1) % accumulate == 0 or (i + 1) == nb: optimizer.step() optimizer.zero_grad() # Print batch results mloss = (mloss * i + loss_items) / (i + 1) # update mean losses s = ('%8s%12s' + '%10.3g' * 7) % ( '%g/%g' % (epoch, epochs - 1), '%g/%g' % (i, nb - 1), *mloss, len(targets), time.time() - t) t = time.time() print(s) # Calculate mAP (always test final epoch, skip first 5 if opt.nosave) if not (opt.notest or (opt.nosave and epoch < 10)) or epoch == epochs - 1: with torch.no_grad(): results, maps = test.test(cfg, data_cfg, batch_size=batch_size, img_size=img_size_test, model=model, conf_thres=0.1) # Write epoch results with open('results.txt', 'a') as file: file.write(s + '%11.3g' * 5 % results + '\n') # P, R, mAP, F1, test_loss # Update best loss test_loss = results[4] if test_loss < best_loss: best_loss = test_loss # Save training results save = (not opt.nosave) or (epoch == epochs - 1) if save: # Create checkpoint chkpt = {'epoch': epoch, 'best_loss': best_loss, 'model': model.module.state_dict() if type( model) is nn.parallel.DistributedDataParallel else model.state_dict(), 'optimizer': optimizer.state_dict()} # Save latest checkpoint torch.save(chkpt, latest) # Save best checkpoint if best_loss == test_loss: torch.save(chkpt, best) # Save backup every 10 epochs (optional) if epoch > 0 and epoch % 10 == 0: torch.save(chkpt, weights + 'backup%g.pt' % epoch) # Delete checkpoint del chkpt dt = (time.time() - t0) / 3600 print('%g epochs completed in %.3f hours.' % (epoch - start_epoch + 1, dt)) return results
if not os.path.exists(backup): os.makedirs(backup) model_name = os.path.splitext(os.path.split( opt.cfg)[-1])[0] + '_best.pth' self.save_path = os.path.join(backup, model_name + '_best.pth') self.best_map = 0 def save(self, mAP): if mAP > self.best_map: self.best_map = mAP if os.path.exists(self.save_path): os.remove(self.save_path) torch.save(model.state_dict(), self.save_path) if __name__ == "__main__": args = create_args() torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) model, model_cfg = create_model(args) optimizer, lr = create_optimizer(model, model_cfg) class_names, train_loader, valid_loader, test_loader = create_dataset( args, model_cfg) save = save_model(args) #import pdb;pdb.set_trace() mAP = 0 for epoch in range(1, args.epochs + 1): train(args, model, train_loader, optimizer, epoch) if epoch % 3 == 2 or epoch == args.epochs: mAP = test(args, model, model_cfg, test_loader, class_names) optimizer, lr = adjust_learning_rate(optimizer, lr, epoch) save.save(mAP)
def train( cfg, data_cfg, img_size=416, resume=False, epochs=273, # 500200 batches at bs 64, dataset length 117263 batch_size=16, accumulate=1, multi_scale=False, freeze_backbone=False, transfer=False # Transfer learning (train only YOLO layers) ): init_seeds() weights = 'weights' + os.sep latest = weights + 'latest.pt' best = weights + 'best.pt' device = torch_utils.select_device() if multi_scale: img_size = 608 # initiate with maximum multi_scale size opt.num_workers = 0 # bug https://github.com/ultralytics/yolov3/issues/174 else: torch.backends.cudnn.benchmark = True # unsuitable for multiscale # Configure run train_path = parse_data_cfg(data_cfg)['train'] # Initialize model model = Darknet(cfg, img_size).to(device) # Optimizer optimizer = optim.SGD(model.parameters(), lr=hyp['lr0'], momentum=hyp['momentum'], weight_decay=hyp['weight_decay']) cutoff = -1 # backbone reaches to cutoff layer start_epoch = 0 best_loss = float('inf') nf = int(model.module_defs[model.yolo_layers[0] - 1]['filters']) # yolo layer size (i.e. 255) if resume: # Load previously saved model if transfer: # Transfer learning chkpt = torch.load(weights + 'yolov3-spp.pt', map_location=device) model.load_state_dict({k: v for k, v in chkpt['model'].items() if v.numel() > 1 and v.shape[0] != 255}, strict=False) for p in model.parameters(): p.requires_grad = True if p.shape[0] == nf else False else: # resume from latest.pt chkpt = torch.load(latest, map_location=device) # load checkpoint model.load_state_dict(chkpt['model']) start_epoch = chkpt['epoch'] + 1 if chkpt['optimizer'] is not None: optimizer.load_state_dict(chkpt['optimizer']) best_loss = chkpt['best_loss'] del chkpt else: # Initialize model with backbone (optional) if '-tiny.cfg' in cfg: #cutoff = load_darknet_weights(model, weights + 'yolov3-tiny.conv.15') pass else: #cutoff = load_darknet_weights(model, weights + 'darknet53.conv.74') pass # Scheduler https://github.com/ultralytics/yolov3/issues/238 # lf = lambda x: 1 - x / epochs # linear ramp to zero # lf = lambda x: 10 ** (hyp['lrf'] * x / epochs) # exp ramp lf = lambda x: 1 - 10 ** (hyp['lrf'] * (1 - x / epochs)) # inverse exp ramp scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lf, last_epoch=start_epoch - 1) # scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[218, 245], gamma=0.1, last_epoch=start_epoch-1) # # Plot lr schedule # y = [] # for _ in range(epochs): # scheduler.step() # y.append(optimizer.param_groups[0]['lr']) # plt.plot(y, label='LambdaLR') # plt.xlabel('epoch') # plt.xlabel('LR') # plt.tight_layout() # plt.savefig('LR.png', dpi=300) # Dataset dataset = LoadImagesAndLabels(train_path, img_size=img_size, augment=True) # Initialize distributed training if torch.cuda.device_count() > 1: dist.init_process_group(backend=opt.backend, init_method=opt.dist_url, world_size=opt.world_size, rank=opt.rank) model = torch.nn.parallel.DistributedDataParallel(model) # sampler = torch.utils.data.distributed.DistributedSampler(dataset) # Dataloader dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=opt.num_workers, shuffle=True, pin_memory=True, collate_fn=dataset.collate_fn) # Mixed precision training https://github.com/NVIDIA/apex # install help: https://github.com/NVIDIA/apex/issues/259 mixed_precision = False if mixed_precision: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Start training t = time.time() model.hyp = hyp # attach hyperparameters to model model_info(model) nb = len(dataloader) results = (0, 0, 0, 0, 0) # P, R, mAP, F1, test_loss n_burnin = min(round(nb / 5 + 1), 1000) # burn-in batches os.remove('train_batch0.jpg') if os.path.exists('train_batch0.jpg') else None os.remove('test_batch0.jpg') if os.path.exists('test_batch0.jpg') else None for epoch in range(start_epoch, epochs): model.train() print(('\n%8s%12s' + '%10s' * 7) % ('Epoch', 'Batch', 'xy', 'wh', 'conf', 'cls', 'total', 'nTargets', 'time')) # Update scheduler scheduler.step() # Freeze backbone at epoch 0, unfreeze at epoch 1 if freeze_backbone and epoch < 2: for name, p in model.named_parameters(): if int(name.split('.')[1]) < cutoff: # if layer < 75 p.requires_grad = False if epoch == 0 else True mloss = torch.zeros(5).to(device) # mean losses for i, (imgs, targets, _, _) in enumerate(dataloader): imgs = imgs.to(device) targets = targets.to(device) nt = len(targets) # Plot images with bounding boxes if epoch == 0 and i == 0: plot_images(imgs=imgs, targets=targets, fname='train_batch0.jpg') # SGD burn-in if epoch == 0 and i <= n_burnin: lr = hyp['lr0'] * (i / n_burnin) ** 4 for x in optimizer.param_groups: x['lr'] = lr # Run model pred = model(imgs) # Compute loss loss, loss_items = compute_loss(pred, targets, model) if torch.isnan(loss): print('WARNING: nan loss detected, ending training') return results # Compute gradient if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Accumulate gradient for x batches before optimizing if (i + 1) % accumulate == 0 or (i + 1) == nb: optimizer.step() optimizer.zero_grad() # Update running mean of tracked metrics mloss = (mloss * i + loss_items) / (i + 1) # Print batch results s = ('%8s%12s' + '%10.3g' * 7) % ( '%g/%g' % (epoch, epochs - 1), '%g/%g' % (i, nb - 1), *mloss, nt, time.time() - t) t = time.time() print(s) # Multi-Scale training (320 - 608 pixels) every 10 batches if multi_scale and (i + 1) % 10 == 0: dataset.img_size = random.choice(range(10, 20)) * 32 print('multi_scale img_size = %g' % dataset.img_size) # Calculate mAP (always test final epoch, skip first 5 if opt.nosave) if not (opt.notest or (opt.nosave and epoch < 5)) or epoch == epochs - 1: with torch.no_grad(): results = test.test(cfg, data_cfg, batch_size=batch_size, img_size=img_size, model=model, conf_thres=0.1) # Write epoch results with open('results.txt', 'a') as file: file.write(s + '%11.3g' * 5 % results + '\n') # P, R, mAP, F1, test_loss # Update best loss test_loss = results[4] if test_loss < best_loss: best_loss = test_loss # Save training results save = True and not opt.nosave if save: # Create checkpoint chkpt = {'epoch': epoch, 'best_loss': best_loss, 'model': model.module.state_dict() if type( model) is nn.parallel.DistributedDataParallel else model.state_dict(), 'optimizer': optimizer.state_dict()} # Save latest checkpoint torch.save(chkpt, latest) # Save best checkpoint if best_loss == test_loss: torch.save(chkpt, best) # Save backup every 10 epochs (optional) if epoch > 0 and epoch % 10 == 0: torch.save(chkpt, weights + 'backup%g.pt' % epoch) # Delete checkpoint del chkpt return results
optimizer = None else: optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=args.lr) optimizer.share_memory() shared_stepcount = SharedCounter() if not args.debug: processes = [] p = mp.Process(target=test, args=(args.num_processes, args, shared_model, Model, make_env, shared_stepcount)) p.start() processes.append(p) for rank in range(0, args.num_processes): p = mp.Process(target=train, args=(rank, args, shared_model, Model, make_env, shared_stepcount, optimizer)) p.start() processes.append(p) for p in processes: p.join() else: ## debug is enabled # run only one process in a main, easier to debug args.num_test_episodes = 1 args.descr = 'debug' args.max_step_count = 1000 # test both train and debug train(0, args, shared_model, Model, make_env, shared_stepcount, optimizer) args.max_step_count += 1000 # needed to perform test test(args.num_processes, args, shared_model, Model, make_env, shared_stepcount)
if opt.weights: if opt.weights.endswith(".pt"): model.load_state_dict( torch.load(opt.weights, map_location=device)['model']) else: _ = load_darknet_weights(model, opt.weights) data_config = parse_data_cfg(opt.data) valid_path = data_config["valid"] class_names = load_classes(data_config["names"]) eval_model = lambda model: test(model=model, cfg=opt.cfg, data=opt.data, batch_size=opt.batch_size, imgsz=opt.img_size, rank=-1) obtain_num_parameters = lambda model: sum( [param.nelement() for param in model.parameters()]) # 这个不应该注释掉,等会要恢复 with torch.no_grad(): origin_model_metric = eval_model(model) origin_nparameters = obtain_num_parameters(model) CBL_idx, Conv_idx, prune_idx = parse_module_defs(model.module_defs) # 将所有要剪枝的BN层的α参数,拷贝到bn_weights列表 bn_weights = gather_bn_weights(model.module_list, prune_idx)
# # **Submission Checklist** # # 1. Does my code pass all tests? # 2. Does my code implement `A*` search and not some other search algorithm? # 3. Do I use an **admissible heuristic** to direct search efforts towards the goal? # 4. Do I use data structures which avoid unnecessarily slow lookups? # # When you can answer "yes" to all of these questions, and also have answered the written questions below, submit by pressing the Submit button in the lower right! # In[29]: from test import test test(PathPlanner) # ## Questions # # **Instructions** # # Answer the following questions in your own words. We do not you expect you to know all of this knowledge on the top of your head. We expect you to do research and ask question. However do not merely copy and paste the answer from a google or stackoverflow. Read the information and understand it first. Then use your own words to explain the answer. # --- # How would you explain A-Star to a family member(layman)? # # **ANSWER**: A-Star is basically a route finding search algorithm which is used to find the shortest path between two given points or locations. It uses path cost and estimated distance between the start and goal state to provide the optimal shortest path. As and example, Google maps internally uses A-Star search algorithm (which is based on cost and heuristic function) to provide the shortest path between two destinations as choosen on the map. # # ---
def main(args): # Ensure training, testing, and manip are not all turned off assert ( args.train or args.test or args.manip ), 'Cannot have train, test, and manip all set to 0, Nothing to do.' # Load the training, validation, and testing data try: train_list, val_list, test_list = load_data(args.data_root_dir, args.split_num) except: # Create the training and test splits if not found split_data(args.data_root_dir, num_splits=4) train_list, val_list, test_list = load_data(args.data_root_dir, args.split_num) # Get image properties from first image. Assume they are all the same. # print("train_list_0",train_list) img_shape = sitk.GetArrayFromImage( sitk.ReadImage(join(args.data_root_dir, 'imgs', train_list[0][0]))).shape net_input_shape = (img_shape[1], img_shape[2], args.slices) # print(img_shape) # Create the model for training/testing/manipulation model_list = create_model(args=args, input_shape=net_input_shape) print_summary(model=model_list[0], positions=[.38, .65, .75, 1.]) args.output_name = 'split-' + str(args.split_num) + '_batch-' + str(args.batch_size) + \ '_shuff-' + str(args.shuffle_data) + '_aug-' + str(args.aug_data) + \ '_loss-' + str(args.loss) + '_slic-' + str(args.slices) + \ '_sub-' + str(args.subsamp) + '_strid-' + str(args.stride) + \ '_lr-' + str(args.initial_lr) + '_recon-' + str(args.recon_wei) args.time = time args.check_dir = join(args.data_root_dir, 'saved_models', args.net) try: makedirs(args.check_dir) except: pass args.log_dir = join(args.data_root_dir, 'logs', args.net) try: makedirs(args.log_dir) except: pass args.tf_log_dir = join(args.log_dir, 'tf_logs') try: makedirs(args.tf_log_dir) except: pass args.output_dir = join(args.data_root_dir, 'plots', args.net) try: makedirs(args.output_dir) except: pass if args.train: from train import train # Run training train(args, train_list, val_list, model_list[0], net_input_shape) if args.test: from test import test # Run testing test(args, test_list, model_list, net_input_shape) if args.manip: from manip import manip # Run manipulation of segcaps manip(args, test_list, model_list, net_input_shape)
def worker(gpu, ngpus_per_node, args): env_device, train_device = args_initialize(gpu, ngpus_per_node, args) train_env, test_env, observation = env_initialize(args, env_device) train_csv_file, train_csv_writer, eval_csv_file, eval_csv_writer, summary_writer = log_initialize( args, train_device) model = ActorCritic(args.num_stack, train_env.action_space, normalize=args.normalize, name=args.env_name) model, optimizer = model_initialize(args, model, train_device) num_frames_per_iter = args.num_ales * args.num_steps total_steps = math.ceil(args.t_max / (args.world_size * num_frames_per_iter)) if args.verbose: print(num_frames_per_iter) print(args.world_size) print(total_steps) shape = (args.num_steps + 1, args.num_ales, args.num_stack, *train_env.observation_space.shape[-2:]) states = torch.zeros(shape, device=train_device, dtype=torch.float32) states[0, :, -1] = observation.to(device=train_device, dtype=torch.float32) if args.verbose: print(shape) shape = (args.num_steps + 1, args.num_ales) values = torch.zeros(shape, device=train_device, dtype=torch.float32) returns = torch.zeros(shape, device=train_device, dtype=torch.float32) if args.verbose: print(shape) shape = (args.num_steps, args.num_ales) rewards = torch.zeros(shape, device=train_device, dtype=torch.float32) masks = torch.zeros(shape, device=train_device, dtype=torch.float32) actions = torch.zeros(shape, device=train_device, dtype=torch.long) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) final_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) episode_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) final_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) if args.use_gae: gae = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) maybe_npy = lambda a: a.numpy() if args.use_openai else a torch.cuda.synchronize() iterator = range(total_steps) if args.rank == 0: iterator = tqdm(iterator) total_time = 0 evaluation_offset = 0 for update in iterator: T = args.world_size * update * num_frames_per_iter if (args.rank == 0) and (T >= evaluation_offset): evaluation_offset += args.evaluation_interval eval_lengths, eval_rewards = test(args, model, test_env) lmean, lmedian, lmin, lmax, lstd = gen_data(eval_lengths) rmean, rmedian, rmin, rmax, rstd = gen_data(eval_rewards) length_data = '(length) min/max/mean/median: {lmin:4.1f}/{lmax:4.1f}/{lmean:4.1f}/{lmedian:4.1f}'.format( lmin=lmin, lmax=lmax, lmean=lmean, lmedian=lmedian) reward_data = '(reward) min/max/mean/median: {rmin:4.1f}/{rmax:4.1f}/{rmean:4.1f}/{rmedian:4.1f}'.format( rmin=rmin, rmax=rmax, rmean=rmean, rmedian=rmedian) print('[training time: {}] {}'.format( format_time(total_time), ' --- '.join([length_data, reward_data]))) if eval_csv_writer and eval_csv_file: eval_csv_writer.writerow([ T, total_time, rmean, rmedian, rmin, rmax, rstd, lmean, lmedian, lmin, lmax, lstd ]) eval_csv_file.flush() if args.plot: summary_writer.add_scalar('eval/rewards_mean', rmean, T, walltime=total_time) summary_writer.add_scalar('eval/lengths_mean', lmean, T, walltime=total_time) start_time = time.time() with torch.no_grad(): for step in range(args.num_steps): value, logit = model(states[step]) # store values values[step] = value.squeeze(-1) # convert actions to numpy and perform next step probs_action = F.softmax(logit, dim=1).multinomial(1).to(env_device) observation, reward, done, info = train_env.step( maybe_npy(probs_action)) if args.use_openai: # convert back to pytorch tensors observation = torch.from_numpy(observation) reward = torch.from_numpy(reward) done = torch.from_numpy(done.astype(np.uint8)) else: observation = observation.squeeze(-1).unsqueeze(1) # move back to training memory observation = observation.to(device=train_device) reward = reward.to(device=train_device, dtype=torch.float32) done = done.to(device=train_device, dtype=torch.bool) probs_action = probs_action.to(device=train_device, dtype=torch.long) not_done = 1.0 - done.float() # update rewards and actions actions[step].copy_(probs_action.view(-1)) masks[step].copy_(not_done) rewards[step].copy_(reward.sign()) # update next observations states[step + 1, :, :-1].copy_(states[step, :, 1:].clone()) states[step + 1] *= not_done.view( -1, *[1] * (observation.dim() - 1)) states[step + 1, :, -1].copy_(observation.view(-1, *states.size()[-2:])) # update episodic reward counters episode_rewards += reward final_rewards[done] = episode_rewards[done] episode_rewards *= not_done episode_lengths += not_done final_lengths[done] = episode_lengths[done] episode_lengths *= not_done returns[-1] = values[-1] = model(states[-1])[0].data.squeeze(-1) if args.use_gae: gae.zero_() for step in reversed(range(args.num_steps)): delta = rewards[step] + (args.gamma * values[step + 1] * masks[step]) - values[step] gae = delta + (args.gamma * args.tau * masks[step] * gae) returns[step] = gae + values[step] else: for step in reversed(range(args.num_steps)): returns[step] = rewards[step] + ( args.gamma * returns[step + 1] * masks[step]) value, logit = model(states[:-1].view(-1, *states.size()[-3:])) log_probs = F.log_softmax(logit, dim=1) probs = F.softmax(logit, dim=1) action_log_probs = log_probs.gather(1, actions.view(-1).unsqueeze(-1)) dist_entropy = -(log_probs * probs).sum(-1).mean() advantages = returns[:-1].view(-1).unsqueeze(-1) - value value_loss = advantages.pow(2).mean() policy_loss = -(advantages.clone().detach() * action_log_probs).mean() loss = value_loss * args.value_loss_coef + policy_loss - dist_entropy * args.entropy_coef optimizer.zero_grad() if args.cpu_train: loss.backward() master_params = model.parameters() else: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() master_params = amp.master_params(optimizer) torch.nn.utils.clip_grad_norm_(master_params, args.max_grad_norm) optimizer.step() states[0].copy_(states[-1]) torch.cuda.synchronize() if args.rank == 0: iter_time = time.time() - start_time total_time += iter_time if args.plot: summary_writer.add_scalar('train/rewards_mean', final_rewards.mean().item(), T, walltime=total_time) summary_writer.add_scalar('train/lengths_mean', final_lengths.mean().item(), T, walltime=total_time) summary_writer.add_scalar('train/value_loss', value_loss, T, walltime=total_time) summary_writer.add_scalar('train/policy_loss', policy_loss, T, walltime=total_time) summary_writer.add_scalar('train/entropy', dist_entropy, T, walltime=total_time) progress_data = callback(args, model, T, iter_time, final_rewards, final_lengths, value_loss.item(), policy_loss.item(), dist_entropy.item(), train_csv_writer, train_csv_file) iterator.set_postfix_str(progress_data) if args.plot and (args.rank == 0): writer.close() if args.use_openai: train_env.close() if args.use_openai_test_env: test_env.close()
def train(hyp, opt, device, tb_writer=None, wandb=None): logger.info(colorstr('hyperparameters: ') + ', '.join(f'{k}={v}' for k, v in hyp.items())) save_dir, epochs, batch_size, total_batch_size, weights, rank = \ Path(opt.save_dir), opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank # Directories wdir = save_dir / 'weights' wdir.mkdir(parents=True, exist_ok=True) # make dir last = wdir / 'last.pt' best = wdir / 'best.pt' results_file = save_dir / 'results.txt' # Save run settings with open(save_dir / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(save_dir / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure plots = not opt.evolve # create plots cuda = device.type != 'cpu' init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.SafeLoader) # data dict with torch_distributed_zero_first(rank): check_dataset(data_dict) # check train_path = data_dict['train'] test_path = data_dict['val'] nc = 1 if opt.single_cls else int(data_dict['nc']) # number of classes names = ['item'] if opt.single_cls and len(data_dict['names']) != 1 else data_dict['names'] # class names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (len(names), nc, opt.data) # check # Model pretrained = weights.endswith('.pt') if pretrained: with torch_distributed_zero_first(rank): attempt_download(weights) # download if not found locally ckpt = torch.load(weights, map_location=device) # load checkpoint if hyp.get('anchors'): ckpt['model'].yaml['anchors'] = round(hyp['anchors']) # force autoanchor model = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc).to(device) # create exclude = ['anchor'] if opt.cfg or hyp.get('anchors') else [] # exclude keys state_dict = ckpt['model'].float().state_dict() # to FP32 state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect model.load_state_dict(state_dict, strict=False) # load logger.info('Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report else: model = Model(opt.cfg, ch=3, nc=nc).to(device) # create # Freeze freeze = ['model.%s.' % x for x in range(11)] # ['model.%s.' % x for x in range(11)] # parameter names to freeze (full or partial) for k, v in model.named_parameters(): v.requires_grad = True # train all layers if any(x in k for x in freeze): print('freezing %s' % k) v.requires_grad = False # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay logger.info(f"Scaled weight_decay = {hyp['weight_decay']}") pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_modules(): if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter): pg2.append(v.bias) # biases if isinstance(v, nn.BatchNorm2d): pg0.append(v.weight) # no decay elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter): pg1.append(v.weight) # apply decay if opt.adam: optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']}) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR lf = one_cycle(1, hyp['lrf'], epochs) # cosine 1->hyp['lrf'] scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # Logging if rank in [-1, 0] and wandb and wandb.run is None: opt.hyp = hyp # add hyperparameters wandb_run = wandb.init(config=opt, resume="allow", project='YOLOv3' if opt.project == 'runs/train' else Path(opt.project).stem, name=save_dir.stem, id=ckpt.get('wandb_id') if 'ckpt' in locals() else None) loggers = {'wandb': wandb} # loggers dict # Resume start_epoch, best_fitness = 0, 0.0 if pretrained: # Optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # Results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # Epochs start_epoch = ckpt['epoch'] + 1 if opt.resume: assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % (weights, epochs) if epochs < start_epoch: logger.info('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt, state_dict # Image sizes gs = int(model.stride.max()) # grid size (max stride) nl = model.model[-1].nl # number of detection layers (used for scaling hyp['obj']) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size] # verify imgsz are gs-multiples # DP mode if cuda and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) logger.info('Using SyncBatchNorm()') # EMA ema = ModelEMA(model) if rank in [-1, 0] else None # DDP mode if cuda and rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank) # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank, world_size=opt.world_size, workers=opt.workers, image_weights=opt.image_weights, quad=opt.quad, prefix=colorstr('train: ')) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (mlc, nc, opt.data, nc - 1) # Process 0 if rank in [-1, 0]: ema.updates = start_epoch * nb // accumulate # set EMA updates testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt, # testloader hyp=hyp, cache=opt.cache_images and not opt.notest, rect=True, rank=-1, world_size=opt.world_size, workers=opt.workers, pad=0.5, prefix=colorstr('val: '))[0] if not opt.resume: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency # model._initialize_biases(cf.to(device)) if plots: plot_labels(labels, save_dir, loggers) if tb_writer: tb_writer.add_histogram('classes', c, 0) # Anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Model parameters hyp['box'] *= 3. / nl # scale to layers hyp['cls'] *= nc / 80. * 3. / nl # scale to classes and layers hyp['obj'] *= (imgsz / 640) ** 2 * 3. / nl # scale to image size and layers model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # iou loss ratio (obj_loss = 1.0 or iou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) * nc # attach class weights model.names = names # Start training t0 = time.time() nw = max(round(hyp['warmup_epochs'] * nb), 1000) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0) # P, R, [email protected], [email protected], val_loss(box, obj, cls) scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) logger.info(f'Image sizes {imgsz} train, {imgsz_test} test\n' f'Using {dataloader.num_workers} dataloader workers\n' f'Logging results to {save_dir}\n' f'Starting training for {epochs} epochs...') for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if opt.image_weights: # Generate indices if rank in [-1, 0]: cw = model.class_weights.cpu().numpy() * (1 - maps) ** 2 / nc # class weights iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights dataset.indices = random.choices(range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx # Broadcast if DDP if rank != -1: indices = (torch.tensor(dataset.indices) if rank == 0 else torch.zeros(dataset.n)).int() dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) logger.info(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'total', 'targets', 'img_size')) if rank in [-1, 0]: pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, (imgs, targets, paths, _) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float() / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou) accumulate = max(1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(ni, xi, [hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [hyp['warmup_momentum'], hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward with amp.autocast(enabled=cuda): pred = model(imgs) # forward loss, loss_items = compute_loss(pred, targets.to(device), model) # loss scaled by batch_size if rank != -1: loss *= opt.world_size # gradient averaged between devices in DDP mode if opt.quad: loss *= 4. # Backward scaler.scale(loss).backward() # Optimize if ni % accumulate == 0: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ( '%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) # Plot if plots and ni < 3: f = save_dir / f'train_batch{ni}.jpg' # filename Thread(target=plot_images, args=(imgs, targets, paths, f), daemon=True).start() # if tb_writer: # tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard elif plots and ni == 3 and wandb: wandb.log({"Mosaics": [wandb.Image(str(x), caption=x.name) for x in save_dir.glob('train*.jpg')]}) # end batch ------------------------------------------------------------------------------------------------ # end epoch ---------------------------------------------------------------------------------------------------- # Scheduler lr = [x['lr'] for x in optimizer.param_groups] # for tensorboard scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP if ema: ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride', 'class_weights']) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP results, maps, times = test.test(opt.data, batch_size=total_batch_size, imgsz=imgsz_test, model=ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=save_dir, plots=plots and final_epoch, log_imgs=opt.log_imgs if wandb else 0) # Write with open(results_file, 'a') as f: f.write(s + '%10.4g' * 7 % results + '\n') # P, R, [email protected], [email protected], val_loss(box, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) # Log tags = ['train/box_loss', 'train/obj_loss', 'train/cls_loss', # train loss 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/box_loss', 'val/obj_loss', 'val/cls_loss', # val loss 'x/lr0', 'x/lr1', 'x/lr2'] # params for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): if tb_writer: tb_writer.add_scalar(tag, x, epoch) # tensorboard if wandb: wandb.log({tag: x}) # W&B # Update best mAP fi = fitness(np.array(results).reshape(1, -1)) # weighted combination of [P, R, [email protected], [email protected]] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = {'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema, 'optimizer': None if final_epoch else optimizer.state_dict(), 'wandb_id': wandb_run.id if wandb else None} # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Strip optimizers final = best if best.exists() else last # final model for f in [last, best]: if f.exists(): strip_optimizer(f) # strip optimizers if opt.bucket: os.system(f'gsutil cp {final} gs://{opt.bucket}/weights') # upload # Plots if plots: plot_results(save_dir=save_dir) # save as results.png if wandb: files = ['results.png', 'precision_recall_curve.png', 'confusion_matrix.png'] wandb.log({"Results": [wandb.Image(str(save_dir / f), caption=f) for f in files if (save_dir / f).exists()]}) if opt.log_artifacts: wandb.log_artifact(artifact_or_path=str(final), type='model', name=save_dir.stem) # Test best.pt logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) if opt.data.endswith('coco.yaml') and nc == 80: # if COCO for conf, iou, save_json in ([0.25, 0.45, False], [0.001, 0.65, True]): # speed, mAP tests results, _, _ = test.test(opt.data, batch_size=total_batch_size, imgsz=imgsz_test, conf_thres=conf, iou_thres=iou, model=attempt_load(final, device).half(), single_cls=opt.single_cls, dataloader=testloader, save_dir=save_dir, save_json=save_json, plots=False) else: dist.destroy_process_group() wandb.run.finish() if wandb and wandb.run else None torch.cuda.empty_cache() return results
def testCode(): file = "../Test_files/test.exe" data = open(file, "rb").read() start_time = time.time() cr = PEHeaderReader(data=data) cr.load() total_imports, cant_librerias, promedio = cr.get_import_size_stats() real, virtual, w_e, w_real_sum, w_virtual_sum = cr.get_section_stats() elapsed = time.time() - start_time line1 = str(total_imports) + "|" + str(cant_librerias) + "|" + str( promedio) line2 = str(real) + "|" + str(virtual) + "|" + str(w_e) + "|" + str( w_real_sum) + "|" + str(w_virtual_sum) print(line1) print(line2) imp = cr.getImports() print(str(imp)) print("Elapsed time: " + str(elapsed)) #****************TEST_EXECUTE****************** from test import test test("-test_PEHeaderReader", testCode)
best_prec1 = max(prec1, best_prec1) state = { 'epoch': i, 'arch': opt.arch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'best_prec1': best_prec1 } save_checkpoint(state, is_best) # if not opt.no_train and not opt.no_val: # scheduler.step(validation_loss) if opt.test: spatial_transform = Compose([ Scale(opt.sample_size), CenterCrop(opt.sample_size), ToTensor(opt.norm_value), norm_method ]) temporal_transform = Compose([TemporalCenterCrop(opt.sample_duration)]) # target_transform = VideoID() target_transform = ClassLabel() test_data = get_test_set(opt, spatial_transform, temporal_transform, target_transform) test_loader = torch.utils.data.DataLoader(test_data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_threads, pin_memory=True) test.test(test_loader, model, opt, test_data.class_names)
def train(hyp, opt, device, tb_writer=None): logger.info(f'Hyperparameters {hyp}') log_dir = Path(tb_writer.log_dir) if tb_writer else Path(opt.logdir) / 'evolve' # logging directory wdir = log_dir / 'weights' # weights directory os.makedirs(wdir, exist_ok=True) last = wdir / 'last.pt' best = wdir / 'best.pt' results_file = str(log_dir / 'results.txt') epochs, batch_size, total_batch_size, weights, rank = \ opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank # Save run settings with open(log_dir / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(log_dir / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure cuda = device.type != 'cpu' init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # data dict with torch_distributed_zero_first(rank): check_dataset(data_dict) # check train_path = data_dict['train'] test_path = data_dict['val'] nc, names = (1, ['item']) if opt.single_cls else (int(data_dict['nc']), data_dict['names']) # number classes, names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (len(names), nc, opt.data) # check # Model pretrained = weights.endswith('.pt') if pretrained: with torch_distributed_zero_first(rank): attempt_download(weights) # download if not found locally ckpt = torch.load(weights, map_location=device) # load checkpoint if hyp.get('anchors'): ckpt['model'].yaml['anchors'] = round(hyp['anchors']) # force autoanchor model = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc).to(device) # create exclude = ['anchor'] if opt.cfg or hyp.get('anchors') else [] # exclude keys state_dict = ckpt['model'].float().state_dict() # to FP32 state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect model.load_state_dict(state_dict, strict=False) # load logger.info('Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report else: model = Model(opt.cfg, ch=3, nc=nc).to(device) # create # Freeze freeze = ['', ] # parameter names to freeze (full or partial) if any(freeze): for k, v in model.named_parameters(): if any(x in k for x in freeze): print('freezing %s' % k) v.requires_grad = False # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_parameters(): v.requires_grad = True if '.bias' in k: pg2.append(v) # biases elif '.weight' in k and '.bn' not in k: pg1.append(v) # apply weight decay else: pg0.append(v) # all else if opt.adam: optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']}) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR lf = lambda x: ((1 + math.cos(x * math.pi / epochs)) / 2) * (1 - hyp['lrf']) + hyp['lrf'] # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # Resume start_epoch, best_fitness = 0, 0.0 if pretrained: # Optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # Results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # Epochs start_epoch = ckpt['epoch'] + 1 if opt.resume: assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % (weights, epochs) shutil.copytree(wdir, wdir.parent / f'weights_backup_epoch{start_epoch - 1}') # save previous weights if epochs < start_epoch: logger.info('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt, state_dict # Image sizes gs = int(max(model.stride)) # grid size (max stride) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size] # verify imgsz are gs-multiples # DP mode if cuda and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) logger.info('Using SyncBatchNorm()') # Exponential moving average ema = ModelEMA(model) if rank in [-1, 0] else None # DDP mode if cuda and rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank) # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank, world_size=opt.world_size, workers=opt.workers, patch_size=opt.patch_size) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (mlc, nc, opt.data, nc - 1) # Process 0 if rank in [-1, 0]: ema.updates = start_epoch * nb // accumulate # set EMA updates testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt, hyp=hyp, augment=False, cache=opt.cache_images and not opt.notest, rect=True, rank=-1, world_size=opt.world_size, workers=opt.workers)[0] # testloader if not opt.resume: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency # model._initialize_biases(cf.to(device)) plot_labels(labels, save_dir=log_dir) if tb_writer: # tb_writer.add_hparams(hyp, {}) # causes duplicate https://github.com/ultralytics/yolov5/pull/384 tb_writer.add_histogram('classes', c, 0) # Anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Model parameters hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights model.names = names # Start training t0 = time.time() nw = max(round(hyp['warmup_epochs'] * nb), 1e3) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) logger.info('Image sizes %g train, %g test\nUsing %g dataloader workers\nLogging results to %s\n' 'Starting training for %g epochs...' % (imgsz, imgsz_test, dataloader.num_workers, log_dir, epochs)) for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if opt.image_weights: # Generate indices if rank in [-1, 0]: cw = model.class_weights.cpu().numpy() * (1 - maps) ** 2 # class weights iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights dataset.indices = random.choices(range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx # Broadcast if DDP if rank != -1: indices = (torch.tensor(dataset.indices) if rank == 0 else torch.zeros(dataset.n)).int() dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) logger.info(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) if rank in [-1, 0]: pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, (imgs, targets, paths, _) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float() / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) accumulate = max(1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(ni, xi, [hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [hyp['warmup_momentum'], hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward with amp.autocast(enabled=cuda): pred = model(imgs) # forward loss, loss_items = compute_loss(pred, targets.to(device), model) # loss scaled by batch_size if rank != -1: loss *= opt.world_size # gradient averaged between devices in DDP mode # Backward scaler.scale(loss).backward() # Optimize if ni % accumulate == 0: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ( '%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) # Plot if ni < 3: f = str(log_dir / ('train_batch%g.jpg' % ni)) # filename result = plot_images(images=imgs, targets=targets, paths=paths, fname=f) if tb_writer and result is not None: tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # Scheduler lr = [x['lr'] for x in optimizer.param_groups] # for tensorboard scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP if ema: ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride']) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP if final_epoch: # replot predictions [os.remove(x) for x in glob.glob(str(log_dir / 'test_batch*_pred.jpg')) if os.path.exists(x)] results, maps, times = test.test(opt.data, batch_size=total_batch_size, imgsz=imgsz_test, model=ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=log_dir) # Write with open(results_file, 'a') as f: f.write(s + '%10.4g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) # Tensorboard if tb_writer: tags = ['train/giou_loss', 'train/obj_loss', 'train/cls_loss', # train loss 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss', # val loss 'x/lr0', 'x/lr1', 'x/lr2'] # params for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): tb_writer.add_scalar(tag, x, epoch) # Update best mAP fi = fitness(np.array(results).reshape(1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = {'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema, 'optimizer': None if final_epoch else optimizer.state_dict()} # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Strip optimizers n = opt.name if opt.name.isnumeric() else '' fresults, flast, fbest = log_dir / f'results{n}.txt', wdir / f'last{n}.pt', wdir / f'best{n}.pt' for f1, f2 in zip([wdir / 'last.pt', wdir / 'best.pt', results_file], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename if str(f2).endswith('.pt'): # is *.pt strip_optimizer(f2) # strip optimizer os.system('gsutil cp %s gs://%s/weights' % (f2, opt.bucket)) if opt.bucket else None # upload # Finish if not opt.evolve: plot_results(save_dir=log_dir) # save as results.png logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if rank not in [-1, 0] else None torch.cuda.empty_cache() return results
def train(hyp, tb_writer, opt, device): print(f'Hyperparameters {hyp}') log_dir = tb_writer.log_dir if tb_writer else 'runs/evolution' # run directory wdir = str(Path(log_dir) / 'weights') + os.sep # weights directory os.makedirs(wdir, exist_ok=True) last = wdir + 'last.pt' best = wdir + 'best.pt' results_file = log_dir + os.sep + 'results.txt' epochs, batch_size, total_batch_size, weights, rank = \ opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.local_rank # TODO: Init DDP logging. Only the first process is allowed to log. # Since I see lots of print here, the logging configuration is skipped here. We may see repeated outputs. # Save run settings with open(Path(log_dir) / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(Path(log_dir) / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # model dict train_path = data_dict['train'] test_path = data_dict['val'] nc, names = (1, ['item']) if opt.single_cls else (int( data_dict['nc']), data_dict['names']) # number classes, names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % ( len(names), nc, opt.data) # check # Remove previous results if rank in [-1, 0]: for f in glob.glob('*_batch*.jpg') + glob.glob(results_file): os.remove(f) # Create model model = Model(opt.cfg, nc=nc).to(device) # Image sizes gs = int(max(model.stride)) # grid size (max stride) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size ] # verify imgsz are gs-multiples # Optimizer nbs = 64 # nominal batch size # default DDP implementation is slow for accumulation according to: https://pytorch.org/docs/stable/notes/ddp.html # all-reduce operation is carried out during loss.backward(). # Thus, there would be redundant all-reduce communications in a accumulation procedure, # which means, the result is still right but the training speed gets slower. # TODO: If acceleration is needed, there is an implementation of allreduce_post_accumulation # in https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT/run_pretraining.py accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_parameters(): if v.requires_grad: if '.bias' in k: pg2.append(v) # biases elif '.weight' in k and '.bn' not in k: pg1.append(v) # apply weight decay else: pg0.append(v) # all else if hyp['optimizer'] == 'adam': # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) print('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Load Model with torch_distributed_zero_first(rank): google_utils.attempt_download(weights) start_epoch, best_fitness = 0, 0.0 if weights.endswith('.pt'): # pytorch format ckpt = torch.load(weights, map_location=device) # load checkpoint # load model try: exclude = ['anchor'] # exclude keys ckpt['model'] = { k: v for k, v in ckpt['model'].float().state_dict().items() if k in model.state_dict() and not any(x in k for x in exclude) and model.state_dict()[k].shape == v.shape } model.load_state_dict(ckpt['model'], strict=False) print('Transferred %g/%g items from %s' % (len(ckpt['model']), len(model.state_dict()), weights)) except KeyError as e: s = "%s is not compatible with %s. This may be due to model differences or %s may be out of date. " \ "Please delete or update %s and try again, or use --weights '' to train from scratch." \ % (weights, opt.cfg, weights, weights) raise KeyError(s) from e # load optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # load results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # epochs start_epoch = ckpt['epoch'] + 1 if epochs < start_epoch: print( '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt # Mixed precision training https://github.com/NVIDIA/apex if mixed_precision: model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0) # Scheduler https://arxiv.org/pdf/1812.01187.pdf lf = lambda x: (( (1 + math.cos(x * math.pi / epochs)) / 2)**1.0) * 0.9 + 0.1 # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822 # plot_lr_scheduler(optimizer, scheduler, epochs) # DP mode if device.type != 'cpu' and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and device.type != 'cpu' and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) print('Using SyncBatchNorm()') # Exponential moving average ema = torch_utils.ModelEMA(model) if rank in [-1, 0] else None # DDP mode if device.type != 'cpu' and rank != -1: model = DDP(model, device_ids=[rank], output_device=rank) # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, local_rank=rank, world_size=opt.world_size) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % ( mlc, nc, opt.data, nc - 1) # Testloader if rank in [-1, 0]: # local_rank is set to -1. Because only the first process is expected to do evaluation. testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt, hyp=hyp, augment=False, cache=opt.cache_images, rect=True, local_rank=-1, world_size=opt.world_size)[0] # Model parameters hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights model.names = names # Class frequency if rank in [-1, 0]: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # model._initialize_biases(cf.to(device)) plot_labels(labels, save_dir=log_dir) if tb_writer: # tb_writer.add_hparams(hyp, {}) # causes duplicate https://github.com/ultralytics/yolov5/pull/384 tb_writer.add_histogram('classes', c, 0) # Check anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Start training t0 = time.time() nw = max(3 * nb, 1e3) # number of warmup iterations, max(3 epochs, 1k iterations) maps = np.zeros(nc) # mAP per class results = ( 0, 0, 0, 0, 0, 0, 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' scheduler.last_epoch = start_epoch - 1 # do not move if rank in [0, -1]: print('Image sizes %g train, %g test' % (imgsz, imgsz_test)) print('Using %g dataloader workers' % dataloader.num_workers) print('Starting training for %g epochs...' % epochs) # torch.autograd.set_detect_anomaly(True) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) # When in DDP mode, the generated indices will be broadcasted to synchronize dataset. if dataset.image_weights: # Generate indices. if rank in [-1, 0]: w = model.class_weights.cpu().numpy() * ( 1 - maps)**2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices( range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx # Broadcast. if rank != -1: indices = torch.zeros([dataset.n], dtype=torch.int) if rank == 0: indices[:] = torch.from_tensor(dataset.indices, dtype=torch.int) dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) if rank in [-1, 0]: print( ('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float( ) / 255.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp( ni, xi, [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [0.9, hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward pred = model(imgs) # Loss loss, loss_items = compute_loss(pred, targets.to(device), model) # scaled by batch_size if rank != -1: loss *= opt.world_size # gradient averaged between devices in DDP mode if not torch.isfinite(loss): print('WARNING: non-finite loss, ending training ', loss_items) return results # Backward if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Optimize if ni % accumulate == 0: optimizer.step() optimizer.zero_grad() if ema is not None: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1 ) # update mean losses mem = '%.3gG' % (torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) # Plot if ni < 3: f = str(Path(log_dir) / ('train_batch%g.jpg' % ni)) # filename result = plot_images(images=imgs, targets=targets, paths=paths, fname=f) if tb_writer and result is not None: tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # Scheduler scheduler.step() # Only the first process in DDP mode is allowed to log or save checkpoints. if rank in [-1, 0]: # mAP if ema is not None: ema.update_attr( model, include=['md', 'nc', 'hyp', 'gr', 'names', 'stride']) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP results, maps, times = test.test( opt.data, batch_size=total_batch_size, imgsz=imgsz_test, save_json=final_epoch and opt.data.endswith(os.sep + 'coco.yaml'), model=ema.ema.module if hasattr(ema.ema, 'module') else ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=log_dir) # Write with open(results_file, 'a') as f: f.write( s + '%10.4g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) # Tensorboard if tb_writer: tags = [ 'train/giou_loss', 'train/obj_loss', 'train/cls_loss', 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss' ] for x, tag in zip(list(mloss[:-1]) + list(results), tags): tb_writer.add_scalar(tag, x, epoch) # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema.module if hasattr(ema, 'module') else ema.ema, 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last, best and delete torch.save(ckpt, last) if (best_fitness == fi) and not final_epoch: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Strip optimizers n = ('_' if len(opt.name) and not opt.name.isnumeric() else '') + opt.name fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename ispt = f2.endswith('.pt') # is *.pt strip_optimizer(f2) if ispt else None # strip optimizer os.system('gsutil cp %s gs://%s/weights' % ( f2, opt.bucket)) if opt.bucket and ispt else None # upload # Finish if not opt.evolve: plot_results(save_dir=log_dir) # save as results.png print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if rank not in [-1, 0] else None torch.cuda.empty_cache() return results