Ejemplo n.º 1
0
def test():
    import test

    pygame.init()
    tb = TextBox(text="Fishy")
    clock = pygame.time.Clock()
    group = pygame.sprite.LayeredDirty(tb, layer=0, _use_update=True)

    def testlogic(event):
        if event == None:
            return
        if event.type == pygame.KEYDOWN:
            if event.key == 27:
                return True
            if event.key == K_BACKSPACE:
                if len(tb.text) > 0:
                    tb.text = tb.text[:-1]
                return
            if event.key >= 256:
                return
            tb.text += chr(event.key)

    def testrender(screen):
        clock.tick()
        time = clock.get_time()
        group.update(time)
        bgd = pygame.Surface((screen.get_width(), screen.get_height()))
        group.draw(screen, bgd=bgd)

    test.test(testlogic, testrender)
Ejemplo n.º 2
0
def startSouth(below, ordered = []):
    "Sort Southern cities into order"
    jyunban = sortx(below, 'ascending')             #Decide starting city
    minami = len(jyunban)
    ordered.append(jyunban[0])
    jyunban.pop(0)
    
    alldistance = getdistances(ordered[0], jyunban) #Decide last southern city
    index = alldistance.index(max(alldistance))
    furthest = jyunban[index]
    jyunban.pop(index)
    
    while len(ordered) < minami/2:                  #Route from start point
        decidenext(jyunban, ordered, 'south')
        test(ordered)
    
    second =  sortx(jyunban, 'descending')          #Route from end point
    lensecond = len(second)
    gyaku = []
    gyaku.append(furthest)
    while len(gyaku) < lensecond+1:
        decidenext(second, gyaku, 'north')
    
    for i in range(1,len(gyaku)+1):                 #Put two routes together
        ordered.append(gyaku[-i])
    
    #checkintercept(ordered)
    return ordered
Ejemplo n.º 3
0
def run(train_file, valid_file, test_file, output_file):
    '''The function to run your ML algorithm on given datasets, generate the output and save them into the provided file path

    Parameters
    ----------
    train_file: string
        the path to the training file
        valid_file: string
                the path to the validation file
        test_file: string
                the path to the testing file
    output_file: string
        the path to the output predictions to be saved
    '''

    ## your implementation here

    # read data from input
    train_samples, word2num = train_data_prepare(train_file)
    valid_samples = test_data_prepare(valid_file, word2num, 'valid')

    # your training algorithm
    model = train(train_samples, valid_samples, word2num)

    # your prediction code
    test(test_file, output_file, word2num, model)
Ejemplo n.º 4
0
def main (argv):
    logger = initLogger();
    
    if (len(argv) == 0):
        print "Missing argument. Options: init, store, list, test, get, restore";
    elif (argv[0] == "init"):
        init.init(archiveDir);
    elif (argv[0] == "store"):
        if (len(argv) < 2):
            print "Usage: mybackup store <directory>";
        else:
            store.store(archiveDir, argv[1], logger);
    elif (argv[0] == "list"):
        if (len(argv) < 2):
            listBackups.list(archiveDir)
        else:
            listBackups.list(archiveDir, argv[1])
    elif (argv[0] == "get"):
        if (len(argv) < 2):
            print "Usage: mybackup get <pattern>";
        else:
            restore.getFile(archiveDir, argv[1]);
    elif (argv[0] == "restore"):
        if (len(argv) < 2):
            restore.restoreAll(archiveDir)
        else:
            restore.restoreAll(archiveDir, argv[1])
    elif (argv[0] == "test"):
        test.test(archiveDir, logger)
    else:
        print "Unknown option: "+argv[0];
Ejemplo n.º 5
0
def main():
    print "In Main Experiment\n"
    # get the classnames from the directory structure
    directory_names = list(set(glob.glob(os.path.join("train", "*"))).difference(set(glob.glob(os.path.join("train", "*.*")))))
    # get the number of rows through image count
    numberofImages = parseImage.gestNumberofImages(directory_names)
    num_rows = numberofImages # one row for each image in the training dataset

    # We'll rescale the images to be 25x25
    maxPixel = 25
    imageSize = maxPixel * maxPixel
    num_features = imageSize + 2 + 128 # for our ratio

    X = np.zeros((num_rows, num_features), dtype=float)
    y = np.zeros((num_rows)) # numeric class label
    files = []
    namesClasses = list() #class name list

    # Get the image training data
    parseImage.readImage(True, namesClasses, directory_names,X, y, files)

    print "Training"

    # get test result
    train.train(X, y, namesClasses)

    print "Testing"
    test.test(num_rows, num_features, X, y, namesClasses = list())
Ejemplo n.º 6
0
def cli_main():
	if sys.version_info[0] < 3:
		print("STK needs Python 3.x to run. Check your execution path and file associations.")
		print("Your Python version is: ")
		print(sys.version)
		return

	if len(sys.argv) >= 2:
		if sys.argv[1] == "--test":
			import test
			test.test(sys.argv[2:])

		elif sys.argv[1] == "--markdown2html" and len(sys.argv) == 4:

			input_file = sys.argv[2]
			output_dir = sys.argv[3]

			parser = MDParser()
			parser.parse_file(input_file)

			exporter = HtmlExporter()
			exporter.export(parser.saga, output_dir)

		elif sys.argv[1] == "--generate_ep_card" and len(sys.argv) == 4:
			#generate_episode_card(sys.argv[2], sys.argv[3])
			#TODO generate_episode_card
			print("Not implemented yet...")

		else:
			print_usage()
	else:
		print_usage()
Ejemplo n.º 7
0
def mainHandler(threadNum, link, deep, key, test):
    event = threading.Event()  # 产生一个event对象,对象维护一个flag,当
    event.clear()  # 将event的flag设为false
    pool = threadPool(threadNum, event)  # 初始化一个threadNum个线程的线程池,event对象用于通知主线程继续执行
    showProgress(pool.getQueue(), deep, event)
    pool.putJob((link, deep), key)  # job是(link,deep)的一个tuple,key是关键字
    pool.wait()  # 阻塞主线程
    if test:  # 需要自测模块运行
        import test
        test.test(key, dbFile)
Ejemplo n.º 8
0
def test3():
   cost = [1,1,1]
   eqs = [[1,1,0], [2,2,2]]
   eqB = [2, 5]

   expectedCost = [1,1,1]
   expectedConstraints = [[3,-2,0], [1,1,0]]
   expectedThresholds = [2, 5]
   test((expectedCost, expectedConstraints, expectedThresholds),
         simplex.standardForm(cost, equalities=eqs, eqThreshold=eqB))
Ejemplo n.º 9
0
def mainHandler(threadNum, link, deep, key, test):
    event = threading.Event()
    event.clear()
    pool = threadPool(threadNum, event)
    showProgress(pool.getQueue(), deep, event)
    pool.putJob((link, deep), key)
    pool.wait()
    if test:  # 需要自测模块运行
        import test
        test.test(key, dbFile)
Ejemplo n.º 10
0
def test():
    import test
    pygame.init()
    image = pygame.image.load("qbird.png")
    sprite = ManipulatableDirtySprite(image = image)
    clock = pygame.time.Clock()
    group = pygame.sprite.LayeredDirty(sprite, layer = 0, _use_update = True)
    keysused = {}
    def testlogic(event):
        if event == None:            
            time = clock.get_time()
            if K_UP in keysused and keysused[K_UP]:
                sprite.y -= 1
            if K_DOWN in keysused and keysused[K_DOWN]:
                sprite.y += 1
            if K_LEFT in keysused and keysused[K_LEFT]:
                sprite.x -= 1
            if K_RIGHT in keysused and keysused[K_RIGHT]:
                sprite.x += 1
            if K_q in keysused and keysused[K_q]:
                sprite.rotation -= 1
            if K_e in keysused and keysused[K_e]:
                sprite.rotation += 1
            if K_w in keysused and keysused[K_w]:
                sprite.ycenter -= 0.015625
            if K_s in keysused and keysused[K_s]:
                sprite.ycenter += 0.015625
            if K_a in keysused and keysused[K_a]:
                sprite.xcenter -= 0.015625
            if K_d in keysused and keysused[K_d]:
                sprite.xcenter += 0.015625
            if K_r in keysused and keysused[K_r]:
                sprite.yscale += 0.015625
            if K_f in keysused and keysused[K_f]:
                sprite.yscale -= 0.015625
            if K_t in keysused and keysused[K_t]:
                sprite.xscale += 0.015625
            if K_g in keysused and keysused[K_g]:
                sprite.xscale -= 0.015625
            if K_y in keysused and keysused[K_y]:
                sprite.opacity += 0.015625
            if K_h in keysused and keysused[K_h]:
                sprite.opacity -= 0.015625
            return
        if event.type == pygame.KEYDOWN:
            if event.key == 27:
                return True
            keysused[event.key] = True
        if event.type == pygame.KEYUP:
            keysused[event.key] = False
    def testrender(screen):
        clock.tick()
        bgd = pygame.Surface((screen.get_width(), screen.get_height()))
        group.draw(screen, bgd = bgd)
    test.test(testlogic, testrender)
Ejemplo n.º 11
0
def test2():
   cost = [1,1,1]
   lts = [[3,-2,0]]
   ltB = [7]
   eqs = [[1,1,0]]
   eqB = [2]

   expectedCost = [1,1,1,0]
   expectedConstraints = [[3,-2,0,1], [1,1,0,0]]
   expectedThresholds = [7,2]
   test((expectedCost, expectedConstraints, expectedThresholds),
         simplex.standardForm(cost, lessThans=lts, ltThreshold=ltB, equalities=eqs, eqThreshold=eqB))
Ejemplo n.º 12
0
def evalQuiz(user, data):
    f = open(join(quiz_requests, '%s.quiz-%s.%d' % 
                  (user, data['page'].split('-')[-1], 
                   int(time() * 1000))), 'w')
    f.write(str(data))
    f.close()

    path.insert(0, join(course_material, str(data['page'])))
    #test_mod = __import__(join(course_material, str(data['page']), 'test.py'))
    import test as test_mod
    test_mod.test(user, data)

    del test_mod
Ejemplo n.º 13
0
def trainrf(model_id,train_x,train_y,valid_x,valid_y,test_x):
    train_x,train_y=shuffle(train_x,train_y)


    random_state=random.randint(0, 1000000)
    print('random state: {state}'.format(state=random_state))

    clf = RandomForestClassifier(n_estimators=random.randint(50,5000),
                                 criterion='gini',
                                 max_depth=random.randint(10,1000),
                                 min_samples_split=random.randint(2,50),
                                 min_samples_leaf=random.randint(1,10),
                                 min_weight_fraction_leaf=random.uniform(0.0,0.5),
                                 max_features=random.uniform(0.1,1.0),
                                 max_leaf_nodes=random.randint(1,10),
                                 bootstrap=False,
                                 oob_score=False,
                                 n_jobs=30,
                                 random_state=random_state,
                                 verbose=0,
                                 warm_start=True,
                                 class_weight=None
                )

    clf.fit(train_x, train_y)

    valid_predictions1 = clf.predict_proba(valid_x)
    test_predictions1= clf.predict_proba(test_x)

    t1 = test(valid_y,valid_predictions1)

    ccv = CalibratedClassifierCV(base_estimator=clf,method="sigmoid",cv='prefit')
    ccv.fit(valid_x,valid_y)

    valid_predictions2 = ccv.predict_proba(valid_x)
    test_predictions2= ccv.predict_proba(test_x)

    t2 = test(valid_y,valid_predictions2)

    if t2<t1:
        valid_predictions=valid_predictions2
        test_predictions=test_predictions2
        t=t2
    else:
        valid_predictions=valid_predictions1
        test_predictions=test_predictions1
        t=t1

    if t < 0.450:
        data.saveData(valid_predictions,"../valid_results/valid_"+str(model_id)+".csv")
        data.saveData(test_predictions,"../results/results_"+str(model_id)+".csv")
Ejemplo n.º 14
0
def testFromPost():
   cost = [1,1,1]
   gts = [[0,1,4]]
   gtB = [10]
   lts = [[3,-2,0]]
   ltB = [7]
   eqs = [[1,1,0]]
   eqB = [2]

   expectedCost = [1,1,1,0,0]
   expectedConstraints = [[0,1,4,-1,0], [3,-2,0,0,1], [1,1,0,0,0]]
   expectedThresholds = [10,7,2]
   test((expectedCost, expectedConstraints, expectedThresholds),
         simplex.standardForm(cost, gts, gtB, lts, ltB, eqs, eqB))
Ejemplo n.º 15
0
def train(model_id,train_x,train_y,valid_x,valid_y,test_x):
    train_x,train_y=shuffle(train_x,train_y)


    random_state=random.randint(0, 1000000)
    print('random state: {state}'.format(state=random_state))

    clf = RandomForestClassifier(bootstrap=False, class_weight=None,
            criterion='entropy', max_depth=29008, max_features=36,
            max_leaf_nodes=None, min_samples_leaf=5, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators=4494, n_jobs=8,
            oob_score=False, random_state=979271, verbose=0,
            warm_start=False)

    clf.fit(train_x, train_y)

    ccv = CalibratedClassifierCV(base_estimator=clf,method="sigmoid",cv="prefit")
    ccv.fit(valid_x,valid_y)

    valid_predictions = ccv.predict_proba(valid_x)
    test_predictions= ccv.predict_proba(test_x)

    loss = test(valid_y,valid_predictions,True)
    if  loss<0.52:
        data.saveData(valid_predictions,"../valid_results/valid_"+str(model_id)+".csv")
        data.saveData(test_predictions,"../results/results_"+str(model_id)+".csv")
Ejemplo n.º 16
0
def cal(code):
	try:
		a=gc.getData(code)
	except:
		print('{} is wrong'.format(code))	
	else:	
		print('{} is running'.format(code))
		if a is not None and len(a)>60:
			global total
			total=total+1
			a.sort_index(inplace=True)
			gc.ma(a,'close',[5,10,15,20,25])
			MA_column=a.columns[-5:]
			a['Diff']= 100 * ((a[MA_column].max(axis=1) - a[MA_column].min(axis=1))/a[MA_column].max(axis=1))
			b=a[-65:-5]['Diff'].mean()
			if b<2 and a[-65:-5]['Diff'].max()<10:
				good.append(code)
				global can_try
				can_try=can_try+1
				# print('{},{}'.format(a[-2:-1].index.values[0],a[-1:].index.values[0]))
				diff=test.test(code,a[-6:-5].index.values[0],a[-1:].index.values[0])
				diff_list.append(diff)
				if diff>0:
					global test_try
					test_try=test_try+1	
Ejemplo n.º 17
0
 def k_result(k):
     train_k = random.sample(train_set,k)
     scp_k = os.path.join(tempdir,'scp_k')
     with open(scp_k,'w') as f:
         f.writelines(train_k)
     final_dir = train(outdir, config, scp_k, proto, htk_dict, words_mlf, monophones, tempdir)
     return test(outdir, final_dir, wdnet, htk_dict, monophones, scp_test, words_mlf, tempdir)
Ejemplo n.º 18
0
def run():
    args = parse_args()
    params = parser.Yaml(file_name=args.params)

    env = rave.Environment()
    env.SetViewer("qtcoin")
    env.Load(params.scene)
    env.UpdatePublishedBodies()
    robot = env.GetRobots()[0]

    time.sleep(0.1)  # give time for environment to update
    navi = navigation.Navigation(robot, params, verbose=args.verbose)

    if args.test:
        test.test(navi)
    else:
        navi.run()
Ejemplo n.º 19
0
def urls(environ):
    template = os.path.join(environ['DOCUMENT_ROOT'], 'wsgi/vhost1/template')
    if environ['PATH_INFO'] == '/vhost1':
        if environ['REQUEST_METHOD'] == 'POST':
            return publish.post(environ, template)
        return publish.edit(environ, template)
    elif environ['PATH_INFO'] == '/vhost/pub':
        return publish.pub(environ)
    else:
        return test.test(environ, template)
Ejemplo n.º 20
0
    def run(optim):
        progress = make_progressbar("Training with " + str(optim), 5)
        progress.start()

        model = net()
        model.training()
        for epoch in range(5):
            train(Xtrain, ytrain, model, optim, criterion, batch_size, "train")
            train(Xtrain, ytrain, model, optim, criterion, batch_size, "stats")
            progress.update(epoch + 1)

        progress.finish()

        model.evaluate()
        nll, _ = test(Xtrain, ytrain, model, batch_size)
        _, nerr = test(Xval, yval, model, batch_size)

        print("Trainset NLL: {:.2f}".format(nll))
        print("Testset errors: {}".format(nerr))
Ejemplo n.º 21
0
Archivo: urls.py Proyecto: bagel/pyblog
def urls(environ):
    template = os.path.join(environ['DOCUMENT_ROOT'], 'test/template')
    if environ['PATH_INFO'] == '/test/work':
        return test.work()
    elif environ['PATH_INFO'] == '/test/html':
        return test.html(environ, template)
    elif environ['PATH_INFO'] == '/test2':
        return test2.test2()
    else:
        return test.test()
Ejemplo n.º 22
0
def main(_):
    pp.pprint(flags.FLAGS.__flags)

    if not os.path.exists(FLAGS.checkpoint_dir):
        os.makedirs(FLAGS.checkpoint_dir)
    if not os.path.exists(FLAGS.sample_dir):
        os.makedirs(FLAGS.sample_dir)

    random.seed(31241)
    np.random.seed(41982)
    tf.set_random_seed(1327634)

    color = True # Must change this and the dataset Flags to the correct path to use color
    if FLAGS.is_debug:
       reader = Bouncing_Balls_Data_Reader(FLAGS.dataset, FLAGS.batch_size, color=color, train_size=160*5, validation_size=8*5, test_size=8*5, num_partitions=5)
    else:
       reader = Bouncing_Balls_Data_Reader(FLAGS.dataset, FLAGS.batch_size, color=color)

    data_fn = lambda epoch, batch_index: reader.read_data(batch_index, reader.TRAIN)
    frame_shape = reader.read_data(0, reader.TRAIN).shape[2:]
    print("Frame shape: ", frame_shape)
    num_batches = reader.num_batches(reader.TRAIN)
    print("Num batches: %d" % num_batches)
    input_sequence_range = range(5, 16)
    print("Input sequence range min: %d, max: %d" % (min(input_sequence_range), max(input_sequence_range)))

    save_sample_fn = utils.gen_save_sample_fn(FLAGS.sample_dir, image_prefix="train")

    with tf.Session() as sess:
        pgn  = PGN(sess, FLAGS.dataset_name, FLAGS.epoch, num_batches, FLAGS.batch_size, input_sequence_range,
                 data_fn, frame_shape=frame_shape, save_sample_fn=save_sample_fn, checkpoint_dir=FLAGS.checkpoint_dir,
                 lambda_adv_loss= FLAGS.lambda_adv_loss)

        if FLAGS.is_train:
            pgn.train()
        else:
            print("Loading from: %s" %(FLAGS.checkpoint_dir,))
            if pgn.load(FLAGS.checkpoint_dir) :
               print(" [*] Successfully loaded")
            else:
               print(" [!] Load failed")

        if FLAGS.is_test:
           result = test.test(pgn, reader)
           result_str = pp.pformat(result)
           fid = open(os.path.join(FLAGS.sample_dir, 'test_out.txt'), mode='w')
           fid.write(unicode(result_str))
           fid.close()

        if FLAGS.is_visualize:
           for i in range(3):
               vid_seq = reader.read_data(i, data_set_type=reader.TEST, batch_size=1)[:, 0, :, :, :]
               utils.make_prediction_gif(pgn, os.path.join(FLAGS.sample_dir, 'vis_%d.gif' % i), video_sequence=vid_seq)
           utils.plot_convergence(pgn.get_MSE_history(), "MSE Convergence",
                        path=os.path.join(FLAGS.sample_dir, "vis_MSE_convergence.png"))
Ejemplo n.º 23
0
 def call_script(self):
   #Disable all the buttons
   self.button.config(state='disabled')
   self.slogan.config(state='disabled')
   #Perform the task
   message=test.test()
   self.message['text'] = message
   time.sleep(0.5)
   #Enable all the buttons
   self.button.config(state='active')
   self.slogan.config(state='active')
Ejemplo n.º 24
0
Archivo: lr.py Proyecto: hujiewang/otto
def train(model_id,train_x,train_y,valid_x,valid_y,test_x):
    train_x,train_y=shuffle(train_x,train_y)

    # normalization
    scaler = StandardScaler()
    train_x = scaler.fit_transform(train_x)
    valid_x = scaler.transform(valid_x)
    test_x = scaler.transform(test_x)

    random_state=random.randint(0, 1000000)
    print('random state: {state}'.format(state=random_state))

    clf = LogisticRegression(penalty='l2',
                             dual=False,
                             tol=0.0001,
                             C=1.0,
                             fit_intercept=True,
                             intercept_scaling=1,
                             class_weight=None,
                             random_state=None,
                             solver='lbfgs',
                             max_iter=1000,
                             multi_class='ovr',
                             verbose=True
                             )

    clf.fit(train_x, train_y)

    valid_predictions = clf.predict_proba(valid_x)
    test(valid_y,valid_predictions)

    ccv = CalibratedClassifierCV(base_estimator=clf,method="sigmoid",cv='prefit')
    ccv.fit(train_x,train_y)

    valid_predictions = ccv.predict_proba(valid_x)
    test(valid_y,valid_predictions)

    test_predictions= ccv.predict_proba(test_x)

    data.saveData(valid_predictions,"../valid_results/valid_"+str(model_id)+".csv")
    data.saveData(test_predictions,"../results/results_"+str(model_id)+".csv")
Ejemplo n.º 25
0
def test():
    import test
    import Bar
    pygame.init()
    bar = Bar.Bar(12340, 12345)
    bar2 = Bar.Bar(120, 135, color = (0, 0 ,255, 255))
    bar2.rect.top = 30
    
    clock = pygame.time.Clock()
    window = LayeredDirtySprite(width = 120, height = 100)
    window.add(bar)
    window.add(bar2)
    window.x = 100
    window.y = 200
    window.ycenter = -2.0
    group = pygame.sprite.LayeredDirty(window, layer = 0, _use_update = True)
    bgd = []
    def testlogic(event):
        if event == None:
            window.rotation += .01
            return
        if event.type == pygame.KEYDOWN:
            if event.key == 27:
                return True
            if event.key >= 256:
                return
            import random
            if event.mod & 4095 == 0:
                bar.actual -= min((random.randint(5, 500), bar.actual))
            else:
                bar.actual += min((random.randint(5, 500), bar._max - bar.actual))
    def testrender(screen):
        clock.tick()
        time = clock.get_time()
        group.update(time)
        if len(bgd) == 0:
            bgd.append(pygame.Surface((screen.get_width(), screen.get_height())))
            bgd[0].fill((0, 255, 0))
            group.clear(screen, bgd[0])
        group.draw(screen)
    test.test(testlogic, testrender)
Ejemplo n.º 26
0
def urls(environ):
    template = os.path.join(environ['DOCUMENT_ROOT'], 'wsgi/test/template')
    if environ['PATH_INFO'] == '/test/vhost':
        return test.vhost(environ, template)
    elif environ['PATH_INFO'] == '/test/post':
        return test.post(environ, template)
    elif environ['PATH_INFO'] == '/ws':
        return test.ws(environ, template)
    elif environ['PATH_INFO'] == '/foobar/':
        return test.foobar(environ, template)
    else:
        return test.test(environ, template)
def Prune(fileName, tree,Train, Validate, Predict, testFile, NeedPrune):  
    csv = csvRead(fileName)  
    columnCsv = []  
    modeList = []  
    traininfo = []  
    # print len(csv[1][0])  
    for num in range(len(csv[1][0])):  
        # columnCsv.append(columnCreate(csv[1], num))  
        columnCsv = columnCreate(csv[1], num)  
        categoryOrnumeric(columnCsv, csv, num, modeList, traininfo)  
    myTree = createTree(csv[1], csv[0],modeList)  
    if NeedPrune == True:
        if Validate:
            test.test(testFile,myTree, traininfo)
        if Predict:
            output.test(testFile,myTree,traininfo)
    # f = open("tree.txt", "w")
    # output.test('btest.csv',myTree,traininfo)
    # drawTree(myTree,0,f)
    # plotting.accurancyGet("bvalidate.csv", myTree, traininfo)  
    return myTree,csv[1],traininfo  
Ejemplo n.º 28
0
def run_for_values_comb(filename, result_file):
    print 'Data set: {}\n'.format(filename)
    for depth, trees, repl, sam_size, fold, dis in [(depth, trees, repl, sam_size, fold, dis)
                                                    for depth in depth_values
                                                    for trees in trees_values
                                                    for repl in replacement_values
                                                    for sam_size in sample_size_values
                                                    for fold in folds_values
                                                    for dis in discretization_values]:
        result = test(os.path.join(sample_data_dir, filename), depth, trees, repl, sam_size, fold, 0, dis)
        print '{} for: Depth: {}, tree no: {}, replacement: {}, sample size: {}, folds no: {}, discretization: {}' \
            .format(result, depth, trees, repl, sam_size, fold, dis)
        result_file.write('{},{},{},{},{},{},{},{}\n'
                   .format(filename, depth, trees, repl, sam_size, fold, dis, result))
Ejemplo n.º 29
0
def main(passed_args):
    args = init.getArgs(passed_args)

    # If auth arg is present, use it for 'get' calls.
    # Create the requests session to use for later 'get' calls.
    session = requests.session()
    # If there is an 'auth' arg, post the session the correct login info for the session
    if 'custom-auth' in args.keys():
        if args['custom-auth'] == 'dvwa':
            payload = {'password': '******', 'username': '******'}
            response = session.post('http://127.0.0.1/dvwa/login.php', data=payload)
            session.cookies = response.cookies

    #Call the appropriate function using the mode param.
    if args['mode'] == 'discover':
        discovered = discover.discover(args, session)
        outputResults(discovered)

    elif args['mode'] == 'test':
        test.test(args, session)

    else: # This else is somewhat redundant.  Mode input is verified in getArgs.
        print('Not a valid <mode> param.  Must be \'discover\' or \'test\'')
        sys.exit(0)
Ejemplo n.º 30
0
def full_test(name, query, train_set, test_set, method):
    '''Performs a full automated test.
    
    @param name: test and model name, arbitrary
    @param query: question to train on
    @param train_set: name of the training set of games
    @param test_set: name of testing set of games
    @param method: feature extraction method
    '''
    path = 'games/{}'.format(train_set)
    begin = time.time()
    dts.build_and_dump(name, query, method=method, path=path)
    trn.train_and_save(name, 'logistic_regression')
    print time.time()-begin
    return tst.test(name, test_set, query, method)
Ejemplo n.º 31
0
def train():
    cfg = opt.cfg
    data = opt.data
    epochs = opt.epochs  # 500200 batches at bs 64, 117263 images = 273 epochs
    batch_size = opt.batch_size
    accumulate = opt.accumulate  # effective bs = batch_size * accumulate = 16 * 4 = 64
    weights = opt.weights  # initial training weights
    imgsz_min, imgsz_max, imgsz_test = opt.img_size  # img sizes (min, max, test)

    # Image Sizes
    gs = 64  # (pixels) grid size
    assert math.fmod(
        imgsz_min,
        gs) == 0, '--img-size %g must be a %g-multiple' % (imgsz_min, gs)
    opt.multi_scale |= imgsz_min != imgsz_max  # multi if different (min, max)
    if opt.multi_scale:
        if imgsz_min == imgsz_max:
            imgsz_min //= 1.5
            imgsz_max //= 0.667
        grid_min, grid_max = imgsz_min // gs, imgsz_max // gs
        imgsz_min, imgsz_max = grid_min * gs, grid_max * gs
    img_size = imgsz_max  # initialize with max size

    # Configure run
    init_seeds()
    data_dict = parse_data_cfg(data)
    train_path = data_dict['train']
    test_path = data_dict['valid']
    nc = 1 if opt.single_cls else int(
        data_dict['classes'])  # number of classes
    hyp['cls'] *= nc / 80  # update coco-tuned hyp['cls'] to current dataset

    # Remove previous results
    for f in glob.glob('*_batch*.png') + glob.glob(results_file):
        os.remove(f)

    # Initialize model
    model = Darknet(cfg).to(device)

    # Optimizer
    pg0, pg1, pg2 = [], [], []  # optimizer parameter groups
    for k, v in dict(model.named_parameters()).items():
        if '.bias' in k:
            pg2 += [v]  # biases
        elif 'Conv2d.weight' in k:
            pg1 += [v]  # apply weight_decay
        else:
            pg0 += [v]  # all else

    if opt.adam:
        # hyp['lr0'] *= 0.1  # reduce lr (i.e. SGD=5E-3, Adam=5E-4)
        optimizer = optim.Adam(pg0, lr=hyp['lr0'])
        # optimizer = AdaBound(pg0, lr=hyp['lr0'], final_lr=0.1)
    else:
        optimizer = optim.SGD(pg0,
                              lr=hyp['lr0'],
                              momentum=hyp['momentum'],
                              nesterov=True)
    optimizer.add_param_group({
        'params': pg1,
        'weight_decay': hyp['weight_decay']
    })  # add pg1 with weight_decay
    optimizer.add_param_group({'params': pg2})  # add pg2 (biases)
    del pg0, pg1, pg2

    start_epoch = 0
    best_fitness = 0.0
    attempt_download(weights)
    if weights.endswith('.pt'):  # pytorch format
        # possible weights are '*.pt', 'yolov3-spp.pt', 'yolov3-tiny.pt' etc.
        chkpt = torch.load(weights, map_location=device)

        # load model
        try:
            chkpt['model'] = {
                k: v
                for k, v in chkpt['model'].items()
                if model.state_dict()[k].numel() == v.numel()
            }
            model.load_state_dict(chkpt['model'], strict=False)
        except KeyError as e:
            s = "%s is not compatible with %s. Specify --weights '' or specify a --cfg compatible with %s. " \
                "See https://github.com/ultralytics/yolov3/issues/657" % (opt.weights, opt.cfg, opt.weights)
            raise KeyError(s) from e

        # load optimizer
        if chkpt['optimizer'] is not None:
            optimizer.load_state_dict(chkpt['optimizer'])
            best_fitness = chkpt['best_fitness']

        # load results
        if chkpt.get('training_results') is not None:
            with open(results_file, 'w') as file:
                file.write(chkpt['training_results'])  # write results.txt

        start_epoch = chkpt['epoch'] + 1
        del chkpt

    elif len(weights) > 0:  # darknet format
        # possible weights are '*.weights', 'yolov3-tiny.conv.15',  'darknet53.conv.74' etc.
        load_darknet_weights(model, weights)

    # Mixed precision training https://github.com/NVIDIA/apex
    if mixed_precision:
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level='O1',
                                          verbosity=0)

    # Scheduler https://github.com/ultralytics/yolov3/issues/238
    lf = lambda x: (
        ((1 + math.cos(x * math.pi / epochs)) / 2
         )**1.0) * 0.95 + 0.05  # cosine https://arxiv.org/pdf/1812.01187.pdf
    scheduler = lr_scheduler.LambdaLR(optimizer,
                                      lr_lambda=lf,
                                      last_epoch=start_epoch - 1)
    # scheduler = lr_scheduler.MultiStepLR(optimizer, [round(epochs * x) for x in [0.8, 0.9]], 0.1, start_epoch - 1)

    # Plot lr schedule
    # y = []
    # for _ in range(epochs):
    #     scheduler.step()
    #     y.append(optimizer.param_groups[0]['lr'])
    # plt.plot(y, '.-', label='LambdaLR')
    # plt.xlabel('epoch')
    # plt.ylabel('LR')
    # plt.tight_layout()
    # plt.savefig('LR.png', dpi=300)

    # Initialize distributed training
    if device.type != 'cpu' and torch.cuda.device_count(
    ) > 1 and torch.distributed.is_available():
        dist.init_process_group(
            backend='nccl',  # 'distributed backend'
            init_method=
            'tcp://127.0.0.1:9999',  # distributed training init method
            world_size=1,  # number of nodes for distributed training
            rank=0)  # distributed training node rank
        model = torch.nn.parallel.DistributedDataParallel(
            model, find_unused_parameters=True)
        model.yolo_layers = model.module.yolo_layers  # move yolo layer indices to top level

    # Dataset
    dataset = LoadImagesAndLabels(
        train_path,
        img_size,
        batch_size,
        augment=True,
        hyp=hyp,  # augmentation hyperparameters
        rect=opt.rect,  # rectangular training
        cache_images=opt.cache_images,
        single_cls=opt.single_cls)

    # Dataloader
    batch_size = min(batch_size, len(dataset))
    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0,
              8])  # number of workers
    dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=batch_size,
        num_workers=nw,
        shuffle=not opt.
        rect,  # Shuffle=True unless rectangular training is used
        pin_memory=True,
        collate_fn=dataset.collate_fn)

    # Testloader
    testloader = torch.utils.data.DataLoader(LoadImagesAndLabels(
        test_path,
        imgsz_test,
        batch_size,
        hyp=hyp,
        rect=True,
        cache_images=opt.cache_images,
        single_cls=opt.single_cls),
                                             batch_size=batch_size,
                                             num_workers=nw,
                                             pin_memory=True,
                                             collate_fn=dataset.collate_fn)

    # Model parameters
    model.nc = nc  # attach number of classes to model
    model.hyp = hyp  # attach hyperparameters to model
    model.gr = 1.0  # giou loss ratio (obj_loss = 1.0 or giou)
    model.class_weights = labels_to_class_weights(dataset.labels, nc).to(
        device)  # attach class weights

    # Model EMA
    ema = torch_utils.ModelEMA(model)

    # Start training
    nb = len(dataloader)  # number of batches
    n_burn = max(3 * nb,
                 500)  # burn-in iterations, max(3 epochs, 500 iterations)
    maps = np.zeros(nc)  # mAP per class
    # torch.autograd.set_detect_anomaly(True)
    results = (
        0, 0, 0, 0, 0, 0, 0
    )  # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification'
    t0 = time.time()
    print('Image sizes %g - %g train, %g test' %
          (imgsz_min, imgsz_max, imgsz_test))
    print('Using %g dataloader workers' % nw)
    print('Starting training for %g epochs...' % epochs)
    for epoch in range(
            start_epoch, epochs
    ):  # epoch ------------------------------------------------------------------
        model.train()

        # Update image weights (optional)
        if dataset.image_weights:
            w = model.class_weights.cpu().numpy() * (1 -
                                                     maps)**2  # class weights
            image_weights = labels_to_image_weights(dataset.labels,
                                                    nc=nc,
                                                    class_weights=w)
            dataset.indices = random.choices(range(dataset.n),
                                             weights=image_weights,
                                             k=dataset.n)  # rand weighted idx

        mloss = torch.zeros(4).to(device)  # mean losses
        print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls',
                                     'total', 'targets', 'img_size'))
        pbar = tqdm(enumerate(dataloader), total=nb)  # progress bar
        for i, (
                imgs, targets, paths, _
        ) in pbar:  # batch -------------------------------------------------------------
            ni = i + nb * epoch  # number integrated batches (since train start)
            imgs = imgs.to(device).float(
            ) / 255.0  # uint8 to float32, 0 - 255 to 0.0 - 1.0
            targets = targets.to(device)

            # Burn-in
            if ni <= n_burn * 2:
                model.gr = np.interp(
                    ni, [0, n_burn * 2],
                    [0.0, 1.0])  # giou loss ratio (obj_loss = 1.0 or giou)
                if ni == n_burn:  # burnin complete
                    print_model_biases(model)

                for j, x in enumerate(optimizer.param_groups):
                    # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
                    x['lr'] = np.interp(
                        ni, [0, n_burn],
                        [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)])
                    if 'momentum' in x:
                        x['momentum'] = np.interp(ni, [0, n_burn],
                                                  [0.9, hyp['momentum']])

            # Multi-Scale training
            if opt.multi_scale:
                if ni / accumulate % 1 == 0:  #  adjust img_size (67% - 150%) every 1 batch
                    img_size = random.randrange(grid_min, grid_max + 1) * gs
                sf = img_size / max(imgs.shape[2:])  # scale factor
                if sf != 1:
                    ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]
                          ]  # new shape (stretched to 32-multiple)
                    imgs = F.interpolate(imgs,
                                         size=ns,
                                         mode='bilinear',
                                         align_corners=False)

            # Run model
            pred = model(imgs)

            # Compute loss
            loss, loss_items = compute_loss(pred, targets, model)
            if not torch.isfinite(loss):
                print('WARNING: non-finite loss, ending training ', loss_items)
                return results

            # Scale loss by nominal batch_size of 64
            loss *= batch_size / 64

            # Compute gradient
            if mixed_precision:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            # Optimize accumulated gradient
            if ni % accumulate == 0:
                optimizer.step()
                optimizer.zero_grad()
                ema.update(model)

            # Print batch results
            mloss = (mloss * i + loss_items) / (i + 1)  # update mean losses
            mem = '%.3gG' % (torch.cuda.memory_cached() /
                             1E9 if torch.cuda.is_available() else 0)  # (GB)
            s = ('%10s' * 2 + '%10.3g' * 6) % ('%g/%g' %
                                               (epoch, epochs - 1), mem,
                                               *mloss, len(targets), img_size)
            pbar.set_description(s)

            # Plot images with bounding boxes
            if ni < 1:
                f = 'train_batch%g.png' % i  # filename
                plot_images(imgs=imgs, targets=targets, paths=paths, fname=f)
                if tb_writer:
                    tb_writer.add_image(f,
                                        cv2.imread(f)[:, :, ::-1],
                                        dataformats='HWC')
                    # tb_writer.add_graph(model, imgs)  # add model to tensorboard

            # end batch ------------------------------------------------------------------------------------------------

        # Update scheduler
        scheduler.step()

        # Process epoch results
        ema.update_attr(model)
        final_epoch = epoch + 1 == epochs
        if not opt.notest or final_epoch:  # Calculate mAP
            is_coco = any([
                x in data
                for x in ['coco.data', 'coco2014.data', 'coco2017.data']
            ]) and model.nc == 80
            results, maps = test.test(cfg,
                                      data,
                                      batch_size=batch_size,
                                      img_size=imgsz_test,
                                      model=ema.ema,
                                      save_json=final_epoch and is_coco,
                                      single_cls=opt.single_cls,
                                      dataloader=testloader)

        # Write epoch results
        with open(results_file, 'a') as f:
            f.write(s + '%10.3g' * 7 % results +
                    '\n')  # P, R, mAP, F1, test_losses=(GIoU, obj, cls)
        if len(opt.name) and opt.bucket:
            os.system('gsutil cp results.txt gs://%s/results/results%s.txt' %
                      (opt.bucket, opt.name))

        # Write Tensorboard results
        if tb_writer:
            tags = [
                'train/giou_loss', 'train/obj_loss', 'train/cls_loss',
                'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5',
                'metrics/F1', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss'
            ]
            for x, tag in zip(list(mloss[:-1]) + list(results), tags):
                tb_writer.add_scalar(tag, x, epoch)

        # Update best mAP
        fi = fitness(np.array(results).reshape(
            1, -1))  # fitness_i = weighted combination of [P, R, mAP, F1]
        if fi > best_fitness:
            best_fitness = fi

        # Save training results
        save = (not opt.nosave) or (final_epoch and not opt.evolve)
        if save:
            with open(results_file, 'r') as f:
                # Create checkpoint
                chkpt = {
                    'epoch':
                    epoch,
                    'best_fitness':
                    best_fitness,
                    'training_results':
                    f.read(),
                    'model':
                    ema.ema.module.state_dict()
                    if hasattr(model, 'module') else ema.ema.state_dict(),
                    'optimizer':
                    None if final_epoch else optimizer.state_dict()
                }

            # Save last checkpoint
            torch.save(chkpt, last)

            # Save best checkpoint
            if (best_fitness == fi) and not final_epoch:
                torch.save(chkpt, best)

            # Save backup every 10 epochs (optional)
            # if epoch > 0 and epoch % 10 == 0:
            #     torch.save(chkpt, wdir + 'backup%g.pt' % epoch)

            # Delete checkpoint
            del chkpt

        # end epoch ----------------------------------------------------------------------------------------------------

    # end training
    n = opt.name
    if len(n):
        n = '_' + n if not n.isnumeric() else n
        fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n
        for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'],
                          [flast, fbest, fresults]):
            if os.path.exists(f1):
                os.rename(f1, f2)  # rename
                ispt = f2.endswith('.pt')  # is *.pt
                strip_optimizer(f2) if ispt else None  # strip optimizer
                os.system('gsutil cp %s gs://%s/weights' % (
                    f2, opt.bucket)) if opt.bucket and ispt else None  # upload

    if not opt.evolve:
        plot_results()  # save as results.png
    print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1,
                                                    (time.time() - t0) / 3600))
    dist.destroy_process_group() if torch.cuda.device_count() > 1 else None
    torch.cuda.empty_cache()

    return results
Ejemplo n.º 32
0
def train(
    cfg,
    data_cfg,
    weights_from="",
    weights_to="",
    save_every=10,
    img_size=(1088, 608),
    resume=False,
    epochs=100,
    batch_size=16,
    accumulated_batches=1,
    freeze_backbone=False,
    opt=None,
):
    # The function starts

    timme = strftime("%Y-%d-%m %H:%M:%S", gmtime())
    timme = timme[5:-3].replace('-', '_')
    timme = timme.replace(' ', '_')
    timme = timme.replace(':', '_')
    weights_to = osp.join(weights_to, 'run' + timme)
    mkdir_if_missing(weights_to)
    if resume:
        latest_resume = osp.join(weights_from, 'latest.pt')

    torch.backends.cudnn.benchmark = True  # unsuitable for multiscale

    # Configure run
    f = open(data_cfg)
    data_config = json.load(f)
    trainset_paths = data_config['train']
    dataset_root = data_config['root']
    f.close()

    transforms = T.Compose([T.ToTensor()])
    # Get dataloader
    dataset = JointDataset(dataset_root,
                           trainset_paths,
                           img_size,
                           augment=True,
                           transforms=transforms)
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batch_size,
                                             shuffle=True,
                                             num_workers=8,
                                             pin_memory=True,
                                             drop_last=True,
                                             collate_fn=collate_fn)
    # Initialize model
    model = Darknet(cfg, dataset.nID)
    # model = ResNet(cfg, dataset.nID)

    cutoff = -1  # backbone reaches to cutoff layer
    start_epoch = 0
    if resume:
        checkpoint = torch.load(latest_resume, map_location='cpu')

        # Load weights to resume from
        model.load_state_dict(checkpoint['model'])
        model.cuda().train()

        # Set optimizer
        optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad,
                                           model.parameters()),
                                    lr=opt.lr,
                                    momentum=.9)

        start_epoch = checkpoint['epoch'] + 1
        if checkpoint['optimizer'] is not None:
            optimizer.load_state_dict(checkpoint['optimizer'])

        del checkpoint  # current, saved

    else:
        # Initialize model with backbone (optional)
        if cfg.endswith('yolov3.cfg'):
            load_darknet_weights(model,
                                 osp.join(weights_from, 'darknet53.conv.74'))
            cutoff = 75
        elif cfg.endswith('yolov3-tiny.cfg'):
            load_darknet_weights(model,
                                 osp.join(weights_from, 'yolov3-tiny.conv.15'))
            cutoff = 15

        model.cuda().train()

        # Set optimizer
        optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad,
                                           model.parameters()),
                                    lr=opt.lr,
                                    momentum=.9,
                                    weight_decay=1e-4)

    model = torch.nn.DataParallel(model)
    # Set scheduler
    scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer,
        milestones=[int(0.5 * opt.epochs),
                    int(0.75 * opt.epochs)],
        gamma=0.1)

    # An important trick for detection: freeze bn during fine-tuning
    if not opt.unfreeze_bn:
        for i, (name, p) in enumerate(model.named_parameters()):
            p.requires_grad = False if 'batch_norm' in name else True

    # model_info(model)
    t0 = time.time()
    for epoch in range(epochs):
        epoch += start_epoch
        logger.info(
            ('%8s%12s' + '%10s' * 6) % ('Epoch', 'Batch', 'box', 'conf', 'id',
                                        'total', 'nTargets', 'time'))

        # Freeze darknet53.conv.74 for first epoch
        if freeze_backbone and (epoch < 2):
            for i, (name, p) in enumerate(model.named_parameters()):
                if int(name.split('.')[2]) < cutoff:  # if layer < 75
                    p.requires_grad = False if (epoch == 0) else True

        ui = -1
        rloss = defaultdict(float)  # running loss
        optimizer.zero_grad()
        for i, (imgs, targets, _, _, targets_len) in enumerate(dataloader):
            if sum([len(x) for x in targets]) < 1:  # if no targets continue
                continue

            # SGD burn-in
            burnin = min(1000, len(dataloader))
            if (epoch == 0) & (i <= burnin):
                lr = opt.lr * (i / burnin)**4
                for g in optimizer.param_groups:
                    g['lr'] = lr

            # Compute loss, compute gradient, update parameters
            loss, components = model(imgs.cuda(), targets.cuda(),
                                     targets_len.cuda())
            components = torch.mean(components.view(-1, 5), dim=0)
            loss = torch.mean(loss)
            loss.backward()

            # accumulate gradient for x batches before optimizing
            if ((i + 1) % accumulated_batches
                    == 0) or (i == len(dataloader) - 1):
                optimizer.step()
                optimizer.zero_grad()

            # Running epoch-means of tracked metrics
            ui += 1

            for ii, key in enumerate(model.module.loss_names):
                rloss[key] = (rloss[key] * ui + components[ii]) / (ui + 1)

            # rloss indicates running loss values with mean updated at every epoch
            s = ('%8s%12s' + '%10.3g' * 6) % (
                '%g/%g' % (epoch, epochs - 1), '%g/%g' %
                (i, len(dataloader) - 1), rloss['box'], rloss['conf'],
                rloss['id'], rloss['loss'], rloss['nT'], time.time() - t0)
            t0 = time.time()
            if i % opt.print_interval == 0:
                logger.info(s)

        # Save latest checkpoint
        checkpoint = {
            'epoch': epoch,
            'model': model.module.state_dict(),
            'optimizer': optimizer.state_dict()
        }

        copyfile(cfg, weights_to + '/cfg/yolo3.cfg')
        copyfile(data_cfg, weights_to + '/cfg/ccmcpe.json')

        latest = osp.join(weights_to, 'latest.pt')
        torch.save(checkpoint, latest)
        if epoch % save_every == 0 and epoch != 0:
            # making the checkpoint lite
            checkpoint["optimizer"] = []
            torch.save(
                checkpoint,
                osp.join(weights_to, "weights_epoch_" + str(epoch) + ".pt"))

        # Calculate mAP
        if epoch % opt.test_interval == 0:
            with torch.no_grad():
                mAP, R, P = test.test(cfg,
                                      data_cfg,
                                      weights=latest,
                                      batch_size=batch_size,
                                      img_size=img_size,
                                      print_interval=40,
                                      nID=dataset.nID)
                test.test_emb(cfg,
                              data_cfg,
                              weights=latest,
                              batch_size=batch_size,
                              img_size=img_size,
                              print_interval=40,
                              nID=dataset.nID)

        # Call scheduler.step() after opimizer.step() with pytorch > 1.1.0
        scheduler.step()
Ejemplo n.º 33
0
 def testare(self):
     self.assertTrue(test(5, 4))
     self.assertFalse(test(3, 4))
Ejemplo n.º 34
0
def train():
    cfg = opt.cfg
    data = opt.data
    img_size, img_size_test = opt.img_size if len(
        opt.img_size) == 2 else opt.img_size * 2  # train, test sizes
    epochs = opt.epochs  # 500200 batches at bs 64, 117263 images = 273 epochs
    batch_size = opt.batch_size
    accumulate = opt.accumulate  # effective bs = batch_size * accumulate = 16 * 4 = 64
    weights = opt.weights  # initial training weights

    # Initialize
    init_seeds()
    if opt.multi_scale:
        img_sz_min = round(img_size / 32 / 1.5)
        img_sz_max = round(img_size / 32 * 1.5)
        img_size = img_sz_max * 32  # initiate with maximum multi_scale size
        print('Using multi-scale %g - %g' % (img_sz_min * 32, img_size))

    # Configure run
    data_dict = parse_data_cfg(data)
    train_path = data_dict['train']
    test_path = data_dict['valid']
    nc = 1 if opt.single_cls else int(
        data_dict['classes'])  # number of classes

    # Remove previous results
    for f in glob.glob('*_batch*.png') + glob.glob(results_file):
        os.remove(f)

    # Initialize model
    model = Darknet(cfg, arc=opt.arc).to(device)

    # Optimizer
    pg0, pg1, pg2 = [], [], []  # optimizer parameter groups
    for k, v in dict(model.named_parameters()).items():
        if '.bias' in k:
            pg2 += [v]  # biases
        elif 'Conv2d.weight' in k:
            pg1 += [v]  # apply weight_decay
        else:
            pg0 += [v]  # all else

    if opt.adam:
        # hyp['lr0'] *= 0.1  # reduce lr (i.e. SGD=5E-3, Adam=5E-4)
        optimizer = optim.Adam(pg0, lr=hyp['lr0'])
        # optimizer = AdaBound(pg0, lr=hyp['lr0'], final_lr=0.1)
    else:
        optimizer = optim.SGD(pg0,
                              lr=hyp['lr0'],
                              momentum=hyp['momentum'],
                              nesterov=True)
    optimizer.add_param_group({
        'params': pg1,
        'weight_decay': hyp['weight_decay']
    })  # add pg1 with weight_decay
    optimizer.add_param_group({'params': pg2})  # add pg2 (biases)
    del pg0, pg1, pg2

    # https://github.com/alphadl/lookahead.pytorch
    # optimizer = torch_utils.Lookahead(optimizer, k=5, alpha=0.5)

    start_epoch = 0
    best_fitness = 0.0
    attempt_download(weights)
    if weights.endswith('.pt'):  # pytorch format
        # possible weights are '*.pt', 'yolov3-spp.pt', 'yolov3-tiny.pt' etc.
        chkpt = torch.load(weights, map_location=device)

        # load model
        try:
            chkpt['model'] = {
                k: v
                for k, v in chkpt['model'].items()
                if model.state_dict()[k].numel() == v.numel()
            }
            model.load_state_dict(chkpt['model'], strict=False)
        except KeyError as e:
            s = "%s is not compatible with %s. Specify --weights '' or specify a --cfg compatible with %s. " \
                "See https://github.com/ultralytics/yolov3/issues/657" % (opt.weights, opt.cfg, opt.weights)
            raise KeyError(s) from e

        # load optimizer
        if chkpt['optimizer'] is not None:
            optimizer.load_state_dict(chkpt['optimizer'])
            best_fitness = chkpt['best_fitness']

        # load results
        if chkpt.get('training_results') is not None:
            with open(results_file, 'w') as file:
                file.write(chkpt['training_results'])  # write results.txt

        start_epoch = chkpt['epoch'] + 1
        del chkpt

    elif len(weights) > 0:  # darknet format
        # possible weights are '*.weights', 'yolov3-tiny.conv.15',  'darknet53.conv.74' etc.
        load_darknet_weights(model, weights)

    # Scheduler https://github.com/ultralytics/yolov3/issues/238
    # lf = lambda x: 1 - x / epochs  # linear ramp to zero
    # lf = lambda x: 10 ** (hyp['lrf'] * x / epochs)  # exp ramp
    # lf = lambda x: 1 - 10 ** (hyp['lrf'] * (1 - x / epochs))  # inverse exp ramp
    # scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
    # scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=range(59, 70, 1), gamma=0.8)  # gradual fall to 0.1*lr0
    scheduler = lr_scheduler.MultiStepLR(
        optimizer,
        milestones=[round(opt.epochs * x) for x in [0.8, 0.9]],
        gamma=0.1)
    scheduler.last_epoch = start_epoch - 1

    # # Plot lr schedule
    # y = []
    # for _ in range(epochs):
    #     scheduler.step()
    #     y.append(optimizer.param_groups[0]['lr'])
    # plt.plot(y, label='LambdaLR')
    # plt.xlabel('epoch')
    # plt.ylabel('LR')
    # plt.tight_layout()
    # plt.savefig('LR.png', dpi=300)

    # Mixed precision training https://github.com/NVIDIA/apex
    if mixed_precision:
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level='O1',
                                          verbosity=0)

    # Initialize distributed training
    if device.type != 'cpu' and torch.cuda.device_count() > 1:
        dist.init_process_group(
            backend='nccl',  # 'distributed backend'
            init_method=
            'tcp://127.0.0.1:9999',  # distributed training init method
            world_size=1,  # number of nodes for distributed training
            rank=0)  # distributed training node rank
        model = torch.nn.parallel.DistributedDataParallel(
            model, find_unused_parameters=True)
        model.yolo_layers = model.module.yolo_layers  # move yolo layer indices to top level

    # Dataset
    dataset = LoadImagesAndLabels(
        train_path,
        img_size,
        batch_size,
        augment=True,
        hyp=hyp,  # augmentation hyperparameters
        rect=opt.rect,  # rectangular training
        cache_labels=True,
        cache_images=opt.cache_images,
        single_cls=opt.single_cls)

    # Dataloader
    batch_size = min(batch_size, len(dataset))
    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0,
              8])  # number of workers
    dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=batch_size,
        num_workers=nw,
        shuffle=not opt.
        rect,  # Shuffle=True unless rectangular training is used
        pin_memory=True,
        collate_fn=dataset.collate_fn)

    # Testloader
    testloader = torch.utils.data.DataLoader(LoadImagesAndLabels(
        test_path,
        img_size_test,
        batch_size * 2,
        hyp=hyp,
        rect=True,
        cache_labels=True,
        cache_images=opt.cache_images,
        single_cls=opt.single_cls),
                                             batch_size=batch_size * 2,
                                             num_workers=nw,
                                             pin_memory=True,
                                             collate_fn=dataset.collate_fn)

    # Start training
    nb = len(dataloader)
    prebias = start_epoch == 0
    model.nc = nc  # attach number of classes to model
    model.arc = opt.arc  # attach yolo architecture
    model.hyp = hyp  # attach hyperparameters to model
    model.class_weights = labels_to_class_weights(dataset.labels, nc).to(
        device)  # attach class weights
    maps = np.zeros(nc)  # mAP per class
    # torch.autograd.set_detect_anomaly(True)
    results = (
        0, 0, 0, 0, 0, 0, 0
    )  # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification'
    t0 = time.time()
    torch_utils.model_info(model, report='summary')  # 'full' or 'summary'
    print('Using %g dataloader workers' % nw)
    print('Starting training for %g epochs...' % epochs)
    for epoch in range(start_epoch,
                       epochs):  # epoch ------------------------------
        model.train()

        # Prebias
        if prebias:
            if epoch < 3:  # prebias
                ps = 0.1, 0.9  # prebias settings (lr=0.1, momentum=0.9)
            else:  # normal training
                ps = hyp['lr0'], hyp['momentum']  # normal training settings
                print_model_biases(model)
                prebias = False

            # Bias optimizer settings
            optimizer.param_groups[2]['lr'] = ps[0]
            if optimizer.param_groups[2].get(
                    'momentum') is not None:  # for SGD but not Adam
                optimizer.param_groups[2]['momentum'] = ps[1]

        # Update image weights (optional)
        if dataset.image_weights:
            w = model.class_weights.cpu().numpy() * (1 -
                                                     maps)**2  # class weights
            image_weights = labels_to_image_weights(dataset.labels,
                                                    nc=nc,
                                                    class_weights=w)
            dataset.indices = random.choices(range(dataset.n),
                                             weights=image_weights,
                                             k=dataset.n)  # rand weighted idx

        mloss = torch.zeros(4).to(device)  # mean losses
        print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls',
                                     'total', 'targets', 'img_size'))
        pbar = tqdm(enumerate(dataloader), total=nb)  # progress bar
        for i, (
                imgs, targets, paths, _
        ) in pbar:  # batch -------------------------------------------------------------
            ni = i + nb * epoch  # number integrated batches (since train start)
            imgs = imgs.to(device).float(
            ) / 255.0  # uint8 to float32, 0 - 255 to 0.0 - 1.0
            targets = targets.to(device)

            # Multi-Scale training
            if opt.multi_scale:
                if ni / accumulate % 10 == 0:  #  adjust (67% - 150%) every 10 batches
                    img_size = random.randrange(img_sz_min,
                                                img_sz_max + 1) * 32
                sf = img_size / max(imgs.shape[2:])  # scale factor
                if sf != 1:
                    ns = [
                        math.ceil(x * sf / 32.) * 32 for x in imgs.shape[2:]
                    ]  # new shape (stretched to 32-multiple)
                    imgs = F.interpolate(imgs,
                                         size=ns,
                                         mode='bilinear',
                                         align_corners=False)

            # Plot images with bounding boxes
            if ni == 0:
                fname = 'train_batch%g.png' % i
                plot_images(imgs=imgs,
                            targets=targets,
                            paths=paths,
                            fname=fname)
                if tb_writer:
                    tb_writer.add_image(fname,
                                        cv2.imread(fname)[:, :, ::-1],
                                        dataformats='HWC')

            # Hyperparameter burn-in
            # n_burn = nb - 1  # min(nb // 5 + 1, 1000)  # number of burn-in batches
            # if ni <= n_burn:
            #     for m in model.named_modules():
            #         if m[0].endswith('BatchNorm2d'):
            #             m[1].momentum = 1 - i / n_burn * 0.99  # BatchNorm2d momentum falls from 1 - 0.01
            #     g = (i / n_burn) ** 4  # gain rises from 0 - 1
            #     for x in optimizer.param_groups:
            #         x['lr'] = hyp['lr0'] * g
            #         x['weight_decay'] = hyp['weight_decay'] * g

            # Run model
            pred = model(imgs)

            # Compute loss
            loss, loss_items = compute_loss(pred, targets, model, not prebias)
            if not torch.isfinite(loss):
                print('WARNING: non-finite loss, ending training ', loss_items)
                return results

            # Scale loss by nominal batch_size of 64
            loss *= batch_size / 64

            # Compute gradient
            if mixed_precision:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            # Accumulate gradient for x batches before optimizing
            if ni % accumulate == 0:
                optimizer.step()
                optimizer.zero_grad()

            # Print batch results
            mloss = (mloss * i + loss_items) / (i + 1)  # update mean losses
            mem = torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available(
            ) else 0  # (GB)
            s = ('%10s' * 2 + '%10.3g' * 6) % ('%g/%g' % (epoch, epochs - 1),
                                               '%.3gG' % mem, *mloss,
                                               len(targets), img_size)
            pbar.set_description(s)

            # end batch ------------------------------------------------------------------------------------------------

        # Process epoch results
        final_epoch = epoch + 1 == epochs
        if not opt.notest or final_epoch:  # Calculate mAP
            is_coco = any([
                x in data
                for x in ['coco.data', 'coco2014.data', 'coco2017.data']
            ]) and model.nc == 80
            results, maps = test.test(
                cfg,
                data,
                batch_size=batch_size * 2,
                img_size=img_size_test,
                model=model,
                conf_thres=0.001
                if final_epoch and is_coco else 0.1,  # 0.1 for speed
                iou_thres=0.6,
                save_json=final_epoch and is_coco,
                single_cls=opt.single_cls,
                dataloader=testloader)

        # Update scheduler
        scheduler.step()

        # Write epoch results
        with open(results_file, 'a') as f:
            f.write(s + '%10.3g' * 7 % results +
                    '\n')  # P, R, mAP, F1, test_losses=(GIoU, obj, cls)
        if len(opt.name) and opt.bucket:
            os.system('gsutil cp results.txt gs://%s/results/results%s.txt' %
                      (opt.bucket, opt.name))

        # Write Tensorboard results
        if tb_writer:
            x = list(mloss) + list(results)
            titles = [
                'GIoU', 'Objectness', 'Classification', 'Train loss',
                'Precision', 'Recall', 'mAP', 'F1', 'val GIoU',
                'val Objectness', 'val Classification'
            ]
            for xi, title in zip(x, titles):
                tb_writer.add_scalar(title, xi, epoch)

        # Update best mAP
        fi = fitness(np.array(results).reshape(
            1, -1))  # fitness_i = weighted combination of [P, R, mAP, F1]
        if fi > best_fitness:
            best_fitness = fi

        # Save training results
        save = (not opt.nosave) or (final_epoch and not opt.evolve)
        if save:
            with open(results_file, 'r') as f:
                # Create checkpoint
                chkpt = {
                    'epoch':
                    epoch,
                    'best_fitness':
                    best_fitness,
                    'training_results':
                    f.read(),
                    'model':
                    model.module.state_dict()
                    if type(model) is nn.parallel.DistributedDataParallel else
                    model.state_dict(),
                    'optimizer':
                    None if final_epoch else optimizer.state_dict()
                }

            # Save last checkpoint
            torch.save(chkpt, last)

            # Save best checkpoint
            if best_fitness == fi:
                torch.save(chkpt, best)

            # Save backup every 10 epochs (optional)
            # if epoch > 0 and epoch % 10 == 0:
            #     torch.save(chkpt, wdir + 'backup%g.pt' % epoch)

            # Delete checkpoint
            del chkpt

        # end epoch ----------------------------------------------------------------------------------------------------

    # end training
    n = opt.name
    if len(n):
        n = '_' + n if not n.isnumeric() else n
        fresults, flast, fbest = 'results%s.txt' % n, 'last%s.pt' % n, 'best%s.pt' % n
        os.rename('results.txt', fresults)
        os.rename(wdir + 'last.pt', wdir +
                  flast) if os.path.exists(wdir + 'last.pt') else None
        os.rename(wdir + 'best.pt', wdir +
                  fbest) if os.path.exists(wdir + 'best.pt') else None
        if opt.bucket:  # save to cloud
            os.system('gsutil cp %s gs://%s/results' % (fresults, opt.bucket))
            os.system('gsutil cp %s gs://%s/weights' %
                      (wdir + flast, opt.bucket))
            # os.system('gsutil cp %s gs://%s/weights' % (wdir + fbest, opt.bucket))

    if not opt.evolve:
        plot_results()  # save as results.png
    print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1,
                                                    (time.time() - t0) / 3600))
    dist.destroy_process_group() if torch.cuda.device_count() > 1 else None
    torch.cuda.empty_cache()

    return results
Ejemplo n.º 35
0
def do_experiment(model_config):

    tf.reset_default_graph()
    experiment_id = ex.current_run._id
    print('Experiment ID: {eid}'.format(eid=experiment_id))

    # Prepare data
    print('Preparing dataset')
    train_data, val_data, test_data = dataset.prepare_datasets(model_config)
    print('Dataset ready')

    # Start session
    tf_config = tf.ConfigProto()
    #tf_config.gpu_options.allow_growth = True
    tf_config.gpu_options.visible_device_list = str(model_config['GPU'])
    sess = tf.Session(config=tf_config)
    #sess = tf.Session()
    #sess = tf_debug.LocalCLIDebugWrapperSession(sess, ui_type="readline")

    print('Session started')

    # Create data iterators
    handle = tf.placeholder(tf.string, shape=[])
    iterator = tf.data.Iterator.from_string_handle(handle,
                                                   train_data.output_types,
                                                   train_data.output_shapes)
    mixed_spec, voice_spec, background_spec, mixed_audio, voice_audio, background_audio = iterator.get_next(
    )

    training_iterator = train_data.make_initializable_iterator()
    validation_iterator = val_data.make_initializable_iterator()
    testing_iterator = test_data.make_initializable_iterator()

    training_handle = sess.run(training_iterator.string_handle())
    validation_handle = sess.run(validation_iterator.string_handle())
    testing_handle = sess.run(testing_iterator.string_handle())
    print('Iterators created')

    # Create variable placeholders and model
    is_training = tf.placeholder(shape=(), dtype=bool)
    mixed_phase = tf.expand_dims(mixed_spec[:, :, :-1, 3], 3)

    print('Creating model')
    # Restructure data from pipeline based on data type required
    if model_config['data_type'] == 'mag':
        mixed_input = tf.expand_dims(mixed_spec[:, :, :-1, 2], 3)
        voice_input = tf.expand_dims(voice_spec[:, :, :-1, 2], 3)
    elif model_config['data_type'] in ['mag_phase', 'mag_phase_diff']:
        mixed_input = mixed_spec[:, :, :-1, 2:4]
        voice_input = voice_spec[:, :, :-1, 2:4]
    elif model_config['data_type'] == 'real_imag':
        mixed_input = mixed_spec[:, :, :-1, 0:2]
        voice_input = voice_spec[:, :, :-1, 0:2]
    elif model_config['data_type'] in ['mag_real_imag', 'mag_phase2']:
        mixed_input = tf.concat([
            tf.expand_dims(mixed_spec[:, :, :-1, 2], 3), mixed_spec[:, :, :-1,
                                                                    0:2]
        ], 3)
        voice_input = tf.concat([
            tf.expand_dims(voice_spec[:, :, :-1, 2], 3), voice_spec[:, :, :-1,
                                                                    0:2]
        ], 3)
    elif model_config['data_type'] in [
            'mag_phase_real_imag', 'complex_to_mag_phase'
    ]:
        mixed_input = mixed_spec[:, :, :-1, :]
        voice_input = voice_spec[:, :, :-1, :]

    model = audio_models.MagnitudeModel(mixed_input,
                                        voice_input,
                                        mixed_phase,
                                        mixed_audio,
                                        voice_audio,
                                        background_audio,
                                        model_config['model_variant'],
                                        is_training,
                                        model_config['learning_rate'],
                                        model_config['data_type'],
                                        model_config['phase_weight'],
                                        name='Magnitude_Model')

    sess.run(tf.global_variables_initializer())

    if model_config['loading']:
        print('Loading checkpoint')
        checkpoint = os.path.join(model_config['model_base_dir'],
                                  model_config['checkpoint_to_load'])
        restorer = tf.train.Saver()
        restorer.restore(sess, checkpoint)

    # Summaries
    model_folder = str(experiment_id)
    writer = tf.summary.FileWriter(os.path.join(model_config["log_dir"],
                                                model_folder),
                                   graph=sess.graph)

    # Get baseline metrics at initialisation
    test_count = 0
    if model_config['initialisation_test']:
        print('Running initialisation test')
        initial_test_loss, test_count = test(sess, model, model_config, handle,
                                             testing_iterator, testing_handle,
                                             test_count, experiment_id)

    # Train the model
    model = train(sess, model, model_config, model_folder, handle,
                  training_iterator, training_handle, validation_iterator,
                  validation_handle, writer)

    # Test the trained model
    mean_test_loss, test_count = test(sess, model, model_config, handle,
                                      testing_iterator, testing_handle,
                                      test_count, experiment_id)
    print('{ts}:\n\tAll done with experiment {exid}!'.format(
        ts=datetime.datetime.now(), exid=experiment_id))
    if model_config['initialisation_test']:
        print('\tInitial test loss: {init}'.format(init=initial_test_loss))
    print('\tFinal test loss: {final}'.format(final=mean_test_loss))
    model = Darknet(opt.model_def).to(device)

    if opt.model:
        if opt.model.endswith(".pt"):
            model.load_state_dict(torch.load(opt.model, map_location=device)['model'])
        else:
            _ = load_darknet_weights(model, opt.model)



    data_config = parse_data_cfg(opt.data_config)

    valid_path = data_config["valid"]
    class_names = load_classes(data_config["names"])

    eval_model = lambda model:test(model=model,cfg=opt.model_def, data=opt.data_config)
    obtain_num_parameters = lambda model:sum([param.nelement() for param in model.parameters()])

    #这个不应该注释掉,等会要恢复
    with torch.no_grad():
        origin_model_metric = eval_model(model)
    origin_nparameters = obtain_num_parameters(model)

    '''
    module_defs是一个列表,列表的每一项都是一个字典.贮存的只是并不生效的网络结构信息
    例如{'type': 'convolutional', 'batch_normalize': '1', 'filters': '32', 'size': '3', 'stride': '1', 'pad': '1', 'activation': 'leaky'}

    module_list是一个列表,列表的每一项都是一个列表,例如:
    Sequential(
      (conv_0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (batch_norm_0): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
Ejemplo n.º 37
0
        batches = datamanager.batch_iter(train, FLAGS.batch_size,
                                         FLAGS.num_epochs)
        num_batches_per_epoch = int(len(train) / FLAGS.batch_size) + 1
        print("Batch data")
        # Training loop. For each batch...
        num_batch = 1
        num_epoch = 1
        dev_x_batch = datamanager.generate_x(dev)
        dev_p1_batch, dev_p2_batch = datamanager.generate_p(dev)
        dev_y_batch = datamanager.generate_y(dev)
        for batch in batches:
            if num_batch == num_batches_per_epoch:
                num_epoch += 1
                num_batch = 1
                test(testing_data, cnn.input_x, cnn.input_p1, cnn.input_p2,
                     cnn.scores, cnn.predictions, cnn.dropout_keep_prob,
                     datamanager, sess, num_epoch)
                path = saver.save(sess,
                                  checkpoint_prefix,
                                  global_step=current_step)
            num_batch += 1
            x_batch = datamanager.generate_x(batch)
            p1_batch, p2_batch = datamanager.generate_p(batch)
            y_batch = datamanager.generate_y(batch)
            loss = train_step(x_batch, y_batch, p1_batch, p2_batch)
            current_step = tf.train.global_step(sess, global_step)
            if current_step % FLAGS.evaluate_every == 0:
                print("Num_batch: {}".format(num_batch))
                print("Num_epoch: {}".format(num_epoch))
                dev_step(dev_x_batch, dev_y_batch, dev_p1_batch, dev_p2_batch)
Ejemplo n.º 38
0
def main_finetune_channels():

    # 波段选择依据
    # 最优的波段排序:
    #       [850, 870, 930, 730, 790, 910, 770, 750, 670, 950, 990, 830, 890, 810, 970, 690, 710, 650, 590, 570, 630, 610, 550]
    # 依次增加一个波段, 前一个模型进行微调

    CHANNEL_SORT = [
        850, 870, 930, 730, 790, 910, 770, 750, 670, 950, 990, 830, 890, 810,
        970, 690, 710, 650, 590, 570, 630, 610, 550
    ]

    for splitidx in range(4, 5):
        usedChannelsList = [CHANNEL_SORT[:i + 1] for i in range(23)]

        # for i_usedChannels in range(len(usedChannelsList)):
        for i_usedChannels in [4, 6]:

            usedChannels = usedChannelsList[i_usedChannels]

            print(getTime(), splitidx, len(usedChannels), '...')

            configer = EasyDict()

            configer.dsize = (64, 64)
            configer.datatype = 'Multi'
            configer.n_epoch = 300
            configer.lrbase = 0.001

            configer.n_channel = 23
            configer.n_class = 63
            configer.batchsize = 32
            configer.stepsize = 250
            configer.gamma = 0.2
            configer.cuda = True

            configer.splitmode = 'split_{}x{}_{}'.format(
                configer.dsize[0], configer.dsize[1], splitidx)
            configer.modelbase = 'recognize_vgg11_bn'

            configer.usedChannels = usedChannels
            configer.n_usedChannels = len(configer.usedChannels)
            configer.modelname = '{}_{}_{}_finetune'.\
                            format(configer.modelbase, configer.splitmode,
                                    '_'.join(list(map(str, configer.usedChannels))))


            configer.datapath = '/home/louishsu/Work/Workspace/ECUST2019_{}x{}'.\
                                            format(configer.dsize[0], configer.dsize[1])
            configer.logspath = '/home/louishsu/Work/Workspace/HUAWEI/pytorch/logs/{}_{}_{}subjects_logs'.\
                                            format(configer.modelbase, configer.splitmode, configer.n_class)
            configer.mdlspath = '/home/louishsu/Work/Workspace/HUAWEI/pytorch/modelfiles/{}_{}_{}subjects_models'.\
                                            format(configer.modelbase, configer.splitmode, configer.n_class)

            ## datasets
            trainset = RecognizeDataset(configer.datapath, configer.datatype,
                                        configer.splitmode, 'train',
                                        configer.usedChannels)
            validset = RecognizeDataset(configer.datapath, configer.datatype,
                                        configer.splitmode, 'valid',
                                        configer.usedChannels)
            trainloader = DataLoader(trainset,
                                     configer.batchsize,
                                     shuffle=True)
            validloader = DataLoader(validset,
                                     configer.batchsize,
                                     shuffle=False)

            ## ============================================================================================
            ## model
            modelpath = os.path.join(configer.mdlspath,
                                     configer.modelname) + '.pkl'
            modeldir = '/'.join(modelpath.split('/')[:-1])
            if not os.path.exists(modeldir): os.makedirs(modeldir)

            if i_usedChannels == 0:
                model = modeldict[configer.modelbase](configer.n_usedChannels,
                                                      configer.n_class,
                                                      configer.dsize[0])
                params = model.parameters()
                torch.save(model, modelpath)
            else:
                modelpath_pretrain = os.path.join(
                    modeldir, '{}_{}_{}_finetune.pkl'.format(
                        configer.modelbase, configer.splitmode, '_'.join(
                            list(map(str,
                                     usedChannelsList[i_usedChannels - 1])))))
                model = torch.load(modelpath_pretrain)
                model.features[0] = nn.Conv2d(len(usedChannels),
                                              64,
                                              3,
                                              stride=1,
                                              padding=1)
                params = [{
                    'params': model.features[1:].parameters(),
                    'lr': configer.lrbase * 0.01,
                }, {
                    'params': model.features[0].parameters(),
                }]
                torch.save(model, modelpath)
            if configer.cuda and is_available(): model.cuda()
            ## ============================================================================================

            ## optimizer
            optimizer = optim.Adam(params, configer.lrbase, weight_decay=1e-3)

            ## loss
            loss = nn.CrossEntropyLoss()

            ## learning rate scheduler
            scheduler = lr_scheduler.StepLR(optimizer, configer.stepsize,
                                            configer.gamma)

            ## log
            logpath = os.path.join(configer.logspath, configer.modelname)
            if not os.path.exists(logpath): os.makedirs(logpath)
            logger = SummaryWriter(logpath)

            ## initialize
            acc_train = 0.
            acc_valid = 0.
            loss_train = float('inf')
            loss_valid = float('inf')
            loss_valid_last = float('inf')

            ## start training
            for i_epoch in range(configer.n_epoch):

                if configer.cuda and is_available(): empty_cache()
                scheduler.step(i_epoch)
                acc_train = []
                acc_valid = []
                loss_train = []
                loss_valid = []

                model.train()
                for i_batch, (X, y) in enumerate(trainloader):

                    # get batch
                    X = Variable(X.float())
                    y = Variable(y)
                    if configer.cuda and is_available():
                        X = X.cuda()
                        y = y.cuda()

                    # forward
                    y_pred_prob = model(X)
                    loss_i = loss(y_pred_prob, y)
                    acc_i = accuracy(y_pred_prob, y)

                    # backward
                    optimizer.zero_grad()
                    loss_i.backward()
                    optimizer.step()

                    loss_train += [loss_i.detach().cpu().numpy()]
                    acc_train += [acc_i.cpu().numpy()]

                model.eval()
                for i_batch, (X, y) in enumerate(validloader):

                    # get batch
                    X = Variable(X.float())
                    y = Variable(y)
                    if configer.cuda and is_available():
                        X = X.cuda()
                        y = y.cuda()

                    # forward
                    y_pred_prob = model(X)
                    loss_i = loss(y_pred_prob, y)
                    acc_i = accuracy(y_pred_prob, y)

                    loss_valid += [loss_i.detach().cpu().numpy()]
                    acc_valid += [acc_i.cpu().numpy()]

                loss_train = np.mean(np.array(loss_train))
                acc_train = np.mean(np.array(acc_train))
                loss_valid = np.mean(np.array(loss_valid))
                acc_valid = np.mean(np.array(acc_valid))

                logger.add_scalars('accuracy', {
                    'train': acc_train,
                    'valid': acc_valid
                }, i_epoch)
                logger.add_scalars('logloss', {
                    'train': loss_train,
                    'valid': loss_valid
                }, i_epoch)
                logger.add_scalar('lr', scheduler.get_lr()[-1], i_epoch)

                if loss_valid_last > loss_valid:

                    loss_valid_last = loss_valid
                    torch.save(model, modelpath)

            test(configer)
Ejemplo n.º 39
0
def train(
    cfg,
    data_cfg,
    img_size=416,
    resume=False,  # 是否加载已训练得到的权重
    epochs=273,  # 跑几遍数据集
    batch_size=16,
    accumulate=1,  # 积累accumulate个梯度然后再进行反向传播
    multi_scale=False,  # 是否进行多尺度训练
    freeze_backbone=False,  # 冻结基础网络 
    transfer=False  # 是否进行迁移学习Transfer learning (train only YOLO layers)
):
    init_seeds()  # 初始化种子,包括numpy和torch的种子
    weights = 'weights' + os.sep
    latest = weights + 'latest.pt'
    best = weights + 'best.pt'
    device = torch_utils.select_device()

    if multi_scale:
        img_size = 608  # initiate with maximum multi_scale size
        opt.num_workers = 0  # bug https://github.com/ultralytics/yolov3/issues/174
    else:
        torch.backends.cudnn.benchmark = True  # unsuitable for multiscale

    # Configure run
    data_dict = parse_data_cfg(data_cfg)  # 处理voc.data or coco.data
    train_path = data_dict['train']
    nc = int(data_dict['classes'])  # nc = number of classes

    # Initialize model 整个darknet模型
    model = Darknet(cfg, img_size).to(device)

    # pytorch 最常用的优化器,SGD
    optimizer = optim.SGD(model.parameters(),
                          lr=hyp['lr0'],
                          momentum=hyp['momentum'],
                          weight_decay=hyp['weight_decay'])

    cutoff = -1  # backbone reaches to cutoff layer
    start_epoch = 0
    best_loss = float('inf')
    nf = int(model.module_defs[model.yolo_layers[0] - 1]
             ['filters'])  # yolo layer size (i.e. 255) 18 = (class+5) * 3
    if resume:  # Load previously saved model
        if transfer:  # Transfer learning
            chkpt = torch.load(weights + 'yolov3-spp.pt', map_location=device)
            model.load_state_dict(
                {
                    k: v
                    for k, v in chkpt['model'].items()
                    if v.numel() > 1 and v.shape[0] != 255
                },
                strict=False)
            for p in model.parameters():
                p.requires_grad = True if p.shape[0] == nf else False

        else:  # resume from latest.pt
            chkpt = torch.load(latest, map_location=device)  # load checkpoint
            model.load_state_dict(chkpt['model'])

        start_epoch = chkpt['epoch'] + 1
        if chkpt['optimizer'] is not None:
            optimizer.load_state_dict(chkpt['optimizer'])
            best_loss = chkpt['best_loss']
        del chkpt

    else:  # Initialize model with backbone (optional)
        if '-tiny.cfg' in cfg:
            cutoff = load_darknet_weights(model,
                                          weights + 'yolov3-tiny.conv.15')
        else:
            cutoff = load_darknet_weights(model, weights + 'darknet53.conv.74')

    # Scheduler https://github.com/ultralytics/yolov3/issues/238
    # lf = lambda x: 1 - x / epochs  # linear ramp to zero
    # lf = lambda x: 10 ** (hyp['lrf'] * x / epochs)  # exp ramp
    lf = lambda x: 1 - 10**(hyp['lrf'] * (1 - x / epochs))  # inverse exp ramp
    scheduler = optim.lr_scheduler.LambdaLR(optimizer,
                                            lr_lambda=lf,
                                            last_epoch=start_epoch - 1)
    # scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[218, 245], gamma=0.1, last_epoch=start_epoch-1)

    # Plot lr schedule
    y = []
    for _ in range(epochs):
        scheduler.step()
        y.append(optimizer.param_groups[0]['lr'])
    plt.plot(y, label='LambdaLR')
    plt.xlabel('epoch')
    plt.xlabel('LR')
    plt.tight_layout()
    plt.savefig('LR.png', dpi=300)

    # Dataset img_size = 608
    dataset = LoadImagesAndLabels(train_path,
                                  img_size,
                                  batch_size,
                                  augment=True,
                                  rect=False,
                                  image_weights=False)

    # Initialize distributed training 多GPU分布式训练
    if torch.cuda.device_count() > 1:
        dist.init_process_group(backend=opt.backend,
                                init_method=opt.dist_url,
                                world_size=opt.world_size,
                                rank=opt.rank)
        model = torch.nn.parallel.DistributedDataParallel(model)
        # sampler = torch.utils.data.distributed.DistributedSampler(dataset)

    # Dataloader 数据加载比较复杂,需要查看API结合进行理解
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        num_workers=opt.num_workers,
        shuffle=False,  # disable rectangular training if True
        pin_memory=True,
        collate_fn=dataset.collate_fn)

    # Mixed precision training https://github.com/NVIDIA/apex
    # install help: https://github.com/NVIDIA/apex/issues/259
    mixed_precision = False
    if mixed_precision:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    # 开始训练
    model.hyp = hyp  # attach hyperparameters to model
    model.class_weights = labels_to_class_weights(dataset.labels,
                                                  nc).to(device)
    # attach class weights 不太明白以上那一行的作用
    model_info(model, 'full')

    nb = len(dataloader)  # 数据集中图片的个数
    maps = np.zeros(nc)  # mAP per class
    results = (0, 0, 0, 0, 0)  # P, R, mAP, F1, test_loss
    n_burnin = min(round(nb / 5 + 1), 1000)  # burn-in batches
    # for f in glob.glob('train_batch*.jpg') + glob.glob('test_batch*.jpg'):
    #     os.remove(f)
    t, t0 = time.time(), time.time()
    for epoch in range(start_epoch, epochs):
        model.train()  # 将模型的模式改为train模式
        print(
            ('\n%8s%12s' + '%10s' * 7) % ('Epoch', 'Batch', 'xy', 'wh', 'conf',
                                          'cls', 'total', 'nTargets', 'time'))

        # Update scheduler
        scheduler.step()

        # Freeze backbone at epoch 0, unfreeze at epoch 1
        if freeze_backbone and epoch < 2:
            for name, p in model.named_parameters():
                if int(name.split('.')[1]) < cutoff:  # if layer < 75
                    p.requires_grad = False if epoch == 0 else True

        # Update image weights (optional)
        w = model.class_weights.cpu().numpy() * (1 - maps)  # class weights
        image_weights = labels_to_image_weights(dataset.labels,
                                                nc=nc,
                                                class_weights=w)  # 不清楚这部分作用,同上
        dataset.indices = random.choices(range(dataset.n),
                                         weights=image_weights,
                                         k=dataset.n)  # random weighted index
        # indices:目录

        mloss = torch.zeros(5).to(
            device)  # mean losses 进行初始化,分别记录:xy, wh, conf, cls, total
        for i, (imgs, targets, _, _) in enumerate(dataloader):
            # i and data(imgs, targets, _ , _ ) 在dataset类中,torch.from_numpy(img), labels_out, img_path, (h, w)
            imgs = imgs.to(device)
            targets = targets.to(device)
            nt = len(targets)  # num of targets 目标的个数

            # Plot images with bounding boxes
            if epoch == 0 and i == 0:
                plot_images(imgs=imgs,
                            targets=targets,
                            fname='train_batch0.jpg')

            # SGD burn-in
            if epoch == 0 and i <= n_burnin:
                lr = hyp['lr0'] * (i / n_burnin)**4
                for x in optimizer.param_groups:
                    x['lr'] = lr

            # 运行模型,输入为图片,输出为向量
            pred = model(imgs)

            # 计算预测结果和标注结果之间的loss, yolo论文中的计算loss那一部分就是在这里
            loss, loss_items = compute_loss(pred, targets, model)
            if torch.isnan(loss):
                print('WARNING: nan loss detected, ending training')
                return results

            # Compute gradient
            if mixed_precision:  #混合精度
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            # Accumulate gradient for x batches before optimizing
            if (i + 1) % accumulate == 0 or (i + 1) == nb:
                optimizer.step()
                optimizer.zero_grad()

            # Update running mean of tracked metrics
            mloss = (mloss * i + loss_items) / (i + 1)

            # Print batch results
            s = ('%8s%12s' +
                 '%10.3g' * 7) % ('%g/%g' % (epoch, epochs - 1), '%g/%g' %
                                  (i, nb - 1), *mloss, nt, time.time() - t)
            t = time.time()
            print(s)

            # Multi-Scale training (320 - 608 pixels) every 10 batches
            if multi_scale and (i + 1) % 10 == 0:
                dataset.img_size = random.choice(range(10, 20)) * 32
                print('multi_scale img_size = %g' % dataset.img_size)

        # Calculate mAP (always test final epoch, skip first 5 if opt.nosave)
        if not (opt.notest or
                (opt.nosave and epoch < 10)) or epoch == epochs - 1:
            with torch.no_grad():
                results, maps = test.test(cfg,
                                          data_cfg,
                                          batch_size=batch_size,
                                          img_size=img_size,
                                          model=model,
                                          conf_thres=0.1)

        # Write epoch results
        with open('results.txt', 'a') as file:
            file.write(s + '%11.3g' * 5 % results +
                       '\n')  # P, R, mAP, F1, test_loss

        # Update best loss
        test_loss = results[4]
        if test_loss < best_loss:
            best_loss = test_loss

        # Save training results
        save = (not opt.nosave) or (epoch == epochs - 1)
        if save:
            # Create checkpoint
            chkpt = {
                'epoch':
                epoch,
                'best_loss':
                best_loss,
                'model':
                model.module.state_dict()
                if type(model) is nn.parallel.DistributedDataParallel else
                model.state_dict(),
                'optimizer':
                optimizer.state_dict()
            }

            # Save latest checkpoint
            torch.save(chkpt, latest)

            # Save best checkpoint
            if best_loss == test_loss:
                torch.save(chkpt, best)

            # Save backup every 10 epochs (optional)
            if epoch > 0 and epoch % 10 == 0:
                torch.save(chkpt, weights + 'backup%g.pt' % epoch)

            # Delete checkpoint
            del chkpt

    dt = (time.time() - t0) / 3600
    print('%g epochs completed in %.3f hours.' % (epoch - start_epoch, dt))
    return results
Ejemplo n.º 40
0
def train(hyp, opt, device, tb_writer=None, wandb=None):
    logger.info(
        colorstr("hyperparameters: ") + ", ".join(f"{k}={v}"
                                                  for k, v in hyp.items()))
    save_dir, epochs, batch_size, total_batch_size, weights, rank = (
        Path(opt.save_dir),
        opt.epochs,
        opt.batch_size,
        opt.total_batch_size,
        opt.weights,
        opt.global_rank,
    )

    # Directories
    wdir = save_dir / "weights"
    wdir.mkdir(parents=True, exist_ok=True)  # make dir
    last = wdir / "last.pt"
    best = wdir / "best.pt"
    results_file = save_dir / "results.txt"

    # Save run settings
    with open(save_dir / "hyp.yaml", "w") as f:
        yaml.dump(hyp, f, sort_keys=False)
    with open(save_dir / "opt.yaml", "w") as f:
        # yaml.dump(vars(opt), f, sort_keys=False)  # opt 実行パラメータ
        yaml.dump(str(opt), f, sort_keys=False)

    # Configure
    plots = not opt.evolve  # create plots
    cuda = device.type != "cpu"
    init_seeds(2 + rank)
    with open(opt.data) as f:
        data_dict = yaml.load(f, Loader=yaml.SafeLoader)  # data dict
    with torch_distributed_zero_first(rank):
        check_dataset(data_dict)  # check
    train_path = data_dict["train"]
    test_path = data_dict["val"]
    nc = 1 if opt.single_cls else int(data_dict["nc"])  # number of classes
    names = (["item"] if opt.single_cls and len(data_dict["names"]) != 1 else
             data_dict["names"])  # class names
    assert len(names) == nc, "%g names found for nc=%g dataset in %s" % (
        len(names),
        nc,
        opt.data,
    )  # check

    # Model
    pretrained = weights.endswith(".pt")
    if pretrained:
        with torch_distributed_zero_first(rank):
            attempt_download(weights)  # download if not found locally
        ckpt = torch.load(weights, map_location=device)  # load checkpoint
        if hyp.get("anchors"):
            ckpt["model"].yaml["anchors"] = round(
                hyp["anchors"])  # force autoanchor
        model = Model(opt.cfg or ckpt["model"].yaml, ch=3,
                      nc=nc).to(device)  # create
        exclude = ["anchor"] if opt.cfg or hyp.get("anchors") else [
        ]  # exclude keys
        state_dict = ckpt["model"].float().state_dict()  # to FP32
        state_dict = intersect_dicts(state_dict,
                                     model.state_dict(),
                                     exclude=exclude)  # intersect
        model.load_state_dict(state_dict, strict=False)  # load
        logger.info(
            "Transferred %g/%g items from %s" %
            (len(state_dict), len(model.state_dict()), weights))  # report
    else:
        model = Model(opt.cfg, ch=3, nc=nc).to(device)  # create

    # Freeze
    freeze = []  # parameter names to freeze (full or partial)
    for k, v in model.named_parameters():
        v.requires_grad = True  # train all layers
        if any(x in k for x in freeze):
            print("freezing %s" % k)
            v.requires_grad = False

    # Optimizer
    nbs = 64  # nominal batch size
    accumulate = max(round(nbs / total_batch_size),
                     1)  # accumulate loss before optimizing
    hyp["weight_decay"] *= total_batch_size * accumulate / nbs  # scale weight_decay
    logger.info(f"Scaled weight_decay = {hyp['weight_decay']}")

    pg0, pg1, pg2 = [], [], []  # optimizer parameter groups
    for k, v in model.named_modules():
        if hasattr(v, "bias") and isinstance(v.bias, nn.Parameter):
            pg2.append(v.bias)  # biases
        if isinstance(v, nn.BatchNorm2d):
            pg0.append(v.weight)  # no decay
        elif hasattr(v, "weight") and isinstance(v.weight, nn.Parameter):
            pg1.append(v.weight)  # apply decay

    if opt.adam:
        optimizer = optim.Adam(pg0,
                               lr=hyp["lr0"],
                               betas=(hyp["momentum"],
                                      0.999))  # adjust beta1 to momentum
    else:
        optimizer = optim.SGD(pg0,
                              lr=hyp["lr0"],
                              momentum=hyp["momentum"],
                              nesterov=True)

    optimizer.add_param_group({
        "params": pg1,
        "weight_decay": hyp["weight_decay"]
    })  # add pg1 with weight_decay
    optimizer.add_param_group({"params": pg2})  # add pg2 (biases)
    logger.info("Optimizer groups: %g .bias, %g conv.weight, %g other" %
                (len(pg2), len(pg1), len(pg0)))
    del pg0, pg1, pg2

    # Scheduler https://arxiv.org/pdf/1812.01187.pdf
    # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR
    if opt.linear_lr:
        lf = (lambda x: (1 - x / (epochs - 1)) *
              (1.0 - hyp["lrf"]) + hyp["lrf"])  # linear
    else:
        lf = one_cycle(1, hyp["lrf"], epochs)  # cosine 1->hyp['lrf']
    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
    # plot_lr_scheduler(optimizer, scheduler, epochs)

    # Logging
    if rank in [-1, 0] and wandb and wandb.run is None:
        opt.hyp = hyp  # add hyperparameters
        wandb_run = wandb.init(
            config=opt,
            resume="allow",
            project="YOLOv5"
            if opt.project == "runs/train" else Path(opt.project).stem,
            name=save_dir.stem,
            id=ckpt.get("wandb_id") if "ckpt" in locals() else None,
        )
    loggers = {"wandb": wandb}  # loggers dict

    # Resume
    start_epoch, best_fitness = 0, 0.0
    if pretrained:
        # Optimizer
        if ckpt["optimizer"] is not None:
            optimizer.load_state_dict(ckpt["optimizer"])
            best_fitness = ckpt["best_fitness"]

        # Results
        if ckpt.get("training_results") is not None:
            with open(results_file, "w") as file:
                file.write(ckpt["training_results"])  # write results.txt

        # Epochs
        start_epoch = ckpt["epoch"] + 1
        if opt.resume:
            assert (
                start_epoch > 0
            ), "%s training to %g epochs is finished, nothing to resume." % (
                weights,
                epochs,
            )
        if epochs < start_epoch:
            logger.info(
                "%s has been trained for %g epochs. Fine-tuning for %g additional epochs."
                % (weights, ckpt["epoch"], epochs))
            epochs += ckpt["epoch"]  # finetune additional epochs

        del ckpt, state_dict

    # Image sizes
    gs = int(model.stride.max())  # grid size (max stride)
    nl = model.model[
        -1].nl  # number of detection layers (used for scaling hyp['obj'])
    imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size
                         ]  # verify imgsz are gs-multiples

    # DP mode
    if cuda and rank == -1 and torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    # SyncBatchNorm
    if opt.sync_bn and cuda and rank != -1:
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
        logger.info("Using SyncBatchNorm()")

    # EMA
    ema = ModelEMA(model) if rank in [-1, 0] else None

    # DDP mode
    if cuda and rank != -1:
        model = DDP(model,
                    device_ids=[opt.local_rank],
                    output_device=opt.local_rank)

    # Trainloader
    dataloader, dataset = create_dataloader(
        train_path,
        imgsz,
        batch_size,
        gs,
        opt,
        hyp=hyp,
        augment=True,
        cache=opt.cache_images,
        rect=opt.rect,
        rank=rank,
        world_size=opt.world_size,
        workers=opt.workers,
        image_weights=opt.image_weights,
        quad=opt.quad,
        prefix=colorstr("train: "),
    )
    mlc = np.concatenate(dataset.labels, 0)[:, 0].max()  # max label class
    nb = len(dataloader)  # number of batches
    assert (
        mlc < nc
    ), "Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g" % (
        mlc,
        nc,
        opt.data,
        nc - 1,
    )

    # Process 0
    if rank in [-1, 0]:
        ema.updates = start_epoch * nb // accumulate  # set EMA updates
        testloader = create_dataloader(
            test_path,
            imgsz_test,
            batch_size * 2,
            gs,
            opt,  # testloader
            hyp=hyp,
            cache=opt.cache_images and not opt.notest,
            rect=True,
            rank=-1,
            world_size=opt.world_size,
            workers=opt.workers,
            pad=0.5,
            prefix=colorstr("val: "),
        )[0]

        if not opt.resume:
            labels = np.concatenate(dataset.labels, 0)
            c = torch.tensor(labels[:, 0])  # classes
            # cf = torch.bincount(c.long(), minlength=nc) + 1.  # frequency
            # model._initialize_biases(cf.to(device))
            if plots:
                plot_labels(labels, save_dir, loggers)
                if tb_writer:
                    tb_writer.add_histogram("classes", c, 0)

            # Anchors
            if not opt.noautoanchor:
                check_anchors(dataset,
                              model=model,
                              thr=hyp["anchor_t"],
                              imgsz=imgsz)

    # Model parameters
    hyp["box"] *= 3.0 / nl  # scale to layers
    hyp["cls"] *= nc / 80.0 * 3.0 / nl  # scale to classes and layers
    hyp["obj"] *= (imgsz / 640)**2 * 3.0 / nl  # scale to image size and layers
    model.nc = nc  # attach number of classes to model
    model.hyp = hyp  # attach hyperparameters to model
    model.gr = 1.0  # iou loss ratio (obj_loss = 1.0 or iou)
    model.class_weights = (
        labels_to_class_weights(dataset.labels, nc).to(device) * nc
    )  # attach class weights
    model.names = names

    # Start training
    t0 = time.time()
    nw = max(round(hyp["warmup_epochs"] * nb),
             1000)  # number of warmup iterations, max(3 epochs, 1k iterations)
    # nw = min(nw, (epochs - start_epoch) / 2 * nb)  # limit warmup to < 1/2 of training
    maps = np.zeros(nc)  # mAP per class
    results = (0, 0, 0, 0, 0, 0, 0
               )  # P, R, [email protected], [email protected], val_loss(box, obj, cls)
    scheduler.last_epoch = start_epoch - 1  # do not move
    scaler = amp.GradScaler(enabled=cuda)
    compute_loss = ComputeLoss(model)  # init loss class
    logger.info(f"Image sizes {imgsz} train, {imgsz_test} test\n"
                f"Using {dataloader.num_workers} dataloader workers\n"
                f"Logging results to {save_dir}\n"
                f"Starting training for {epochs} epochs...")
    for epoch in range(
            start_epoch, epochs
    ):  # epoch ------------------------------------------------------------------
        model.train()

        # Update image weights (optional)
        if opt.image_weights:
            # Generate indices
            if rank in [-1, 0]:
                cw = (model.class_weights.cpu().numpy() * (1 - maps)**2 / nc
                      )  # class weights
                iw = labels_to_image_weights(dataset.labels,
                                             nc=nc,
                                             class_weights=cw)  # image weights
                dataset.indices = random.choices(
                    range(dataset.n), weights=iw,
                    k=dataset.n)  # rand weighted idx
            # Broadcast if DDP
            if rank != -1:
                indices = (torch.tensor(dataset.indices)
                           if rank == 0 else torch.zeros(dataset.n)).int()
                dist.broadcast(indices, 0)
                if rank != 0:
                    dataset.indices = indices.cpu().numpy()

        # Update mosaic border
        # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs)
        # dataset.mosaic_border = [b - imgsz, -b]  # height, width borders

        mloss = torch.zeros(4, device=device)  # mean losses
        if rank != -1:
            dataloader.sampler.set_epoch(epoch)
        pbar = enumerate(dataloader)
        logger.info(
            ("\n" + "%10s" * 8) % ("Epoch", "gpu_mem", "box", "obj", "cls",
                                   "total", "targets", "img_size"))
        if rank in [-1, 0]:
            pbar = tqdm(pbar, total=nb)  # progress bar
        optimizer.zero_grad()
        for i, (
                imgs,
                targets,
                paths,
                _,
        ) in (
                pbar
        ):  # batch -------------------------------------------------------------
            ni = i + nb * epoch  # number integrated batches (since train start)
            imgs = (imgs.to(device, non_blocking=True).float() / 255.0
                    )  # uint8 to float32, 0-255 to 0.0-1.0

            # Warmup
            if ni <= nw:
                xi = [0, nw]  # x interp
                # model.gr = np.interp(ni, xi, [0.0, 1.0])  # iou loss ratio (obj_loss = 1.0 or iou)
                accumulate = max(
                    1,
                    np.interp(ni, xi, [1, nbs / total_batch_size]).round())
                for j, x in enumerate(optimizer.param_groups):
                    # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
                    x["lr"] = np.interp(
                        ni,
                        xi,
                        [
                            hyp["warmup_bias_lr"] if j == 2 else 0.0,
                            x["initial_lr"] * lf(epoch),
                        ],
                    )
                    if "momentum" in x:
                        x["momentum"] = np.interp(
                            ni, xi, [hyp["warmup_momentum"], hyp["momentum"]])

            # Multi-scale
            if opt.multi_scale:
                sz = random.randrange(imgsz * 0.5,
                                      imgsz * 1.5 + gs) // gs * gs  # size
                sf = sz / max(imgs.shape[2:])  # scale factor
                if sf != 1:
                    ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]
                          ]  # new shape (stretched to gs-multiple)
                    imgs = F.interpolate(imgs,
                                         size=ns,
                                         mode="bilinear",
                                         align_corners=False)

            # Forward
            with amp.autocast(enabled=cuda):
                pred = model(imgs)  # forward
                loss, loss_items = compute_loss(
                    pred, targets.to(device))  # loss scaled by batch_size
                if rank != -1:
                    loss *= (opt.world_size
                             )  # gradient averaged between devices in DDP mode
                if opt.quad:
                    loss *= 4.0

            # Backward
            scaler.scale(loss).backward()

            # Optimize
            if ni % accumulate == 0:
                scaler.step(optimizer)  # optimizer.step
                scaler.update()
                optimizer.zero_grad()
                if ema:
                    ema.update(model)

            # Print
            if rank in [-1, 0]:
                mloss = (mloss * i + loss_items) / (i + 1
                                                    )  # update mean losses
                mem = "%.3gG" % (torch.cuda.memory_reserved() / 1e9
                                 if torch.cuda.is_available() else 0)  # (GB)
                s = ("%10s" * 2 + "%10.4g" * 6) % (
                    "%g/%g" % (epoch, epochs - 1),
                    mem,
                    *mloss,
                    targets.shape[0],
                    imgs.shape[-1],
                )
                pbar.set_description(s)

                # Plot
                if plots and ni < 3:
                    f = save_dir / f"train_batch{ni}.jpg"  # filename
                    Thread(target=plot_images,
                           args=(imgs, targets, paths, f),
                           daemon=True).start()
                    # if tb_writer:
                    #     tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch)
                    #     tb_writer.add_graph(model, imgs)  # add model to tensorboard
                elif plots and ni == 10 and wandb:
                    wandb.log(
                        {
                            "Mosaics": [
                                wandb.Image(str(x), caption=x.name)
                                for x in save_dir.glob("train*.jpg")
                                if x.exists()
                            ]
                        },
                        commit=False,
                    )

            # end batch ------------------------------------------------------------------------------------------------
        # end epoch ----------------------------------------------------------------------------------------------------

        # Scheduler
        lr = [x["lr"] for x in optimizer.param_groups]  # for tensorboard
        scheduler.step()

        # DDP process 0 or single-GPU
        if rank in [-1, 0]:
            # mAP
            if ema:
                ema.update_attr(
                    model,
                    include=[
                        "yaml",
                        "nc",
                        "hyp",
                        "gr",
                        "names",
                        "stride",
                        "class_weights",
                    ],
                )
            final_epoch = epoch + 1 == epochs
            if not opt.notest or final_epoch:  # Calculate mAP
                results, maps, times = test.test(
                    opt.data,
                    batch_size=batch_size * 2,
                    imgsz=imgsz_test,
                    model=ema.ema,
                    single_cls=opt.single_cls,
                    dataloader=testloader,
                    save_dir=save_dir,
                    verbose=nc < 50 and final_epoch,
                    plots=plots and final_epoch,
                    log_imgs=opt.log_imgs if wandb else 0,
                    compute_loss=compute_loss,
                )

            # Write
            with open(results_file, "a") as f:
                f.write(
                    s + "%10.4g" * 7 % results +
                    "\n")  # P, R, [email protected], [email protected], val_loss(box, obj, cls)
            if len(opt.name) and opt.bucket:
                os.system("gsutil cp %s gs://%s/results/results%s.txt" %
                          (results_file, opt.bucket, opt.name))

            # Log
            tags = [
                "train/box_loss",
                "train/obj_loss",
                "train/cls_loss",  # train loss
                "metrics/precision",
                "metrics/recall",
                "metrics/mAP_0.5",
                "metrics/mAP_0.5:0.95",
                "val/box_loss",
                "val/obj_loss",
                "val/cls_loss",  # val loss
                "x/lr0",
                "x/lr1",
                "x/lr2",
            ]  # params
            for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags):
                if tb_writer:
                    tb_writer.add_scalar(tag, x, epoch)  # tensorboard
                if wandb:
                    wandb.log({tag: x}, step=epoch,
                              commit=tag == tags[-1])  # W&B

            # Update best mAP
            fi = fitness(np.array(results).reshape(
                1, -1))  # weighted combination of [P, R, [email protected], [email protected]]
            if fi > best_fitness:
                best_fitness = fi

            # Save model
            save = (not opt.nosave) or (final_epoch and not opt.evolve)
            if save:
                with open(results_file, "r") as f:  # create checkpoint
                    ckpt = {
                        "epoch":
                        epoch,
                        "best_fitness":
                        best_fitness,
                        "training_results":
                        f.read(),
                        "model":
                        ema.ema,
                        "optimizer":
                        None if final_epoch else optimizer.state_dict(),
                        "wandb_id":
                        wandb_run.id if wandb else None,
                    }

                # Save last, best and delete
                torch.save(ckpt, last)
                if best_fitness == fi:
                    torch.save(ckpt, best)
                del ckpt
        # end epoch ----------------------------------------------------------------------------------------------------
    # end training

    if rank in [-1, 0]:
        # Strip optimizers
        final = best if best.exists() else last  # final model
        for f in [last, best]:
            if f.exists():
                strip_optimizer(f)  # strip optimizers
        if opt.bucket:
            os.system(f"gsutil cp {final} gs://{opt.bucket}/weights")  # upload

        # Plots
        if plots:
            plot_results(save_dir=save_dir)  # save as results.png
            if wandb:
                files = [
                    "results.png",
                    "confusion_matrix.png",
                    *[f"{x}_curve.png" for x in ("F1", "PR", "P", "R")],
                ]
                wandb.log({
                    "Results": [
                        wandb.Image(str(save_dir / f), caption=f)
                        for f in files if (save_dir / f).exists()
                    ]
                })
                if opt.log_artifacts:
                    wandb.log_artifact(artifact_or_path=str(final),
                                       type="model",
                                       name=save_dir.stem)

        # Test best.pt
        logger.info("%g epochs completed in %.3f hours.\n" %
                    (epoch - start_epoch + 1, (time.time() - t0) / 3600))
        if opt.data.endswith("coco.yaml") and nc == 80:  # if COCO
            for conf, iou, save_json in (
                [0.25, 0.45, False],
                [0.001, 0.65, True],
            ):  # speed, mAP tests
                results, _, _ = test.test(
                    opt.data,
                    batch_size=batch_size * 2,
                    imgsz=imgsz_test,
                    conf_thres=conf,
                    iou_thres=iou,
                    model=attempt_load(final, device).half(),
                    single_cls=opt.single_cls,
                    dataloader=testloader,
                    save_dir=save_dir,
                    save_json=save_json,
                    plots=False,
                )

    else:
        dist.destroy_process_group()

    wandb.run.finish() if wandb and wandb.run else None
    torch.cuda.empty_cache()

    # mlflow
    with mlflow.start_run() as run:
        # Log args into mlflow
        for key, value in hyp.items():
            mlflow.log_param(key, value)

        for key, value in vars(opt).items():
            mlflow.log_param(key, value)

        # Log results into mlflow
        for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags):
            # xがtorch.Tensorだったらfloatに直す
            if torch.is_tensor(x):
                x = x.item()

            # tag名に特殊記号があれば削除する
            if ":" in tag:
                tag = re.sub(r":", " ", tag)

            mlflow.log_metric(tag, x)

        # Log model
        mlflow.pytorch.log_model(model, "model")

    return results
Ejemplo n.º 41
0
def main_best_channels():

    # 波段选择依据
    # 以最佳的划分方式:
    # 依次选择每个波段进行实验

    for splitidx in range(46, 51):
        for datatype in ['Multi', 'RGB']:

            if datatype == 'Multi':
                # usedChannelsList = [[i] for i in range(23)]
                usedChannelsList = [[550 + 20 * i for i in range(23)]]
            else:
                # usedChannelsList = ['R', 'G', 'B']
                usedChannelsList = [
                    'RGB',
                ]

            for usedChannels in usedChannelsList:

                print(getTime(), splitidx, datatype, usedChannels, '...')

                configer = EasyDict()

                configer.dsize = (64, 64)
                configer.datatype = datatype
                configer.n_epoch = 300 if datatype == 'Multi' else 350
                configer.lrbase = 0.001 if datatype == 'Multi' else 0.0005

                configer.n_channel = 23
                configer.n_class = 63
                configer.batchsize = 32
                configer.stepsize = 250
                configer.gamma = 0.2
                configer.cuda = True

                configer.splitmode = 'split_{}x{}_{}'.format(
                    configer.dsize[0], configer.dsize[1], splitidx)
                configer.modelbase = 'recognize_vgg11_bn'

                if configer.datatype == 'Multi':
                    configer.usedChannels = usedChannels
                    configer.n_usedChannels = len(configer.usedChannels)
                    configer.modelname = '{}_{}_{}'.\
                                    format(configer.modelbase, configer.splitmode,
                                            '_'.join(list(map(str, configer.usedChannels))))
                elif configer.datatype == 'RGB':
                    configer.usedChannels = usedChannels
                    configer.n_usedChannels = len(configer.usedChannels)
                    configer.modelname = '{}_{}_{}'.\
                                    format(configer.modelbase, configer.splitmode, configer.usedChannels)


                configer.datapath = '/datasets/ECUST2019_{}x{}'.\
                                                format(configer.dsize[0], configer.dsize[1])
                configer.logspath = '/home/louishsu/Work/Workspace/HUAWEI/pytorch/logs/{}_{}_{}subjects_logs'.\
                                                format(configer.modelbase, configer.splitmode, configer.n_class)
                configer.mdlspath = '/home/louishsu/Work/Workspace/HUAWEI/pytorch/modelfiles/{}_{}_{}subjects_models'.\
                                                format(configer.modelbase, configer.splitmode, configer.n_class)

                train(configer)
                test(configer)
Ejemplo n.º 42
0
def train():
    cfg = opt.cfg
    data = opt.data
    img_size = opt.img_size
    epochs = 1 if opt.prebias else opt.epochs  # 500200 batches at bs 64, 117263 images = 273 epochs
    batch_size = opt.batch_size
    accumulate = opt.accumulate  # effective bs = batch_size * accumulate = 16 * 4 = 64
    weights = opt.weights  # initial training weights

    if 'pw' not in opt.arc:  # remove BCELoss positive weights
        hyp['cls_pw'] = 1.
        hyp['obj_pw'] = 1.

    # Initialize
    init_seeds()
    wdir = 'weights' + os.sep  # weights dir
    last = wdir + 'last.pt'
    best = wdir + 'best.pt'
    multi_scale = opt.multi_scale

    if multi_scale:
        img_sz_min = round(img_size / 32 / 1.5) + 1
        img_sz_max = round(img_size / 32 * 1.5) - 1
        img_size = img_sz_max * 32  # initiate with maximum multi_scale size
        print('Using multi-scale %g - %g' % (img_sz_min * 32, img_size))

    # Configure run
    data_dict = parse_data_cfg(data)
    train_path = data_dict['train']
    nc = int(data_dict['classes'])  # number of classes

    # Remove previous results
    for f in glob.glob('*_batch*.jpg') + glob.glob('results.txt'):
        os.remove(f)

    # Initialize model
    model = Darknet(cfg, arc=opt.arc).to(device)

    # Optimizer
    pg0, pg1 = [], []  # optimizer parameter groups
    for k, v in dict(model.named_parameters()).items():
        if 'Conv2d.weight' in k:
            pg1 += [v]  # parameter group 1 (apply weight_decay)
        else:
            pg0 += [v]  # parameter group 0

    # optimizer = optim.Adam(pg0, lr=hyp['lr0'])
    # optimizer = AdaBound(pg0, lr=hyp['lr0'], final_lr=0.1)
    optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
    optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']})  # add pg1 with weight_decay
    del pg0, pg1

    cutoff = -1  # backbone reaches to cutoff layer
    start_epoch = 0
    best_fitness = 0.
    if weights.endswith('.pt'):  # pytorch format
        # possible weights are 'last.pt', 'yolov3-spp.pt', 'yolov3-tiny.pt' etc.
        if opt.bucket:
            os.system('gsutil cp gs://%s/last.pt %s' % (opt.bucket, last))  # download from bucket
        chkpt = torch.load(weights, map_location=device)

        # load model
        if opt.transfer:
            chkpt['model'] = {k: v for k, v in chkpt['model'].items() if model.state_dict()[k].numel() == v.numel()}
            model.load_state_dict(chkpt['model'], strict=False)
        else:
            model.load_state_dict(chkpt['model'])

        # load optimizer
        if chkpt['optimizer'] is not None:
            optimizer.load_state_dict(chkpt['optimizer'])
            best_fitness = chkpt['best_fitness']

        # load results
        if chkpt.get('training_results') is not None:
            with open('results.txt', 'w') as file:
                file.write(chkpt['training_results'])  # write results.txt

        start_epoch = chkpt['epoch'] + 1
        del chkpt

    elif len(weights) > 0:  # darknet format
        # possible weights are 'yolov3.weights', 'yolov3-tiny.conv.15',  'darknet53.conv.74' etc.
        cutoff = load_darknet_weights(model, weights)

    if opt.transfer or opt.prebias:  # transfer learning edge (yolo) layers
        nf = int(model.module_defs[model.yolo_layers[0] - 1]['filters'])  # yolo layer size (i.e. 255)

        for p in optimizer.param_groups:
            # lower param count allows more aggressive training settings: i.e. SGD ~0.1 lr0, ~0.9 momentum
            p['lr'] *= 100
            if p.get('momentum') is not None:  # for SGD but not Adam
                p['momentum'] *= 0.9

        for p in model.parameters():
            if opt.prebias and p.numel() == nf:  # train (yolo biases)
                p.requires_grad = True
            elif opt.transfer and p.shape[0] == nf:  # train (yolo biases+weights)
                p.requires_grad = True
            else:  # freeze layer
                p.requires_grad = False

    # Scheduler https://github.com/ultralytics/yolov3/issues/238
    # lf = lambda x: 1 - x / epochs  # linear ramp to zero
    # lf = lambda x: 10 ** (hyp['lrf'] * x / epochs)  # exp ramp
    # lf = lambda x: 1 - 10 ** (hyp['lrf'] * (1 - x / epochs))  # inverse exp ramp
    # scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
    scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[round(opt.epochs * x) for x in [0.8, 0.9]], gamma=0.1)
    scheduler.last_epoch = start_epoch - 1

    # # Plot lr schedule
    # y = []
    # for _ in range(epochs):
    #     scheduler.step()
    #     y.append(optimizer.param_groups[0]['lr'])
    # plt.plot(y, label='LambdaLR')
    # plt.xlabel('epoch')
    # plt.ylabel('LR')
    # plt.tight_layout()
    # plt.savefig('LR.png', dpi=300)

    # Mixed precision training https://github.com/NVIDIA/apex
    if mixed_precision:
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0)

    # Initialize distributed training
    if torch.cuda.device_count() > 1:
        dist.init_process_group(backend='nccl',  # 'distributed backend'
                                init_method='tcp://127.0.0.1:9999',  # distributed training init method
                                world_size=1,  # number of nodes for distributed training
                                rank=0)  # distributed training node rank
        model = torch.nn.parallel.DistributedDataParallel(model)
        model.yolo_layers = model.module.yolo_layers  # move yolo layer indices to top level

    # Dataset
    dataset = LoadImagesAndLabels(train_path,
                                  img_size,
                                  batch_size,
                                  augment=True,
                                  hyp=hyp,  # augmentation hyperparameters
                                  rect=opt.rect,  # rectangular training
                                  image_weights=opt.img_weights,
                                  cache_images=False if opt.prebias else opt.cache_images)

    # Dataloader
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batch_size,
                                             num_workers=min(os.cpu_count(), batch_size),
                                             shuffle=not opt.rect,  # Shuffle=True unless rectangular training is used
                                             pin_memory=True,
                                             collate_fn=dataset.collate_fn)

    # Start training
    model.nc = nc  # attach number of classes to model
    model.arc = opt.arc  # attach yolo architecture
    model.hyp = hyp  # attach hyperparameters to model
    model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device)  # attach class weights
    model_info(model, report='summary')  # 'full' or 'summary'
    nb = len(dataloader)
    maps = np.zeros(nc)  # mAP per class
    results = (0, 0, 0, 0, 0, 0, 0)  # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification'
    t0 = time.time()
    print('Starting %s for %g epochs...' % ('prebias' if opt.prebias else 'training', epochs))
    for epoch in range(start_epoch, epochs):  # epoch ------------------------------------------------------------------
        model.train()
        print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size'))

        # Freeze backbone at epoch 0, unfreeze at epoch 1 (optional)
        freeze_backbone = False
        if freeze_backbone and epoch < 2:
            for name, p in model.named_parameters():
                if int(name.split('.')[1]) < cutoff:  # if layer < 75
                    p.requires_grad = False if epoch == 0 else True

        # Update image weights (optional)
        if dataset.image_weights:
            w = model.class_weights.cpu().numpy() * (1 - maps) ** 2  # class weights
            image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w)
            dataset.indices = random.choices(range(dataset.n), weights=image_weights, k=dataset.n)  # rand weighted idx

        mloss = torch.zeros(4).to(device)  # mean losses
        pbar = tqdm(enumerate(dataloader), total=nb)  # progress bar
        for i, (imgs, targets, paths, _) in pbar:  # batch -------------------------------------------------------------
            ni = i + nb * epoch  # number integrated batches (since train start)
            imgs = imgs.to(device)
            targets = targets.to(device)

            # Multi-Scale training
            if multi_scale:
                if ni / accumulate % 10 == 0:  #  adjust (67% - 150%) every 10 batches
                    img_size = random.randrange(img_sz_min, img_sz_max + 1) * 32
                sf = img_size / max(imgs.shape[2:])  # scale factor
                if sf != 1:
                    ns = [math.ceil(x * sf / 32.) * 32 for x in imgs.shape[2:]]  # new shape (stretched to 32-multiple)
                    imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False)

            # Plot images with bounding boxes
            if ni == 0:
                fname = 'train_batch%g.jpg' % i
                plot_images(imgs=imgs, targets=targets, paths=paths, fname=fname)
                if tb_writer:
                    tb_writer.add_image(fname, cv2.imread(fname)[:, :, ::-1], dataformats='HWC')

            # Hyperparameter burn-in
            # n_burn = nb - 1  # min(nb // 5 + 1, 1000)  # number of burn-in batches
            # if ni <= n_burn:
            #     for m in model.named_modules():
            #         if m[0].endswith('BatchNorm2d'):
            #             m[1].momentum = 1 - i / n_burn * 0.99  # BatchNorm2d momentum falls from 1 - 0.01
            #     g = (i / n_burn) ** 4  # gain rises from 0 - 1
            #     for x in optimizer.param_groups:
            #         x['lr'] = hyp['lr0'] * g
            #         x['weight_decay'] = hyp['weight_decay'] * g

            # Run model
            pred = model(imgs)

            # Compute loss
            loss, loss_items = compute_loss(pred, targets, model)
            if not torch.isfinite(loss):
                print('WARNING: non-finite loss, ending training ', loss_items)
                return results

            # Scale loss by nominal batch_size of 64
            loss *= batch_size / 64

            # Compute gradient
            if mixed_precision:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            # Accumulate gradient for x batches before optimizing
            if ni % accumulate == 0:
                optimizer.step()
                optimizer.zero_grad()

            # Print batch results
            mloss = (mloss * i + loss_items) / (i + 1)  # update mean losses
            mem = torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0  # (GB)
            s = ('%10s' * 2 + '%10.3g' * 6) % (
                '%g/%g' % (epoch, epochs - 1), '%.3gG' % mem, *mloss, len(targets), img_size)
            pbar.set_description(s)

            # end batch ------------------------------------------------------------------------------------------------

        # Update scheduler
        scheduler.step()

        # Process epoch results
        final_epoch = epoch + 1 == epochs
        if opt.prebias:
            print_model_biases(model)
        else:
            # Calculate mAP (always test final epoch, skip first 10 if opt.nosave)
            if not (opt.notest or (opt.nosave and epoch < 10)) or final_epoch:
                with torch.no_grad():
                    results, maps = test.test(cfg,
                                              data,
                                              batch_size=batch_size,
                                              img_size=opt.img_size,
                                              model=model,
                                              conf_thres=0.001 if final_epoch and epoch > 0 else 0.1,  # 0.1 for speed
                                              save_json=final_epoch and epoch > 0 and 'coco.data' in data)

        # Write epoch results
        with open('results.txt', 'a') as file:
            file.write(s + '%10.3g' * 7 % results + '\n')  # P, R, mAP, F1, test_losses=(GIoU, obj, cls)

        # Write Tensorboard results
        if tb_writer:
            x = list(mloss) + list(results)
            titles = ['GIoU', 'Objectness', 'Classification', 'Train loss',
                      'Precision', 'Recall', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification']
            for xi, title in zip(x, titles):
                tb_writer.add_scalar(title, xi, epoch)

        # Update best mAP
        fitness = results[2]  # mAP
        if fitness > best_fitness:
            best_fitness = fitness

        # Save training results
        save = (not opt.nosave) or ((not opt.evolve) and final_epoch)
        if save:
            with open('results.txt', 'r') as file:
                # Create checkpoint
                chkpt = {'epoch': epoch,
                         'best_fitness': best_fitness,
                         'training_results': file.read(),
                         'model': model.module.state_dict() if type(
                             model) is nn.parallel.DistributedDataParallel else model.state_dict(),
                         'optimizer': None if final_epoch else optimizer.state_dict()}

            # Save last checkpoint
            torch.save(chkpt, last)
            if opt.bucket:
                os.system('gsutil cp %s gs://%s' % (last, opt.bucket))  # upload to bucket

            # Save best checkpoint
            if best_fitness == fitness:
                torch.save(chkpt, best)

            # Save backup every 10 epochs (optional)
            if epoch > 0 and epoch % 10 == 0:
                torch.save(chkpt, wdir + 'backup%g.pt' % epoch)

            # Delete checkpoint
            del chkpt

        # end epoch ----------------------------------------------------------------------------------------------------

    # end training
    if len(opt.name):
        os.rename('results.txt', 'results_%s.txt' % opt.name)
    plot_results()  # save as results.png
    print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600))
    dist.destroy_process_group() if torch.cuda.device_count() > 1 else None
    torch.cuda.empty_cache()
    return results
Ejemplo n.º 43
0
    img_size = opt.img_size
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = Darknet(opt.cfg, (img_size, img_size)).to(device)

    if opt.weights.endswith('.pt'):
        model.load_state_dict(torch.load(opt.weights)['model'])
    else:
        load_darknet_weights(model, opt.weights)
    print('\nloaded weights from ', opt.weights)

    eval_model = lambda model: test(opt.cfg,
                                    opt.data,
                                    weights=opt.weights,
                                    batch_size=16,
                                    img_size=img_size,
                                    iou_thres=0.5,
                                    conf_thres=0.001,
                                    nms_thres=0.5,
                                    save_json=False,
                                    model=model)
    obtain_num_parameters = lambda model: sum(
        [param.nelement() for param in model.parameters()])

    print("\nlet's test the original model first:")
    with torch.no_grad():
        origin_model_metric = eval_model(model)

    origin_nparameters = obtain_num_parameters(model)

    CBL_idx, Conv_idx, prune_idx = parse_module_defs(model.module_defs)
    0.7, 0.8, 0.9, 1
]
MODELS = {}

net = Net(HN1, HN2)
init_state = copy.deepcopy(net.state_dict())
for lr in LR:
    net.load_state_dict(init_state)
    training_inputs = training_data[:, 0:5]
    training_add = training_data[:, 8:]
    training_inputs = np.append(training_inputs, training_add, axis=1)
    training_labels = training_data[:, 5:8]

    test_inputs = testing_data[:, 0:5]
    test_add = testing_data[:, 8:]
    test_inputs = np.append(test_inputs, test_add, axis=1)
    test_labels = testing_data[:, 5:8]

    train(net, training_inputs, training_labels, EPOCHS, lr, BATCH_SIZE)
    avg_mse = test(test_inputs, test_labels, net)

    MODELS['{a}_{x}-{y}_{z}_{b}'.format(a=HL, x=HN1, y=HN2, z=EPOCHS,
                                        b=lr)] = avg_mse

with open('Data/Search/manual_search_results_{x}HL_lr.csv'.format(x=HL),
          'w') as f:
    for key in MODELS.keys():
        f.write("%s: %s\n" % (key, MODELS[key]))

print(MODELS)
Ejemplo n.º 45
0
    test_dataset = LoadImagesAndLabels(valid_path, img_size, batch_size,
                                       hyp=hyp,
                                       rect=True,
                                       cache_images=False)
    testloader = torch.utils.data.DataLoader(test_dataset,
                                             batch_size=batch_size,
                                             num_workers=min([os.cpu_count(), batch_size, 8]),
                                             shuffle=False,
                                             pin_memory=True,
                                             collate_fn=test_dataset.collate_fn)
    with torch.no_grad():
        origin_model_metric = test.test(opt.cfg,
                                        opt.data,
                                        batch_size=batch_size,
                                        imgsz=img_size,
                                        model=model,
                                        dataloader=testloader,
                                        rank=-1,
                                        plot=False)
    origin_nparameters = obtain_num_parameters(model)
    origin_macs = performance_summary(model)

    CBL_idx, Conv_idx, prune_idx, _, _ = parse_module_defs2(model.module_defs)

    print("-------------------------------------------------------")

    max_mAP = 0
    for i in range(number):
        compact_module_defs, current_parameters, compact_model = rand_prune_and_eval(model, 0, 1)
        with torch.no_grad():
            # 防止随机生成的较差的模型撑爆显存,增大nmsconf阈值
Ejemplo n.º 46
0
# Construct validation memory
val_mem = ReplayMemory(args, args.evaluation_size)
T, done = 0, True
while T < args.evaluation_size - args.history_length + 1:
    if done:
        state, done = env.reset(), False
        val_mem.preappend()  # Set up memory for beginning of episode

    val_mem.append(state, None, None)
    state, _, done = env.step(random.randint(0, action_space - 1))
    T += 1
    # No need to postappend on done in validation memory

if args.evaluate:
    dqn.eval()  # Set DQN (policy network) to evaluation mode
    avg_reward, avg_Q = test(args, 0, dqn, val_mem, evaluate=True)  # Test
    print('Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' + str(avg_Q))
else:
    # Training loop
    dqn.train()
    T, done = 0, True
    while T < args.T_max:
        if done:
            state, done = Variable(env.reset()), False
            dqn.reset_noise(
            )  # Draw a new set of noisy weights for each episode (better for before learning starts)
            mem.preappend()  # Set up memory for beginning of episode

        action = dqn.act(
            state)  # Choose an action greedily (with noisy weights)
Ejemplo n.º 47
0
def train(hyp):
    epochs = opt.epochs  # 300
    batch_size = opt.batch_size  # 64
    weights = opt.weights  # initial training weights

    # Configure
    init_seeds(1)
    with open(opt.data) as f:
        data_dict = yaml.load(f, Loader=yaml.FullLoader)  # model dict
    train_path = data_dict['train']
    test_path = data_dict['val']
    nc = 1 if opt.single_cls else int(data_dict['nc'])  # number of classes

    # Remove previous results
    for f in glob.glob('*_batch*.jpg') + glob.glob(results_file):
        os.remove(f)

    # Create model
    model = Model(opt.cfg).to(device)
    assert model.md['nc'] == nc, '%s nc=%g classes but %s nc=%g classes' % (
        opt.data, nc, opt.cfg, model.md['nc'])

    # Image sizes
    gs = int(max(model.stride))  # grid size (max stride)
    imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size
                         ]  # verify imgsz are gs-multiples

    # Optimizer
    nbs = 64  # nominal batch size
    accumulate = max(round(nbs / batch_size),
                     1)  # accumulate loss before optimizing
    hyp['weight_decay'] *= batch_size * accumulate / nbs  # scale weight_decay
    pg0, pg1, pg2 = [], [], []  # optimizer parameter groups
    for k, v in model.named_parameters():
        if v.requires_grad:
            if '.bias' in k:
                pg2.append(v)  # biases
            elif '.weight' in k and '.bn' not in k:
                pg1.append(v)  # apply weight decay
            else:
                pg0.append(v)  # all else

    optimizer = optim.Adam(pg0, lr=hyp['lr0']) if opt.adam else \
        optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
    optimizer.add_param_group({
        'params': pg1,
        'weight_decay': hyp['weight_decay']
    })  # add pg1 with weight_decay
    optimizer.add_param_group({'params': pg2})  # add pg2 (biases)
    print('Optimizer groups: %g .bias, %g conv.weight, %g other' %
          (len(pg2), len(pg1), len(pg0)))
    del pg0, pg1, pg2

    # Load Model
    google_utils.attempt_download(weights)
    start_epoch, best_fitness = 0, 0.0
    if weights.endswith('.pt'):  # pytorch format
        ckpt = torch.load(weights, map_location=device)  # load checkpoint

        # load model
        try:
            ckpt['model'] = {
                k: v
                for k, v in ckpt['model'].float().state_dict().items()
                if model.state_dict()[k].shape == v.shape
            }  # to FP32, filter
            model.load_state_dict(ckpt['model'], strict=False)
        except KeyError as e:
            s = "%s is not compatible with %s. Specify --weights '' or specify a --cfg compatible with %s." \
                % (opt.weights, opt.cfg, opt.weights)
            raise KeyError(s) from e

        # load optimizer
        if ckpt['optimizer'] is not None:
            optimizer.load_state_dict(ckpt['optimizer'])
            best_fitness = ckpt['best_fitness']

        # load results
        if ckpt.get('training_results') is not None:
            with open(results_file, 'w') as file:
                file.write(ckpt['training_results'])  # write results.txt

        start_epoch = ckpt['epoch'] + 1
        del ckpt

    # Mixed precision training https://github.com/NVIDIA/apex
    if mixed_precision:
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level='O1',
                                          verbosity=0)

    # Scheduler https://arxiv.org/pdf/1812.01187.pdf
    lf = lambda x: ((
        (1 + math.cos(x * math.pi / epochs)) / 2)**1.0) * 0.9 + 0.1  # cosine
    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
    scheduler.last_epoch = start_epoch - 1  # do not move
    # https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822
    # plot_lr_scheduler(optimizer, scheduler, epochs)

    # Initialize distributed training
    if device.type != 'cpu' and torch.cuda.device_count(
    ) > 1 and torch.distributed.is_available():
        dist.init_process_group(
            backend='nccl',  # distributed backend
            init_method='tcp://127.0.0.1:9999',  # init method
            world_size=1,  # number of nodes
            rank=0)  # node rank
        model = torch.nn.parallel.DistributedDataParallel(model)
        # pip install torch==1.4.0+cu100 torchvision==0.5.0+cu100 -f https://download.pytorch.org/whl/torch_stable.html

    # Dataset
    dataset = LoadImagesAndLabels(
        train_path,
        imgsz,
        batch_size,
        augment=True,
        hyp=hyp,  # augmentation hyperparameters
        rect=opt.rect,  # rectangular training
        cache_images=opt.cache_images,
        single_cls=opt.single_cls)
    mlc = np.concatenate(dataset.labels, 0)[:, 0].max()  # max label class
    assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Correct your labels or your model.' % (
        mlc, nc, opt.cfg)

    # Dataloader
    batch_size = min(batch_size, len(dataset))
    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0,
              8])  # number of workers
    dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=batch_size,
        num_workers=nw,
        shuffle=not opt.
        rect,  # Shuffle=True unless rectangular training is used
        pin_memory=True,
        collate_fn=dataset.collate_fn)

    # Testloader
    testloader = torch.utils.data.DataLoader(LoadImagesAndLabels(
        test_path,
        imgsz_test,
        batch_size,
        hyp=hyp,
        rect=True,
        cache_images=opt.cache_images,
        single_cls=opt.single_cls),
                                             batch_size=batch_size,
                                             num_workers=nw,
                                             pin_memory=True,
                                             collate_fn=dataset.collate_fn)

    # Model parameters
    hyp['cls'] *= nc / 80.  # scale coco-tuned hyp['cls'] to current dataset
    model.nc = nc  # attach number of classes to model
    model.hyp = hyp  # attach hyperparameters to model
    model.gr = 1.0  # giou loss ratio (obj_loss = 1.0 or giou)
    model.class_weights = labels_to_class_weights(dataset.labels, nc).to(
        device)  # attach class weights
    model.names = data_dict['names']

    # Class frequency
    labels = np.concatenate(dataset.labels, 0)
    c = torch.tensor(labels[:, 0])  # classes
    # cf = torch.bincount(c.long(), minlength=nc) + 1.
    # model._initialize_biases(cf.to(device))
    if tb_writer:
        plot_labels(labels)
        tb_writer.add_histogram('classes', c, 0)

    # Check anchors
    if not opt.noautoanchor:
        check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz)

    # Exponential moving average
    ema = torch_utils.ModelEMA(model)

    # Start training
    t0 = time.time()
    nb = len(dataloader)  # number of batches
    n_burn = max(3 * nb,
                 1e3)  # burn-in iterations, max(3 epochs, 1k iterations)
    maps = np.zeros(nc)  # mAP per class
    results = (
        0, 0, 0, 0, 0, 0, 0
    )  # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification'
    print('Image sizes %g train, %g test' % (imgsz, imgsz_test))
    print('Using %g dataloader workers' % nw)
    print('Starting training for %g epochs...' % epochs)
    # torch.autograd.set_detect_anomaly(True)
    for epoch in range(
            start_epoch, epochs
    ):  # epoch ------------------------------------------------------------------
        model.train()

        # Update image weights (optional)
        if dataset.image_weights:
            w = model.class_weights.cpu().numpy() * (1 -
                                                     maps)**2  # class weights
            image_weights = labels_to_image_weights(dataset.labels,
                                                    nc=nc,
                                                    class_weights=w)
            dataset.indices = random.choices(range(dataset.n),
                                             weights=image_weights,
                                             k=dataset.n)  # rand weighted idx

        mloss = torch.zeros(4, device=device)  # mean losses
        print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls',
                                     'total', 'targets', 'img_size'))
        pbar = tqdm(enumerate(dataloader), total=nb)  # progress bar
        for i, (
                imgs, targets, paths, _
        ) in pbar:  # batch -------------------------------------------------------------
            ni = i + nb * epoch  # number integrated batches (since train start)
            imgs = imgs.to(device).float(
            ) / 255.0  # uint8 to float32, 0 - 255 to 0.0 - 1.0

            # Burn-in
            if ni <= n_burn:
                xi = [0, n_burn]  # x interp
                # model.gr = np.interp(ni, xi, [0.0, 1.0])  # giou loss ratio (obj_loss = 1.0 or giou)
                accumulate = max(
                    1,
                    np.interp(ni, xi, [1, nbs / batch_size]).round())
                for j, x in enumerate(optimizer.param_groups):
                    # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
                    x['lr'] = np.interp(
                        ni, xi,
                        [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)])
                    if 'momentum' in x:
                        x['momentum'] = np.interp(ni, xi,
                                                  [0.9, hyp['momentum']])

            # Multi-scale
            if opt.multi_scale:
                sz = random.randrange(imgsz * 0.5,
                                      imgsz * 1.5 + gs) // gs * gs  # size
                sf = sz / max(imgs.shape[2:])  # scale factor
                if sf != 1:
                    ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]
                          ]  # new shape (stretched to gs-multiple)
                    imgs = F.interpolate(imgs,
                                         size=ns,
                                         mode='bilinear',
                                         align_corners=False)

            # Forward
            pred = model(imgs)

            # Loss
            loss, loss_items = compute_loss(pred, targets.to(device), model)
            if not torch.isfinite(loss):
                print('WARNING: non-finite loss, ending training ', loss_items)
                return results

            # Backward
            if mixed_precision:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            # Optimize
            if ni % accumulate == 0:
                optimizer.step()
                optimizer.zero_grad()
                ema.update(model)

            # Print
            mloss = (mloss * i + loss_items) / (i + 1)  # update mean losses
            mem = '%.3gG' % (torch.cuda.memory_cached() /
                             1E9 if torch.cuda.is_available() else 0)  # (GB)
            s = ('%10s' * 2 + '%10.4g' * 6) % ('%g/%g' % (epoch, epochs - 1),
                                               mem, *mloss, targets.shape[0],
                                               imgs.shape[-1])
            pbar.set_description(s)

            # Plot
            if ni < 3:
                f = 'train_batch%g.jpg' % i  # filename
                res = plot_images(images=imgs,
                                  targets=targets,
                                  paths=paths,
                                  fname=f)
                if tb_writer:
                    tb_writer.add_image(f,
                                        res,
                                        dataformats='HWC',
                                        global_step=epoch)
                    # tb_writer.add_graph(model, imgs)  # add model to tensorboard

            # end batch ------------------------------------------------------------------------------------------------

        # Scheduler
        scheduler.step()

        # mAP
        ema.update_attr(model)
        final_epoch = epoch + 1 == epochs
        if not opt.notest or final_epoch:  # Calculate mAP
            results, maps, times = test.test(
                opt.data,
                batch_size=batch_size,
                imgsz=imgsz_test,
                save_json=final_epoch
                and opt.data.endswith(os.sep + 'coco.yaml'),
                model=ema.ema,
                single_cls=opt.single_cls,
                dataloader=testloader)

        # Write
        with open(results_file, 'a') as f:
            f.write(s + '%10.4g' * 7 % results +
                    '\n')  # P, R, mAP, F1, test_losses=(GIoU, obj, cls)
        if len(opt.name) and opt.bucket:
            os.system('gsutil cp results.txt gs://%s/results/results%s.txt' %
                      (opt.bucket, opt.name))

        # Tensorboard
        if tb_writer:
            tags = [
                'train/giou_loss', 'train/obj_loss', 'train/cls_loss',
                'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5',
                'metrics/F1', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss'
            ]
            for x, tag in zip(list(mloss[:-1]) + list(results), tags):
                tb_writer.add_scalar(tag, x, epoch)

        # Update best mAP
        fi = fitness(np.array(results).reshape(
            1, -1))  # fitness_i = weighted combination of [P, R, mAP, F1]
        if fi > best_fitness:
            best_fitness = fi

        # Save model
        save = (not opt.nosave) or (final_epoch and not opt.evolve)
        if save:
            with open(results_file, 'r') as f:  # create checkpoint
                ckpt = {
                    'epoch': epoch,
                    'best_fitness': best_fitness,
                    'training_results': f.read(),
                    'model':
                    ema.ema.module if hasattr(model, 'module') else ema.ema,
                    'optimizer':
                    None if final_epoch else optimizer.state_dict()
                }

            # Save last, best and delete
            torch.save(ckpt, last)
            if (best_fitness == fi) and not final_epoch:
                torch.save(ckpt, best)
            del ckpt

        # end epoch ----------------------------------------------------------------------------------------------------
    # end training

    n = opt.name
    if len(n):
        n = '_' + n if not n.isnumeric() else n
        fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n
        for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'],
                          [flast, fbest, fresults]):
            if os.path.exists(f1):
                os.rename(f1, f2)  # rename
                ispt = f2.endswith('.pt')  # is *.pt
                strip_optimizer(f2) if ispt else None  # strip optimizer
                os.system('gsutil cp %s gs://%s/weights' % (
                    f2, opt.bucket)) if opt.bucket and ispt else None  # upload

    if not opt.evolve:
        plot_results()  # save as results.png
    print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1,
                                                    (time.time() - t0) / 3600))
    dist.destroy_process_group(
    ) if device.type != 'cpu' and torch.cuda.device_count() > 1 else None
    torch.cuda.empty_cache()
    return results
Ejemplo n.º 48
0
def train(
        cfg,
        data_cfg,
        img_size=416,
        resume=False,
        epochs=100,  # 500200 batches at bs 4, 117263 images = 68 epochs
        batch_size=16,
        accumulate=4,  # effective bs = 64 = batch_size * accumulate
        freeze_backbone=False,
        transfer=False  # Transfer learning (train only YOLO layers)
):
    init_seeds()
    weights = 'weights' + os.sep
    latest = weights + 'latest.pt'
    best = weights + 'best.pt'
    device = torch_utils.select_device()
    torch.backends.cudnn.benchmark = True  # possibly unsuitable for multiscale
    img_size_test = img_size  # image size for testing

    if opt.multi_scale:
        img_size_min = round(img_size / 32 / 1.5)
        img_size_max = round(img_size / 32 * 1.5)
        img_size = img_size_max * 32  # initiate with maximum multi_scale size

    # Configure run
    data_dict = parse_data_cfg(data_cfg)
    train_path = data_dict['train']
    nc = int(data_dict['classes'])  # number of classes

    # Initialize model
    model = Darknet(cfg).to(device)

    # Optimizer
    optimizer = optim.SGD(model.parameters(), lr=hyp['lr0'], momentum=hyp['momentum'], weight_decay=hyp['weight_decay'])

    cutoff = -1  # backbone reaches to cutoff layer
    start_epoch = 0
    best_loss = float('inf')
    nf = int(model.module_defs[model.yolo_layers[0] - 1]['filters'])  # yolo layer size (i.e. 255)
    if resume:  # Load previously saved model
        if transfer:  # Transfer learning
            chkpt = torch.load(weights + 'yolov3-spp.pt', map_location=device)
            model.load_state_dict({k: v for k, v in chkpt['model'].items() if v.numel() > 1 and v.shape[0] != 255},
                                  strict=False)
            for p in model.parameters():
                p.requires_grad = True if p.shape[0] == nf else False

        else:  # resume from latest.pt
            chkpt = torch.load(latest, map_location=device)  # load checkpoint
            model.load_state_dict(chkpt['model'])

        start_epoch = chkpt['epoch'] + 1
        if chkpt['optimizer'] is not None:
            optimizer.load_state_dict(chkpt['optimizer'])
            best_loss = chkpt['best_loss']
        del chkpt

    else:  # Initialize model with backbone (optional)
        if '-tiny' in cfg:
            cutoff = load_darknet_weights(model, weights + 'yolov3-tiny.conv.15')
        else:
            cutoff = load_darknet_weights(model, weights + 'darknet53.conv.74')

    # Scheduler https://github.com/ultralytics/yolov3/issues/238
    # lf = lambda x: 1 - x / epochs  # linear ramp to zero
    # lf = lambda x: 10 ** (hyp['lrf'] * x / epochs)  # exp ramp
    # lf = lambda x: 1 - 10 ** (hyp['lrf'] * (1 - x / epochs))  # inverse exp ramp
    # scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
    scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[round(opt.epochs * x) for x in (0.8, 0.9)], gamma=0.1)
    scheduler.last_epoch = start_epoch - 1

    # # Plot lr schedule
    # y = []
    # for _ in range(epochs):
    #     scheduler.step()
    #     y.append(optimizer.param_groups[0]['lr'])
    # plt.plot(y, label='LambdaLR')
    # plt.xlabel('epoch')
    # plt.xlabel('LR')
    # plt.tight_layout()
    # plt.savefig('LR.png', dpi=300)

    # Dataset
    dataset = LoadImagesAndLabels(train_path,
                                  img_size,
                                  batch_size,
                                  augment=True,
                                  rect=False)

    # Initialize distributed training
    if torch.cuda.device_count() > 1:
        dist.init_process_group(backend=opt.backend, init_method=opt.dist_url, world_size=opt.world_size, rank=opt.rank)
        model = torch.nn.parallel.DistributedDataParallel(model)
        # sampler = torch.project.data.distributed.DistributedSampler(dataset)

    # Dataloader
    dataloader = DataLoader(dataset,
                            batch_size=batch_size,
                            num_workers=opt.num_workers,
                            shuffle=True,  # disable rectangular training if True
                            pin_memory=True,
                            collate_fn=dataset.collate_fn)

    # Mixed precision training https://github.com/NVIDIA/apex
    # install help: https://github.com/NVIDIA/apex/issues/259
    mixed_precision = False
    if mixed_precision:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    # Remove old results
    for f in glob.glob('*_batch*.jpg') + glob.glob('results.txt'):
        os.remove(f)

    # Start training
    model.hyp = hyp  # attach hyperparameters to model
    model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device)  # attach class weights
    model_info(model)
    nb = len(dataloader)
    maps = np.zeros(nc)  # mAP per class
    results = (0, 0, 0, 0, 0)  # P, R, mAP, F1, test_loss
    n_burnin = min(round(nb / 5 + 1), 1000)  # burn-in batches
    t, t0 = time.time(), time.time()
    for epoch in range(start_epoch, epochs):
        print(epoch)
        model.train()
        print(('\n%8s%12s' + '%10s' * 7) % ('Epoch', 'Batch', 'xy', 'wh', 'conf', 'cls', 'total', 'targets', 'time'))

        # Update scheduler
        scheduler.step()

        # Freeze backbone at epoch 0, unfreeze at epoch 1 (optional)
        if freeze_backbone and epoch < 2:
            for name, p in model.named_parameters():
                if int(name.split('.')[1]) < cutoff:  # if layer < 75
                    p.requires_grad = False if epoch == 0 else True

        # # Update image weights (optional)
        # w = model.class_weights.cpu().numpy() * (1 - maps)  # class weights
        # image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w)
        # dataset.indices = random.choices(range(dataset.n), weights=image_weights, k=dataset.n)  # random weighted index

        mloss = torch.zeros(5).to(device)  # mean losses
        for i, (imgs, targets, _, _) in enumerate(dataloader):
            imgs = imgs.to(device)
            targets = targets.to(device)

            # Multi-Scale training
            if opt.multi_scale:
                if (i + 1 + nb * epoch) % 10 == 0:  #  adjust (67% - 150%) every 10 batches
                    img_size = random.choice(range(img_size_min, img_size_max + 1)) * 32
                    print('img_size = %g' % img_size)
                scale_factor = img_size / max(imgs.shape[-2:])
                imgs = F.interpolate(imgs, scale_factor=scale_factor, mode='bilinear', align_corners=False)

            # Plot images with bounding boxes
            if epoch == 0 and i == 0:
                plot_images(imgs=imgs, targets=targets, fname='train_batch%g.jpg' % i)

            # SGD burn-in
            if epoch == 0 and i <= n_burnin:
                lr = hyp['lr0'] * (i / n_burnin) ** 4
                for x in optimizer.param_groups:
                    x['lr'] = lr

            # Run model
            pred = model(imgs)

            # Compute loss
            loss, loss_items = compute_loss(pred, targets, model)
            if torch.isnan(loss):
                print('WARNING: nan loss detected, ending training')
                return results

            # Compute gradient
            if mixed_precision:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            # Accumulate gradient for x batches before optimizing
            if (i + 1) % accumulate == 0 or (i + 1) == nb:
                optimizer.step()
                optimizer.zero_grad()

            # Print batch results
            mloss = (mloss * i + loss_items) / (i + 1)  # update mean losses
            s = ('%8s%12s' + '%10.3g' * 7) % (
                '%g/%g' % (epoch, epochs - 1),
                '%g/%g' % (i, nb - 1), *mloss, len(targets), time.time() - t)
            t = time.time()
            print(s)

        # Calculate mAP (always test final epoch, skip first 5 if opt.nosave)
        if not (opt.notest or (opt.nosave and epoch < 10)) or epoch == epochs - 1:
            with torch.no_grad():
                results, maps = test.test(cfg, data_cfg, batch_size=batch_size, img_size=img_size_test, model=model,
                                          conf_thres=0.1)

        # Write epoch results
        with open('results.txt', 'a') as file:
            file.write(s + '%11.3g' * 5 % results + '\n')  # P, R, mAP, F1, test_loss

        # Update best loss
        test_loss = results[4]
        if test_loss < best_loss:
            best_loss = test_loss

        # Save training results
        save = (not opt.nosave) or (epoch == epochs - 1)
        if save:
            # Create checkpoint
            chkpt = {'epoch': epoch,
                     'best_loss': best_loss,
                     'model': model.module.state_dict() if type(
                         model) is nn.parallel.DistributedDataParallel else model.state_dict(),
                     'optimizer': optimizer.state_dict()}

            # Save latest checkpoint
            torch.save(chkpt, latest)

            # Save best checkpoint
            if best_loss == test_loss:
                torch.save(chkpt, best)

            # Save backup every 10 epochs (optional)
            if epoch > 0 and epoch % 10 == 0:
                torch.save(chkpt, weights + 'backup%g.pt' % epoch)

            # Delete checkpoint
            del chkpt

    dt = (time.time() - t0) / 3600
    print('%g epochs completed in %.3f hours.' % (epoch - start_epoch + 1, dt))
    return results
Ejemplo n.º 49
0
        if not os.path.exists(backup): os.makedirs(backup)
        model_name = os.path.splitext(os.path.split(
            opt.cfg)[-1])[0] + '_best.pth'
        self.save_path = os.path.join(backup, model_name + '_best.pth')
        self.best_map = 0

    def save(self, mAP):
        if mAP > self.best_map: self.best_map = mAP
        if os.path.exists(self.save_path): os.remove(self.save_path)
        torch.save(model.state_dict(), self.save_path)


if __name__ == "__main__":
    args = create_args()
    torch.manual_seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)
    model, model_cfg = create_model(args)
    optimizer, lr = create_optimizer(model, model_cfg)
    class_names, train_loader, valid_loader, test_loader = create_dataset(
        args, model_cfg)
    save = save_model(args)
    #import pdb;pdb.set_trace()
    mAP = 0
    for epoch in range(1, args.epochs + 1):
        train(args, model, train_loader, optimizer, epoch)
        if epoch % 3 == 2 or epoch == args.epochs:
            mAP = test(args, model, model_cfg, test_loader, class_names)
        optimizer, lr = adjust_learning_rate(optimizer, lr, epoch)
        save.save(mAP)
Ejemplo n.º 50
0
def train(
        cfg,
        data_cfg,
        img_size=416,
        resume=False,
        epochs=273,  # 500200 batches at bs 64, dataset length 117263
        batch_size=16,
        accumulate=1,
        multi_scale=False,
        freeze_backbone=False,
        transfer=False  # Transfer learning (train only YOLO layers)
):
    init_seeds()
    weights = 'weights' + os.sep
    latest = weights + 'latest.pt'
    best = weights + 'best.pt'
    device = torch_utils.select_device()

    if multi_scale:
        img_size = 608  # initiate with maximum multi_scale size
        opt.num_workers = 0  # bug https://github.com/ultralytics/yolov3/issues/174
    else:
        torch.backends.cudnn.benchmark = True  # unsuitable for multiscale

    # Configure run
    train_path = parse_data_cfg(data_cfg)['train']

    # Initialize model
    model = Darknet(cfg, img_size).to(device)

    # Optimizer
    optimizer = optim.SGD(model.parameters(), lr=hyp['lr0'], momentum=hyp['momentum'], weight_decay=hyp['weight_decay'])

    cutoff = -1  # backbone reaches to cutoff layer
    start_epoch = 0
    best_loss = float('inf')
    nf = int(model.module_defs[model.yolo_layers[0] - 1]['filters'])  # yolo layer size (i.e. 255)
    if resume:  # Load previously saved model
        if transfer:  # Transfer learning
            chkpt = torch.load(weights + 'yolov3-spp.pt', map_location=device)
            model.load_state_dict({k: v for k, v in chkpt['model'].items() if v.numel() > 1 and v.shape[0] != 255},
                                  strict=False)
            for p in model.parameters():
                p.requires_grad = True if p.shape[0] == nf else False

        else:  # resume from latest.pt
            chkpt = torch.load(latest, map_location=device)  # load checkpoint
            model.load_state_dict(chkpt['model'])

        start_epoch = chkpt['epoch'] + 1
        if chkpt['optimizer'] is not None:
            optimizer.load_state_dict(chkpt['optimizer'])
            best_loss = chkpt['best_loss']
        del chkpt

    else:  # Initialize model with backbone (optional)
        if '-tiny.cfg' in cfg:
            #cutoff = load_darknet_weights(model, weights + 'yolov3-tiny.conv.15')
            pass
        else:
            #cutoff = load_darknet_weights(model, weights + 'darknet53.conv.74')
            pass

    # Scheduler https://github.com/ultralytics/yolov3/issues/238
    # lf = lambda x: 1 - x / epochs  # linear ramp to zero
    # lf = lambda x: 10 ** (hyp['lrf'] * x / epochs)  # exp ramp
    lf = lambda x: 1 - 10 ** (hyp['lrf'] * (1 - x / epochs))  # inverse exp ramp
    scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lf, last_epoch=start_epoch - 1)
    # scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[218, 245], gamma=0.1, last_epoch=start_epoch-1)

    # # Plot lr schedule
    # y = []
    # for _ in range(epochs):
    #     scheduler.step()
    #     y.append(optimizer.param_groups[0]['lr'])
    # plt.plot(y, label='LambdaLR')
    # plt.xlabel('epoch')
    # plt.xlabel('LR')
    # plt.tight_layout()
    # plt.savefig('LR.png', dpi=300)

    # Dataset
    dataset = LoadImagesAndLabels(train_path, img_size=img_size, augment=True)

    # Initialize distributed training
    if torch.cuda.device_count() > 1:
        dist.init_process_group(backend=opt.backend, init_method=opt.dist_url, world_size=opt.world_size, rank=opt.rank)
        model = torch.nn.parallel.DistributedDataParallel(model)
        # sampler = torch.utils.data.distributed.DistributedSampler(dataset)

    # Dataloader
    dataloader = DataLoader(dataset,
                            batch_size=batch_size,
                            num_workers=opt.num_workers,
                            shuffle=True,
                            pin_memory=True,
                            collate_fn=dataset.collate_fn)

    # Mixed precision training https://github.com/NVIDIA/apex
    # install help: https://github.com/NVIDIA/apex/issues/259
    mixed_precision = False
    if mixed_precision:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    # Start training
    t = time.time()
    model.hyp = hyp  # attach hyperparameters to model
    model_info(model)
    nb = len(dataloader)
    results = (0, 0, 0, 0, 0)  # P, R, mAP, F1, test_loss
    n_burnin = min(round(nb / 5 + 1), 1000)  # burn-in batches
    os.remove('train_batch0.jpg') if os.path.exists('train_batch0.jpg') else None
    os.remove('test_batch0.jpg') if os.path.exists('test_batch0.jpg') else None
    for epoch in range(start_epoch, epochs):
        model.train()
        print(('\n%8s%12s' + '%10s' * 7) % ('Epoch', 'Batch', 'xy', 'wh', 'conf', 'cls', 'total', 'nTargets', 'time'))

        # Update scheduler
        scheduler.step()

        # Freeze backbone at epoch 0, unfreeze at epoch 1
        if freeze_backbone and epoch < 2:
            for name, p in model.named_parameters():
                if int(name.split('.')[1]) < cutoff:  # if layer < 75
                    p.requires_grad = False if epoch == 0 else True

        mloss = torch.zeros(5).to(device)  # mean losses
        for i, (imgs, targets, _, _) in enumerate(dataloader):
            imgs = imgs.to(device)
            targets = targets.to(device)
            nt = len(targets)

            # Plot images with bounding boxes
            if epoch == 0 and i == 0:
                plot_images(imgs=imgs, targets=targets, fname='train_batch0.jpg')

            # SGD burn-in
            if epoch == 0 and i <= n_burnin:
                lr = hyp['lr0'] * (i / n_burnin) ** 4
                for x in optimizer.param_groups:
                    x['lr'] = lr

            # Run model
            pred = model(imgs)

            # Compute loss
            loss, loss_items = compute_loss(pred, targets, model)
            if torch.isnan(loss):
                print('WARNING: nan loss detected, ending training')
                return results

            # Compute gradient
            if mixed_precision:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            # Accumulate gradient for x batches before optimizing
            if (i + 1) % accumulate == 0 or (i + 1) == nb:
                optimizer.step()
                optimizer.zero_grad()

            # Update running mean of tracked metrics
            mloss = (mloss * i + loss_items) / (i + 1)

            # Print batch results
            s = ('%8s%12s' + '%10.3g' * 7) % (
                '%g/%g' % (epoch, epochs - 1),
                '%g/%g' % (i, nb - 1), *mloss, nt, time.time() - t)
            t = time.time()
            print(s)

            # Multi-Scale training (320 - 608 pixels) every 10 batches
            if multi_scale and (i + 1) % 10 == 0:
                dataset.img_size = random.choice(range(10, 20)) * 32
                print('multi_scale img_size = %g' % dataset.img_size)

        # Calculate mAP (always test final epoch, skip first 5 if opt.nosave)
        if not (opt.notest or (opt.nosave and epoch < 5)) or epoch == epochs - 1:
            with torch.no_grad():
                results = test.test(cfg, data_cfg, batch_size=batch_size, img_size=img_size, model=model,
                                    conf_thres=0.1)

        # Write epoch results
        with open('results.txt', 'a') as file:
            file.write(s + '%11.3g' * 5 % results + '\n')  # P, R, mAP, F1, test_loss

        # Update best loss
        test_loss = results[4]
        if test_loss < best_loss:
            best_loss = test_loss

        # Save training results
        save = True and not opt.nosave
        if save:
            # Create checkpoint
            chkpt = {'epoch': epoch,
                     'best_loss': best_loss,
                     'model': model.module.state_dict() if type(
                         model) is nn.parallel.DistributedDataParallel else model.state_dict(),
                     'optimizer': optimizer.state_dict()}

            # Save latest checkpoint
            torch.save(chkpt, latest)

            # Save best checkpoint
            if best_loss == test_loss:
                torch.save(chkpt, best)

            # Save backup every 10 epochs (optional)
            if epoch > 0 and epoch % 10 == 0:
                torch.save(chkpt, weights + 'backup%g.pt' % epoch)

            # Delete checkpoint
            del chkpt

    return results
Ejemplo n.º 51
0
        optimizer = None
    else:
        optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=args.lr)
        optimizer.share_memory()
    
    shared_stepcount = SharedCounter()
    
    if not args.debug:
        processes = []

        p = mp.Process(target=test, args=(args.num_processes, args, 
            shared_model, Model, make_env, shared_stepcount))
        p.start()
        processes.append(p)
        for rank in range(0, args.num_processes):
            p = mp.Process(target=train, args=(rank, args, shared_model, 
                Model, make_env, shared_stepcount, optimizer))
            p.start()
            processes.append(p)
        for p in processes:
            p.join()
    else: ## debug is enabled
        # run only one process in a main, easier to debug
        args.num_test_episodes = 1
        args.descr = 'debug'
        args.max_step_count = 1000 # test both train and debug
        train(0, args, shared_model, Model, make_env, shared_stepcount, optimizer)
        args.max_step_count += 1000 # needed to perform test
        test(args.num_processes, args, shared_model, Model, make_env, shared_stepcount)

Ejemplo n.º 52
0
    if opt.weights:
        if opt.weights.endswith(".pt"):
            model.load_state_dict(
                torch.load(opt.weights, map_location=device)['model'])
        else:
            _ = load_darknet_weights(model, opt.weights)

    data_config = parse_data_cfg(opt.data)

    valid_path = data_config["valid"]
    class_names = load_classes(data_config["names"])

    eval_model = lambda model: test(model=model,
                                    cfg=opt.cfg,
                                    data=opt.data,
                                    batch_size=opt.batch_size,
                                    imgsz=opt.img_size,
                                    rank=-1)

    obtain_num_parameters = lambda model: sum(
        [param.nelement() for param in model.parameters()])

    # 这个不应该注释掉,等会要恢复
    with torch.no_grad():
        origin_model_metric = eval_model(model)
    origin_nparameters = obtain_num_parameters(model)

    CBL_idx, Conv_idx, prune_idx = parse_module_defs(model.module_defs)

    # 将所有要剪枝的BN层的α参数,拷贝到bn_weights列表
    bn_weights = gather_bn_weights(model.module_list, prune_idx)
Ejemplo n.º 53
0
# 
# **Submission Checklist**
# 
# 1. Does my code pass all tests?
# 2. Does my code implement `A*` search and not some other search algorithm?
# 3. Do I use an **admissible heuristic** to direct search efforts towards the goal?
# 4. Do I use data structures which avoid unnecessarily slow lookups?
# 
# When you can answer "yes" to all of these questions, and also have answered the written questions below, submit by pressing the Submit button in the lower right!

# In[29]:


from test import test

test(PathPlanner)


# ## Questions
# 
# **Instructions**  
# 
# Answer the following questions in your own words. We do not you expect you to know all of this knowledge on the top of your head. We expect you to do research and ask question. However do not merely copy and paste the answer from a google or stackoverflow. Read the information and understand it first. Then use your own words to explain the answer.

# ---
# How would you explain A-Star to a family member(layman)?
# 
# **ANSWER**: A-Star is basically a route finding search algorithm which is used to find the shortest path between two given points or locations. It uses path cost and estimated distance between the start and goal state to provide the optimal shortest path. As and example, Google maps internally uses A-Star search algorithm (which is based on cost and heuristic function) to provide the shortest path between two destinations as choosen on the map.
# 

# ---
Ejemplo n.º 54
0
def main(args):
    # Ensure training, testing, and manip are not all turned off
    assert (
        args.train or args.test or args.manip
    ), 'Cannot have train, test, and manip all set to 0, Nothing to do.'

    # Load the training, validation, and testing data
    try:
        train_list, val_list, test_list = load_data(args.data_root_dir,
                                                    args.split_num)
    except:
        # Create the training and test splits if not found
        split_data(args.data_root_dir, num_splits=4)
        train_list, val_list, test_list = load_data(args.data_root_dir,
                                                    args.split_num)

    # Get image properties from first image. Assume they are all the same.
    # print("train_list_0",train_list)
    img_shape = sitk.GetArrayFromImage(
        sitk.ReadImage(join(args.data_root_dir, 'imgs',
                            train_list[0][0]))).shape
    net_input_shape = (img_shape[1], img_shape[2], args.slices)

    # print(img_shape)

    # Create the model for training/testing/manipulation
    model_list = create_model(args=args, input_shape=net_input_shape)
    print_summary(model=model_list[0], positions=[.38, .65, .75, 1.])

    args.output_name = 'split-' + str(args.split_num) + '_batch-' + str(args.batch_size) + \
                       '_shuff-' + str(args.shuffle_data) + '_aug-' + str(args.aug_data) + \
                       '_loss-' + str(args.loss) + '_slic-' + str(args.slices) + \
                       '_sub-' + str(args.subsamp) + '_strid-' + str(args.stride) + \
                       '_lr-' + str(args.initial_lr) + '_recon-' + str(args.recon_wei)
    args.time = time

    args.check_dir = join(args.data_root_dir, 'saved_models', args.net)
    try:
        makedirs(args.check_dir)
    except:
        pass

    args.log_dir = join(args.data_root_dir, 'logs', args.net)
    try:
        makedirs(args.log_dir)
    except:
        pass

    args.tf_log_dir = join(args.log_dir, 'tf_logs')
    try:
        makedirs(args.tf_log_dir)
    except:
        pass

    args.output_dir = join(args.data_root_dir, 'plots', args.net)
    try:
        makedirs(args.output_dir)
    except:
        pass

    if args.train:
        from train import train
        # Run training
        train(args, train_list, val_list, model_list[0], net_input_shape)

    if args.test:
        from test import test
        # Run testing
        test(args, test_list, model_list, net_input_shape)

    if args.manip:
        from manip import manip
        # Run manipulation of segcaps
        manip(args, test_list, model_list, net_input_shape)
Ejemplo n.º 55
0
def worker(gpu, ngpus_per_node, args):
    env_device, train_device = args_initialize(gpu, ngpus_per_node, args)
    train_env, test_env, observation = env_initialize(args, env_device)
    train_csv_file, train_csv_writer, eval_csv_file, eval_csv_writer, summary_writer = log_initialize(
        args, train_device)

    model = ActorCritic(args.num_stack,
                        train_env.action_space,
                        normalize=args.normalize,
                        name=args.env_name)
    model, optimizer = model_initialize(args, model, train_device)

    num_frames_per_iter = args.num_ales * args.num_steps
    total_steps = math.ceil(args.t_max /
                            (args.world_size * num_frames_per_iter))

    if args.verbose:
        print(num_frames_per_iter)
        print(args.world_size)
        print(total_steps)

    shape = (args.num_steps + 1, args.num_ales, args.num_stack,
             *train_env.observation_space.shape[-2:])
    states = torch.zeros(shape, device=train_device, dtype=torch.float32)
    states[0, :, -1] = observation.to(device=train_device, dtype=torch.float32)

    if args.verbose:
        print(shape)

    shape = (args.num_steps + 1, args.num_ales)
    values = torch.zeros(shape, device=train_device, dtype=torch.float32)
    returns = torch.zeros(shape, device=train_device, dtype=torch.float32)

    if args.verbose:
        print(shape)

    shape = (args.num_steps, args.num_ales)
    rewards = torch.zeros(shape, device=train_device, dtype=torch.float32)
    masks = torch.zeros(shape, device=train_device, dtype=torch.float32)
    actions = torch.zeros(shape, device=train_device, dtype=torch.long)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros(args.num_ales,
                                  device=train_device,
                                  dtype=torch.float32)
    final_rewards = torch.zeros(args.num_ales,
                                device=train_device,
                                dtype=torch.float32)
    episode_lengths = torch.zeros(args.num_ales,
                                  device=train_device,
                                  dtype=torch.float32)
    final_lengths = torch.zeros(args.num_ales,
                                device=train_device,
                                dtype=torch.float32)

    if args.use_gae:
        gae = torch.zeros(args.num_ales,
                          device=train_device,
                          dtype=torch.float32)

    maybe_npy = lambda a: a.numpy() if args.use_openai else a

    torch.cuda.synchronize()

    iterator = range(total_steps)
    if args.rank == 0:
        iterator = tqdm(iterator)
        total_time = 0
        evaluation_offset = 0

    for update in iterator:

        T = args.world_size * update * num_frames_per_iter
        if (args.rank == 0) and (T >= evaluation_offset):
            evaluation_offset += args.evaluation_interval
            eval_lengths, eval_rewards = test(args, model, test_env)

            lmean, lmedian, lmin, lmax, lstd = gen_data(eval_lengths)
            rmean, rmedian, rmin, rmax, rstd = gen_data(eval_rewards)
            length_data = '(length) min/max/mean/median: {lmin:4.1f}/{lmax:4.1f}/{lmean:4.1f}/{lmedian:4.1f}'.format(
                lmin=lmin, lmax=lmax, lmean=lmean, lmedian=lmedian)
            reward_data = '(reward) min/max/mean/median: {rmin:4.1f}/{rmax:4.1f}/{rmean:4.1f}/{rmedian:4.1f}'.format(
                rmin=rmin, rmax=rmax, rmean=rmean, rmedian=rmedian)
            print('[training time: {}] {}'.format(
                format_time(total_time),
                ' --- '.join([length_data, reward_data])))

            if eval_csv_writer and eval_csv_file:
                eval_csv_writer.writerow([
                    T, total_time, rmean, rmedian, rmin, rmax, rstd, lmean,
                    lmedian, lmin, lmax, lstd
                ])
                eval_csv_file.flush()

            if args.plot:
                summary_writer.add_scalar('eval/rewards_mean',
                                          rmean,
                                          T,
                                          walltime=total_time)
                summary_writer.add_scalar('eval/lengths_mean',
                                          lmean,
                                          T,
                                          walltime=total_time)

        start_time = time.time()

        with torch.no_grad():

            for step in range(args.num_steps):
                value, logit = model(states[step])

                # store values
                values[step] = value.squeeze(-1)

                # convert actions to numpy and perform next step
                probs_action = F.softmax(logit,
                                         dim=1).multinomial(1).to(env_device)
                observation, reward, done, info = train_env.step(
                    maybe_npy(probs_action))

                if args.use_openai:
                    # convert back to pytorch tensors
                    observation = torch.from_numpy(observation)
                    reward = torch.from_numpy(reward)
                    done = torch.from_numpy(done.astype(np.uint8))
                else:
                    observation = observation.squeeze(-1).unsqueeze(1)

                # move back to training memory
                observation = observation.to(device=train_device)
                reward = reward.to(device=train_device, dtype=torch.float32)
                done = done.to(device=train_device, dtype=torch.bool)
                probs_action = probs_action.to(device=train_device,
                                               dtype=torch.long)

                not_done = 1.0 - done.float()

                # update rewards and actions
                actions[step].copy_(probs_action.view(-1))
                masks[step].copy_(not_done)
                rewards[step].copy_(reward.sign())

                # update next observations
                states[step + 1, :, :-1].copy_(states[step, :, 1:].clone())
                states[step + 1] *= not_done.view(
                    -1, *[1] * (observation.dim() - 1))
                states[step + 1, :,
                       -1].copy_(observation.view(-1,
                                                  *states.size()[-2:]))

                # update episodic reward counters
                episode_rewards += reward
                final_rewards[done] = episode_rewards[done]
                episode_rewards *= not_done

                episode_lengths += not_done
                final_lengths[done] = episode_lengths[done]
                episode_lengths *= not_done

            returns[-1] = values[-1] = model(states[-1])[0].data.squeeze(-1)

            if args.use_gae:
                gae.zero_()
                for step in reversed(range(args.num_steps)):
                    delta = rewards[step] + (args.gamma * values[step + 1] *
                                             masks[step]) - values[step]
                    gae = delta + (args.gamma * args.tau * masks[step] * gae)
                    returns[step] = gae + values[step]
            else:
                for step in reversed(range(args.num_steps)):
                    returns[step] = rewards[step] + (
                        args.gamma * returns[step + 1] * masks[step])

        value, logit = model(states[:-1].view(-1, *states.size()[-3:]))

        log_probs = F.log_softmax(logit, dim=1)
        probs = F.softmax(logit, dim=1)

        action_log_probs = log_probs.gather(1, actions.view(-1).unsqueeze(-1))
        dist_entropy = -(log_probs * probs).sum(-1).mean()

        advantages = returns[:-1].view(-1).unsqueeze(-1) - value

        value_loss = advantages.pow(2).mean()
        policy_loss = -(advantages.clone().detach() * action_log_probs).mean()

        loss = value_loss * args.value_loss_coef + policy_loss - dist_entropy * args.entropy_coef
        optimizer.zero_grad()

        if args.cpu_train:
            loss.backward()
            master_params = model.parameters()
        else:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            master_params = amp.master_params(optimizer)

        torch.nn.utils.clip_grad_norm_(master_params, args.max_grad_norm)
        optimizer.step()

        states[0].copy_(states[-1])

        torch.cuda.synchronize()

        if args.rank == 0:
            iter_time = time.time() - start_time
            total_time += iter_time

            if args.plot:
                summary_writer.add_scalar('train/rewards_mean',
                                          final_rewards.mean().item(),
                                          T,
                                          walltime=total_time)
                summary_writer.add_scalar('train/lengths_mean',
                                          final_lengths.mean().item(),
                                          T,
                                          walltime=total_time)
                summary_writer.add_scalar('train/value_loss',
                                          value_loss,
                                          T,
                                          walltime=total_time)
                summary_writer.add_scalar('train/policy_loss',
                                          policy_loss,
                                          T,
                                          walltime=total_time)
                summary_writer.add_scalar('train/entropy',
                                          dist_entropy,
                                          T,
                                          walltime=total_time)

            progress_data = callback(args, model, T, iter_time,
                                     final_rewards, final_lengths,
                                     value_loss.item(), policy_loss.item(),
                                     dist_entropy.item(), train_csv_writer,
                                     train_csv_file)
            iterator.set_postfix_str(progress_data)

    if args.plot and (args.rank == 0):
        writer.close()

    if args.use_openai:
        train_env.close()
    if args.use_openai_test_env:
        test_env.close()
Ejemplo n.º 56
0
def train(hyp, opt, device, tb_writer=None, wandb=None):
    logger.info(colorstr('hyperparameters: ') + ', '.join(f'{k}={v}' for k, v in hyp.items()))
    save_dir, epochs, batch_size, total_batch_size, weights, rank = \
        Path(opt.save_dir), opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank

    # Directories
    wdir = save_dir / 'weights'
    wdir.mkdir(parents=True, exist_ok=True)  # make dir
    last = wdir / 'last.pt'
    best = wdir / 'best.pt'
    results_file = save_dir / 'results.txt'

    # Save run settings
    with open(save_dir / 'hyp.yaml', 'w') as f:
        yaml.dump(hyp, f, sort_keys=False)
    with open(save_dir / 'opt.yaml', 'w') as f:
        yaml.dump(vars(opt), f, sort_keys=False)

    # Configure
    plots = not opt.evolve  # create plots
    cuda = device.type != 'cpu'
    init_seeds(2 + rank)
    with open(opt.data) as f:
        data_dict = yaml.load(f, Loader=yaml.SafeLoader)  # data dict
    with torch_distributed_zero_first(rank):
        check_dataset(data_dict)  # check
    train_path = data_dict['train']
    test_path = data_dict['val']
    nc = 1 if opt.single_cls else int(data_dict['nc'])  # number of classes
    names = ['item'] if opt.single_cls and len(data_dict['names']) != 1 else data_dict['names']  # class names
    assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (len(names), nc, opt.data)  # check

    # Model
    pretrained = weights.endswith('.pt')
    if pretrained:
        with torch_distributed_zero_first(rank):
            attempt_download(weights)  # download if not found locally
        ckpt = torch.load(weights, map_location=device)  # load checkpoint
        if hyp.get('anchors'):
            ckpt['model'].yaml['anchors'] = round(hyp['anchors'])  # force autoanchor
        model = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc).to(device)  # create
        exclude = ['anchor'] if opt.cfg or hyp.get('anchors') else []  # exclude keys
        state_dict = ckpt['model'].float().state_dict()  # to FP32
        state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude)  # intersect
        model.load_state_dict(state_dict, strict=False)  # load
        logger.info('Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights))  # report
    else:
        model = Model(opt.cfg, ch=3, nc=nc).to(device)  # create

    # Freeze
    freeze = ['model.%s.' % x for x in range(11)] # ['model.%s.' % x for x in range(11)]  # parameter names to freeze (full or partial)
    for k, v in model.named_parameters():
        v.requires_grad = True  # train all layers
        if any(x in k for x in freeze):
            print('freezing %s' % k)
            v.requires_grad = False

    # Optimizer
    nbs = 64  # nominal batch size
    accumulate = max(round(nbs / total_batch_size), 1)  # accumulate loss before optimizing
    hyp['weight_decay'] *= total_batch_size * accumulate / nbs  # scale weight_decay
    logger.info(f"Scaled weight_decay = {hyp['weight_decay']}")

    pg0, pg1, pg2 = [], [], []  # optimizer parameter groups
    for k, v in model.named_modules():
        if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter):
            pg2.append(v.bias)  # biases
        if isinstance(v, nn.BatchNorm2d):
            pg0.append(v.weight)  # no decay
        elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter):
            pg1.append(v.weight)  # apply decay

    if opt.adam:
        optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999))  # adjust beta1 to momentum
    else:
        optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)

    optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']})  # add pg1 with weight_decay
    optimizer.add_param_group({'params': pg2})  # add pg2 (biases)
    logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0)))
    del pg0, pg1, pg2

    # Scheduler https://arxiv.org/pdf/1812.01187.pdf
    # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR
    lf = one_cycle(1, hyp['lrf'], epochs)  # cosine 1->hyp['lrf']
    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
    # plot_lr_scheduler(optimizer, scheduler, epochs)

    # Logging
    if rank in [-1, 0] and wandb and wandb.run is None:
        opt.hyp = hyp  # add hyperparameters
        wandb_run = wandb.init(config=opt, resume="allow",
                               project='YOLOv3' if opt.project == 'runs/train' else Path(opt.project).stem,
                               name=save_dir.stem,
                               id=ckpt.get('wandb_id') if 'ckpt' in locals() else None)
    loggers = {'wandb': wandb}  # loggers dict

    # Resume
    start_epoch, best_fitness = 0, 0.0
    if pretrained:
        # Optimizer
        if ckpt['optimizer'] is not None:
            optimizer.load_state_dict(ckpt['optimizer'])
            best_fitness = ckpt['best_fitness']

        # Results
        if ckpt.get('training_results') is not None:
            with open(results_file, 'w') as file:
                file.write(ckpt['training_results'])  # write results.txt

        # Epochs
        start_epoch = ckpt['epoch'] + 1
        if opt.resume:
            assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % (weights, epochs)
        if epochs < start_epoch:
            logger.info('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' %
                        (weights, ckpt['epoch'], epochs))
            epochs += ckpt['epoch']  # finetune additional epochs

        del ckpt, state_dict

    # Image sizes
    gs = int(model.stride.max())  # grid size (max stride)
    nl = model.model[-1].nl  # number of detection layers (used for scaling hyp['obj'])
    imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size]  # verify imgsz are gs-multiples

    # DP mode
    if cuda and rank == -1 and torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    # SyncBatchNorm
    if opt.sync_bn and cuda and rank != -1:
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
        logger.info('Using SyncBatchNorm()')

    # EMA
    ema = ModelEMA(model) if rank in [-1, 0] else None

    # DDP mode
    if cuda and rank != -1:
        model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank)

    # Trainloader
    dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt,
                                            hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank,
                                            world_size=opt.world_size, workers=opt.workers,
                                            image_weights=opt.image_weights, quad=opt.quad, prefix=colorstr('train: '))
    mlc = np.concatenate(dataset.labels, 0)[:, 0].max()  # max label class
    nb = len(dataloader)  # number of batches
    assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (mlc, nc, opt.data, nc - 1)

    # Process 0
    if rank in [-1, 0]:
        ema.updates = start_epoch * nb // accumulate  # set EMA updates
        testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt,  # testloader
                                       hyp=hyp, cache=opt.cache_images and not opt.notest, rect=True, rank=-1,
                                       world_size=opt.world_size, workers=opt.workers,
                                       pad=0.5, prefix=colorstr('val: '))[0]

        if not opt.resume:
            labels = np.concatenate(dataset.labels, 0)
            c = torch.tensor(labels[:, 0])  # classes
            # cf = torch.bincount(c.long(), minlength=nc) + 1.  # frequency
            # model._initialize_biases(cf.to(device))
            if plots:
                plot_labels(labels, save_dir, loggers)
                if tb_writer:
                    tb_writer.add_histogram('classes', c, 0)

            # Anchors
            if not opt.noautoanchor:
                check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz)

    # Model parameters
    hyp['box'] *= 3. / nl  # scale to layers
    hyp['cls'] *= nc / 80. * 3. / nl  # scale to classes and layers
    hyp['obj'] *= (imgsz / 640) ** 2 * 3. / nl  # scale to image size and layers
    model.nc = nc  # attach number of classes to model
    model.hyp = hyp  # attach hyperparameters to model
    model.gr = 1.0  # iou loss ratio (obj_loss = 1.0 or iou)
    model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) * nc  # attach class weights
    model.names = names

    # Start training
    t0 = time.time()
    nw = max(round(hyp['warmup_epochs'] * nb), 1000)  # number of warmup iterations, max(3 epochs, 1k iterations)
    # nw = min(nw, (epochs - start_epoch) / 2 * nb)  # limit warmup to < 1/2 of training
    maps = np.zeros(nc)  # mAP per class
    results = (0, 0, 0, 0, 0, 0, 0)  # P, R, [email protected], [email protected], val_loss(box, obj, cls)
    scheduler.last_epoch = start_epoch - 1  # do not move
    scaler = amp.GradScaler(enabled=cuda)
    logger.info(f'Image sizes {imgsz} train, {imgsz_test} test\n'
                f'Using {dataloader.num_workers} dataloader workers\n'
                f'Logging results to {save_dir}\n'
                f'Starting training for {epochs} epochs...')
    for epoch in range(start_epoch, epochs):  # epoch ------------------------------------------------------------------
        model.train()

        # Update image weights (optional)
        if opt.image_weights:
            # Generate indices
            if rank in [-1, 0]:
                cw = model.class_weights.cpu().numpy() * (1 - maps) ** 2 / nc  # class weights
                iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw)  # image weights
                dataset.indices = random.choices(range(dataset.n), weights=iw, k=dataset.n)  # rand weighted idx
            # Broadcast if DDP
            if rank != -1:
                indices = (torch.tensor(dataset.indices) if rank == 0 else torch.zeros(dataset.n)).int()
                dist.broadcast(indices, 0)
                if rank != 0:
                    dataset.indices = indices.cpu().numpy()

        # Update mosaic border
        # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs)
        # dataset.mosaic_border = [b - imgsz, -b]  # height, width borders

        mloss = torch.zeros(4, device=device)  # mean losses
        if rank != -1:
            dataloader.sampler.set_epoch(epoch)
        pbar = enumerate(dataloader)
        logger.info(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'total', 'targets', 'img_size'))
        if rank in [-1, 0]:
            pbar = tqdm(pbar, total=nb)  # progress bar
        optimizer.zero_grad()
        for i, (imgs, targets, paths, _) in pbar:  # batch -------------------------------------------------------------
            ni = i + nb * epoch  # number integrated batches (since train start)
            imgs = imgs.to(device, non_blocking=True).float() / 255.0  # uint8 to float32, 0-255 to 0.0-1.0

            # Warmup
            if ni <= nw:
                xi = [0, nw]  # x interp
                # model.gr = np.interp(ni, xi, [0.0, 1.0])  # iou loss ratio (obj_loss = 1.0 or iou)
                accumulate = max(1, np.interp(ni, xi, [1, nbs / total_batch_size]).round())
                for j, x in enumerate(optimizer.param_groups):
                    # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
                    x['lr'] = np.interp(ni, xi, [hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch)])
                    if 'momentum' in x:
                        x['momentum'] = np.interp(ni, xi, [hyp['warmup_momentum'], hyp['momentum']])

            # Multi-scale
            if opt.multi_scale:
                sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs  # size
                sf = sz / max(imgs.shape[2:])  # scale factor
                if sf != 1:
                    ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]]  # new shape (stretched to gs-multiple)
                    imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False)

            # Forward
            with amp.autocast(enabled=cuda):
                pred = model(imgs)  # forward
                loss, loss_items = compute_loss(pred, targets.to(device), model)  # loss scaled by batch_size
                if rank != -1:
                    loss *= opt.world_size  # gradient averaged between devices in DDP mode
                if opt.quad:
                    loss *= 4.

            # Backward
            scaler.scale(loss).backward()

            # Optimize
            if ni % accumulate == 0:
                scaler.step(optimizer)  # optimizer.step
                scaler.update()
                optimizer.zero_grad()
                if ema:
                    ema.update(model)

            # Print
            if rank in [-1, 0]:
                mloss = (mloss * i + loss_items) / (i + 1)  # update mean losses
                mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0)  # (GB)
                s = ('%10s' * 2 + '%10.4g' * 6) % (
                    '%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1])
                pbar.set_description(s)

                # Plot
                if plots and ni < 3:
                    f = save_dir / f'train_batch{ni}.jpg'  # filename
                    Thread(target=plot_images, args=(imgs, targets, paths, f), daemon=True).start()
                    # if tb_writer:
                    #     tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch)
                    #     tb_writer.add_graph(model, imgs)  # add model to tensorboard
                elif plots and ni == 3 and wandb:
                    wandb.log({"Mosaics": [wandb.Image(str(x), caption=x.name) for x in save_dir.glob('train*.jpg')]})

            # end batch ------------------------------------------------------------------------------------------------
        # end epoch ----------------------------------------------------------------------------------------------------

        # Scheduler
        lr = [x['lr'] for x in optimizer.param_groups]  # for tensorboard
        scheduler.step()

        # DDP process 0 or single-GPU
        if rank in [-1, 0]:
            # mAP
            if ema:
                ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride', 'class_weights'])
            final_epoch = epoch + 1 == epochs
            if not opt.notest or final_epoch:  # Calculate mAP
                results, maps, times = test.test(opt.data,
                                                 batch_size=total_batch_size,
                                                 imgsz=imgsz_test,
                                                 model=ema.ema,
                                                 single_cls=opt.single_cls,
                                                 dataloader=testloader,
                                                 save_dir=save_dir,
                                                 plots=plots and final_epoch,
                                                 log_imgs=opt.log_imgs if wandb else 0)

            # Write
            with open(results_file, 'a') as f:
                f.write(s + '%10.4g' * 7 % results + '\n')  # P, R, [email protected], [email protected], val_loss(box, obj, cls)
            if len(opt.name) and opt.bucket:
                os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name))

            # Log
            tags = ['train/box_loss', 'train/obj_loss', 'train/cls_loss',  # train loss
                    'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95',
                    'val/box_loss', 'val/obj_loss', 'val/cls_loss',  # val loss
                    'x/lr0', 'x/lr1', 'x/lr2']  # params
            for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags):
                if tb_writer:
                    tb_writer.add_scalar(tag, x, epoch)  # tensorboard
                if wandb:
                    wandb.log({tag: x})  # W&B

            # Update best mAP
            fi = fitness(np.array(results).reshape(1, -1))  # weighted combination of [P, R, [email protected], [email protected]]
            if fi > best_fitness:
                best_fitness = fi

            # Save model
            save = (not opt.nosave) or (final_epoch and not opt.evolve)
            if save:
                with open(results_file, 'r') as f:  # create checkpoint
                    ckpt = {'epoch': epoch,
                            'best_fitness': best_fitness,
                            'training_results': f.read(),
                            'model': ema.ema,
                            'optimizer': None if final_epoch else optimizer.state_dict(),
                            'wandb_id': wandb_run.id if wandb else None}

                # Save last, best and delete
                torch.save(ckpt, last)
                if best_fitness == fi:
                    torch.save(ckpt, best)
                del ckpt
        # end epoch ----------------------------------------------------------------------------------------------------
    # end training

    if rank in [-1, 0]:
        # Strip optimizers
        final = best if best.exists() else last  # final model
        for f in [last, best]:
            if f.exists():
                strip_optimizer(f)  # strip optimizers
        if opt.bucket:
            os.system(f'gsutil cp {final} gs://{opt.bucket}/weights')  # upload

        # Plots
        if plots:
            plot_results(save_dir=save_dir)  # save as results.png
            if wandb:
                files = ['results.png', 'precision_recall_curve.png', 'confusion_matrix.png']
                wandb.log({"Results": [wandb.Image(str(save_dir / f), caption=f) for f in files
                                       if (save_dir / f).exists()]})
                if opt.log_artifacts:
                    wandb.log_artifact(artifact_or_path=str(final), type='model', name=save_dir.stem)

        # Test best.pt
        logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600))
        if opt.data.endswith('coco.yaml') and nc == 80:  # if COCO
            for conf, iou, save_json in ([0.25, 0.45, False], [0.001, 0.65, True]):  # speed, mAP tests
                results, _, _ = test.test(opt.data,
                                          batch_size=total_batch_size,
                                          imgsz=imgsz_test,
                                          conf_thres=conf,
                                          iou_thres=iou,
                                          model=attempt_load(final, device).half(),
                                          single_cls=opt.single_cls,
                                          dataloader=testloader,
                                          save_dir=save_dir,
                                          save_json=save_json,
                                          plots=False)

    else:
        dist.destroy_process_group()

    wandb.run.finish() if wandb and wandb.run else None
    torch.cuda.empty_cache()
    return results
Ejemplo n.º 57
0

def testCode():

    file = "../Test_files/test.exe"
    data = open(file, "rb").read()

    start_time = time.time()
    cr = PEHeaderReader(data=data)
    cr.load()
    total_imports, cant_librerias, promedio = cr.get_import_size_stats()
    real, virtual, w_e, w_real_sum, w_virtual_sum = cr.get_section_stats()
    elapsed = time.time() - start_time

    line1 = str(total_imports) + "|" + str(cant_librerias) + "|" + str(
        promedio)
    line2 = str(real) + "|" + str(virtual) + "|" + str(w_e) + "|" + str(
        w_real_sum) + "|" + str(w_virtual_sum)

    print(line1)
    print(line2)

    imp = cr.getImports()
    print(str(imp))
    print("Elapsed time: " + str(elapsed))


#****************TEST_EXECUTE******************
from test import test
test("-test_PEHeaderReader", testCode)
Ejemplo n.º 58
0
            best_prec1 = max(prec1, best_prec1)
            state = {
                'epoch': i,
                'arch': opt.arch,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'best_prec1': best_prec1
            }
            save_checkpoint(state, is_best)
        # if not opt.no_train and not opt.no_val:
        #     scheduler.step(validation_loss)

    if opt.test:
        spatial_transform = Compose([
            Scale(opt.sample_size),
            CenterCrop(opt.sample_size),
            ToTensor(opt.norm_value), norm_method
        ])
        temporal_transform = Compose([TemporalCenterCrop(opt.sample_duration)])
        # target_transform = VideoID()
        target_transform = ClassLabel()

        test_data = get_test_set(opt, spatial_transform, temporal_transform,
                                 target_transform)
        test_loader = torch.utils.data.DataLoader(test_data,
                                                  batch_size=opt.batch_size,
                                                  shuffle=False,
                                                  num_workers=opt.n_threads,
                                                  pin_memory=True)
        test.test(test_loader, model, opt, test_data.class_names)
Ejemplo n.º 59
0
def train(hyp, opt, device, tb_writer=None):
    logger.info(f'Hyperparameters {hyp}')
    log_dir = Path(tb_writer.log_dir) if tb_writer else Path(opt.logdir) / 'evolve'  # logging directory
    wdir = log_dir / 'weights'  # weights directory
    os.makedirs(wdir, exist_ok=True)
    last = wdir / 'last.pt'
    best = wdir / 'best.pt'
    results_file = str(log_dir / 'results.txt')
    epochs, batch_size, total_batch_size, weights, rank = \
        opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank

    # Save run settings
    with open(log_dir / 'hyp.yaml', 'w') as f:
        yaml.dump(hyp, f, sort_keys=False)
    with open(log_dir / 'opt.yaml', 'w') as f:
        yaml.dump(vars(opt), f, sort_keys=False)

    # Configure
    cuda = device.type != 'cpu'
    init_seeds(2 + rank)
    with open(opt.data) as f:
        data_dict = yaml.load(f, Loader=yaml.FullLoader)  # data dict
    with torch_distributed_zero_first(rank):
        check_dataset(data_dict)  # check
    train_path = data_dict['train']
    test_path = data_dict['val']
    nc, names = (1, ['item']) if opt.single_cls else (int(data_dict['nc']), data_dict['names'])  # number classes, names
    assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (len(names), nc, opt.data)  # check

    # Model
    pretrained = weights.endswith('.pt')
    if pretrained:
        with torch_distributed_zero_first(rank):
            attempt_download(weights)  # download if not found locally
        ckpt = torch.load(weights, map_location=device)  # load checkpoint
        if hyp.get('anchors'):
            ckpt['model'].yaml['anchors'] = round(hyp['anchors'])  # force autoanchor
        model = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc).to(device)  # create
        exclude = ['anchor'] if opt.cfg or hyp.get('anchors') else []  # exclude keys
        state_dict = ckpt['model'].float().state_dict()  # to FP32
        state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude)  # intersect
        model.load_state_dict(state_dict, strict=False)  # load
        logger.info('Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights))  # report
    else:
        model = Model(opt.cfg, ch=3, nc=nc).to(device)  # create

    # Freeze
    freeze = ['', ]  # parameter names to freeze (full or partial)
    if any(freeze):
        for k, v in model.named_parameters():
            if any(x in k for x in freeze):
                print('freezing %s' % k)
                v.requires_grad = False

    # Optimizer
    nbs = 64  # nominal batch size
    accumulate = max(round(nbs / total_batch_size), 1)  # accumulate loss before optimizing
    hyp['weight_decay'] *= total_batch_size * accumulate / nbs  # scale weight_decay

    pg0, pg1, pg2 = [], [], []  # optimizer parameter groups
    for k, v in model.named_parameters():
        v.requires_grad = True
        if '.bias' in k:
            pg2.append(v)  # biases
        elif '.weight' in k and '.bn' not in k:
            pg1.append(v)  # apply weight decay
        else:
            pg0.append(v)  # all else

    if opt.adam:
        optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999))  # adjust beta1 to momentum
    else:
        optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)

    optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']})  # add pg1 with weight_decay
    optimizer.add_param_group({'params': pg2})  # add pg2 (biases)
    logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0)))
    del pg0, pg1, pg2

    # Scheduler https://arxiv.org/pdf/1812.01187.pdf
    # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR
    lf = lambda x: ((1 + math.cos(x * math.pi / epochs)) / 2) * (1 - hyp['lrf']) + hyp['lrf']  # cosine
    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
    # plot_lr_scheduler(optimizer, scheduler, epochs)

    # Resume
    start_epoch, best_fitness = 0, 0.0
    if pretrained:
        # Optimizer
        if ckpt['optimizer'] is not None:
            optimizer.load_state_dict(ckpt['optimizer'])
            best_fitness = ckpt['best_fitness']

        # Results
        if ckpt.get('training_results') is not None:
            with open(results_file, 'w') as file:
                file.write(ckpt['training_results'])  # write results.txt

        # Epochs
        start_epoch = ckpt['epoch'] + 1
        if opt.resume:
            assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % (weights, epochs)
            shutil.copytree(wdir, wdir.parent / f'weights_backup_epoch{start_epoch - 1}')  # save previous weights
        if epochs < start_epoch:
            logger.info('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' %
                        (weights, ckpt['epoch'], epochs))
            epochs += ckpt['epoch']  # finetune additional epochs

        del ckpt, state_dict

    # Image sizes
    gs = int(max(model.stride))  # grid size (max stride)
    imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size]  # verify imgsz are gs-multiples

    # DP mode
    if cuda and rank == -1 and torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    # SyncBatchNorm
    if opt.sync_bn and cuda and rank != -1:
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
        logger.info('Using SyncBatchNorm()')

    # Exponential moving average
    ema = ModelEMA(model) if rank in [-1, 0] else None

    # DDP mode
    if cuda and rank != -1:
        model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank)

    # Trainloader
    dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt,
                                            hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect,
                                            rank=rank, world_size=opt.world_size, workers=opt.workers,
                                            patch_size=opt.patch_size)
    mlc = np.concatenate(dataset.labels, 0)[:, 0].max()  # max label class
    nb = len(dataloader)  # number of batches
    assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (mlc, nc, opt.data, nc - 1)

    # Process 0
    if rank in [-1, 0]:
        ema.updates = start_epoch * nb // accumulate  # set EMA updates
        testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt,
                                       hyp=hyp, augment=False, cache=opt.cache_images and not opt.notest, rect=True,
                                       rank=-1, world_size=opt.world_size, workers=opt.workers)[0]  # testloader

        if not opt.resume:
            labels = np.concatenate(dataset.labels, 0)
            c = torch.tensor(labels[:, 0])  # classes
            # cf = torch.bincount(c.long(), minlength=nc) + 1.  # frequency
            # model._initialize_biases(cf.to(device))
            plot_labels(labels, save_dir=log_dir)
            if tb_writer:
                # tb_writer.add_hparams(hyp, {})  # causes duplicate https://github.com/ultralytics/yolov5/pull/384
                tb_writer.add_histogram('classes', c, 0)

            # Anchors
            if not opt.noautoanchor:
                check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz)

    # Model parameters
    hyp['cls'] *= nc / 80.  # scale coco-tuned hyp['cls'] to current dataset
    model.nc = nc  # attach number of classes to model
    model.hyp = hyp  # attach hyperparameters to model
    model.gr = 1.0  # giou loss ratio (obj_loss = 1.0 or giou)
    model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device)  # attach class weights
    model.names = names

    # Start training
    t0 = time.time()
    nw = max(round(hyp['warmup_epochs'] * nb), 1e3)  # number of warmup iterations, max(3 epochs, 1k iterations)
    # nw = min(nw, (epochs - start_epoch) / 2 * nb)  # limit warmup to < 1/2 of training
    maps = np.zeros(nc)  # mAP per class
    results = (0, 0, 0, 0, 0, 0, 0)  # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification'
    scheduler.last_epoch = start_epoch - 1  # do not move
    scaler = amp.GradScaler(enabled=cuda)
    logger.info('Image sizes %g train, %g test\nUsing %g dataloader workers\nLogging results to %s\n'
                'Starting training for %g epochs...' % (imgsz, imgsz_test, dataloader.num_workers, log_dir, epochs))
    for epoch in range(start_epoch, epochs):  # epoch ------------------------------------------------------------------
        model.train()

        # Update image weights (optional)
        if opt.image_weights:
            # Generate indices
            if rank in [-1, 0]:
                cw = model.class_weights.cpu().numpy() * (1 - maps) ** 2  # class weights
                iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw)  # image weights
                dataset.indices = random.choices(range(dataset.n), weights=iw, k=dataset.n)  # rand weighted idx
            # Broadcast if DDP
            if rank != -1:
                indices = (torch.tensor(dataset.indices) if rank == 0 else torch.zeros(dataset.n)).int()
                dist.broadcast(indices, 0)
                if rank != 0:
                    dataset.indices = indices.cpu().numpy()

        # Update mosaic border
        # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs)
        # dataset.mosaic_border = [b - imgsz, -b]  # height, width borders

        mloss = torch.zeros(4, device=device)  # mean losses
        if rank != -1:
            dataloader.sampler.set_epoch(epoch)
        pbar = enumerate(dataloader)
        logger.info(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size'))
        if rank in [-1, 0]:
            pbar = tqdm(pbar, total=nb)  # progress bar
        optimizer.zero_grad()
        for i, (imgs, targets, paths, _) in pbar:  # batch -------------------------------------------------------------
            ni = i + nb * epoch  # number integrated batches (since train start)
            imgs = imgs.to(device, non_blocking=True).float() / 255.0  # uint8 to float32, 0-255 to 0.0-1.0

            # Warmup
            if ni <= nw:
                xi = [0, nw]  # x interp
                # model.gr = np.interp(ni, xi, [0.0, 1.0])  # giou loss ratio (obj_loss = 1.0 or giou)
                accumulate = max(1, np.interp(ni, xi, [1, nbs / total_batch_size]).round())
                for j, x in enumerate(optimizer.param_groups):
                    # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
                    x['lr'] = np.interp(ni, xi, [hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch)])
                    if 'momentum' in x:
                        x['momentum'] = np.interp(ni, xi, [hyp['warmup_momentum'], hyp['momentum']])

            # Multi-scale
            if opt.multi_scale:
                sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs  # size
                sf = sz / max(imgs.shape[2:])  # scale factor
                if sf != 1:
                    ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]]  # new shape (stretched to gs-multiple)
                    imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False)

            # Forward
            with amp.autocast(enabled=cuda):
                pred = model(imgs)  # forward
                loss, loss_items = compute_loss(pred, targets.to(device), model)  # loss scaled by batch_size
                if rank != -1:
                    loss *= opt.world_size  # gradient averaged between devices in DDP mode

            # Backward
            scaler.scale(loss).backward()

            # Optimize
            if ni % accumulate == 0:
                scaler.step(optimizer)  # optimizer.step
                scaler.update()
                optimizer.zero_grad()
                if ema:
                    ema.update(model)

            # Print
            if rank in [-1, 0]:
                mloss = (mloss * i + loss_items) / (i + 1)  # update mean losses
                mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0)  # (GB)
                s = ('%10s' * 2 + '%10.4g' * 6) % (
                    '%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1])
                pbar.set_description(s)

                # Plot
                if ni < 3:
                    f = str(log_dir / ('train_batch%g.jpg' % ni))  # filename
                    result = plot_images(images=imgs, targets=targets, paths=paths, fname=f)
                    if tb_writer and result is not None:
                        tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch)
                        # tb_writer.add_graph(model, imgs)  # add model to tensorboard

            # end batch ------------------------------------------------------------------------------------------------

        # Scheduler
        lr = [x['lr'] for x in optimizer.param_groups]  # for tensorboard
        scheduler.step()

        # DDP process 0 or single-GPU
        if rank in [-1, 0]:
            # mAP
            if ema:
                ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride'])
            final_epoch = epoch + 1 == epochs
            if not opt.notest or final_epoch:  # Calculate mAP
                if final_epoch:  # replot predictions
                    [os.remove(x) for x in glob.glob(str(log_dir / 'test_batch*_pred.jpg')) if os.path.exists(x)]
                results, maps, times = test.test(opt.data,
                                                 batch_size=total_batch_size,
                                                 imgsz=imgsz_test,
                                                 model=ema.ema,
                                                 single_cls=opt.single_cls,
                                                 dataloader=testloader,
                                                 save_dir=log_dir)

            # Write
            with open(results_file, 'a') as f:
                f.write(s + '%10.4g' * 7 % results + '\n')  # P, R, mAP, F1, test_losses=(GIoU, obj, cls)
            if len(opt.name) and opt.bucket:
                os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name))

            # Tensorboard
            if tb_writer:
                tags = ['train/giou_loss', 'train/obj_loss', 'train/cls_loss',  # train loss
                        'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95',
                        'val/giou_loss', 'val/obj_loss', 'val/cls_loss',  # val loss
                        'x/lr0', 'x/lr1', 'x/lr2']  # params
                for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags):
                    tb_writer.add_scalar(tag, x, epoch)

            # Update best mAP
            fi = fitness(np.array(results).reshape(1, -1))  # fitness_i = weighted combination of [P, R, mAP, F1]
            if fi > best_fitness:
                best_fitness = fi

            # Save model
            save = (not opt.nosave) or (final_epoch and not opt.evolve)
            if save:
                with open(results_file, 'r') as f:  # create checkpoint
                    ckpt = {'epoch': epoch,
                            'best_fitness': best_fitness,
                            'training_results': f.read(),
                            'model': ema.ema,
                            'optimizer': None if final_epoch else optimizer.state_dict()}

                # Save last, best and delete
                torch.save(ckpt, last)
                if best_fitness == fi:
                    torch.save(ckpt, best)
                del ckpt
        # end epoch ----------------------------------------------------------------------------------------------------
    # end training

    if rank in [-1, 0]:
        # Strip optimizers
        n = opt.name if opt.name.isnumeric() else ''
        fresults, flast, fbest = log_dir / f'results{n}.txt', wdir / f'last{n}.pt', wdir / f'best{n}.pt'
        for f1, f2 in zip([wdir / 'last.pt', wdir / 'best.pt', results_file], [flast, fbest, fresults]):
            if os.path.exists(f1):
                os.rename(f1, f2)  # rename
                if str(f2).endswith('.pt'):  # is *.pt
                    strip_optimizer(f2)  # strip optimizer
                    os.system('gsutil cp %s gs://%s/weights' % (f2, opt.bucket)) if opt.bucket else None  # upload
        # Finish
        if not opt.evolve:
            plot_results(save_dir=log_dir)  # save as results.png
        logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600))

    dist.destroy_process_group() if rank not in [-1, 0] else None
    torch.cuda.empty_cache()
    return results
Ejemplo n.º 60
0
def train(hyp, tb_writer, opt, device):
    print(f'Hyperparameters {hyp}')
    log_dir = tb_writer.log_dir if tb_writer else 'runs/evolution'  # run directory
    wdir = str(Path(log_dir) / 'weights') + os.sep  # weights directory
    os.makedirs(wdir, exist_ok=True)
    last = wdir + 'last.pt'
    best = wdir + 'best.pt'
    results_file = log_dir + os.sep + 'results.txt'
    epochs, batch_size, total_batch_size, weights, rank = \
        opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.local_rank
    # TODO: Init DDP logging. Only the first process is allowed to log.
    # Since I see lots of print here, the logging configuration is skipped here. We may see repeated outputs.

    # Save run settings
    with open(Path(log_dir) / 'hyp.yaml', 'w') as f:
        yaml.dump(hyp, f, sort_keys=False)
    with open(Path(log_dir) / 'opt.yaml', 'w') as f:
        yaml.dump(vars(opt), f, sort_keys=False)

    # Configure
    init_seeds(2 + rank)
    with open(opt.data) as f:
        data_dict = yaml.load(f, Loader=yaml.FullLoader)  # model dict
    train_path = data_dict['train']
    test_path = data_dict['val']
    nc, names = (1, ['item']) if opt.single_cls else (int(
        data_dict['nc']), data_dict['names'])  # number classes, names
    assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (
        len(names), nc, opt.data)  # check

    # Remove previous results
    if rank in [-1, 0]:
        for f in glob.glob('*_batch*.jpg') + glob.glob(results_file):
            os.remove(f)

    # Create model
    model = Model(opt.cfg, nc=nc).to(device)

    # Image sizes
    gs = int(max(model.stride))  # grid size (max stride)
    imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size
                         ]  # verify imgsz are gs-multiples

    # Optimizer
    nbs = 64  # nominal batch size
    # default DDP implementation is slow for accumulation according to: https://pytorch.org/docs/stable/notes/ddp.html
    # all-reduce operation is carried out during loss.backward().
    # Thus, there would be redundant all-reduce communications in a accumulation procedure,
    # which means, the result is still right but the training speed gets slower.
    # TODO: If acceleration is needed, there is an implementation of allreduce_post_accumulation
    # in https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT/run_pretraining.py
    accumulate = max(round(nbs / total_batch_size),
                     1)  # accumulate loss before optimizing
    hyp['weight_decay'] *= total_batch_size * accumulate / nbs  # scale weight_decay

    pg0, pg1, pg2 = [], [], []  # optimizer parameter groups
    for k, v in model.named_parameters():
        if v.requires_grad:
            if '.bias' in k:
                pg2.append(v)  # biases
            elif '.weight' in k and '.bn' not in k:
                pg1.append(v)  # apply weight decay
            else:
                pg0.append(v)  # all else

    if hyp['optimizer'] == 'adam':  # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR
        optimizer = optim.Adam(pg0,
                               lr=hyp['lr0'],
                               betas=(hyp['momentum'],
                                      0.999))  # adjust beta1 to momentum
    else:
        optimizer = optim.SGD(pg0,
                              lr=hyp['lr0'],
                              momentum=hyp['momentum'],
                              nesterov=True)

    optimizer.add_param_group({
        'params': pg1,
        'weight_decay': hyp['weight_decay']
    })  # add pg1 with weight_decay
    optimizer.add_param_group({'params': pg2})  # add pg2 (biases)
    print('Optimizer groups: %g .bias, %g conv.weight, %g other' %
          (len(pg2), len(pg1), len(pg0)))
    del pg0, pg1, pg2

    # Load Model
    with torch_distributed_zero_first(rank):
        google_utils.attempt_download(weights)
    start_epoch, best_fitness = 0, 0.0
    if weights.endswith('.pt'):  # pytorch format
        ckpt = torch.load(weights, map_location=device)  # load checkpoint

        # load model
        try:
            exclude = ['anchor']  # exclude keys
            ckpt['model'] = {
                k: v
                for k, v in ckpt['model'].float().state_dict().items()
                if k in model.state_dict() and not any(x in k for x in exclude)
                and model.state_dict()[k].shape == v.shape
            }
            model.load_state_dict(ckpt['model'], strict=False)
            print('Transferred %g/%g items from %s' %
                  (len(ckpt['model']), len(model.state_dict()), weights))
        except KeyError as e:
            s = "%s is not compatible with %s. This may be due to model differences or %s may be out of date. " \
                "Please delete or update %s and try again, or use --weights '' to train from scratch." \
                % (weights, opt.cfg, weights, weights)
            raise KeyError(s) from e

        # load optimizer
        if ckpt['optimizer'] is not None:
            optimizer.load_state_dict(ckpt['optimizer'])
            best_fitness = ckpt['best_fitness']

        # load results
        if ckpt.get('training_results') is not None:
            with open(results_file, 'w') as file:
                file.write(ckpt['training_results'])  # write results.txt

        # epochs
        start_epoch = ckpt['epoch'] + 1
        if epochs < start_epoch:
            print(
                '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.'
                % (weights, ckpt['epoch'], epochs))
            epochs += ckpt['epoch']  # finetune additional epochs

        del ckpt

    # Mixed precision training https://github.com/NVIDIA/apex
    if mixed_precision:
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level='O1',
                                          verbosity=0)

    # Scheduler https://arxiv.org/pdf/1812.01187.pdf
    lf = lambda x: ((
        (1 + math.cos(x * math.pi / epochs)) / 2)**1.0) * 0.9 + 0.1  # cosine
    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
    # https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822
    # plot_lr_scheduler(optimizer, scheduler, epochs)

    # DP mode
    if device.type != 'cpu' and rank == -1 and torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    # SyncBatchNorm
    if opt.sync_bn and device.type != 'cpu' and rank != -1:
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
        print('Using SyncBatchNorm()')

    # Exponential moving average
    ema = torch_utils.ModelEMA(model) if rank in [-1, 0] else None

    # DDP mode
    if device.type != 'cpu' and rank != -1:
        model = DDP(model, device_ids=[rank], output_device=rank)

    # Trainloader
    dataloader, dataset = create_dataloader(train_path,
                                            imgsz,
                                            batch_size,
                                            gs,
                                            opt,
                                            hyp=hyp,
                                            augment=True,
                                            cache=opt.cache_images,
                                            rect=opt.rect,
                                            local_rank=rank,
                                            world_size=opt.world_size)
    mlc = np.concatenate(dataset.labels, 0)[:, 0].max()  # max label class
    nb = len(dataloader)  # number of batches
    assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (
        mlc, nc, opt.data, nc - 1)

    # Testloader
    if rank in [-1, 0]:
        # local_rank is set to -1. Because only the first process is expected to do evaluation.
        testloader = create_dataloader(test_path,
                                       imgsz_test,
                                       total_batch_size,
                                       gs,
                                       opt,
                                       hyp=hyp,
                                       augment=False,
                                       cache=opt.cache_images,
                                       rect=True,
                                       local_rank=-1,
                                       world_size=opt.world_size)[0]

    # Model parameters
    hyp['cls'] *= nc / 80.  # scale coco-tuned hyp['cls'] to current dataset
    model.nc = nc  # attach number of classes to model
    model.hyp = hyp  # attach hyperparameters to model
    model.gr = 1.0  # giou loss ratio (obj_loss = 1.0 or giou)
    model.class_weights = labels_to_class_weights(dataset.labels, nc).to(
        device)  # attach class weights
    model.names = names

    # Class frequency
    if rank in [-1, 0]:
        labels = np.concatenate(dataset.labels, 0)
        c = torch.tensor(labels[:, 0])  # classes
        # cf = torch.bincount(c.long(), minlength=nc) + 1.
        # model._initialize_biases(cf.to(device))
        plot_labels(labels, save_dir=log_dir)
        if tb_writer:
            # tb_writer.add_hparams(hyp, {})  # causes duplicate https://github.com/ultralytics/yolov5/pull/384
            tb_writer.add_histogram('classes', c, 0)

        # Check anchors
        if not opt.noautoanchor:
            check_anchors(dataset,
                          model=model,
                          thr=hyp['anchor_t'],
                          imgsz=imgsz)

    # Start training
    t0 = time.time()
    nw = max(3 * nb,
             1e3)  # number of warmup iterations, max(3 epochs, 1k iterations)
    maps = np.zeros(nc)  # mAP per class
    results = (
        0, 0, 0, 0, 0, 0, 0
    )  # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification'
    scheduler.last_epoch = start_epoch - 1  # do not move
    if rank in [0, -1]:
        print('Image sizes %g train, %g test' % (imgsz, imgsz_test))
        print('Using %g dataloader workers' % dataloader.num_workers)
        print('Starting training for %g epochs...' % epochs)
    # torch.autograd.set_detect_anomaly(True)
    for epoch in range(
            start_epoch, epochs
    ):  # epoch ------------------------------------------------------------------
        model.train()

        # Update image weights (optional)
        # When in DDP mode, the generated indices will be broadcasted to synchronize dataset.
        if dataset.image_weights:
            # Generate indices.
            if rank in [-1, 0]:
                w = model.class_weights.cpu().numpy() * (
                    1 - maps)**2  # class weights
                image_weights = labels_to_image_weights(dataset.labels,
                                                        nc=nc,
                                                        class_weights=w)
                dataset.indices = random.choices(
                    range(dataset.n), weights=image_weights,
                    k=dataset.n)  # rand weighted idx
            # Broadcast.
            if rank != -1:
                indices = torch.zeros([dataset.n], dtype=torch.int)
                if rank == 0:
                    indices[:] = torch.from_tensor(dataset.indices,
                                                   dtype=torch.int)
                dist.broadcast(indices, 0)
                if rank != 0:
                    dataset.indices = indices.cpu().numpy()

        # Update mosaic border
        # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs)
        # dataset.mosaic_border = [b - imgsz, -b]  # height, width borders

        mloss = torch.zeros(4, device=device)  # mean losses
        if rank != -1:
            dataloader.sampler.set_epoch(epoch)
        pbar = enumerate(dataloader)
        if rank in [-1, 0]:
            print(
                ('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj',
                                       'cls', 'total', 'targets', 'img_size'))
            pbar = tqdm(pbar, total=nb)  # progress bar
        optimizer.zero_grad()
        for i, (
                imgs, targets, paths, _
        ) in pbar:  # batch -------------------------------------------------------------
            ni = i + nb * epoch  # number integrated batches (since train start)
            imgs = imgs.to(device, non_blocking=True).float(
            ) / 255.0  # uint8 to float32, 0 - 255 to 0.0 - 1.0

            # Warmup
            if ni <= nw:
                xi = [0, nw]  # x interp
                # model.gr = np.interp(ni, xi, [0.0, 1.0])  # giou loss ratio (obj_loss = 1.0 or giou)
                accumulate = max(
                    1,
                    np.interp(ni, xi, [1, nbs / total_batch_size]).round())
                for j, x in enumerate(optimizer.param_groups):
                    # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
                    x['lr'] = np.interp(
                        ni, xi,
                        [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)])
                    if 'momentum' in x:
                        x['momentum'] = np.interp(ni, xi,
                                                  [0.9, hyp['momentum']])

            # Multi-scale
            if opt.multi_scale:
                sz = random.randrange(imgsz * 0.5,
                                      imgsz * 1.5 + gs) // gs * gs  # size
                sf = sz / max(imgs.shape[2:])  # scale factor
                if sf != 1:
                    ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]
                          ]  # new shape (stretched to gs-multiple)
                    imgs = F.interpolate(imgs,
                                         size=ns,
                                         mode='bilinear',
                                         align_corners=False)

            # Forward
            pred = model(imgs)

            # Loss
            loss, loss_items = compute_loss(pred, targets.to(device),
                                            model)  # scaled by batch_size
            if rank != -1:
                loss *= opt.world_size  # gradient averaged between devices in DDP mode
            if not torch.isfinite(loss):
                print('WARNING: non-finite loss, ending training ', loss_items)
                return results

            # Backward
            if mixed_precision:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            # Optimize
            if ni % accumulate == 0:
                optimizer.step()
                optimizer.zero_grad()
                if ema is not None:
                    ema.update(model)

            # Print
            if rank in [-1, 0]:
                mloss = (mloss * i + loss_items) / (i + 1
                                                    )  # update mean losses
                mem = '%.3gG' % (torch.cuda.memory_cached() / 1E9
                                 if torch.cuda.is_available() else 0)  # (GB)
                s = ('%10s' * 2 +
                     '%10.4g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem,
                                      *mloss, targets.shape[0], imgs.shape[-1])
                pbar.set_description(s)

                # Plot
                if ni < 3:
                    f = str(Path(log_dir) /
                            ('train_batch%g.jpg' % ni))  # filename
                    result = plot_images(images=imgs,
                                         targets=targets,
                                         paths=paths,
                                         fname=f)
                    if tb_writer and result is not None:
                        tb_writer.add_image(f,
                                            result,
                                            dataformats='HWC',
                                            global_step=epoch)
                        # tb_writer.add_graph(model, imgs)  # add model to tensorboard

            # end batch ------------------------------------------------------------------------------------------------

        # Scheduler
        scheduler.step()

        # Only the first process in DDP mode is allowed to log or save checkpoints.
        if rank in [-1, 0]:
            # mAP
            if ema is not None:
                ema.update_attr(
                    model,
                    include=['md', 'nc', 'hyp', 'gr', 'names', 'stride'])
            final_epoch = epoch + 1 == epochs
            if not opt.notest or final_epoch:  # Calculate mAP
                results, maps, times = test.test(
                    opt.data,
                    batch_size=total_batch_size,
                    imgsz=imgsz_test,
                    save_json=final_epoch
                    and opt.data.endswith(os.sep + 'coco.yaml'),
                    model=ema.ema.module
                    if hasattr(ema.ema, 'module') else ema.ema,
                    single_cls=opt.single_cls,
                    dataloader=testloader,
                    save_dir=log_dir)

                # Write
                with open(results_file, 'a') as f:
                    f.write(
                        s + '%10.4g' * 7 % results +
                        '\n')  # P, R, mAP, F1, test_losses=(GIoU, obj, cls)
                if len(opt.name) and opt.bucket:
                    os.system('gsutil cp %s gs://%s/results/results%s.txt' %
                              (results_file, opt.bucket, opt.name))

                # Tensorboard
                if tb_writer:
                    tags = [
                        'train/giou_loss', 'train/obj_loss', 'train/cls_loss',
                        'metrics/precision', 'metrics/recall',
                        'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95',
                        'val/giou_loss', 'val/obj_loss', 'val/cls_loss'
                    ]
                    for x, tag in zip(list(mloss[:-1]) + list(results), tags):
                        tb_writer.add_scalar(tag, x, epoch)

                # Update best mAP
                fi = fitness(np.array(results).reshape(
                    1,
                    -1))  # fitness_i = weighted combination of [P, R, mAP, F1]
                if fi > best_fitness:
                    best_fitness = fi

            # Save model
            save = (not opt.nosave) or (final_epoch and not opt.evolve)
            if save:
                with open(results_file, 'r') as f:  # create checkpoint
                    ckpt = {
                        'epoch':
                        epoch,
                        'best_fitness':
                        best_fitness,
                        'training_results':
                        f.read(),
                        'model':
                        ema.ema.module if hasattr(ema, 'module') else ema.ema,
                        'optimizer':
                        None if final_epoch else optimizer.state_dict()
                    }

                # Save last, best and delete
                torch.save(ckpt, last)
                if (best_fitness == fi) and not final_epoch:
                    torch.save(ckpt, best)
                del ckpt
        # end epoch ----------------------------------------------------------------------------------------------------
    # end training

    if rank in [-1, 0]:
        # Strip optimizers
        n = ('_'
             if len(opt.name) and not opt.name.isnumeric() else '') + opt.name
        fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n
        for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'],
                          [flast, fbest, fresults]):
            if os.path.exists(f1):
                os.rename(f1, f2)  # rename
                ispt = f2.endswith('.pt')  # is *.pt
                strip_optimizer(f2) if ispt else None  # strip optimizer
                os.system('gsutil cp %s gs://%s/weights' % (
                    f2, opt.bucket)) if opt.bucket and ispt else None  # upload
        # Finish
        if not opt.evolve:
            plot_results(save_dir=log_dir)  # save as results.png
        print('%g epochs completed in %.3f hours.\n' %
              (epoch - start_epoch + 1, (time.time() - t0) / 3600))

    dist.destroy_process_group() if rank not in [-1, 0] else None
    torch.cuda.empty_cache()
    return results