def main(): model = config.get('config', 'model') cachedir = utils.get_cachedir(config) with open(os.path.join(cachedir, 'names'), 'r') as f: names = [line.strip() for line in f] width, height = np.array(utils.get_downsampling(config)) * 13 anchors = pd.read_csv(os.path.expanduser(os.path.expandvars(config.get(model, 'anchors'))), sep='\t').values func = getattr(inference, config.get(model, 'inference')) with tf.Session() as sess: image = tf.placeholder(tf.float32, [1, height, width, 3], name='image') func(image, len(names), len(anchors)) tf.contrib.framework.get_or_create_global_step() tf.global_variables_initializer().run() prog = re.compile(r'[_\w\d]+\/conv(\d*)\/(weights|biases|(BatchNorm\/(gamma|beta|moving_mean|moving_variance)))$') variables = [(prog.match(v.op.name).group(1), v) for v in tf.global_variables() if prog.match(v.op.name)] variables = sorted([[int(k) if k else -1, [v for _, v in g]] for k, g in itertools.groupby(variables, operator.itemgetter(0))], key=operator.itemgetter(0)) assert variables[0][0] == -1 variables[0][0] = len(variables) - 1 variables.insert(len(variables), variables.pop(0)) with tf.name_scope('assign'): with open(os.path.expanduser(os.path.expandvars(args.file)), 'rb') as f: major, minor, revision, seen = struct.unpack('4i', f.read(16)) tf.logging.info('major=%d, minor=%d, revision=%d, seen=%d' % (major, minor, revision, seen)) for i, layer in variables: tf.logging.info('processing layer %d' % i) total = 0 for suffix in ['biases', 'beta', 'gamma', 'moving_mean', 'moving_variance', 'weights']: try: v = next(filter(lambda v: v.op.name.endswith(suffix), layer)) except StopIteration: continue shape = v.get_shape().as_list() cnt = np.multiply.reduce(shape) total += cnt tf.logging.info('%s: %s=%d' % (v.op.name, str(shape), cnt)) p = struct.unpack('%df' % cnt, f.read(4 * cnt)) if suffix == 'weights': ksize1, ksize2, channels_in, channels_out = shape p = np.reshape(p, [channels_out, channels_in, ksize1, ksize2]) # Darknet format p = np.transpose(p, [2, 3, 1, 0]) # TensorFlow format (ksize1, ksize2, channels_in, channels_out) sess.run(v.assign(p)) tf.logging.info('%d parameters assigned' % total) remaining = os.fstat(f.fileno()).st_size - f.tell() transpose(sess, layer, len(anchors)) saver = tf.train.Saver() logdir = utils.get_logdir(config) if args.delete: tf.logging.warn('delete logging directory: ' + logdir) shutil.rmtree(logdir, ignore_errors=True) os.makedirs(logdir, exist_ok=True) model_path = os.path.join(logdir, 'model.ckpt') tf.logging.info('save model into ' + model_path) saver.save(sess, model_path) if args.summary: path = os.path.join(logdir, args.logname) summary_writer = tf.summary.FileWriter(path) summary_writer.add_graph(sess.graph) tf.logging.info('tensorboard --logdir ' + logdir) if remaining > 0: tf.logging.warn('%d bytes remaining' % remaining)
def get_instance(): env = utils.get_env() for k, v in env.__dict__.items(): print(f'{k}: {v}') n_states = env.n_states n_actions = env.n_actions agent = DQN(n_states, n_actions) logdir = utils.get_logdir() return env, agent, logdir
def get_instance(diffs): default_conf = utils.get_config('default_config.yaml') conf = deepcopy(default_conf) recursive_merge(conf, diffs) env = utils.get_env(**conf['env']) # for k in diffs.keys(): # print(k, ':', env.__getattribute__(k)) n_states = env.n_states n_actions = env.n_actions agent = DQN(n_states, n_actions, **conf['agent']) # different to pa_main logdir = utils.get_logdir(conf, default_conf) return env, agent, logdir
def train(): for time in range(5): logx.initialize(get_logdir("../runs"), tensorboard=True, coolname=False) model.load_state_dict( torch.load("..\\runs\exp10\last_checkpoint_ep0.pth") ['state_dict']) # warmup dataset_train = TrainDataset( '../' + cfg.root_folder + '/five_fold/train_kfold_{}.csv'.format(time), '../' + cfg.root_folder + '/train/', train_transform) train_loader = DataLoader(dataset_train, batch_size=cfg.bs, shuffle=True) test_data = TrainDataset( '../' + cfg.root_folder + '/five_fold/test_kfold_{}.csv'.format(time), '../' + cfg.root_folder + '/train/', ) test_load = DataLoader(test_data, batch_size=cfg.bs, shuffle=False) # train for epoch in range(cfg.epoch): loss_epoch = 0 total = 0 correct = 0 for i, (x, y) in enumerate(train_loader, 1): x, y = x.to(device), y.to(device) y_hat = model(x) # 计算正确率 total += x.size(0) _, predict = torch.max(y_hat.data, dim=1) correct += (predict == y).sum().item() # 损失 loss = criterion(y_hat, y) loss_epoch += loss.item() optimizer.zero_grad() loss.backward() optimizer.step() # 过程可视化 if i % 30 == 0: print( 'epoch:%d, enumerate:%d, loss_avg:%f, now_acc:%f' % (epoch, i, loss_epoch / i, correct / total)) # epoch matric 可视化 train_loss = loss_epoch / i train_acc = (correct / total) * 100 logx.metric('train', {'loss': train_loss, 'acc': train_acc}, epoch) # valid # 开发集正确率 correct = 0 total = 0 val_loss = 0 with torch.no_grad(): for i, (img, label) in enumerate(test_load, 1): img, label = img.to(device), label.to(device) output = model(img) loss = criterion(output, label) val_loss += loss.cpu().item() _, predicted = torch.max(output.data, dim=1) # 最大值,位置 total += img.size(0) correct += (predicted == label).sum().item() val_acc = (100 * correct / total) val_loss /= i logx.metric('val', {'loss': val_loss, 'acc': val_acc}, epoch) # epoch lossand other metric print( 'epoch over; train_loss:%f, val_loss:%f, train_acc=%f, val_acc:%f' % (train_loss, val_loss, train_acc, val_acc)) logx.save_model({ 'state_dict': model.state_dict(), 'epoch': epoch }, val_acc, higher_better=True, epoch=epoch, delete_old=True) scheduler.step()
def main(): model = config.get('config', 'model') cachedir = utils.get_cachedir(config) with open(os.path.join(cachedir, 'names'), 'r') as f: names = [line.strip() for line in f] width, height = np.array(utils.get_downsampling(config)) * 13 anchors = pd.read_csv(os.path.expanduser( os.path.expandvars(config.get(model, 'anchors'))), sep='\t').values func = getattr(inference, config.get(model, 'inference')) with tf.Session() as sess: image = tf.placeholder(tf.float32, [1, height, width, 3], name='image') func(image, len(names), len(anchors)) tf.contrib.framework.get_or_create_global_step() tf.global_variables_initializer().run() prog = re.compile( r'[_\w\d]+\/conv(\d*)\/(weights|biases|(BatchNorm\/(gamma|beta|moving_mean|moving_variance)))$' ) variables = [(prog.match(v.op.name).group(1), v) for v in tf.global_variables() if prog.match(v.op.name)] variables = sorted( [[int(k) if k else -1, [v for _, v in g]] for k, g in itertools.groupby(variables, operator.itemgetter(0))], key=operator.itemgetter(0)) assert variables[0][0] == -1 variables[0][0] = len(variables) - 1 variables.insert(len(variables), variables.pop(0)) with tf.name_scope('assign'): with open(os.path.expanduser(os.path.expandvars(args.file)), 'rb') as f: major, minor, revision, seen = struct.unpack('4i', f.read(16)) tf.logging.info('major=%d, minor=%d, revision=%d, seen=%d' % (major, minor, revision, seen)) for i, layer in variables: tf.logging.info('processing layer %d' % i) total = 0 for suffix in [ 'biases', 'beta', 'gamma', 'moving_mean', 'moving_variance', 'weights' ]: try: v = next( filter(lambda v: v.op.name.endswith(suffix), layer)) except StopIteration: continue shape = v.get_shape().as_list() cnt = np.multiply.reduce(shape) total += cnt tf.logging.info('%s: %s=%d' % (v.op.name, str(shape), cnt)) p = struct.unpack('%df' % cnt, f.read(4 * cnt)) if suffix == 'weights': ksize1, ksize2, channels_in, channels_out = shape p = np.reshape( p, [channels_out, channels_in, ksize1, ksize2 ]) # Darknet format p = np.transpose( p, [2, 3, 1, 0] ) # TensorFlow format (ksize1, ksize2, channels_in, channels_out) sess.run(v.assign(p)) tf.logging.info('%d parameters assigned' % total) remaining = os.fstat(f.fileno()).st_size - f.tell() transpose(sess, layer, len(anchors)) saver = tf.train.Saver() logdir = utils.get_logdir(config) if args.delete: tf.logging.warn('delete logging directory: ' + logdir) shutil.rmtree(logdir, ignore_errors=True) os.makedirs(logdir, exist_ok=True) model_path = os.path.join(logdir, 'model.ckpt') tf.logging.info('save model into ' + model_path) saver.save(sess, model_path) if args.summary: path = os.path.join(logdir, args.logname) summary_writer = tf.summary.FileWriter(path) summary_writer.add_graph(sess.graph) tf.logging.info('tensorboard --logdir ' + logdir) if remaining > 0: tf.logging.warn('%d bytes remaining' % remaining)
def run(args): start_epoch = 1 best_loss = 1e+9 # logs args.logdir = get_logdir(args) logger = get_logger(os.path.join(args.logdir, 'main.log')) logger.info(args) writer = SummaryWriter(args.logdir) # data train_set = MovingMNIST(root='./data', train=True, download=True) valid_set = MovingMNIST(root='./data', train=False, download=True, split=args.test_size) train_loader = DataLoader(train_set, batch_size=args.batch_size, num_workers=args.n_workers, shuffle=True) valid_loader = DataLoader(valid_set, batch_size=args.batch_size, num_workers=args.n_workers, shuffle=False) # network model = models.__dict__[args.model](args=args) model = nn.DataParallel(model) args.device = torch.device( 'cuda:0' if torch.cuda.is_available() else 'cpu') model = model.to(args.device) # training criterion = get_loss_fn(args) optimizer = get_optimizer(model, args) scheduler = get_scheduler(optimizer, args) if args.resume: if os.path.isfile(args.resume): checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] + 1 best_loss = checkpoint['best/{}'.format(args.loss)] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger.info('Loaded checkpoint {} (epoch {})'.format( args.resume, start_epoch - 1)) else: raise IOError('No such file {}'.format(args.resume)) for epoch_i in range(start_epoch, args.epochs + 1): training = train(train_loader, model, criterion, optimizer, logger=logger, args=args) validation = validate(valid_loader, model, criterion, logger=logger, args=args) writer.add_scalar('Train/{}'.format(args.loss), training[args.loss], epoch_i) writer.add_scalar('Valid/{}'.format(args.loss), validation[args.loss], epoch_i) writer.add_image('Train/Predict', _get_images(training['output'], args), epoch_i) writer.add_image('Train/Target', _get_images(training['target'], args), epoch_i) writer.add_image('Valid/Predict', _get_images(validation['output'], args), epoch_i) writer.add_image('Valid/Target', _get_images(validation['target'], args), epoch_i) message = '[{}] Epoch {} Train/{} {:.4f} Valid/{} {:.4f} ' message = message.format( args.expid, epoch_i, args.loss, training[args.loss], args.loss, validation[args.loss], ) is_best = validation[args.loss] < best_loss if is_best: best_loss = validation[args.loss] message += '(Best)' save_checkpoint( { 'epoch': epoch_i, 'state_dict': model.state_dict(), 'valid/{}'.format(args.loss): validation[args.loss], 'best/{}'.format(args.loss): best_loss, 'optimizer': optimizer.state_dict(), }, is_best, args.logdir) if scheduler is not None: scheduler.step(epoch=epoch_i) logger.debug('Scheduler stepped.') for param_group in optimizer.param_groups: logger.debug(param_group['lr']) logger.info(message)