def main(): # Initialize C2 workspace.GlobalInit( ['caffe2', '--caffe2_log_level=0', '--caffe2_gpu_memory_tracking=1'] ) # Set up logging and load config options logger = setup_logging(__name__) logging.getLogger('roi_data.loader').setLevel(logging.INFO) args = parse_args() logger.info('Called with args:') logger.info(args) if args.cfg_file is not None: merge_cfg_from_file(args.cfg_file) if args.opts is not None: merge_cfg_from_list(args.opts) assert_and_infer_cfg() logger.info('Training with config:') logger.info(pprint.pformat(cfg)) # Note that while we set the numpy random seed network training will not be # deterministic in general. There are sources of non-determinism that cannot # be removed with a reasonble execution-speed tradeoff (such as certain # non-deterministic cudnn functions). np.random.seed(cfg.RNG_SEED) # Execute the training run checkpoints = train_model() # Test the trained model if not args.skip_test: test_model(checkpoints['final'], args.multi_gpu_testing, args.opts)
import threading from django.conf import settings from cassandra.cluster import Cluster #from celery import shared_task from celery.signals import worker_process_init, worker_process_shutdown from hydroview.celeryconfig import app from .management.commands import run_update from utils import logging thread_local = threading.local() logging.setup_logging() @worker_process_init.connect def open_cassandra_session(*args, **kwargs): cluster = Cluster([settings.DATABASES["cassandra"]["HOST"],], protocol_version=3) session = cluster.connect(settings.DATABASES["cassandra"]["NAME"]) thread_local.cassandra_session = session @worker_process_shutdown.connect def close_cassandra_session(*args, **kwargs): session = thread_local.cassandra_session session.shutdown() thread_local.cassandra_session = None @app.task(name='logs.tasks.init_run_update') def init_run_update(): print("init run update!") run_update.run_update()
import _init_paths # pylint: disable=unused-import import nn as mynn import utils.net as net_utils import utils.misc as misc_utils from core.config import cfg, cfg_from_file, cfg_from_list, assert_and_infer_cfg from datasets.roidb import combined_roidb_for_training from roi_data.loader import RoiDataLoader, MinibatchSampler, collate_minibatch from modeling.model_builder import Generalized_RCNN from utils.detectron_weight_helper import load_detectron_weight from utils.logging import setup_logging from utils.timer import Timer from utils.training_stats import TrainingStats # Set up logging and load config options logger = setup_logging(__name__) logging.getLogger('roi_data.loader').setLevel(logging.INFO) # RuntimeError: received 0 items of ancdata. Issue: pytorch/pytorch#973 rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1])) def parse_args(): """Parse input arguments""" parser = argparse.ArgumentParser(description='Train a X-RCNN network') parser.add_argument('--dataset', dest='dataset', required=True, help='Dataset to use')
import _init_paths # pylint: disable=unused-import import nn as mynn import utils.net as net_utils import utils.misc as misc_utils from core.config import cfg, cfg_from_file, cfg_from_list, assert_and_infer_cfg from datasets.roidb import combined_roidb_for_training from roi_data.loader import RoiDataLoader, MinibatchSampler, BatchSampler, collate_minibatch from modeling.model_builder import Generalized_RCNN from utils.detectron_weight_helper import load_detectron_weight from utils.logging import setup_logging from utils.timer import Timer from utils.training_stats import TrainingStats # Set up logging and load config options logger = setup_logging(__name__) logging.getLogger('roi_data.loader').setLevel(logging.INFO) # RuntimeError: received 0 items of ancdata. Issue: pytorch/pytorch#973 rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1])) def parse_args(): """Parse input arguments""" parser = argparse.ArgumentParser(description='Train a X-RCNN network') parser.add_argument( '--dataset', dest='dataset', required=True, help='Dataset to use') parser.add_argument( '--cfg', dest='cfg_file', required=True,
def main(): parser = argparse.ArgumentParser(prog='Fashion MNIST Models Evaluator') setup_argparse_logging_level(parser) parser.add_argument('--model-type', choices=['vgg', 'two_conv', 'five_conv'], required=True, help='') parser.add_argument('-t', '--test-model-path', default=None, type=str, help='model path') parser.add_argument('-r', '--resume-model-path', default=None, type=str, help='model path') parser.add_argument('--train-batch-size', default=50, help='batch size for training with Adam') parser.add_argument('--lr', default=0.005, type=float, help='learning rate') parser.add_argument('--train-epoch', default=60, type=int, help='number of training epoch') parser.add_argument('--seed', default=42, help='seed') parser.add_argument('--save-dir', default='./data', help='saving metrics dir') parser.add_argument('--optimizer', choices=['adam', 'sgd'], default='adam', help='') parser.add_argument('--dump-metrics-frequency', metavar='Batch_n', default='600', type=int, help='Dump metrics every Batch_n batches') parser.add_argument( '--threshold-validation-accuracy', default='0.95', type=float, help='Threshold validation to reach for stopping training') parser.add_argument( '--num-threads', default='0', type=int, help='Number of CPU to use for processing mini batches') parser.add_argument('--scale', action='store_true', help='scale input in [0-1] range') parser.add_argument( '--standardize', action='store_true', help='Subtract each instance by mean of data and divide by std') parser.add_argument('--augment', action='store_true', help='Use data augmentation') parser.add_argument('--pretrained', action='store_true', help='Use pretrained weights for VGG') parser.add_argument('--batch-norm', action='store_true', help='Use batch norm') args = parser.parse_args() args = vars(args) model_cfg_keys = ('pretrained', 'batch_norm') model_cfg = {k: args[k] for k in model_cfg_keys if k in args} args['model_cfg'] = model_cfg for key in model_cfg_keys: args.pop(key) setup_logging(args.pop('logging_level')) evaluator = FMModelsEvaluator(**args) evaluator.run()
import os import sentry_sdk # noqa: I001; pylint: disable=ungrouped-imports; conflicts with Flake8 from sentry_sdk.integrations.logging import LoggingIntegration # noqa: I001 from flask import Flask from flask_jwt_oidc import JwtManager import requests import config from colin_api.models.filing import Filing from registry_schemas import validate from utils.logging import setup_logging setup_logging( os.path.join(os.path.abspath(os.path.dirname(__file__)), 'logging.conf')) # important to do this first # lower case name as used by convention in most Flask apps jwt = JwtManager() # pylint: disable=invalid-name SENTRY_LOGGING = LoggingIntegration( event_level=logging.ERROR # send errors as events ) def create_app(run_mode=os.getenv('FLASK_ENV', 'production')): """Return a configured Flask App using the Factory method.""" app = Flask(__name__) app.config.from_object(config.CONFIGURATION[run_mode]) # Configure Sentry
def eval_worker(gpu, ngpus_per_node, args): args.gpu = gpu # local rank, local machine cuda id args.local_rank = args.gpu args.batch_size = args.batch_size_per_gpu global_rank = args.gpu + args.machine_rank * ngpus_per_node dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=global_rank) # Setup logging format. logging.setup_logging("stdout.log", 'w') # synchronize is needed here to prevent a possible timeout after calling # init_process_group # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172 comm.synchronize() args.rank = comm.get_rank() # global rank torch.cuda.set_device(args.gpu) random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) # build the supernet logger.info("=> creating model '{}'".format(args.arch)) model = models.model_factory.create_model(args) model.cuda(args.gpu) model = comm.get_parallel_model(model, args.gpu) #local rank # define loss function (criterion) criterion = nn.CrossEntropyLoss().cuda() ## load dataset, train_sampler: distributed train_loader, val_loader, train_sampler = build_data_loader(args) assert args.resume #reloading model model.module.load_weights_from_pretrained_models(args.resume) if train_sampler: train_sampler.set_epoch(0) targeted_min_flops = args.evo_search.targeted_min_flops targeted_max_flops = args.evo_search.targeted_max_flops # run evolutionary search parent_popu = [] for idx in range(args.evo_search.parent_popu_size): if idx == 0: cfg = model.module.sample_min_subnet() else: cfg = model.module.sample_active_subnet_within_range( targeted_min_flops, targeted_max_flops) cfg['net_id'] = f'net_{idx % args.world_size}_evo_0_{idx}' parent_popu.append(cfg) pareto_global = {} for evo in range(args.evo_search.evo_iter): # partition the set of candidate sub-networks # and send them to each GPU for parallel evaluation # sub-networks to be evaluated on GPU {args.rank} my_subnets_to_be_evaluated = {} n_evaluated = len(parent_popu) // args.world_size * args.world_size for cfg in parent_popu[:n_evaluated]: if cfg['net_id'].startswith(f'net_{args.rank}_'): my_subnets_to_be_evaluated[cfg['net_id']] = cfg # aggregating all evaluation results eval_results = attentive_nas_eval.validate( my_subnets_to_be_evaluated, train_loader, val_loader, model, criterion, args, logger, ) # update the Pareto frontier # in this case, we search the best FLOPs vs. accuracy trade-offs for cfg in eval_results: f = round( cfg['flops'] / args.evo_search.step) * args.evo_search.step if f not in pareto_global or pareto_global[f]['acc1'] < cfg['acc1']: pareto_global[f] = cfg # next batch of sub-networks to be evaluated parent_popu = [] # mutate for idx in range(args.evo_search.mutate_size): while True: old_cfg = random.choice(list(pareto_global.values())) cfg = model.module.mutate_and_reset( old_cfg, prob=args.evo_search.mutate_prob) flops = model.module.compute_active_subnet_flops() if flops >= targeted_min_flops and flops <= targeted_max_flops: break cfg['net_id'] = f'net_{idx % args.world_size}_evo_{evo}_mutate_{idx}' parent_popu.append(cfg) # cross over for idx in range(args.evo_search.crossover_size): while True: cfg1 = random.choice(list(pareto_global.values())) cfg2 = random.choice(list(pareto_global.values())) cfg = model.module.crossover_and_reset(cfg1, cfg2) flops = model.module.compute_active_subnet_flops() if flops >= targeted_min_flops and flops <= targeted_max_flops: break cfg['net_id'] = f'net_{idx % args.world_size}_evo_{evo}_crossover_{idx}' parent_popu.append(cfg)
def main(): ''' main ''' setup_logging() logger = logging.getLogger(__name__) host = '192.95.32.117' vdbname = 'ds-wizards' vuser = '******' vpassword = '******' tdbname = 'ds-content-tags' tuser = '******' tpassword = '******' vres = fetch(host, vdbname, vuser, vpassword, VQUERY) vres = [(post_id.split("_")[1], url) for post_id, url in vres] tres = fetch(host, tdbname, tuser, tpassword, TQUERY) videos = inner_join(tres, vres) filtered, t2i, i2t = filter_videos(videos, MIN_TAGS) logger.info("Found %d videos with %d unique tags" % (len(filtered), len(t2i))) # we will need this eventually tags = { tag_id: (name, path) for (tag_id, name, path) in fetch(host, tdbname, tuser, tpassword, TAGS) } # dump tags to file with open(os.path.join(DATA_PATH, "tags.pickle"), 'wb') as handle: pickle.dump(tags, handle, protocol=pickle.HIGHEST_PROTOCOL) # check if we have processed some of those items processed = set([ os.path.basename(f).split(".")[0] for f in glob.glob(os.path.join(DATA_PATH, "*.pickle")) ]) logger.info("Found %d processed videos" % len(processed)) filtered = [(video_id, tags, url) for (video_id, tags, url) in filtered if video_id not in processed] logger.info("After removing processed videos, %d items left" % len(filtered)) fsq = lambda: sq.fetch(filtered[:LIMIT], model_path=MODEL_PATH, data_path=DATA_PATH, logging_step=LOGGING_INTERVAL) fmp = lambda: mp.fetch(filtered[:LIMIT], nprod=NPROD, model_path=MODEL_PATH, data_path=DATA_PATH, logging_step=LOGGING_INTERVAL) # Use only the first GPU os.environ["CUDA_VISIBLE_DEVICES"] = "0" run_and_measure(fsq, len(filtered[:LIMIT]))
def main_worker(gpu, ngpus_per_node, args): args.gpu = gpu # local rank, local machine cuda id args.local_rank = args.gpu args.batch_size = args.batch_size_per_gpu args.batch_size_total = args.batch_size * args.world_size #rescale base lr args.lr_scheduler.base_lr = args.lr_scheduler.base_lr * (max( 1, args.batch_size_total // 256)) # set random seed, make sure all random subgraph generated would be the same random.seed(args.seed) torch.manual_seed(args.seed) if args.gpu: torch.cuda.manual_seed(args.seed) global_rank = args.gpu + args.machine_rank * ngpus_per_node dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=global_rank) # Setup logging format. logging.setup_logging(args.logging_save_path, 'w') logger.info( f"Use GPU: {args.gpu}, machine rank {args.machine_rank}, num_nodes {args.num_nodes}, \ gpu per node {ngpus_per_node}, world size {args.world_size}" ) # synchronize is needed here to prevent a possible timeout after calling # init_process_group # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172 comm.synchronize() args.rank = comm.get_rank() # global rank args.local_rank = args.gpu torch.cuda.set_device(args.gpu) # build model logger.info("=> creating model '{}'".format(args.arch)) model = models.model_factory.create_model(args) model.cuda(args.gpu) #build arch sampler arch_sampler = None if getattr(args, 'sampler', None): arch_sampler = ArchSampler(args.sampler.arch_to_flops_map_file_path, args.sampler.discretize_step, model, None) # use sync batchnorm if getattr(args, 'sync_bn', False): model.apply(lambda m: setattr(m, 'need_sync', True)) model = comm.get_parallel_model(model, args.gpu) #local rank logger.info(model) criterion = loss_ops.CrossEntropyLossSmooth(args.label_smoothing).cuda( args.gpu) soft_criterion = loss_ops.KLLossSoft().cuda(args.gpu) if not getattr(args, 'inplace_distill', True): soft_criterion = None ## load dataset, train_sampler: distributed train_loader, val_loader, train_sampler = build_data_loader(args) args.n_iters_per_epoch = len(train_loader) logger.info(f'building optimizer and lr scheduler, \ local rank {args.gpu}, global rank {args.rank}, world_size {args.world_size}' ) optimizer = build_optimizer(args, model) lr_scheduler = build_lr_scheduler(args, optimizer) # optionally resume from a checkpoint if args.resume: saver.load_checkpoints(args, model, optimizer, lr_scheduler, logger) logger.info(args) for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) args.curr_epoch = epoch logger.info('Training lr {}'.format(lr_scheduler.get_lr()[0])) # train for one epoch acc1, acc5 = train_epoch(epoch, model, train_loader, optimizer, criterion, args, \ arch_sampler=arch_sampler, soft_criterion=soft_criterion, lr_scheduler=lr_scheduler) if comm.is_master_process() or args.distributed: # validate supernet model validate(train_loader, val_loader, model, criterion, args) if comm.is_master_process(): # save checkpoints saver.save_checkpoint( args.checkpoint_save_path, model, optimizer, lr_scheduler, args, epoch, )
import csv import logging from matplotlib import pyplot as pyplot from utils.logging import setup_logging, get_module_by_name logger = setup_logging([get_module_by_name(__name__)], level=logging.INFO, module_logger_to_return=get_module_by_name(__name__)) def setup_font_sizes(): SMALL_SIZE = 14 MEDIUM_SIZE = 16 BIG_SIZE = 20 pyplot.rc('font', size=SMALL_SIZE) # controls default text sizes pyplot.rc('axes', titlesize=SMALL_SIZE) # fontsize of the axes title pyplot.rc('axes', labelsize=MEDIUM_SIZE) # fontsize of the x and y labels pyplot.rc('xtick', labelsize=SMALL_SIZE) # fontsize of the tick labels pyplot.rc('ytick', labelsize=SMALL_SIZE) # fontsize of the tick labels pyplot.rc('legend', fontsize=SMALL_SIZE) # legend fontsize pyplot.rc('figure', titlesize=BIG_SIZE) # fontsize of the figure title if __name__ == '__main__': logger.info('start.') csv_fn = '../SOResults/tag_vs_questioncount.csv' with open(csv_fn, newline='') as csvfile: next(csvfile) # skip header csv_reader = csv.reader(csvfile, delimiter=',')
def train(cfg): """ Train a video model for many epochs on train set and evaluate it on val set. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set up environment. du.init_distributed_training(cfg) # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Init multigrid. multigrid = None if cfg.MULTIGRID.LONG_CYCLE or cfg.MULTIGRID.SHORT_CYCLE: multigrid = MultigridSchedule() cfg = multigrid.init_multigrid(cfg) if cfg.MULTIGRID.LONG_CYCLE: cfg, _ = multigrid.update_long_cycle(cfg, cur_epoch=0) # Print config. logger.info("Train with config:") logger.info(pprint.pformat(cfg)) # Build the video model and print model statistics. model = build_model(cfg) # model = x3d.MyModel() if du.is_master_proc() and cfg.LOG_MODEL_INFO: misc.log_model_info(model, cfg, is_train=True) # Construct the optimizer. optimizer = optim.construct_optimizer(model, cfg) # Load a checkpoint to resume training if applicable. if cfg.TRAIN.AUTO_RESUME and cu.has_checkpoint(cfg.OUTPUT_DIR): last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR) logger.info("Load from last checkpoint, {}.".format(last_checkpoint)) checkpoint_epoch = cu.load_checkpoint( last_checkpoint, model, cfg.NUM_GPUS > 1, optimizer ) start_epoch = checkpoint_epoch + 1 elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "": logger.info("Load from given checkpoint file.") checkpoint_epoch = cu.load_checkpoint( cfg.TRAIN.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, optimizer, inflation=cfg.TRAIN.CHECKPOINT_INFLATE, convert_from_caffe2=cfg.TRAIN.CHECKPOINT_TYPE == "caffe2", ) start_epoch = checkpoint_epoch + 1 else: start_epoch = 0 # Create the video train and val loaders. train_loader = loader.construct_loader(cfg, "train") val_loader = loader.construct_loader(cfg, "val") precise_bn_loader = loader.construct_loader( cfg, "train", is_precise_bn=True ) # Create meters. if cfg.DETECTION.ENABLE: train_meter = AVAMeter(len(train_loader), cfg, mode="train") val_meter = AVAMeter(len(val_loader), cfg, mode="val") else: train_meter = TrainMeter(len(train_loader), cfg) val_meter = ValMeter(len(val_loader), cfg) # set up writer for logging to Tensorboard format. if cfg.TENSORBOARD.ENABLE and du.is_master_proc( cfg.NUM_GPUS * cfg.NUM_SHARDS ): writer = tb.TensorboardWriter(cfg) else: writer = None # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch + 1)) for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH): if cfg.MULTIGRID.LONG_CYCLE: cfg, changed = multigrid.update_long_cycle(cfg, cur_epoch) if changed: ( model, optimizer, train_loader, val_loader, precise_bn_loader, train_meter, val_meter, ) = build_trainer(cfg) # Load checkpoint. if cu.has_checkpoint(cfg.OUTPUT_DIR): last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR) assert "{:05d}.pyth".format(cur_epoch) in last_checkpoint else: last_checkpoint = cfg.TRAIN.CHECKPOINT_FILE_PATH logger.info("Load from {}".format(last_checkpoint)) cu.load_checkpoint( last_checkpoint, model, cfg.NUM_GPUS > 1, optimizer ) # Shuffle the dataset. loader.shuffle_dataset(train_loader, cur_epoch) # Train for one epoch. train_epoch( train_loader, model, optimizer, train_meter, cur_epoch, cfg, writer ) # Compute precise BN stats. if cfg.BN.USE_PRECISE_STATS and len(get_bn_modules(model)) > 0: calculate_and_update_precise_bn( precise_bn_loader, model, min(cfg.BN.NUM_BATCHES_PRECISE, len(precise_bn_loader)), ) _ = misc.aggregate_sub_bn_stats(model) # Save a checkpoint. if cu.is_checkpoint_epoch( cfg, cur_epoch, None if multigrid is None else multigrid.schedule ): cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg) # Evaluate the model on validation set. if misc.is_eval_epoch( cfg, cur_epoch, None if multigrid is None else multigrid.schedule ): eval_epoch(val_loader, model, val_meter, cur_epoch, cfg, writer) if writer is not None: writer.close()
def train(cfg): """ Train function. Args: cfg (CfgNode) : configs. Details can be found in config.py """ # Set random seed from configs. if cfg.RNG_SEED != -1: random.seed(cfg.RNG_SEED) np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) torch.cuda.manual_seed_all(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.NUM_GPUS, os.path.join(cfg.LOG_DIR, "log.txt")) # Print config. logger.info("Train with config:") logger.info(pprint.pformat(cfg)) # Model for training. model = build_model(cfg) # Construct te optimizer. optimizer = optim.construct_optimizer(model, cfg) # Print model statistics. if du.is_master_proc(cfg.NUM_GPUS): misc.log_model_info(model, cfg, use_train_input=True) # Create dataloaders. train_loader = loader.construct_loader(cfg, 'train') val_loader = loader.construct_loader(cfg, 'val') if cfg.SOLVER.MAX_EPOCH != -1: max_epoch = cfg.SOLVER.MAX_EPOCH * cfg.SOLVER.GRADIENT_ACCUMULATION_STEPS num_steps = max_epoch * len(train_loader) cfg.SOLVER.NUM_STEPS = cfg.SOLVER.MAX_EPOCH * len(train_loader) cfg.SOLVER.WARMUP_PROPORTION = cfg.SOLVER.WARMUP_EPOCHS / cfg.SOLVER.MAX_EPOCH else: num_steps = cfg.SOLVER.NUM_STEPS * cfg.SOLVER.GRADIENT_ACCUMULATION_STEPS max_epoch = math.ceil(num_steps / len(train_loader)) cfg.SOLVER.MAX_EPOCH = cfg.SOLVER.NUM_STEPS / len(train_loader) cfg.SOLVER.WARMUP_EPOCHS = cfg.SOLVER.MAX_EPOCH * cfg.SOLVER.WARMUP_PROPORTION start_epoch = 0 global_step = 0 if cfg.TRAIN.CHECKPOINT_FILE_PATH: if os.path.isfile(cfg.TRAIN.CHECKPOINT_FILE_PATH): logger.info( "=> loading checkpoint '{}'".format( cfg.TRAIN.CHECKPOINT_FILE_PATH ) ) ms = model.module if cfg.NUM_GPUS > 1 else model # Load the checkpoint on CPU to avoid GPU mem spike. checkpoint = torch.load( cfg.TRAIN.CHECKPOINT_FILE_PATH, map_location='cpu' ) start_epoch = checkpoint['epoch'] ms.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) global_step = checkpoint['epoch'] * len(train_loader) logger.info( "=> loaded checkpoint '{}' (epoch {})".format( cfg.TRAIN.CHECKPOINT_FILE_PATH, checkpoint['epoch'] ) ) else: logger.info("Training with random initialization.") # Create meters. train_meter = TrainMeter( len(train_loader), num_steps, max_epoch, cfg ) val_meter = ValMeter( len(val_loader), max_epoch, cfg ) # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch+1)) cudnn.benchmark = True best_epoch, best_top1_err, top5_err, best_map = 0, 100.0, 100.0, 0.0 for cur_epoch in range(start_epoch, max_epoch): is_best_epoch = False # Shuffle the dataset. # loader.shuffle_dataset(train_loader, cur_epoch) # Pretrain for one epoch. global_step = train_epoch( train_loader, model, optimizer, train_meter, cur_epoch, global_step, num_steps, cfg ) if cfg.BN.USE_PRECISE_STATS and len(get_bn_modules(model)) > 0: calculate_and_update_precise_bn( train_loader, model, cfg.BN.NUM_BATCHES_PRECISE ) if misc.is_eval_epoch(cfg, cur_epoch, max_epoch): stats = eval_epoch(val_loader, model, val_meter, cur_epoch, cfg) if cfg.DATA.MULTI_LABEL: if best_map < float(stats["map"]): best_epoch = cur_epoch + 1 best_map = float(stats["map"]) is_best_epoch = True logger.info( "BEST: epoch: {}, best_map: {:.2f}".format( best_epoch, best_map, ) ) else: if best_top1_err > float(stats["top1_err"]): best_epoch = cur_epoch + 1 best_top1_err = float(stats["top1_err"]) top5_err = float(stats["top5_err"]) is_best_epoch = True logger.info( "BEST: epoch: {}, best_top1_err: {:.2f}, top5_err: {:.2f}".format( best_epoch, best_top1_err, top5_err ) ) sd = \ model.module.state_dict() if cfg.NUM_GPUS > 1 else \ model.state_dict() ckpt = { 'epoch': cur_epoch + 1, 'model_arch': cfg.MODEL.DOWNSTREAM_ARCH, 'state_dict': sd, 'optimizer': optimizer.state_dict(), } if (cur_epoch + 1) % cfg.SAVE_EVERY_EPOCH == 0 and du.get_rank() == 0: sd = \ model.module.state_dict() if cfg.NUM_GPUS > 1 else \ model.state_dict() save_checkpoint( ckpt, filename=os.path.join(cfg.SAVE_DIR, f'epoch{cur_epoch+1}.pyth') ) if is_best_epoch and du.get_rank() == 0: save_checkpoint( ckpt, filename=os.path.join(cfg.SAVE_DIR, f"epoch_best.pyth") )
def main(): setup_logging() create_folder(daemon_folder) server = daemon_uds_server.DaemonUDSServer( daemon_main_uds_filename, client_handler.create, client_handler.libc_call_callback)
import requests import sentry_sdk # noqa: I001, E501; pylint: disable=ungrouped-imports; conflicts with Flake8 from colin_api.models.filing import Filing from flask import Flask from legal_api.services.bootstrap import AccountService from legal_api.services.queue import QueueService from sentry_sdk import capture_message from sentry_sdk.integrations.logging import LoggingIntegration # noqa: I001 import config # pylint: disable=import-error from utils.logging import setup_logging # pylint: disable=import-error # noqa: I003 setup_logging( os.path.join(os.path.abspath(os.path.dirname(__file__)), 'logging.conf')) SENTRY_LOGGING = LoggingIntegration( event_level=logging.ERROR # send errors as events ) SET_EVENTS_MANUALLY = False def create_app(run_mode=os.getenv('FLASK_ENV', 'production')): """Return a configured Flask App using the Factory method.""" app = Flask(__name__) app.config.from_object(config.CONFIGURATION[run_mode]) # Configure Sentry if app.config.get('SENTRY_DSN', None): sentry_sdk.init(dsn=app.config.get('SENTRY_DSN'), integrations=[SENTRY_LOGGING])
def main(argv): args = parser.parse_args(argv) setup_logging(os.path.dirname(args.checkpoint), 'eval', args.verbose, args.dry) logging.info('Commandline arguments: {}'.format(' '.join(argv))) if args.cuda != '': try: args.cuda = utils.set_cuda_env(args.cuda) except Exception: logging.critical('No free GPU on this machine. Aborting run.') return logging.info('Running on GPU {}'.format(args.cuda)) # Load configuration conf = Configuration.from_json(args.config) conf.args = args if args.conf: new_conf_entries = {} for arg in args.conf: key, value = arg.split('=') new_conf_entries[key] = value conf.update(new_conf_entries) if args.verbose: logging.debug(conf) utils.set_random_seeds(conf.seed) if args.raw: # This is a hack to suppress the output transform when we request raw data conf.application = 'none' if conf.has_attr('tasks'): for name, task in conf.tasks.items(): if 'application' in task: logging.debug(('Changing output transform in task {} ' 'from {} to none').format(name, task['application'])) task['application'] = 'none' # Setup model runner = build_runner(conf, conf.runner_type, args.cuda, mode='test') # Handle resuming from checkpoint if args.checkpoint != 'NONE': if os.path.exists(args.checkpoint): _ = restore_checkpoint(args.checkpoint, runner, cuda=args.cuda) logging.info('Restored checkpoint from {}'.format(args.checkpoint)) else: logging.critical(('Checkpoint {} to restore ' 'from not found').format(args.checkpoint)) return # Load datasets mode = 'dataset' if len(args.files_or_dirs) == 0: datasets = [load_dataset(conf, args.data_dir, conf.validation_dataset, args.fold)] else: datasets = [] for f in args.files_or_dirs: if is_dataset(f): dataset = load_dataset(conf, args.data_dir, f, args.fold) datasets.append(dataset) if args.raw: mode = 'raw' num_samples = conf.get_attr('num_validation_subset_samples', default=None) # Evaluate all datasets for dataset in datasets: logging.info('Evaluating dataset {}'.format(dataset.name)) sampler = maybe_get_subset_sampler(num_samples, dataset) loader = DataLoader(dataset=dataset, num_workers=DEFAULT_NUM_WORKERS, batch_size=1, sampler=sampler, shuffle=False) if mode == 'dataset': data, _, val_metrics = runner.validate(loader, len(loader)) res_str = 'Average metrics for {}\n'.format(dataset.name) for metric_name, metric in val_metrics.items(): res_str += ' {}: {}\n'.format(metric_name, metric) logging.info(res_str) else: data = runner.infer(loader) if not args.dry and (args.infer or args.dump): if mode == 'dataset' or mode == 'raw': conf_name = os.path.splitext(os.path.basename(conf.file))[0] output_dir = get_run_dir(args.out_dir, '{}_{}'.format(dataset.name, conf_name)) if not os.path.isdir(output_dir): os.mkdir(output_dir) logging.info('Writing images to {}'.format(output_dir)) file_idx = 0 for batch in data: if mode == 'image': output_dir = os.path.dirname(dataset.images[file_idx]) named_batch = runner.get_named_outputs(batch) inp = named_batch['input'] if 'prediction' in named_batch: batch_size = named_batch['prediction'].shape[0] filenames = [dataset.get_filename(idx) for idx in range(file_idx, file_idx + batch_size)] save_output_images(dataset, inp, named_batch['prediction'], named_batch['target'], output_dir, filenames, 'default', args.dump, args.raw) file_idx += len(filenames) logging.info(('Finished writing images for ' 'dataset {}').format(dataset.name))