Ejemplo n.º 1
0
def main():
    # Initialize C2
    workspace.GlobalInit(
        ['caffe2', '--caffe2_log_level=0', '--caffe2_gpu_memory_tracking=1']
    )
    # Set up logging and load config options
    logger = setup_logging(__name__)
    logging.getLogger('roi_data.loader').setLevel(logging.INFO)
    args = parse_args()
    logger.info('Called with args:')
    logger.info(args)
    if args.cfg_file is not None:
        merge_cfg_from_file(args.cfg_file)
    if args.opts is not None:
        merge_cfg_from_list(args.opts)
    assert_and_infer_cfg()
    logger.info('Training with config:')
    logger.info(pprint.pformat(cfg))
    # Note that while we set the numpy random seed network training will not be
    # deterministic in general. There are sources of non-determinism that cannot
    # be removed with a reasonble execution-speed tradeoff (such as certain
    # non-deterministic cudnn functions).
    np.random.seed(cfg.RNG_SEED)
    # Execute the training run
    checkpoints = train_model()
    # Test the trained model
    if not args.skip_test:
        test_model(checkpoints['final'], args.multi_gpu_testing, args.opts)
Ejemplo n.º 2
0
import threading
from django.conf import settings
from cassandra.cluster import Cluster
#from celery import shared_task
from celery.signals import worker_process_init, worker_process_shutdown
from hydroview.celeryconfig import app
from .management.commands import run_update

from utils import logging

thread_local = threading.local()
logging.setup_logging()

@worker_process_init.connect
def open_cassandra_session(*args, **kwargs):
    cluster = Cluster([settings.DATABASES["cassandra"]["HOST"],], protocol_version=3)
    session = cluster.connect(settings.DATABASES["cassandra"]["NAME"])
    thread_local.cassandra_session = session

@worker_process_shutdown.connect
def close_cassandra_session(*args, **kwargs):
    session = thread_local.cassandra_session
    session.shutdown()
    thread_local.cassandra_session = None

@app.task(name='logs.tasks.init_run_update')
def init_run_update():
    print("init run update!")
    run_update.run_update()
Ejemplo n.º 3
0
import _init_paths  # pylint: disable=unused-import
import nn as mynn
import utils.net as net_utils
import utils.misc as misc_utils
from core.config import cfg, cfg_from_file, cfg_from_list, assert_and_infer_cfg
from datasets.roidb import combined_roidb_for_training
from roi_data.loader import RoiDataLoader, MinibatchSampler, collate_minibatch
from modeling.model_builder import Generalized_RCNN
from utils.detectron_weight_helper import load_detectron_weight
from utils.logging import setup_logging
from utils.timer import Timer
from utils.training_stats import TrainingStats

# Set up logging and load config options
logger = setup_logging(__name__)
logging.getLogger('roi_data.loader').setLevel(logging.INFO)

# RuntimeError: received 0 items of ancdata. Issue: pytorch/pytorch#973
rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1]))


def parse_args():
    """Parse input arguments"""
    parser = argparse.ArgumentParser(description='Train a X-RCNN network')

    parser.add_argument('--dataset',
                        dest='dataset',
                        required=True,
                        help='Dataset to use')
import _init_paths  # pylint: disable=unused-import
import nn as mynn
import utils.net as net_utils
import utils.misc as misc_utils
from core.config import cfg, cfg_from_file, cfg_from_list, assert_and_infer_cfg
from datasets.roidb import combined_roidb_for_training
from roi_data.loader import RoiDataLoader, MinibatchSampler, BatchSampler, collate_minibatch
from modeling.model_builder import Generalized_RCNN
from utils.detectron_weight_helper import load_detectron_weight
from utils.logging import setup_logging
from utils.timer import Timer
from utils.training_stats import TrainingStats

# Set up logging and load config options
logger = setup_logging(__name__)
logging.getLogger('roi_data.loader').setLevel(logging.INFO)

# RuntimeError: received 0 items of ancdata. Issue: pytorch/pytorch#973
rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1]))

def parse_args():
    """Parse input arguments"""
    parser = argparse.ArgumentParser(description='Train a X-RCNN network')

    parser.add_argument(
        '--dataset', dest='dataset', required=True,
        help='Dataset to use')
    parser.add_argument(
        '--cfg', dest='cfg_file', required=True,
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser(prog='Fashion MNIST Models Evaluator')
    setup_argparse_logging_level(parser)

    parser.add_argument('--model-type',
                        choices=['vgg', 'two_conv', 'five_conv'],
                        required=True,
                        help='')

    parser.add_argument('-t',
                        '--test-model-path',
                        default=None,
                        type=str,
                        help='model path')

    parser.add_argument('-r',
                        '--resume-model-path',
                        default=None,
                        type=str,
                        help='model path')

    parser.add_argument('--train-batch-size',
                        default=50,
                        help='batch size for training with Adam')

    parser.add_argument('--lr',
                        default=0.005,
                        type=float,
                        help='learning rate')

    parser.add_argument('--train-epoch',
                        default=60,
                        type=int,
                        help='number of training epoch')

    parser.add_argument('--seed', default=42, help='seed')

    parser.add_argument('--save-dir',
                        default='./data',
                        help='saving metrics dir')

    parser.add_argument('--optimizer',
                        choices=['adam', 'sgd'],
                        default='adam',
                        help='')

    parser.add_argument('--dump-metrics-frequency',
                        metavar='Batch_n',
                        default='600',
                        type=int,
                        help='Dump metrics every Batch_n batches')

    parser.add_argument(
        '--threshold-validation-accuracy',
        default='0.95',
        type=float,
        help='Threshold validation to reach for stopping training')

    parser.add_argument(
        '--num-threads',
        default='0',
        type=int,
        help='Number of CPU to use for processing mini batches')

    parser.add_argument('--scale',
                        action='store_true',
                        help='scale input in [0-1] range')

    parser.add_argument(
        '--standardize',
        action='store_true',
        help='Subtract each instance by mean of data and divide by std')

    parser.add_argument('--augment',
                        action='store_true',
                        help='Use data augmentation')

    parser.add_argument('--pretrained',
                        action='store_true',
                        help='Use pretrained weights for VGG')

    parser.add_argument('--batch-norm',
                        action='store_true',
                        help='Use batch norm')

    args = parser.parse_args()
    args = vars(args)

    model_cfg_keys = ('pretrained', 'batch_norm')
    model_cfg = {k: args[k] for k in model_cfg_keys if k in args}
    args['model_cfg'] = model_cfg
    for key in model_cfg_keys:
        args.pop(key)

    setup_logging(args.pop('logging_level'))
    evaluator = FMModelsEvaluator(**args)
    evaluator.run()
Ejemplo n.º 6
0
import os

import sentry_sdk  # noqa: I001; pylint: disable=ungrouped-imports; conflicts with Flake8
from sentry_sdk.integrations.logging import LoggingIntegration  # noqa: I001
from flask import Flask
from flask_jwt_oidc import JwtManager

import requests
import config

from colin_api.models.filing import Filing
from registry_schemas import validate
from utils.logging import setup_logging

setup_logging(
    os.path.join(os.path.abspath(os.path.dirname(__file__)),
                 'logging.conf'))  # important to do this first

# lower case name as used by convention in most Flask apps
jwt = JwtManager()  # pylint: disable=invalid-name

SENTRY_LOGGING = LoggingIntegration(
    event_level=logging.ERROR  # send errors as events
)


def create_app(run_mode=os.getenv('FLASK_ENV', 'production')):
    """Return a configured Flask App using the Factory method."""
    app = Flask(__name__)
    app.config.from_object(config.CONFIGURATION[run_mode])
    # Configure Sentry
Ejemplo n.º 7
0
def eval_worker(gpu, ngpus_per_node, args):
    args.gpu = gpu  # local rank, local machine cuda id
    args.local_rank = args.gpu
    args.batch_size = args.batch_size_per_gpu

    global_rank = args.gpu + args.machine_rank * ngpus_per_node
    dist.init_process_group(backend=args.dist_backend,
                            init_method=args.dist_url,
                            world_size=args.world_size,
                            rank=global_rank)

    # Setup logging format.
    logging.setup_logging("stdout.log", 'w')

    # synchronize is needed here to prevent a possible timeout after calling
    # init_process_group
    # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
    comm.synchronize()

    args.rank = comm.get_rank()  # global rank
    torch.cuda.set_device(args.gpu)

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    # build the supernet
    logger.info("=> creating model '{}'".format(args.arch))
    model = models.model_factory.create_model(args)
    model.cuda(args.gpu)
    model = comm.get_parallel_model(model, args.gpu)  #local rank

    # define loss function (criterion)
    criterion = nn.CrossEntropyLoss().cuda()

    ## load dataset, train_sampler: distributed
    train_loader, val_loader, train_sampler = build_data_loader(args)

    assert args.resume
    #reloading model
    model.module.load_weights_from_pretrained_models(args.resume)

    if train_sampler:
        train_sampler.set_epoch(0)

    targeted_min_flops = args.evo_search.targeted_min_flops
    targeted_max_flops = args.evo_search.targeted_max_flops

    # run evolutionary search
    parent_popu = []
    for idx in range(args.evo_search.parent_popu_size):
        if idx == 0:
            cfg = model.module.sample_min_subnet()
        else:
            cfg = model.module.sample_active_subnet_within_range(
                targeted_min_flops, targeted_max_flops)
        cfg['net_id'] = f'net_{idx % args.world_size}_evo_0_{idx}'
        parent_popu.append(cfg)

    pareto_global = {}
    for evo in range(args.evo_search.evo_iter):
        # partition the set of candidate sub-networks
        # and send them to each GPU for parallel evaluation

        # sub-networks to be evaluated on GPU {args.rank}
        my_subnets_to_be_evaluated = {}
        n_evaluated = len(parent_popu) // args.world_size * args.world_size
        for cfg in parent_popu[:n_evaluated]:
            if cfg['net_id'].startswith(f'net_{args.rank}_'):
                my_subnets_to_be_evaluated[cfg['net_id']] = cfg

        # aggregating all evaluation results
        eval_results = attentive_nas_eval.validate(
            my_subnets_to_be_evaluated,
            train_loader,
            val_loader,
            model,
            criterion,
            args,
            logger,
        )

        # update the Pareto frontier
        # in this case, we search the best FLOPs vs. accuracy trade-offs
        for cfg in eval_results:
            f = round(
                cfg['flops'] / args.evo_search.step) * args.evo_search.step
            if f not in pareto_global or pareto_global[f]['acc1'] < cfg['acc1']:
                pareto_global[f] = cfg

        # next batch of sub-networks to be evaluated
        parent_popu = []
        # mutate
        for idx in range(args.evo_search.mutate_size):
            while True:
                old_cfg = random.choice(list(pareto_global.values()))
                cfg = model.module.mutate_and_reset(
                    old_cfg, prob=args.evo_search.mutate_prob)
                flops = model.module.compute_active_subnet_flops()
                if flops >= targeted_min_flops and flops <= targeted_max_flops:
                    break
            cfg['net_id'] = f'net_{idx % args.world_size}_evo_{evo}_mutate_{idx}'
            parent_popu.append(cfg)

        # cross over
        for idx in range(args.evo_search.crossover_size):
            while True:
                cfg1 = random.choice(list(pareto_global.values()))
                cfg2 = random.choice(list(pareto_global.values()))
                cfg = model.module.crossover_and_reset(cfg1, cfg2)
                flops = model.module.compute_active_subnet_flops()
                if flops >= targeted_min_flops and flops <= targeted_max_flops:
                    break
            cfg['net_id'] = f'net_{idx % args.world_size}_evo_{evo}_crossover_{idx}'
            parent_popu.append(cfg)
Ejemplo n.º 8
0
def main():
    ''' main '''
    setup_logging()
    logger = logging.getLogger(__name__)

    host = '192.95.32.117'

    vdbname = 'ds-wizards'
    vuser = '******'
    vpassword = '******'

    tdbname = 'ds-content-tags'
    tuser = '******'
    tpassword = '******'

    vres = fetch(host, vdbname, vuser, vpassword, VQUERY)
    vres = [(post_id.split("_")[1], url) for post_id, url in vres]

    tres = fetch(host, tdbname, tuser, tpassword, TQUERY)
    videos = inner_join(tres, vres)

    filtered, t2i, i2t = filter_videos(videos, MIN_TAGS)
    logger.info("Found %d videos with %d unique tags" %
                (len(filtered), len(t2i)))

    # we will need this eventually
    tags = {
        tag_id: (name, path)
        for (tag_id, name,
             path) in fetch(host, tdbname, tuser, tpassword, TAGS)
    }

    # dump tags to file
    with open(os.path.join(DATA_PATH, "tags.pickle"), 'wb') as handle:
        pickle.dump(tags, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # check if we have processed some of those items
    processed = set([
        os.path.basename(f).split(".")[0]
        for f in glob.glob(os.path.join(DATA_PATH, "*.pickle"))
    ])

    logger.info("Found %d processed videos" % len(processed))

    filtered = [(video_id, tags, url) for (video_id, tags, url) in filtered
                if video_id not in processed]

    logger.info("After removing processed videos, %d items left" %
                len(filtered))

    fsq = lambda: sq.fetch(filtered[:LIMIT],
                           model_path=MODEL_PATH,
                           data_path=DATA_PATH,
                           logging_step=LOGGING_INTERVAL)

    fmp = lambda: mp.fetch(filtered[:LIMIT],
                           nprod=NPROD,
                           model_path=MODEL_PATH,
                           data_path=DATA_PATH,
                           logging_step=LOGGING_INTERVAL)

    # Use only the first GPU
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"

    run_and_measure(fsq, len(filtered[:LIMIT]))
def main_worker(gpu, ngpus_per_node, args):
    args.gpu = gpu  # local rank, local machine cuda id
    args.local_rank = args.gpu
    args.batch_size = args.batch_size_per_gpu
    args.batch_size_total = args.batch_size * args.world_size
    #rescale base lr
    args.lr_scheduler.base_lr = args.lr_scheduler.base_lr * (max(
        1, args.batch_size_total // 256))

    # set random seed, make sure all random subgraph generated would be the same
    random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.gpu:
        torch.cuda.manual_seed(args.seed)

    global_rank = args.gpu + args.machine_rank * ngpus_per_node
    dist.init_process_group(backend=args.dist_backend,
                            init_method=args.dist_url,
                            world_size=args.world_size,
                            rank=global_rank)

    # Setup logging format.
    logging.setup_logging(args.logging_save_path, 'w')

    logger.info(
        f"Use GPU: {args.gpu}, machine rank {args.machine_rank}, num_nodes {args.num_nodes}, \
                    gpu per node {ngpus_per_node}, world size {args.world_size}"
    )

    # synchronize is needed here to prevent a possible timeout after calling
    # init_process_group
    # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
    comm.synchronize()

    args.rank = comm.get_rank()  # global rank
    args.local_rank = args.gpu
    torch.cuda.set_device(args.gpu)

    # build model
    logger.info("=> creating model '{}'".format(args.arch))
    model = models.model_factory.create_model(args)
    model.cuda(args.gpu)

    #build arch sampler
    arch_sampler = None
    if getattr(args, 'sampler', None):
        arch_sampler = ArchSampler(args.sampler.arch_to_flops_map_file_path,
                                   args.sampler.discretize_step, model, None)

    # use sync batchnorm
    if getattr(args, 'sync_bn', False):
        model.apply(lambda m: setattr(m, 'need_sync', True))

    model = comm.get_parallel_model(model, args.gpu)  #local rank

    logger.info(model)

    criterion = loss_ops.CrossEntropyLossSmooth(args.label_smoothing).cuda(
        args.gpu)
    soft_criterion = loss_ops.KLLossSoft().cuda(args.gpu)

    if not getattr(args, 'inplace_distill', True):
        soft_criterion = None

    ## load dataset, train_sampler: distributed
    train_loader, val_loader, train_sampler = build_data_loader(args)
    args.n_iters_per_epoch = len(train_loader)

    logger.info(f'building optimizer and lr scheduler, \
            local rank {args.gpu}, global rank {args.rank}, world_size {args.world_size}'
                )
    optimizer = build_optimizer(args, model)
    lr_scheduler = build_lr_scheduler(args, optimizer)

    # optionally resume from a checkpoint
    if args.resume:
        saver.load_checkpoints(args, model, optimizer, lr_scheduler, logger)

    logger.info(args)

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)

        args.curr_epoch = epoch
        logger.info('Training lr {}'.format(lr_scheduler.get_lr()[0]))

        # train for one epoch
        acc1, acc5 = train_epoch(epoch, model, train_loader, optimizer, criterion, args, \
                arch_sampler=arch_sampler, soft_criterion=soft_criterion, lr_scheduler=lr_scheduler)

        if comm.is_master_process() or args.distributed:
            # validate supernet model
            validate(train_loader, val_loader, model, criterion, args)

        if comm.is_master_process():
            # save checkpoints
            saver.save_checkpoint(
                args.checkpoint_save_path,
                model,
                optimizer,
                lr_scheduler,
                args,
                epoch,
            )
Ejemplo n.º 10
0
import csv
import logging

from matplotlib import pyplot as pyplot

from utils.logging import setup_logging, get_module_by_name

logger = setup_logging([get_module_by_name(__name__)],
                       level=logging.INFO,
                       module_logger_to_return=get_module_by_name(__name__))


def setup_font_sizes():
    SMALL_SIZE = 14
    MEDIUM_SIZE = 16
    BIG_SIZE = 20
    pyplot.rc('font', size=SMALL_SIZE)  # controls default text sizes
    pyplot.rc('axes', titlesize=SMALL_SIZE)  # fontsize of the axes title
    pyplot.rc('axes', labelsize=MEDIUM_SIZE)  # fontsize of the x and y labels
    pyplot.rc('xtick', labelsize=SMALL_SIZE)  # fontsize of the tick labels
    pyplot.rc('ytick', labelsize=SMALL_SIZE)  # fontsize of the tick labels
    pyplot.rc('legend', fontsize=SMALL_SIZE)  # legend fontsize
    pyplot.rc('figure', titlesize=BIG_SIZE)  # fontsize of the figure title


if __name__ == '__main__':
    logger.info('start.')
    csv_fn = '../SOResults/tag_vs_questioncount.csv'
    with open(csv_fn, newline='') as csvfile:
        next(csvfile)  # skip header
        csv_reader = csv.reader(csvfile, delimiter=',')
Ejemplo n.º 11
0
def train(cfg):
    """
    Train a video model for many epochs on train set and evaluate it on val set.
    Args:
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    # Set up environment.
    du.init_distributed_training(cfg)
    # Set random seed from configs.
    np.random.seed(cfg.RNG_SEED)
    torch.manual_seed(cfg.RNG_SEED)

    # Setup logging format.
    logging.setup_logging(cfg.OUTPUT_DIR)

    # Init multigrid.
    multigrid = None
    if cfg.MULTIGRID.LONG_CYCLE or cfg.MULTIGRID.SHORT_CYCLE:
        multigrid = MultigridSchedule()
        cfg = multigrid.init_multigrid(cfg)
        if cfg.MULTIGRID.LONG_CYCLE:
            cfg, _ = multigrid.update_long_cycle(cfg, cur_epoch=0)
    # Print config.
    logger.info("Train with config:")
    logger.info(pprint.pformat(cfg))

    # Build the video model and print model statistics.
    model = build_model(cfg)
    # model = x3d.MyModel()
    if du.is_master_proc() and cfg.LOG_MODEL_INFO:
        misc.log_model_info(model, cfg, is_train=True)

    # Construct the optimizer.
    optimizer = optim.construct_optimizer(model, cfg)

    # Load a checkpoint to resume training if applicable.
    if cfg.TRAIN.AUTO_RESUME and cu.has_checkpoint(cfg.OUTPUT_DIR):
        last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR)
        logger.info("Load from last checkpoint, {}.".format(last_checkpoint))
        checkpoint_epoch = cu.load_checkpoint(
            last_checkpoint, model, cfg.NUM_GPUS > 1, optimizer
        )
        start_epoch = checkpoint_epoch + 1
    elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "":
        logger.info("Load from given checkpoint file.")
        checkpoint_epoch = cu.load_checkpoint(
            cfg.TRAIN.CHECKPOINT_FILE_PATH,
            model,
            cfg.NUM_GPUS > 1,
            optimizer,
            inflation=cfg.TRAIN.CHECKPOINT_INFLATE,
            convert_from_caffe2=cfg.TRAIN.CHECKPOINT_TYPE == "caffe2",
        )
        start_epoch = checkpoint_epoch + 1
    else:
        start_epoch = 0

    # Create the video train and val loaders.
    train_loader = loader.construct_loader(cfg, "train")
    val_loader = loader.construct_loader(cfg, "val")
    precise_bn_loader = loader.construct_loader(
        cfg, "train", is_precise_bn=True
    )

    # Create meters.
    if cfg.DETECTION.ENABLE:
        train_meter = AVAMeter(len(train_loader), cfg, mode="train")
        val_meter = AVAMeter(len(val_loader), cfg, mode="val")
    else:
        train_meter = TrainMeter(len(train_loader), cfg)
        val_meter = ValMeter(len(val_loader), cfg)

    # set up writer for logging to Tensorboard format.
    if cfg.TENSORBOARD.ENABLE and du.is_master_proc(
        cfg.NUM_GPUS * cfg.NUM_SHARDS
    ):
        writer = tb.TensorboardWriter(cfg)
    else:
        writer = None

    # Perform the training loop.
    logger.info("Start epoch: {}".format(start_epoch + 1))

    for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH):
        if cfg.MULTIGRID.LONG_CYCLE:
            cfg, changed = multigrid.update_long_cycle(cfg, cur_epoch)
            if changed:
                (
                    model,
                    optimizer,
                    train_loader,
                    val_loader,
                    precise_bn_loader,
                    train_meter,
                    val_meter,
                ) = build_trainer(cfg)

                # Load checkpoint.
                if cu.has_checkpoint(cfg.OUTPUT_DIR):
                    last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR)
                    assert "{:05d}.pyth".format(cur_epoch) in last_checkpoint
                else:
                    last_checkpoint = cfg.TRAIN.CHECKPOINT_FILE_PATH
                logger.info("Load from {}".format(last_checkpoint))
                cu.load_checkpoint(
                    last_checkpoint, model, cfg.NUM_GPUS > 1, optimizer
                )

        # Shuffle the dataset.
        loader.shuffle_dataset(train_loader, cur_epoch)
        # Train for one epoch.
        train_epoch(
            train_loader, model, optimizer, train_meter, cur_epoch, cfg, writer
        )

        # Compute precise BN stats.
        if cfg.BN.USE_PRECISE_STATS and len(get_bn_modules(model)) > 0:
            calculate_and_update_precise_bn(
                precise_bn_loader,
                model,
                min(cfg.BN.NUM_BATCHES_PRECISE, len(precise_bn_loader)),
            )
        _ = misc.aggregate_sub_bn_stats(model)

        # Save a checkpoint.
        if cu.is_checkpoint_epoch(
            cfg, cur_epoch, None if multigrid is None else multigrid.schedule
        ):
            cu.save_checkpoint(cfg.OUTPUT_DIR, model,
                               optimizer, cur_epoch, cfg)
        # Evaluate the model on validation set.
        if misc.is_eval_epoch(
            cfg, cur_epoch, None if multigrid is None else multigrid.schedule
        ):
            eval_epoch(val_loader, model, val_meter, cur_epoch, cfg, writer)

    if writer is not None:
        writer.close()
Ejemplo n.º 12
0
def train(cfg):
    """
    Train function.
    Args:
        cfg (CfgNode) : configs. Details can be found in
            config.py
    """
    # Set random seed from configs.
    if cfg.RNG_SEED != -1:
        random.seed(cfg.RNG_SEED)
        np.random.seed(cfg.RNG_SEED)
        torch.manual_seed(cfg.RNG_SEED)
        torch.cuda.manual_seed_all(cfg.RNG_SEED)

    # Setup logging format.
    logging.setup_logging(cfg.NUM_GPUS, os.path.join(cfg.LOG_DIR, "log.txt"))

    # Print config.
    logger.info("Train with config:")
    logger.info(pprint.pformat(cfg))

    # Model for training.
    model = build_model(cfg)
    # Construct te optimizer.
    optimizer = optim.construct_optimizer(model, cfg)

    # Print model statistics.
    if du.is_master_proc(cfg.NUM_GPUS):
        misc.log_model_info(model, cfg, use_train_input=True)

    # Create dataloaders.
    train_loader = loader.construct_loader(cfg, 'train')
    val_loader = loader.construct_loader(cfg, 'val')

    if cfg.SOLVER.MAX_EPOCH != -1:
        max_epoch = cfg.SOLVER.MAX_EPOCH * cfg.SOLVER.GRADIENT_ACCUMULATION_STEPS
        num_steps = max_epoch * len(train_loader)
        cfg.SOLVER.NUM_STEPS = cfg.SOLVER.MAX_EPOCH * len(train_loader)
        cfg.SOLVER.WARMUP_PROPORTION = cfg.SOLVER.WARMUP_EPOCHS / cfg.SOLVER.MAX_EPOCH
    else:
        num_steps = cfg.SOLVER.NUM_STEPS * cfg.SOLVER.GRADIENT_ACCUMULATION_STEPS
        max_epoch = math.ceil(num_steps / len(train_loader))
        cfg.SOLVER.MAX_EPOCH = cfg.SOLVER.NUM_STEPS / len(train_loader)
        cfg.SOLVER.WARMUP_EPOCHS = cfg.SOLVER.MAX_EPOCH * cfg.SOLVER.WARMUP_PROPORTION

    start_epoch = 0
    global_step = 0
    if cfg.TRAIN.CHECKPOINT_FILE_PATH:
        if os.path.isfile(cfg.TRAIN.CHECKPOINT_FILE_PATH):
            logger.info(
                "=> loading checkpoint '{}'".format(
                    cfg.TRAIN.CHECKPOINT_FILE_PATH
                )
            )
            ms = model.module if cfg.NUM_GPUS > 1 else model
            # Load the checkpoint on CPU to avoid GPU mem spike.
            checkpoint = torch.load(
                cfg.TRAIN.CHECKPOINT_FILE_PATH, map_location='cpu'
            )
            start_epoch = checkpoint['epoch']
            ms.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            global_step = checkpoint['epoch'] * len(train_loader)
            logger.info(
                "=> loaded checkpoint '{}' (epoch {})".format(
                    cfg.TRAIN.CHECKPOINT_FILE_PATH,
                    checkpoint['epoch']
                )
            )
    else:
        logger.info("Training with random initialization.")

    # Create meters.
    train_meter = TrainMeter(
        len(train_loader),
        num_steps,
        max_epoch,
        cfg
    )
    val_meter = ValMeter(
        len(val_loader),
        max_epoch,
        cfg
    )

    # Perform the training loop.
    logger.info("Start epoch: {}".format(start_epoch+1))

    cudnn.benchmark = True

    best_epoch, best_top1_err, top5_err, best_map = 0, 100.0, 100.0, 0.0

    for cur_epoch in range(start_epoch, max_epoch):
        is_best_epoch = False
        # Shuffle the dataset.
        # loader.shuffle_dataset(train_loader, cur_epoch)
        # Pretrain for one epoch.
        global_step = train_epoch(
            train_loader,
            model,
            optimizer,
            train_meter,
            cur_epoch,
            global_step,
            num_steps,
            cfg
        )

        if cfg.BN.USE_PRECISE_STATS and len(get_bn_modules(model)) > 0:
            calculate_and_update_precise_bn(
                train_loader, model, cfg.BN.NUM_BATCHES_PRECISE
            )

        if misc.is_eval_epoch(cfg, cur_epoch, max_epoch):
            stats = eval_epoch(val_loader, model, val_meter, cur_epoch, cfg)
            if cfg.DATA.MULTI_LABEL:
                if best_map < float(stats["map"]):
                    best_epoch = cur_epoch + 1
                    best_map = float(stats["map"])
                    is_best_epoch = True
                logger.info(
                    "BEST: epoch: {}, best_map: {:.2f}".format(
                        best_epoch, best_map,
                    )
                )
            else:
                if best_top1_err > float(stats["top1_err"]):
                    best_epoch = cur_epoch + 1
                    best_top1_err = float(stats["top1_err"])
                    top5_err = float(stats["top5_err"])
                    is_best_epoch = True
                logger.info(
                    "BEST: epoch: {}, best_top1_err: {:.2f}, top5_err: {:.2f}".format(
                        best_epoch, best_top1_err, top5_err
                    )
                )

        sd = \
            model.module.state_dict() if cfg.NUM_GPUS > 1 else \
            model.state_dict()

        ckpt = {
            'epoch': cur_epoch + 1,
            'model_arch': cfg.MODEL.DOWNSTREAM_ARCH,
            'state_dict': sd,
            'optimizer': optimizer.state_dict(),
        }

        if (cur_epoch + 1) % cfg.SAVE_EVERY_EPOCH == 0 and du.get_rank() == 0:
            sd = \
                model.module.state_dict() if cfg.NUM_GPUS > 1 else \
                model.state_dict()
            save_checkpoint(
                ckpt,
                filename=os.path.join(cfg.SAVE_DIR, f'epoch{cur_epoch+1}.pyth')
            )

        if is_best_epoch and du.get_rank() == 0:
            save_checkpoint(
                ckpt,
                filename=os.path.join(cfg.SAVE_DIR, f"epoch_best.pyth")
            )
Ejemplo n.º 13
0
def main():
    setup_logging()
    create_folder(daemon_folder)
    server = daemon_uds_server.DaemonUDSServer(
        daemon_main_uds_filename, client_handler.create,
        client_handler.libc_call_callback)
Ejemplo n.º 14
0
import requests
import sentry_sdk  # noqa: I001, E501; pylint: disable=ungrouped-imports; conflicts with Flake8
from colin_api.models.filing import Filing
from flask import Flask
from legal_api.services.bootstrap import AccountService
from legal_api.services.queue import QueueService
from sentry_sdk import capture_message
from sentry_sdk.integrations.logging import LoggingIntegration  # noqa: I001

import config  # pylint: disable=import-error
from utils.logging import setup_logging  # pylint: disable=import-error

# noqa: I003

setup_logging(
    os.path.join(os.path.abspath(os.path.dirname(__file__)), 'logging.conf'))

SENTRY_LOGGING = LoggingIntegration(
    event_level=logging.ERROR  # send errors as events
)
SET_EVENTS_MANUALLY = False


def create_app(run_mode=os.getenv('FLASK_ENV', 'production')):
    """Return a configured Flask App using the Factory method."""
    app = Flask(__name__)
    app.config.from_object(config.CONFIGURATION[run_mode])
    # Configure Sentry
    if app.config.get('SENTRY_DSN', None):
        sentry_sdk.init(dsn=app.config.get('SENTRY_DSN'),
                        integrations=[SENTRY_LOGGING])
Ejemplo n.º 15
0
def main(argv):
  args = parser.parse_args(argv)

  setup_logging(os.path.dirname(args.checkpoint), 'eval',
                args.verbose, args.dry)

  logging.info('Commandline arguments: {}'.format(' '.join(argv)))

  if args.cuda != '':
    try:
      args.cuda = utils.set_cuda_env(args.cuda)
    except Exception:
      logging.critical('No free GPU on this machine. Aborting run.')
      return
    logging.info('Running on GPU {}'.format(args.cuda))

  # Load configuration
  conf = Configuration.from_json(args.config)
  conf.args = args
  if args.conf:
    new_conf_entries = {}
    for arg in args.conf:
      key, value = arg.split('=')
      new_conf_entries[key] = value
    conf.update(new_conf_entries)

  if args.verbose:
    logging.debug(conf)

  utils.set_random_seeds(conf.seed)

  if args.raw:
    # This is a hack to suppress the output transform when we request raw data
    conf.application = 'none'
    if conf.has_attr('tasks'):
      for name, task in conf.tasks.items():
        if 'application' in task:
          logging.debug(('Changing output transform in task {} '
                         'from {} to none').format(name,
                                                   task['application']))
          task['application'] = 'none'

  # Setup model
  runner = build_runner(conf, conf.runner_type, args.cuda, mode='test')

  # Handle resuming from checkpoint
  if args.checkpoint != 'NONE':
    if os.path.exists(args.checkpoint):
      _ = restore_checkpoint(args.checkpoint, runner, cuda=args.cuda)
      logging.info('Restored checkpoint from {}'.format(args.checkpoint))
    else:
      logging.critical(('Checkpoint {} to restore '
                       'from not found').format(args.checkpoint))
      return

  # Load datasets
  mode = 'dataset'
  if len(args.files_or_dirs) == 0:
    datasets = [load_dataset(conf, args.data_dir,
                             conf.validation_dataset, args.fold)]
  else:
    datasets = []
    for f in args.files_or_dirs:
      if is_dataset(f):
        dataset = load_dataset(conf, args.data_dir, f, args.fold)
        datasets.append(dataset)

  if args.raw:
    mode = 'raw'

  num_samples = conf.get_attr('num_validation_subset_samples',
                              default=None)

  # Evaluate all datasets
  for dataset in datasets:
    logging.info('Evaluating dataset {}'.format(dataset.name))

    sampler = maybe_get_subset_sampler(num_samples, dataset)
    loader = DataLoader(dataset=dataset,
                        num_workers=DEFAULT_NUM_WORKERS,
                        batch_size=1,
                        sampler=sampler,
                        shuffle=False)

    if mode == 'dataset':
      data, _, val_metrics = runner.validate(loader, len(loader))

      res_str = 'Average metrics for {}\n'.format(dataset.name)
      for metric_name, metric in val_metrics.items():
        res_str += '     {}: {}\n'.format(metric_name, metric)
      logging.info(res_str)
    else:
      data = runner.infer(loader)

    if not args.dry and (args.infer or args.dump):
      if mode == 'dataset' or mode == 'raw':
        conf_name = os.path.splitext(os.path.basename(conf.file))[0]
        output_dir = get_run_dir(args.out_dir, '{}_{}'.format(dataset.name,
                                                              conf_name))
        if not os.path.isdir(output_dir):
          os.mkdir(output_dir)

      logging.info('Writing images to {}'.format(output_dir))

      file_idx = 0
      for batch in data:
        if mode == 'image':
          output_dir = os.path.dirname(dataset.images[file_idx])

        named_batch = runner.get_named_outputs(batch)
        inp = named_batch['input']

        if 'prediction' in named_batch:
          batch_size = named_batch['prediction'].shape[0]
          filenames = [dataset.get_filename(idx)
                       for idx in range(file_idx, file_idx + batch_size)]
          save_output_images(dataset, inp, named_batch['prediction'],
                             named_batch['target'], output_dir,
                             filenames, 'default', args.dump, args.raw)

        file_idx += len(filenames)

      logging.info(('Finished writing images for '
                   'dataset {}').format(dataset.name))