Exemple #1
0
    def __init__(self, bm_ws_market, eventQueue, apiKey, apiSecret, symbols):

        # 日志
        self.logger = generate_logger('OMS')

        # 绑定事件队列, 队列中只有 TARGET_POSITION_EVENT,另开一个线程来push目标仓位
        self.eventQueue = eventQueue

        # 目标仓位 {symbol: pos}
        self.target_position = {}

        # 标的
        self.symbols = symbols

        # websocket-market
        self.bm_ws_market = bm_ws_market  # 外部的,因为DataHandler同时也在用它 or 它就是DataHandler

        # websocket-trading
        self.bm_ws_trading = bitmexWSTrading(apiKey, apiSecret)
        self.bm_ws_trading.connect()
        self.bm_ws_trading.subscribe(self.symbols)
        self.bm_ws_trading.wait_for_initial_status()  # 等待的初始信息

        self.actual_position = self.bm_ws_trading.actual_position  # 由websocket接收的信息计算出的实际仓位 `position`
        self.unfilled_qty = self.bm_ws_trading.unfilled_qty  # 由websocket接收的信息计算出的未成交委托  `order`

        # rest
        self.bm_rest = bitmexREST(apiKey, apiSecret)
Exemple #2
0
def main_worker(gpu, args):
    """
    模型训练、测试、转JIT、蒸馏文件制作
    :param gpu: 运行的gpu id
    :param args: 运行超参
    """
    args.gpu = gpu
    utils.generate_logger(f"{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}-{gpu}.log")
    logging.info(f'args: {args}')

    # 可复现性
    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        logging.warning('You have chosen to seed training. '
                        'This will turn on the CUDNN deterministic setting, '
                        'which can slow down your training considerably! '
                        'You may see unexpected behavior when restarting '
                        'from checkpoints.')

    if args.cuda:
        logging.info(f"Use GPU: {args.gpu} ~")
        if args.distributed:
            args.rank = args.rank * args.gpus + gpu
            dist.init_process_group(backend='nccl', init_method=args.init_method,
                                    world_size=args.world_size, rank=args.rank)
    else:
        logging.info(f"Use CPU ~")

    # 创建/加载模型,使用预训练模型时,需要自己先下载好放到 pretrained 文件夹下,以网络名词命名
    logging.info(f"=> creating model '{args.arch}'")
    model = my_models.get_model(args.arch, args.pretrained, num_classes=args.num_classes)

    # 重加载之前训练好的模型
    if args.resume:
        if os.path.isfile(args.resume):
            logging.info(f"=> loading checkpoint '{args.resume}'")
            checkpoint = torch.load(args.resume, map_location=torch.device('cpu'))
            acc = model.load_state_dict(checkpoint['state_dict'], strict=True)
            logging.info(f'missing keys of models: {acc.missing_keys}')
            del checkpoint
        else:
            raise Exception(f"No checkpoint found at '{args.resume}' to be resumed")

    # 模型信息
    image_height, image_width = args.image_size
    logging.info(f'Model {args.arch} input size: ({image_height}, {image_width})')
    utils.summary(size=(image_height, image_width), channel=3, model=model)

    # 模型转换:转为 torch.jit.script
    if args.jit:
        if not args.resume:
            raise Exception('Option --resume must specified!')
        applications.convert_to_jit(model, args=args)
        return

    if args.criterion == 'softmax':
        criterion = criterions.HybridCELoss(args=args)  # 混合策略多分类
    elif args.criterion == 'bce':
        criterion = criterions.HybridBCELoss(args=args)  # 混合策略多标签二分类
    else:
        raise NotImplementedError(f'Not loss function {args.criterion}')

    if args.cuda:
        if args.distributed and args.sync_bn:
            model = apex.parallel.convert_syncbn_model(model)
        torch.cuda.set_device(args.gpu)
        model.cuda(args.gpu)
        criterion = criterion.cuda(args.gpu)

    if args.knowledge in ('train', 'test', 'val'):
        torch.set_flush_denormal(True)
        distill_loader = dataloader.load(args, name=args.knowledge)
        applications.distill(distill_loader, model, criterion, args, is_confuse_matrix=True)
        return

    if args.make_curriculum in ('train', 'test', 'val'):
        torch.set_flush_denormal(True)
        curriculum_loader = dataloader.load(args, name=args.make_curriculum)
        applications.make_curriculum(curriculum_loader, model, criterion, args, is_confuse_matrix=True)
        return

    if args.visual_data in ('train', 'test', 'val'):
        torch.set_flush_denormal(True)
        test_loader = dataloader.load(args, name=args.visual_data)
        applications.Visualize.visualize(test_loader, model, args)
        return

    # 优化器
    opt_set = {
        'sgd': partial(torch.optim.SGD, momentum=args.momentum),
        'adam': torch.optim.Adam, 'adamw': AdamW,
        'radam': RAdam, 'ranger': Ranger, 'lookaheadadam': LookaheadAdam,
        'ralamb': Ralamb, 'rangerlars': RangerLars,
        'novograd': Novograd,
    }
    optimizer = opt_set[args.opt](model.parameters(), lr=args.lr)  # weight decay转移到train那里了
    # 随机均值平均优化器
    # from optim.swa import SWA
    # optimizer = SWA(optimizer, swa_start=10, swa_freq=5, swa_lr=0.05)

    # 混合精度训练
    if args.cuda:
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")

    if args.distributed:
        model = apex.parallel.DistributedDataParallel(model)
    else:
        model = torch.nn.DataParallel(model)

    if args.train:
        train_loader = dataloader.load(args, 'train')
        val_loader = dataloader.load(args, 'val')
        scheduler = LambdaLR(optimizer,
                             lambda epoch: adjust_learning_rate(epoch, args=args))
        applications.train(train_loader, val_loader, model, criterion, optimizer, scheduler, args)
        args.evaluate = True

    if args.evaluate:
        torch.set_flush_denormal(True)
        test_loader = dataloader.load(args, name='test')
        acc, loss, paths_targets_preds_probs = applications.test(test_loader, model,
                                                                 criterion, args, is_confuse_matrix=True)
        logging.info(f'Evaluation: * Acc@1 {acc:.3f} and loss {loss:.3f}.')
        logging.info(f'Evaluation Result:\n')
        for path, target, pred, prob in paths_targets_preds_probs:
            logging.info(path + ' ' + str(target) + ' ' + str(pred) + ' ' + ','.join([f'{num:.2f}' for num in prob]))
        logging.info('Evaluation Over~')
def main():

    opt = args.getopt()

    # set-up file structures and create logger
    datasetdir = os.path.join(opt.datasetdir, str(opt.datasetversion))
    opt.batch_size == 1 if opt.save_ranks else opt.batch_size
    resultsdir = os.path.join(opt.resultsdir, 'experiment_id' + str(opt.id))
    if not os.path.exists(resultsdir):
        os.system('mkdir -p ' + resultsdir)
    log = utils.generate_logger(opt, resultsdir)

    # basic/cuda set-up
    torch.manual_seed(opt.seed)
    if opt.gpu >= 0:
        assert opt.gpu <= torch.cuda.device_count()
        torch.cuda.manual_seed(opt.seed)
        torch.cuda.set_device(opt.gpu)
        cudnn.enabled = True
        cudnn.benchmark = True
    else:
        log.warning('on cpu. you should probably use gpus with --gpu=GPU_idx')

    # print & save arguments
    log.info(opt)
    torch.save(opt, os.path.join(resultsdir, 'exp' + str(opt.id) + '_opt.pt'))

    ####################################### build vocab  #####################################
    log.info('-' * 100)
    dictionary = dataset.build_vocabulary(opt, log)
    ntokens = len(dictionary)
    log.info('dictionary loaded successfully! vocabulary size: ' +
             str(ntokens))
    log.info('-' * 100)

    # get pre-trained word embeddings (from downloaded binary file saved at opt.wordmodel)
    word_vec_file = os.path.join(
        opt.datasetdir, str(opt.datasetversion),
        os.path.basename(opt.wordmodel) + '_vocab_vecs.pt')
    dictionary.loadwordmodel(opt.wordmodel, word_vec_file, opt.emsize, log,
                             opt.gpu)
    log.info('-' * 100)

    ####################################### load data #####################################

    train_loader, flat_train_features = dataset.get_features(
        dictionary, opt, log, 'train')
    test_loader, flat_test_features = dataset.get_features(
        dictionary, opt, log, opt.evalset)

    # filter views by input_vars and condition_vars
    condition_vars = sorted(
        opt.condition_vars.split('_'))  # sorted alphabetically
    train_views, train_mus = dataset.filter_features(flat_train_features,
                                                     opt.input_vars,
                                                     condition_vars)
    test_views, _ = dataset.filter_features(flat_test_features, opt.input_vars,
                                            condition_vars)

    # do CCA
    lambdas, proj_mtxs = cca_utils.cca(train_views, log, k=opt.k)
    #lambdas, proj_mtxs = cca_utils.cca_mardia(train_views, log, opt.k, opt.eps, opt.r)

    # get (train) projections using learned weights
    train_projections = [
        cca_utils.get_projection(v, mtx, lambdas, opt.p)
        for (v, mtx) in zip(train_views, proj_mtxs)
    ]
    proj_train_mus = [
        proj.mean(dim=0).view(-1, 1) for proj in train_projections
    ]

    if len(condition_vars) == 2:  # three-view CCA

        # select answer & question proj_mtxs (rather than image)
        qa_proj_mtxs = [proj_mtxs[0], proj_mtxs[2]]
        qa_train_projections = [train_projections[0], train_projections[2]]
        qa_proj_train_mus = [proj_train_mus[0], proj_train_mus[2]]

        test_meters = evals.candidate_answers_recall(
            test_loader, lambdas, qa_proj_mtxs, qa_train_projections,
            qa_proj_train_mus, dictionary, opt, log, opt.evalset, train_loader)

    elif len(condition_vars) == 1:  # two-view CCA
        test_meters = evals.candidate_answers_recall(
            test_loader, lambdas, proj_mtxs, train_projections, proj_train_mus,
            dictionary, opt, log, opt.evalset, train_loader)

    else:
        print(
            'cannot handle this CCA architecture - check --input_vars and --condition_vars!'
        )
        return

    log.info('-' * 100)
    log.info(opt)
    log.info('-' * 100)
    s = utils.stringify_meters(test_meters)
    log.info(s)

    # saves ranking performance to text file with p,k parameters
    resultsdir = os.path.join(opt.resultsdir, 'experiment_id' + str(opt.id))
    fh = open(os.path.join(resultsdir, 'p_k_gridsearch.txt'), 'a')
    fh.write('k={k}\tp={p}\tranks: {ranks}\r\n'.format(k=str(opt.k),
                                                       p=str(opt.p),
                                                       ranks=s))
    fh.close()
Exemple #4
0
def main(config, mode, weights):
    cfg = kaptan.Kaptan(handler='yaml')
    config = cfg.import_config(config)

    MODEL_SAVE_NAME, MODEL_SAVE_FOLDER, LOGGER_SAVE_NAME, CHECKPOINT_DIRECTORY = utils.generate_save_names(
        config)
    os.makedirs(MODEL_SAVE_FOLDER, exist_ok=True)
    logger = utils.generate_logger(MODEL_SAVE_FOLDER, LOGGER_SAVE_NAME)

    logger.info("*" * 40)
    logger.info("")
    logger.info("")
    logger.info("Using the following configuration:")
    logger.info(config.export("yaml", indent=4))
    logger.info("")
    logger.info("")
    logger.info("*" * 40)

    NORMALIZATION_MEAN, NORMALIZATION_STD, RANDOM_ERASE_VALUE = utils.fix_generator_arguments(
        config)
    TRAINDATA_KWARGS = {
        "rea_value": config.get("TRANSFORMATION.RANDOM_ERASE_VALUE")
    }
    """ MODEL PARAMS """
    from utils import model_weights

    MODEL_WEIGHTS = None
    if config.get("MODEL.MODEL_BASE") in model_weights:
        if mode == "train":
            if os.path.exists(
                    model_weights[config.get("MODEL.MODEL_BASE")][1]):
                pass
            else:
                logger.info(
                    "Model weights file {} does not exist. Downloading.".
                    format(model_weights[config.get("MODEL.MODEL_BASE")][1]))
                utils.web.download(
                    model_weights[config.get("MODEL.MODEL_BASE")][1],
                    model_weights[config.get("MODEL.MODEL_BASE")][0])
            MODEL_WEIGHTS = model_weights[config.get("MODEL.MODEL_BASE")][1]
    else:
        raise NotImplementedError(
            "Model %s is not available. Please choose one of the following: %s"
            % (config.get("MODEL.MODEL_BASE"), str(model_weights.keys())))

    # ------------------ LOAD SAVED LOGGER IF EXISTS ----------------------------
    DRIVE_BACKUP = config.get("SAVE.DRIVE_BACKUP")
    if DRIVE_BACKUP:
        backup_logger = os.path.join(CHECKPOINT_DIRECTORY, LOGGER_SAVE_NAME)
        if os.path.exists(backup_logger):
            shutil.copy2(backup_logger, ".")
    else:
        backup_logger = None

    NUM_GPUS = torch.cuda.device_count()
    if NUM_GPUS > 1:
        raise RuntimeError(
            "Not built for multi-GPU. Please start with single-GPU.")
    logger.info("Found %i GPUs" % NUM_GPUS)

    # --------------------- BUILD GENERATORS ------------------------
    data_crawler_ = config.get("EXECUTION.CRAWLER", "VeRiDataCrawler")
    data_crawler = __import__("crawlers." + data_crawler_,
                              fromlist=[data_crawler_])
    data_crawler = getattr(data_crawler, data_crawler_)

    from generators import SequencedGenerator
    logger.info("Crawling data folder %s" %
                config.get("DATASET.ROOT_DATA_FOLDER"))
    crawler = data_crawler(data_folder=config.get("DATASET.ROOT_DATA_FOLDER"),
                           train_folder=config.get("DATASET.TRAIN_FOLDER"),
                           test_folder=config.get("DATASET.TEST_FOLDER"),
                           query_folder=config.get("DATASET.QUERY_FOLDER"),
                           **{"logger": logger})
    train_generator = SequencedGenerator(gpus=NUM_GPUS, i_shape=config.get("DATASET.SHAPE"), \
                                normalization_mean=NORMALIZATION_MEAN, normalization_std=NORMALIZATION_STD, normalization_scale=1./config.get("TRANSFORMATION.NORMALIZATION_SCALE"), \
                                h_flip = config.get("TRANSFORMATION.H_FLIP"), t_crop=config.get("TRANSFORMATION.T_CROP"), rea=config.get("TRANSFORMATION.RANDOM_ERASE"),
                                **TRAINDATA_KWARGS)
    train_generator.setup(crawler,
                          mode='train',
                          batch_size=config.get("TRANSFORMATION.BATCH_SIZE"),
                          instance=config.get("TRANSFORMATION.INSTANCES"),
                          workers=config.get("TRANSFORMATION.WORKERS"))
    logger.info("Generated training data generator")
    TRAIN_CLASSES = config.get("MODEL.SOFTMAX_DIM",
                               train_generator.num_entities)
    test_generator = SequencedGenerator(
        gpus=NUM_GPUS,
        i_shape=config.get("DATASET.SHAPE"),
        normalization_mean=NORMALIZATION_MEAN,
        normalization_std=NORMALIZATION_STD,
        normalization_scale=1. /
        config.get("TRANSFORMATION.NORMALIZATION_SCALE"),
        h_flip=0,
        t_crop=False,
        rea=False)
    test_generator.setup(crawler,
                         mode='test',
                         batch_size=config.get("TRANSFORMATION.BATCH_SIZE"),
                         instance=config.get("TRANSFORMATION.INSTANCES"),
                         workers=config.get("TRANSFORMATION.WORKERS"))
    QUERY_CLASSES = test_generator.num_entities
    logger.info("Generated validation data/query generator")

    # --------------------- INSTANTIATE MODEL ------------------------
    model_builder = __import__("models", fromlist=["*"])
    model_builder = getattr(
        model_builder,
        config.get("EXECUTION.MODEL_BUILDER", "veri_model_builder"))
    logger.info("Loaded {} from {} to build ReID model".format(
        config.get("EXECUTION.MODEL_BUILDER", "veri_model_builder"), "models"))

    if type(
            config.get("MODEL.MODEL_KWARGS")
    ) is dict:  # Compatibility with old configs. TODO fix all old configs.
        model_kwargs_dict = config.get("MODEL.MODEL_KWARGS")
    else:
        model_kwargs_dict = json.loads(config.get("MODEL.MODEL_KWARGS"))

    reid_model = model_builder( arch = config.get("MODEL.MODEL_ARCH"), \
                                base=config.get("MODEL.MODEL_BASE"), \
                                weights=MODEL_WEIGHTS, \
                                soft_dimensions = config.get("MODEL.SOFTMAX",TRAIN_CLASSES), \
                                embedding_dimensions = config.get("MODEL.EMB_DIM"), \
                                normalization = config.get("MODEL.MODEL_NORMALIZATION"), \
                                **model_kwargs_dict)
    logger.info("Finished instantiating model with {} architecture".format(
        config.get("MODEL.MODEL_ARCH")))

    if mode == "test":
        reid_model.load_state_dict(torch.load(weights))
        reid_model.cuda()
        reid_model.eval()
    else:
        if weights != "":  # Load weights if train and starting from a another model base...
            logger.info(
                "Commencing partial model load from {}".format(weights))
            reid_model.partial_load(weights)
            logger.info("Completed partial model load from {}".format(weights))
        reid_model.cuda()
        logger.info(
            torchsummary.summary(reid_model,
                                 input_size=(3, *config.get("DATASET.SHAPE"))))
    # --------------------- INSTANTIATE LOSS ------------------------
    from loss import ReIDLossBuilder
    loss_function = ReIDLossBuilder(
        loss_functions=config.get("LOSS.LOSSES"),
        loss_lambda=config.get("LOSS.LOSS_LAMBDAS"),
        loss_kwargs=config.get("LOSS.LOSS_KWARGS"),
        **{"logger": logger})
    logger.info("Built loss function")

    # --------------------- INSTANTIATE LOSS OPTIMIZER --------------
    from optimizer.StandardLossOptimizer import StandardLossOptimizer as loss_optimizer

    LOSS_OPT = loss_optimizer(
        base_lr=config.get("LOSS_OPTIMIZER.BASE_LR",
                           config.get("OPTIMIZER.BASE_LR")),
        lr_bias=config.get("LOSS_OPTIMIZER.LR_BIAS_FACTOR",
                           config.get("OPTIMIZER.LR_BIAS_FACTOR")),
        weight_decay=config.get("LOSS_OPTIMIZER.WEIGHT_DECAY",
                                config.get("OPTIMIZER.WEIGHT_DECAY")),
        weight_bias=config.get("LOSS_OPTIMIZER.WEIGHT_BIAS_FACTOR",
                               config.get("OPTIMIZER.WEIGHT_BIAS_FACTOR")),
        gpus=NUM_GPUS)
    loss_optimizer = LOSS_OPT.build(
        loss_builder=loss_function,
        name=config.get("LOSS_OPTIMIZER.OPTIMIZER_NAME",
                        config.get("OPTIMIZER.OPTIMIZER_NAME")),
        **json.loads(
            config.get("LOSS_OPTIMIZER.OPTIMIZER_KWARGS",
                       config.get("OPTIMIZER.OPTIMIZER_KWARGS"))))
    logger.info("Built loss optimizer")
    # --------------------- INSTANTIATE OPTIMIZER ------------------------
    optimizer_builder = __import__("optimizer", fromlist=["*"])
    optimizer_builder = getattr(
        optimizer_builder,
        config.get("EXECUTION.OPTIMIZER_BUILDER", "OptimizerBuilder"))
    logger.info("Loaded {} from {} to build Optimizer model".format(
        config.get("EXECUTION.OPTIMIZER_BUILDER", "OptimizerBuilder"),
        "optimizer"))

    OPT = optimizer_builder(
        base_lr=config.get("OPTIMIZER.BASE_LR"),
        lr_bias=config.get("OPTIMIZER.LR_BIAS_FACTOR"),
        weight_decay=config.get("OPTIMIZER.WEIGHT_DECAY"),
        weight_bias=config.get("OPTIMIZER.WEIGHT_BIAS_FACTOR"),
        gpus=NUM_GPUS)
    optimizer = OPT.build(
        reid_model, config.get("OPTIMIZER.OPTIMIZER_NAME"),
        **json.loads(config.get("OPTIMIZER.OPTIMIZER_KWARGS")))
    logger.info("Built optimizer")
    # --------------------- INSTANTIATE SCHEDULER ------------------------
    try:  # We first check if scheduler is part of torch's provided schedulers.
        scheduler = __import__('torch.optim.lr_scheduler',
                               fromlist=['lr_scheduler'])
        scheduler = getattr(scheduler, config.get("SCHEDULER.LR_SCHEDULER"))
    except (
            ModuleNotFoundError, AttributeError
    ):  # If it fails, then we try to import from schedulers implemented in scheduler/ folder
        scheduler_ = config.get("SCHEDULER.LR_SCHEDULER")
        scheduler = __import__("scheduler." + scheduler_,
                               fromlist=[scheduler_])
        scheduler = getattr(scheduler, scheduler_)
    scheduler = scheduler(optimizer,
                          last_epoch=-1,
                          **json.loads(config.get("SCHEDULER.LR_KWARGS")))
    logger.info("Built scheduler")

    # ------------------- INSTANTIATE LOSS SCHEEDULER ---------------------
    loss_scheduler = None
    if loss_optimizer is not None:  # In case loss has no differentiable paramters
        try:
            loss_scheduler = __import__('torch.optim.lr_scheduler',
                                        fromlist=['lr_scheduler'])
            loss_scheduler = getattr(
                loss_scheduler,
                config.get("LOSS_SCHEDULER.LR_SCHEDULER",
                           config.get("SCHEDULER.LR_SCHEDULER")))
        except (ModuleNotFoundError, AttributeError):
            loss_scheduler_ = config.get("LOSS_SCHEDULER.LR_SCHEDULER",
                                         config.get("SCHEDULER.LR_SCHEDULER"))
            loss_scheduler = __import__("scheduler." + loss_scheduler_,
                                        fromlist=[loss_scheduler_])
            loss_scheduler = getattr(loss_scheduler, loss_scheduler_)
        loss_scheduler = loss_scheduler(
            loss_optimizer,
            last_epoch=-1,
            **json.loads(
                config.get("LOSS_SCHEDULER.LR_KWARGS",
                           config.get("SCHEDULER.LR_KWARGS"))))
        logger.info("Built loss scheduler")
    else:
        loss_scheduler = None

    # ---------------------------- SETUP BACKUP PATH -------------------------
    if DRIVE_BACKUP:
        fl_list = glob.glob(os.path.join(CHECKPOINT_DIRECTORY, "*.pth"))
    else:
        fl_list = glob.glob(os.path.join(MODEL_SAVE_FOLDER, "*.pth"))
    _re = re.compile(r'.*epoch([0-9]+)\.pth')
    previous_stop = [
        int(item[1]) for item in [_re.search(item) for item in fl_list]
        if item is not None
    ]
    if len(previous_stop) == 0:
        previous_stop = 0
        logger.info("No previous stop detected. Will start from epoch 0")
    else:
        previous_stop = max(previous_stop) + 1
        logger.info(
            "Previous stop detected. Will attempt to resume from epoch %i" %
            previous_stop)

    # --------------------- PERFORM TRAINING ------------------------
    trainer = __import__("trainer", fromlist=["*"])
    trainer = getattr(trainer, config.get("EXECUTION.TRAINER",
                                          "SimpleTrainer"))
    logger.info("Loaded {} from {} to build Trainer".format(
        config.get("EXECUTION.TRAINER", "SimpleTrainer"), "trainer"))

    loss_stepper = trainer(model=reid_model,
                           loss_fn=loss_function,
                           optimizer=optimizer,
                           loss_optimizer=loss_optimizer,
                           scheduler=scheduler,
                           loss_scheduler=loss_scheduler,
                           train_loader=train_generator.dataloader,
                           test_loader=test_generator.dataloader,
                           queries=QUERY_CLASSES,
                           epochs=config.get("EXECUTION.EPOCHS"),
                           logger=logger,
                           crawler=crawler)
    loss_stepper.setup(step_verbose=config.get("LOGGING.STEP_VERBOSE"),
                       save_frequency=config.get("SAVE.SAVE_FREQUENCY"),
                       test_frequency=config.get("EXECUTION.TEST_FREQUENCY"),
                       save_directory=MODEL_SAVE_FOLDER,
                       save_backup=DRIVE_BACKUP,
                       backup_directory=CHECKPOINT_DIRECTORY,
                       gpus=NUM_GPUS,
                       fp16=config.get("OPTIMIZER.FP16"),
                       model_save_name=MODEL_SAVE_NAME,
                       logger_file=LOGGER_SAVE_NAME)
    if mode == 'train':
        loss_stepper.train(continue_epoch=previous_stop)
    elif mode == 'test':
        loss_stepper.evaluate()
    else:
        raise NotImplementedError()
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.logger = generate_logger('bitmexWS_Market')
Exemple #6
0
def main_worker(gpu, args):
    """ 模型训练、测试、转JIT、蒸馏文件制作
    :param gpu: 运行的gpu id
    :param args: 运行超参
    """
    args.gpu = gpu
    utils.generate_logger(
        f"{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}-{gpu}.log")
    logging.info(f'args: {args}')

    # 可复现性
    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        logging.warning('You have chosen to seed training. '
                        'This will turn on the CUDNN deterministic setting, '
                        'which can slow down your training considerably! '
                        'You may see unexpected behavior when restarting '
                        'from checkpoints.')

    if args.cuda:
        logging.info(f"Use GPU: {args.gpu} ~")
        if args.distributed:
            args.rank = args.rank * args.gpus + gpu
            dist.init_process_group(backend='nccl',
                                    init_method=args.init_method,
                                    world_size=args.world_size,
                                    rank=args.rank)
    else:
        logging.info(f"Use CPU ~")

    # 创建/加载模型,使用预训练模型时,需要自己先下载好放到 pretrained 文件夹下,以网络名词命名
    logging.info(f"=> creating model '{args.arch}'")
    model = my_models.get_model(args.arch,
                                args.pretrained,
                                num_classes=args.num_classes)

    # 重加载之前训练好的模型
    if args.resume:
        if os.path.isfile(args.resume):
            logging.info(f"=> loading checkpoint '{args.resume}'")
            checkpoint = torch.load(args.resume,
                                    map_location=torch.device('cpu'))
            acc = model.load_state_dict(checkpoint['state_dict'])
            logging.info(f'missing keys of models: {acc.missing_keys}')
            del checkpoint
        else:
            raise Exception(
                f"No checkpoint found at '{args.resume}' to be resumed")

    # 模型信息
    image_height, image_width = args.image_size
    logging.info(
        f'Model {args.arch} input size: ({image_height}, {image_width})')
    utils.summary(size=(image_height, image_width), channel=3, model=model)

    # 模型转换:转为 torch.jit.script
    if args.jit:
        if not args.resume:
            raise Exception('Option --resume must specified!')
        applications.convert_to_jit(model, args=args)
        return

    if args.criterion == 'rank':
        criterion = criterions.RankingLoss(args=args)  # 对比排序损失
    elif args.criterion == 'emd':
        criterion = criterions.EMDLoss()  # 推土机距离损失
    elif args.criterion == 'regress':
        criterion = criterions.RegressionLoss()  # MSE回归损失
    else:
        raise NotImplementedError(
            f'Not loss function {args.criterion},only (rank, emd, regress)!')

    if args.cuda:
        if args.distributed and args.sync_bn:
            model = apex.parallel.convert_syncbn_model(model)
        torch.cuda.set_device(args.gpu)
        model.cuda(args.gpu)
        criterion = criterion.cuda(args.gpu)

    # 优化器:Adam > SGD > SWA(SGD > Adam)
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)
    # 可尝试优化器
    # optimizer = torch.optim.SGD(model.parameters(),
    #                             args.lr, momentum=args.momentum,
    #                             weight_decay=args.weight_decay)
    # from optim.torchtools.optim import RangerLars, Ralamb, Novograd, LookaheadAdam, Ranger, RAdam, AdamW
    # optimizer = RangerLars(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    # optimizer = Ralamb(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    # optimizer = Novograd(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    # optimizer = LookaheadAdam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    # optimizer = Ranger(model_params, lr=args.lr, weight_decay=args.weight_decay)
    # optimizer = RAdam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    # optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    # 随机均值平均优化器
    # from optim.swa import SWA
    # optimizer = SWA(optimizer, swa_start=10, swa_freq=5, swa_lr=0.05)

    # 混合精度训练
    if args.cuda:
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
        model = DDP(model)
    else:
        model = torch.nn.DataParallel(model)

    if args.train:
        train_loader = dataloader.load(args, 'train')
        val_loader = dataloader.load(args, 'val')
        scheduler = LambdaLR(
            optimizer, lambda epoch: adjust_learning_rate(epoch, args=args))
        applications.train(train_loader, val_loader, model, criterion,
                           optimizer, scheduler, args)
        args.evaluate = True

    if args.evaluate:
        torch.set_flush_denormal(True)
        test_loader = dataloader.load(args, name='test')
        acc, loss, test_results = applications.test(test_loader, model,
                                                    criterion, args)
        logging.info(f'Evaluation: * Acc@1 {acc:.3f} and loss {loss:.3f}.')
        logging.info(f'Evaluation results:')
        for result in test_results:
            logging.info(' '.join([str(r) for r in result]))
        logging.info('Evaluation Over~')
def main():

    args = config.get_args()

    # set-up file structures and create logger
    log, args.results_dir = generate_logger(args, args.results_dir)
    print_and_log(log, args)

    # basic/cuda set-up
    torch.manual_seed(SEED)
    if args.gpu >= 0:
        assert args.gpu <= torch.cuda.device_count()
        torch.cuda.manual_seed(SEED)
        torch.cuda.set_device(args.gpu)
        cudnn.enabled = True
        cudnn.benchmark = True
    else:
        print_and_log(
            log, 'on cpu. you should probably use gpus with --gpu=GPU_idx')

    ####################################### load datasets  #####################################
    print_and_log(log, '-' * 100)
    train_loader = get_dataloader('train', args)
    val_loader = get_dataloader(
        'val', args, shared_dictionary=train_loader.dataset.dictionary)
    print_and_log(log, '-' * 100)

    ####################################### do CCA #####################################

    # flatten features for CCA
    if 'QA_full' in args.cca:  # CCA on full dataset
        flat_train_features = get_flat_features(train_loader, args)
        if args.cca == 'QA_full_trainval':
            flat_val_features = get_flat_features(val_loader, args)
            flat_train_features = [
                torch.cat((flat_train_features[i], flat_val_features[i]),
                          dim=0) for i in range(len(flat_val_features))
            ]
    elif 'QA_human' in args.cca:  # CCA on H_t subset (i.e. subset with human relevance scores)
        flat_train_features = get_flat_features(train_loader,
                                                args,
                                                human_set_only=True)
        if args.cca == 'QA_human_trainval':
            flat_val_features = get_flat_features(val_loader,
                                                  args,
                                                  human_set_only=True)
            flat_train_features = [
                torch.cat((flat_train_features[i], flat_val_features[i]),
                          dim=0) for i in range(len(flat_val_features))
            ]

    # do CCA
    lambdas, proj_mtxs = compute_cca(flat_train_features, k=args.k)

    # get train projections using learned weights
    train_projections = [
        get_projection(v, mtx, lambdas, args.p)
        for (v, mtx) in zip(flat_train_features, proj_mtxs)
    ]
    proj_train_mus = [
        proj.mean(dim=0).view(-1, 1) for proj in train_projections
    ]
    del flat_train_features, train_projections

    # compute clusters
    compute_clusters(train_loader, lambdas, proj_mtxs, proj_train_mus, args,
                     log)
    compute_clusters(val_loader, lambdas, proj_mtxs, proj_train_mus, args, log)
Exemple #8
0
def main(config, mode, weights):
    # Generate configuration
    cfg = kaptan.Kaptan(handler='yaml')
    config = cfg.import_config(config)

    # Generate logger
    MODEL_SAVE_NAME, MODEL_SAVE_FOLDER, LOGGER_SAVE_NAME, CHECKPOINT_DIRECTORY = utils.generate_save_names(
        config)
    logger = utils.generate_logger(MODEL_SAVE_FOLDER, LOGGER_SAVE_NAME)

    logger.info("*" * 40)
    logger.info("")
    logger.info("")
    logger.info("Using the following configuration:")
    logger.info(config.export("yaml", indent=4))
    logger.info("")
    logger.info("")
    logger.info("*" * 40)
    """ SETUP IMPORTS """
    #from crawlers import ReidDataCrawler
    #from generators import SequencedGenerator

    #from loss import LossBuilder

    NORMALIZATION_MEAN, NORMALIZATION_STD, RANDOM_ERASE_VALUE = utils.fix_generator_arguments(
        config)
    TRAINDATA_KWARGS = {
        "rea_value": config.get("TRANSFORMATION.RANDOM_ERASE_VALUE")
    }
    """ Load previousely saved logger, if it exists """
    DRIVE_BACKUP = config.get("SAVE.DRIVE_BACKUP")
    if DRIVE_BACKUP:
        backup_logger = os.path.join(CHECKPOINT_DIRECTORY, LOGGER_SAVE_NAME)
        if os.path.exists(backup_logger):
            shutil.copy2(backup_logger, ".")
    else:
        backup_logger = None

    NUM_GPUS = torch.cuda.device_count()
    if NUM_GPUS > 1:
        raise RuntimeError(
            "Not built for multi-GPU. Please start with single-GPU.")
    logger.info("Found %i GPUs" % NUM_GPUS)

    # --------------------- BUILD GENERATORS ------------------------
    # Supported integrated data sources --> MNIST, CIFAR
    # For BDD or others need a crawler and stuff...but we;ll deal with it later
    from generators import ClassedGenerator

    load_dataset = config.get("EXECUTION.DATASET_PRELOAD")
    if load_dataset in ["MNIST", "CIFAR10", "CIFAR100"]:
        crawler = load_dataset
        #dataset = torchvision.datasets.MNIST(root="./MNIST", train=True,)
        logger.info("No crawler necessary when using %s dataset" % crawler)
    else:
        raise NotImplementedError()


    train_generator = ClassedGenerator( gpus=NUM_GPUS, i_shape=config.get("DATASET.SHAPE"), \
                                        normalization_mean=NORMALIZATION_MEAN, normalization_std=NORMALIZATION_STD, normalization_scale=1./config.get("TRANSFORMATION.NORMALIZATION_SCALE"), \
                                        h_flip = config.get("TRANSFORMATION.H_FLIP"), t_crop=config.get("TRANSFORMATION.T_CROP"), rea=config.get("TRANSFORMATION.RANDOM_ERASE"),
                                        **TRAINDATA_KWARGS)
    train_generator.setup(  crawler, preload_classes = config.get("EXECUTION.DATASET_PRELOAD_CLASS"), \
                            mode='train',batch_size=config.get("TRANSFORMATION.BATCH_SIZE"), \
                            workers = config.get("TRANSFORMATION.WORKERS"))
    logger.info("Generated training data generator")

    test_generator = ClassedGenerator( gpus=NUM_GPUS, i_shape=config.get("DATASET.SHAPE"), \
                                        normalization_mean=NORMALIZATION_MEAN, normalization_std=NORMALIZATION_STD, normalization_scale=1./config.get("TRANSFORMATION.NORMALIZATION_SCALE"), \
                                        h_flip = config.get("TRANSFORMATION.H_FLIP"), t_crop=config.get("TRANSFORMATION.T_CROP"), rea=config.get("TRANSFORMATION.RANDOM_ERASE"),
                                        **TRAINDATA_KWARGS)
    test_generator.setup(  crawler, preload_classes = config.get("EXECUTION.DATASET_TEST_PRELOAD_CLASS"), \
                            mode='test',batch_size=config.get("TRANSFORMATION.BATCH_SIZE"), \
                            workers = config.get("TRANSFORMATION.WORKERS"))
    logger.info("Generated testing data generator")

    # --------------------- INSTANTIATE MODEL ------------------------
    model_builder = __import__("models", fromlist=["*"])
    model_builder = getattr(model_builder,
                            config.get("EXECUTION.MODEL_BUILDER"))
    logger.info("Loaded {} from {} to build VAEGAN model".format(
        config.get("EXECUTION.MODEL_BUILDER"), "models"))

    vaegan_model = model_builder(   arch=config.get("MODEL.ARCH"), base=config.get("MODEL.BASE"), \
                                    latent_dimensions = config.get("MODEL.LATENT_DIMENSIONS"), \
                                    **json.loads(config.get("MODEL.MODEL_KWARGS")))
    logger.info("Finished instantiating model")

    if mode == "test":
        vaegan_model.load_state_dict(torch.load(weights))
        vaegan_model.cuda()
        vaegan_model.eval()
    else:
        vaegan_model.cuda()
        #logger.info(torchsummary.summary(vaegan_model, input_size=(config.get("TRANSFORMATION.CHANNELS"), *config.get("DATASET.SHAPE"))))
        logger.info(
            torchsummary.summary(
                vaegan_model.Encoder,
                input_size=(config.get("TRANSFORMATION.CHANNELS"),
                            *config.get("DATASET.SHAPE"))))
        logger.info(
            torchsummary.summary(
                vaegan_model.Decoder,
                input_size=(config.get("MODEL.LATENT_DIMENSIONS"), 1)))
        logger.info(
            torchsummary.summary(
                vaegan_model.LatentDiscriminator,
                input_size=(config.get("MODEL.LATENT_DIMENSIONS"), 1)))
        logger.info(
            torchsummary.summary(
                vaegan_model.Discriminator,
                input_size=(config.get("TRANSFORMATION.CHANNELS"),
                            *config.get("DATASET.SHAPE"))))

    # --------------------- INSTANTIATE LOSS ------------------------
    # ----------- NOT NEEDED. VAEGAN WILL USE BCE LOSS THROUGHOUT
    # loss_function = LossBuilder(loss_functions=config.get("LOSS.LOSSES"), loss_lambda=config.get("LOSS.LOSS_LAMBDAS"), loss_kwargs=config.get("LOSS.LOSS_KWARGS"), **{"logger":logger})
    # logger.info("Built loss function")
    # --------------------- INSTANTIATE OPTIMIZER ------------------------
    optimizer_builder = __import__("optimizer", fromlist=["*"])
    optimizer_builder = getattr(optimizer_builder,
                                config.get("EXECUTION.OPTIMIZER_BUILDER"))
    logger.info("Loaded {} from {} to build VAEGAN model".format(
        config.get("EXECUTION.OPTIMIZER_BUILDER"), "optimizer"))

    OPT = optimizer_builder(base_lr=config.get("OPTIMIZER.BASE_LR"))
    optimizer = OPT.build(
        vaegan_model, config.get("OPTIMIZER.OPTIMIZER_NAME"),
        **json.loads(config.get("OPTIMIZER.OPTIMIZER_KWARGS")))
    logger.info("Built optimizer")
    # --------------------- INSTANTIATE SCHEDULER ------------------------
    try:
        scheduler = __import__('torch.optim.lr_scheduler',
                               fromlist=['lr_scheduler'])
        scheduler_ = getattr(scheduler, config.get("SCHEDULER.LR_SCHEDULER"))
    except (ModuleNotFoundError, AttributeError):
        scheduler_ = config.get("SCHEDULER.LR_SCHEDULER")
        scheduler = __import__("scheduler." + scheduler_,
                               fromlist=[scheduler_])
        scheduler_ = getattr(scheduler, scheduler_)
    scheduler = {}
    for base_model in [
            "Encoder", "Decoder", "Discriminator", "Autoencoder",
            "LatentDiscriminator"
    ]:
        scheduler[base_model] = scheduler_(
            optimizer[base_model],
            last_epoch=-1,
            **json.loads(config.get("SCHEDULER.LR_KWARGS")))
        logger.info("Built scheduler for {}".format(base_model))

    # --------------------- SETUP CONTINUATION  ------------------------
    if DRIVE_BACKUP:
        fl_list = glob.glob(os.path.join(CHECKPOINT_DIRECTORY, "*.pth"))
    else:
        fl_list = glob.glob(os.path.join(MODEL_SAVE_FOLDER, "*.pth"))
    _re = re.compile(r'.*epoch([0-9]+)\.pth')
    previous_stop = [
        int(item[1]) for item in [_re.search(item) for item in fl_list]
        if item is not None
    ]
    if len(previous_stop) == 0:
        previous_stop = 0
    else:
        previous_stop = max(previous_stop) + 1
        logger.info(
            "Previous stop detected. Will attempt to resume from epoch %i" %
            previous_stop)

    # --------------------- INSTANTIATE TRAINER  ------------------------
    Trainer = __import__("trainer", fromlist=["*"])
    Trainer = getattr(Trainer, config.get("EXECUTION.TRAINER"))
    logger.info("Loaded {} from {} to build VAEGAN model".format(
        config.get("EXECUTION.TRAINER"), "trainer"))

    loss_stepper = Trainer(model=vaegan_model,
                           loss_fn=None,
                           optimizer=optimizer,
                           scheduler=scheduler,
                           train_loader=train_generator.dataloader,
                           test_loader=test_generator.dataloader,
                           epochs=config.get("EXECUTION.EPOCHS"),
                           batch_size=config.get("TRANSFORMATION.BATCH_SIZE"),
                           latent_size=config.get("MODEL.LATENT_DIMENSIONS"),
                           logger=logger)
    loss_stepper.setup(step_verbose=config.get("LOGGING.STEP_VERBOSE"),
                       save_frequency=config.get("SAVE.SAVE_FREQUENCY"),
                       test_frequency=config.get("EXECUTION.TEST_FREQUENCY"),
                       save_directory=MODEL_SAVE_FOLDER,
                       save_backup=DRIVE_BACKUP,
                       backup_directory=CHECKPOINT_DIRECTORY,
                       gpus=NUM_GPUS,
                       fp16=config.get("OPTIMIZER.FP16"),
                       model_save_name=MODEL_SAVE_NAME,
                       logger_file=LOGGER_SAVE_NAME)
    if mode == 'train':
        loss_stepper.train(continue_epoch=previous_stop)
    elif mode == 'test':
        loss_stepper.evaluate()
    else:
        raise NotImplementedError()