def train(config) -> None: setup_logging('train') logger = logging.getLogger() logger.info(f'Training: {config}') seed_everything(config['SEED']) # setup data_loader instances data_loader = eval(config["DATA_LOADER"]["TYPE"])(**config["DATA_LOADER"]["ARGS"]) valid_data_loader = data_loader.split_validation() # build model architecture, then print to console model = create_model((config["MODEL"]["TYPE"]))(**config["MODEL"]["ARGS"]) logger.info(model) # prepare for (multi-device) GPU training device, device_ids = prepare_device(config['N_GPU']) model = model.to(device) if len(device_ids) > 1: model = torch.nn.DataParallel(model, device_ids=device_ids) # get function handles of loss and metrics criterion = eval(config['LOSS']).to(device) metrics = [eval(met) for met in config['METRICS']] # build optimizer, learning rate scheduler. delete every lines containing lr_scheduler for disabling scheduler optimizer = create_optimizer(config["OPTIMIZER"]["TYPE"])(**config["OPTIMIZER"]["ARGS"], model=model) lr_scheduler, num_epochs = create_scheduler(config["LR_SCHEDULER"]["TYPE"])(**config["LR_SCHEDULER"]["ARGS"], optimizer=optimizer) trainer = Trainer(model, criterion, metrics, optimizer, config=config, device=device, data_loader=data_loader, valid_data_loader=valid_data_loader, lr_scheduler=lr_scheduler) trainer.train()
def __init__( self, args: Dict, options: Optional[List["CustomArgs"]] = None, timestamp: bool = True, ): if "device" in args: if args["device"] is not None: os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(args["device"]) self.device = [int(device) for device in args["device"]] if args["resume"]: self.resume: Union[Path, None] = Path(args["resume"]) self.cfg_fname = self.resume.parent / DEFAULT_CONFIG_FILE_NAME elif args["config"]: self.resume: Union[Path, None] = None self.cfg_fname = Path(args["config"]) else: msg_no_cfg = ("Configuration file need to be specified.\n" "add -c config.json for example.") raise ValueError(msg_no_cfg) self._config = self.load(self.cfg_fname) self._apply_options(args, options) # set save_dir where trained model and log will be saved save_dir = Path(self._config["trainer"]["save_dir"]) expr_name = self._config["name"] if self.resume: time_stamp = self.resume.parent.stem if timestamp else "" self._save_dir = save_dir / "models" / expr_name / time_stamp self._log_dir = save_dir / "log" / expr_name / time_stamp self._test_dir = save_dir / "test" / expr_name / time_stamp else: time_stamp = datetime.now().strftime( r"%m%d_%H%M%S") if timestamp else "" self._save_dir = save_dir / "models" / expr_name / time_stamp self._log_dir = save_dir / "log" / expr_name / time_stamp self._test_dir = save_dir / "test" / expr_name / time_stamp self._save_dir.mkdir(parents=True, exist_ok=True) self._log_dir.mkdir(parents=True, exist_ok=True) self._test_dir.mkdir(parents=True, exist_ok=True) self.save(self._save_dir / DEFAULT_CONFIG_FILE_NAME) # configurations for logging module setup_logging(self.log_dir) self._log_level = { 0: logging.WARNING, 1: logging.INFO, 2: logging.DEBUG }
def __init__(self, args, options='', timestamp=True): # parse default and custom cli options for opt in options: args.add_argument(*opt.flags, default=None, type=opt.type) args = args.parse_args() self.resume = None if args.device: os.environ["CUDA_VISIBLE_DEVICES"] = args.device if args.resume: self.resume = Path(args.resume) self.cfg_fname = self.resume.parent / 'config.json' if args.config: self.cfg_fname = Path(args.config) msg_no_cfg = ("Configuration file need to be specified. " "Add '-c config.json', for example.") assert self.cfg_fname is not None, msg_no_cfg # load config file and apply custom cli options config = read_json(self.cfg_fname) self.__config = _update_config(config, options, args) self.__raw = copy.deepcopy(self.__config) # set save_dir where trained model and log will be saved. save_dir = Path( parse_value(self.config['trainer']['extra_args']['save_dir'])) timestamp = datetime.now().strftime( r'%m%d_%H%M%S') if timestamp else '' exper_name = self.config['name'] self.__save_dir = save_dir / 'models' / exper_name / timestamp self.__log_dir = save_dir / 'log' / exper_name / timestamp self.save_dir.mkdir(parents=True, exist_ok=True) self.log_dir.mkdir(parents=True, exist_ok=True) # save updated config file to the checkpoint dir if get_global_rank() == 0: write_json(self.config, self.save_dir / 'config.json') # configure logging module setup_logging(self.log_dir) self.log_levels = { 0: logging.WARNING, 1: logging.INFO, 2: logging.DEBUG } logger = self.get_logger('config') logger.info(f"Experiment name: {exper_name}")
def __init__(self, args: Dict, timestamp: bool = True): if "device" in args: os.environ["CUDA_VISIBLE_DEVICES"] = args['device'] self.device = [int(device) for device in args['device'].split(',')] if 'resume' in args: self.resume: Union[Path, None] = Path(args['resume']) self.cfg_fname = self.resume.parent / DEFAULT_CONFIG_FILE_NAME elif 'config' in args: self.resume: Union[Path, None] = None self.cfg_fname = Path(args['config']) else: msg_no_cfg = "Configuration file need to be specified." \ "add -c config.json for example." raise ValueError(msg_no_cfg) self._config = self.load(self.cfg_fname) self._update(args) # set save_dir where trained model and log will be saved save_dir = Path(self._config["trainer"]["save_dir"]) expr_name = self._config["name"] if self.resume: time_stamp = self.resume.parent.stem if timestamp else '' self._save_dir = save_dir / "models" / expr_name / time_stamp self._log_dir = save_dir / "log" / expr_name / time_stamp self._test_dir = save_dir / "test" / expr_name / time_stamp else: time_stamp = datetime.now().strftime( r"%m%d_%H%M%S") if timestamp else '' self._save_dir = save_dir / "models" / expr_name / time_stamp self._log_dir = save_dir / "log" / expr_name / time_stamp self._test_dir = save_dir / "test" / expr_name / time_stamp self._save_dir.mkdir(parents=True, exist_ok=True) self._log_dir.mkdir(parents=True, exist_ok=True) self._test_dir.mkdir(parents=True, exist_ok=True) self.save(self._save_dir / DEFAULT_CONFIG_FILE_NAME) # configurations for logging module setup_logging(self.log_dir) self._log_level = { 0: logging.WARNING, 1: logging.INFO, 2: logging.DEBUG }
def __init__(self, dataset, batch_size, shuffle, validation_split, num_workers, collate_fn=default_collate, drop_last=False): """ Initiates the DataLoader with the given parameters and initiates the super class. :param dataset (Dataset): dataset from which to load the data. :param batch_size (int): how many samples per batch to load. :param shuffle (bool): set to ``True`` to have the data reshuffled at every epoch. :param validation_split (int or float): If integer, treats as amount of validation examples. If float, treats as percent of validation examples. :param num_workers (int): how many subprocesses to use for data loading. :param collate_fn (Callable): merges a list of samples to form a mini-batch of Tensor(s). :param drop_last determines behavior of last non-full batch """ # Define logger if not LOGGER_SETUP: setup_logging() self.logger = logging.getLogger('BaseDataLoader') self.logger.setLevel(logging.INFO) self.validation_split = validation_split self.shuffle = shuffle self.batch_idx = 0 self.n_samples = len(dataset) # Define samplers for the training split and the validation split. self.sampler, self.valid_sampler = self._split_sampler( self.validation_split) self.drop_last = drop_last # Define arguments to initiate the superclass. self.init_kwargs = { 'dataset': dataset, 'batch_size': batch_size, 'shuffle': self.shuffle, 'collate_fn': collate_fn, 'num_workers': num_workers, 'drop_last': self.drop_last } super().__init__(sampler=self.sampler, **self.init_kwargs)
def __init__(self, config, resume=None, modification=None, run_id=None): """ class to parse configuration json file. Handles hyper-parameters for training, initializations of modules, checkpoint saving and logging module. :param config: Dict containing configurations, hyper-parameters for training. contents of `config.json` file for example. :param resume: String, path to the checkpoint being loaded. :param modification: Dict keychain:value, specifying position values to be replaced from config dict. :param run_id: Unique Identifier for training processes. Used to save checkpoints and training log. Timestamp is being used as default """ # load config file and apply modification self._config = _update_config(config, modification) self.resume = resume # set save_dir where trained model and log will be saved. save_dir = Path(self.config['trainer']['save_dir']) exper_name = self.config['name'] if run_id is None: # use timestamp as default run-id run_id = datetime.now().strftime(r'%m%d_%H%M%S') self._save_dir = save_dir / 'models' / exper_name / run_id self._log_dir = save_dir / 'log' / exper_name / run_id self._predictions_dir = save_dir / 'predictions' / exper_name / run_id self.predictions_file_name = self._predictions_dir / 'predictions.csv' # make directory for saving checkpoints and log. exist_ok = run_id == '' self.save_dir.mkdir(parents=True, exist_ok=exist_ok) self.log_dir.mkdir(parents=True, exist_ok=exist_ok) self.predictions_dir.mkdir(parents=True, exist_ok=exist_ok) # save updated config file to the checkpoint dir write_json(self.config, self.save_dir / 'config.json') # configure logging module setup_logging(self.log_dir) self.log_levels = { 0: logging.WARNING, 1: logging.INFO, 2: logging.DEBUG }
def init_params(cls, config_fname=None): """ intialize and setup params from config file Parameters ---------- config_fname Returns ------- """ if config_fname is None: config_fname = os.path.join(Path(__file__).parent.parent, 'config.json') logging.info('Using default configuration file: ', config_fname) # parse the config file _config = read_json(config_fname) # specify the saving directory proj_name = _config['project_name'] save_root_dir = Path(_config['resparams']['save_dir']) run_id = datetime.now().strftime(r'%m%d_%H%M%S') save_model_dir = save_root_dir / proj_name / 'models' save_log_dir = save_root_dir / proj_name / 'logs' if not os.path.exists(save_model_dir): save_model_dir.mkdir(parents=True, exist_ok=True) if not os.path.exists(save_log_dir): save_log_dir.mkdir(parents=True, exist_ok=True) _config['resparams']['run_id'] = run_id _config['resparams']['save_model_dir'] = str(save_model_dir) _config['resparams']['save_log_dir'] = str(save_log_dir) # setup logging setup_logging(save_log_dir, run_id=run_id) return cls(_config)
import sys import worker from shutil import rmtree from time import sleep from algthm.utils.file import dir_empty from cfg.loader import cfg from multiprocessing import Process from logger import logger from dex.core.db import MongoConnection from dex.core.exceptions.indexer import IndexerBootFailure from logging import CRITICAL, getLogger from datetime import datetime from elasticsearch import Elasticsearch, ElasticsearchException logger.setup_logging() logger = logger.get_logger('dex') pika_logger = getLogger('pika') pika_logger.setLevel(CRITICAL) def initialize_workers(num_workers, target, daemon=True): """ Initializes the worker processes. """ workers = [] process = None print '> initializing {} workers ..'.format(num_workers), for i in range(num_workers):
criterion = eval("loss_module." + trainer_loss) # metrics metrics = [ eval("metric_module." + met) for met in eval(trainer_metrics) ] # optimizer optimizer = eval("optimizer_module." + trainer_optimizer)( **sessions.trainer_params[trainer_id]["optimizer_params"], params=model.parameters()) # lr_scheduler lr_scheduler = eval("scheduler_module." + trainer_scheduler)( **sessions.trainer_params[trainer_id]['scheduler_params'], optimizer=optimizer) setup_logging(trainer_save_path) trainer = Trainer( model=model, criterion=criterion, metrics=metrics, optimizer=optimizer, epoch=trainer_epoch, device=device, data_loader=data_loader, valid_data_loader=valid_data_loader, lr_scheduler=lr_scheduler, sts=[trainer_stop_btn, trainer_chart, trainer_save_path]) trainer.train() if cur == "Eval": eval_container = st.sidebar.beta_container()
def __init__(self, config, resume=None, modification=None, run_id=None): """ 解析json配置文件, 处理超参数,for训练、初始化模型、检查点模型、日志模块 class to parse configuration json file. Handles hyperparameters for training, initializations of modules, checkpoint saving and logging module. :param config: 配置的json(字典) Dict containing configurations, hyperparameters for training. contents of `rnn_config.json` file for example. :param resume: 重启checkpoint的路径 String, path to the checkpoint being loaded. :param modification: 修改项(字典) Dict keychain:value, specifying position values to be replaced from config dict. :param run_id: 训练唯一标识(用来保存检查点训练日志) Unique Identifier for training processes. Used to save checkpoints and training log. Timestamp is being used as default """ # 训练唯一标识 if run_id is None: # 若run_id为none,用时间戳作为默认run_id run_id = datetime.now().strftime(r'%m%d_%H%M%S') # 加载配置文件,并更新配置(这个不能乱放,要先放前面) self._config = _update_config(config, modification) # 项目名 exper_name = self.config['name'] # 路径:重启checkpoint self.resume = resume # 保存路径:模型、模型记录、日志: save_dir = Path(self.config['trainer']['saved']) # 路径文件名:模型、模型记录 # /,应该是文件名可以这样连,字符串不行 self._save_dir = save_dir / 'models' / run_id # 路径文件名:日志 self._log_dir = save_dir / 'log' / run_id # 创建目录:模型、模型记录、日志 """ run_id为'',exist_ok为True parents,父目录不存在,也要创建目录 exist_ok,? pathlib.Path('/my/directory').mkdir(parents=True, exist_ok=True) parents:如果父目录不存在,是否创建父目录。 exist_ok:只有在目录不存在时,创建目录;目录已存在时,不会抛出异常。 """ exist_ok = run_id == '' self.save_dir.mkdir(parents=True, exist_ok=exist_ok) self.log_dir.mkdir(parents=True, exist_ok=exist_ok) # 写配置:训练配置(√);模型、模型记录(×) # 注意,这里要改,是写死的! # 记下run_id的训练配置 """ 写,这个run_id使用的配置,到(模型/记录)目录 save updated config file to the checkpoint dir """ write_json(self.config, self.save_dir / 'config.json') # 写配置:训练日志 # 建立日志 setup_logging(self.log_dir) # 日志等级 self.log_levels = { 0: logging.WARNING, 1: logging.INFO, 2: logging.DEBUG }
def __init__(self, data_dir, gold_type, domain_split, length_split, train=True, valid=False, debug=False): """ Initiates the Bread dataset. :param data_dir (str): Path to the data in which to save/ from which to read the dataset. :param train: True to load the train data, False to load the test data. :param valid: If train is False, ignored. If train is True, then valid=True will load the validation data and valid=False will load the train data. :param debug: True for using a small subset of the dataset. """ # Define logger if not LOGGER_SETUP: setup_logging() self.logger = logging.getLogger('BREAKLogical') self.logger.setLevel(logging.INFO) self.logger.info("Preparing dataset") super(BREAKLogical, self).__init__() self.gold_type = gold_type self.domain_split = domain_split self.length_split = length_split # Load dataset and lexicon self.dataset_split = 'test' if train: self.dataset_split = 'train' if valid: self.dataset_split = 'validation' self.logger.info('loading data split:' + self.dataset_split) self.logger.info('loading vanilla dataset') self.dataset_logical = self.load_dataset(data_dir, 'logical-forms', self.logger) if self.domain_split: self.logger.info('loading domain split dataset') self.dataset_logical = self.load_domain_split_dataset( data_dir, self.logger) elif self.length_split: self.logger.info('loading length split dataset') self.dataset_logical = self.load_length_split_dataset( data_dir, self.logger) self.logger.info('dataset ready.') # Download spacy language model if not spacy.util.is_package("en_core_web_sm"): self.logger.info('Downloading spacy english core...') run(['python', '-m', 'spacy', 'download', 'en']) # Prepare the data parts self.ids = self.dataset_logical[self.dataset_split]['question_id'] self.questions = self.dataset_logical[ self.dataset_split]['question_text'] # lexicon is based on vanilla/ domain_split type of dataset_logical self.lexicon_str = self.get_lexicon()[self.dataset_split] self.logger.info('dataset and lexicon ready.') # uses QDMR self.qdmrs = [ format_qdmr(decomp) for decomp in self.dataset_logical[ self.dataset_split]["decomposition"] ] self.programs = self.get_programs() if debug: self.ids = self.ids[:DEBUG_EXAMPLES_AMOUNT] self.questions = self.questions[:DEBUG_EXAMPLES_AMOUNT] self.qdmrs = self.qdmrs[:DEBUG_EXAMPLES_AMOUNT] self.lexicon_str = self.lexicon_str[:DEBUG_EXAMPLES_AMOUNT] self.programs = self.programs[:DEBUG_EXAMPLES_AMOUNT]
def __init__(self, args, options='', timestamp=True): # parse default and custom cli options for opt in options: args.add_argument(*opt.flags, default=None, type=opt.type) args = args.parse_args() self.args = args # set config file from arguments (dataset, lr scheduler, loss fn) cfg_fname = None if args.dataset and args.lr_scheduler and args.loss_fn: cfg_fname = './hyperparams/' + args.lr_scheduler + '/config_' + args.dataset + '_' + args.loss_fn + '_' + args.arch + '.json' if args.device: os.environ["CUDA_VISIBLE_DEVICES"] = args.device if args.resume is None: msg_no_cfg = "Configuration file need to be specified. Add '-c config.json', for example." if cfg_fname is not None: self.cfg_fname = Path(cfg_fname) else: assert args.config is not None, msg_no_cfg self.cfg_fname = Path(args.config) config = read_json(self.cfg_fname) self.resume = None else: self.resume = Path(args.resume) resume_cfg_fname = self.resume.parent / 'config.json' config = read_json(resume_cfg_fname) if args.config is not None: config.update(read_json(Path(args.config))) # load config file and apply custom cli options self._config = _update_config(config, options, args) # set save_dir where trained model and log will be saved. save_dir = Path(self.config['trainer']['save_dir']) dataset_name = self.config['name'].split('_')[0] model_type = self.config['arch']['type'] lr_scheduler = self.config['lr_scheduler']['type'] loss_fn = self.config['train_loss']['type'] sym_setting = 'sym' if not self.config['trainer']['asym'] else 'asym' percent = str(int(self.config['trainer']['percent'] * 100)) if args.distillation: distill_mode = args.distill_mode seed = args.dataseed self._save_dir = save_dir / 'models' / dataset_name / model_type / lr_scheduler / loss_fn / sym_setting / percent / distill_mode / str( int(seed)) self._log_dir = save_dir / 'log' / dataset_name / model_type / lr_scheduler / loss_fn / sym_setting / percent / distill_mode / str( int(seed)) else: self._save_dir = save_dir / 'models' / dataset_name / model_type / lr_scheduler / loss_fn / sym_setting / percent self._log_dir = save_dir / 'log' / dataset_name / model_type / lr_scheduler / loss_fn / sym_setting / percent self.save_dir.mkdir(parents=True, exist_ok=True) self.log_dir.mkdir(parents=True, exist_ok=True) # save updated config file to the checkpoint dir config_name = 'config_' + str(self.config['seed']) + '.json' write_json(self.config, self.save_dir / config_name) # configure logging module setup_logging(self.log_dir) self.log_levels = { 0: logging.WARNING, 1: logging.INFO, 2: logging.DEBUG }