def run(): # reproducibility torch.manual_seed(42) torch.cuda.manual_seed_all(42) np.random.seed(42) args = parse_args() paths = PathsContainer.from_args(args.output, args.run_id, args.config_file_name) os.makedirs(paths.base_output_path, exist_ok=True) create_output_dirs(paths.output_dir) logger = init_logger(paths.output_dir) logger.info("will save data in {output_dir}".format( output_dir=paths.base_output_path)) # read config config = Config.from_json(paths.config_path) logger.info("Config:\n {}".format(pformat(vars(config), width=1))) output_config_path = os.path.join(paths.output_dir, "used_config.json") execute_command("cp {} {}".format(paths.config_path, output_config_path)) # train_ds, val_ds train_ds, val_ds = load_libsvm_dataset( input_path=config.data.path, slate_length=config.data.slate_length, validation_ds_role=config.data.validation_ds_role, ) n_features = train_ds.shape[-1] assert n_features == val_ds.shape[ -1], "Last dimensions of train_ds and val_ds do not match!" # train_dl, val_dl train_dl, val_dl = create_data_loaders(train_ds, val_ds, num_workers=config.data.num_workers, batch_size=config.data.batch_size) # gpu support dev = get_torch_device() logger.info("Model training will execute on {}".format(dev.type)) # instantiate model model = make_model(**asdict(config.model, recurse=False), n_features=n_features) if torch.cuda.device_count() > 1: model = CustomDataParallel(model) logger.info("Model training will be distributed to {} GPUs.".format( torch.cuda.device_count())) model.to(dev) # load optimizer, loss and LR scheduler optimizer = getattr(optim, config.optimizer.name)(params=model.parameters(), **config.optimizer.args) loss_func = partial(getattr(losses, config.loss.name), **config.loss.args) if config.lr_scheduler.name: scheduler = getattr(optim.lr_scheduler, config.lr_scheduler.name)( optimizer, **config.lr_scheduler.args) else: scheduler = None with torch.autograd.detect_anomaly( ) if config.detect_anomaly else dummy_context_mgr(): # run training result = fit(**asdict(config.training), model=model, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, train_dl=train_dl, valid_dl=val_dl, config=config, device=dev, output_dir=paths.output_dir, tensorboard_output_path=paths.tensorboard_output_path) dump_experiment_result(args, config, paths.output_dir, result) assert_expected_metrics(result, config.expected_metrics)
def run(): # reproducibility torch.manual_seed(42) torch.cuda.manual_seed_all(42) np.random.seed(42) args = parse_args() paths = PathsContainer.from_args(args.job_dir, args.run_id, args.config_file_name) os.makedirs(paths.base_output_path, exist_ok=True) create_output_dirs(paths.output_dir) logger = init_logger(paths.output_dir) logger.info("will save data in {output_dir}".format(output_dir=paths.base_output_path)) # read config config = Config.from_json(paths.config_path) logger.info("Config:\n {}".format(pformat(vars(config), width=1))) output_config_path = os.path.join(paths.output_dir, "used_config.json") execute_command("cp {} {}".format(paths.config_path, output_config_path)) datasets = {role: load_libsvm_dataset_role(role, config.data.path, config.data.slate_length) for role in args.roles} n_features = [ds.shape[-1] for ds in datasets.values()] assert all_equal(n_features), f"Last dimensions of datasets must match but got {n_features}" # gpu support dev = get_torch_device() logger.info("Will use device {}".format(dev.type)) # instantiate model model = make_model(n_features=n_features[0], **asdict(config.model, recurse=False)) model.load_state_dict(load_state_dict_from_file(args.input_model_path, dev)) logger.info(f"loaded model weights from {args.input_model_path}") if torch.cuda.device_count() > 1: model = CustomDataParallel(model) logger.info("Model training will be distributed to {} GPUs.".format(torch.cuda.device_count())) model.to(dev) assert config.click_model is not None, "click_model must be defined in config for this run" click_model = instantiate_from_recursive_name_args(name_args=config.click_model) ranked_slates = rank_slates(datasets, model, config) clicked_slates = {role: click_on_slates(slates, click_model, include_empty=False) for role, slates in ranked_slates.items()} # save clickthrough datasets for role, slates in clicked_slates.items(): write_to_libsvm_without_masked(os.path.join(paths.output_dir, f"{role}.txt"), *slates) # calculate metrics metered_slates = {role: metrics_on_clicked_slates(slates) for role, slates in clicked_slates.items()} for role, metrics in metered_slates.items(): metrics_df = pd.DataFrame(metrics) logger.info(f"{role} metrics summary:") logger.info(metrics_df.mean()) metrics_df.to_csv(os.path.join(paths.output_dir, f"{role}_metrics.csv"), index=False) pd.DataFrame(metrics_df.mean()).T.to_csv(os.path.join(paths.output_dir, f"{role}_metrics_mean.csv"), index=False) if urlparse(args.job_dir).scheme == "gs": copy_local_to_gs(paths.local_base_output_path, args.job_dir)
def run(): # reproducibility torch.manual_seed(42) torch.cuda.manual_seed_all(42) np.random.seed(42) args = parse_args() paths = PathsContainer.from_args(args.job_dir, args.run_id, args.config_file_name) os.makedirs(paths.base_output_path, exist_ok=True) create_output_dirs(paths.output_dir) logger = init_logger(paths.output_dir) logger.info("will save data in {output_dir}".format( output_dir=paths.base_output_path)) # read config config = Config.from_json(paths.config_path) logger.info("Config:\n {}".format(pformat(vars(config), width=1))) output_config_path = os.path.join(paths.output_dir, "used_config.json") execute_command("cp {} {}".format(paths.config_path, output_config_path)) train_ds, val_ds = load_libsvm_dataset( input_path=config.data.path, slate_length=config.data.slate_length, validation_ds_role=config.data.validation_ds_role, ) # load dstore and use as feature func dstore = Dstore(**config.dstore) n_features = train_ds.shape[-1] n_features = dstore.get_n_features(n_features, config) train_dl, val_dl = create_data_loaders(train_ds, val_ds, num_workers=config.data.num_workers, batch_size=config.data.batch_size, dstore=dstore) if dstore.prefetch: dstore.run_prefetch([train_dl, val_dl]) # gpu support dev = get_torch_device() logger.info("Will use device {}".format(dev.type)) # instantiate model model = make_model(n_features=n_features, dstore=dstore, **asdict(config.model, recurse=False)) model.load_state_dict(load_state_dict_from_file(args.input_model_path, dev)) logger.info(f"loaded model weights from {args.input_model_path}") if torch.cuda.device_count() > 1: model = CustomDataParallel(model) logger.info("Model training will be distributed to {} GPUs.".format( torch.cuda.device_count())) model.to(dev) datasets = {'vali': val_dl} ranked_slates = rank_slates(datasets, model, dstore, config) # save output for role, out in ranked_slates.items(): write_out_dir(paths.output_dir, role, out, dstore) print('DONE')
def run(args): # reproducibility torch.manual_seed(42) torch.cuda.manual_seed_all(42) np.random.seed(42) paths = PathsContainer.from_args(args.job_dir, args.run_id, args.config_file_name) create_output_dirs(paths.output_dir) logger = init_logger(paths.output_dir) logger.info(f"created paths container {paths}") # read config config = Config.from_json(paths.config_path) logger.info("Config:\n {}".format(pformat(vars(config), width=1))) output_config_path = os.path.join(paths.output_dir, "used_config.json") execute_command("cp {} {}".format(paths.config_path, output_config_path)) print("Shared in main", config.data.shared) # train_ds, val_ds, test_ds train_ds, val_ds, test_ds = load_libsvm_dataset( input_path=config.data.path, slate_length=config.data.slate_length, validation_ds_role=config.data.validation_ds_role, test_ds_role=config.data.test_ds_role, sigma=config.data.noise, shared=config.data.shared) n_features = train_ds.shape[-1] assert n_features == val_ds.shape[ -1], "Last dimensions of train_ds and val_ds do not match!" # train_dl, val_dl, test_dl train_dl, val_dl, test_dl = create_data_loaders( train_ds, val_ds, test_ds, num_workers=config.data.num_workers, batch_size=config.data.batch_size) # gpu support dev = get_torch_device() logger.info("Model training will execute on {}".format(dev.type)) # instantiate model use_distillation = True if config.distillation_loss else False full_pipeline = True if use_distillation and "full" in config.distillation_loss.name else False fit_size = config.teacher_model.fc_model['sizes'][ -1] if full_pipeline and config.teacher_model.fc_model['sizes'][ -1] != config.model.fc_model['sizes'][-1] else None print("Fit size", fit_size) model = make_model(n_features=n_features, **asdict(config.model, recurse=False), fit_size=fit_size, distillation=full_pipeline, seq_len=config.data.slate_length) if torch.cuda.device_count() > 1: model = CustomDataParallel(model) logger.info("Model training will be distributed to {} GPUs.".format( torch.cuda.device_count())) model.to(dev) # load optimizer, loss and LR scheduler if hasattr(optim, config.optimizer.name): optimizer = getattr(optim, config.optimizer.name)(params=model.parameters(), **config.optimizer.args) #if hasattr(optimizers, config.optimizer.name): # optimizer = getattr(optimizers, config.optimizer.name)(params=model.parameters(), **config.optimizer.args) if config.lr_scheduler.name: scheduler = getattr(optim.lr_scheduler, config.lr_scheduler.name)( optimizer, **config.lr_scheduler.args) else: scheduler = None loss_func = partial(getattr(losses, config.loss.name), **config.loss.args) if args.evaluate: test_metrics = compute_metrics(config.metrics, model, test_dl, dev) print(test_metrics) sys.exit() if use_distillation: if full_pipeline: assert config.teacher_model.transformer.h == config.model.transformer.h teacher_model = make_model(n_features=n_features, **asdict(config.teacher_model, recurse=False), distillation=full_pipeline, fit_size=None) if torch.cuda.device_count() > 1: teacher_model = CustomDataParallel(teacher_model) logger.info( "Model training will be distributed to {} GPUs.".format( torch.cuda.device_count())) teacher_model.to(dev) loss_func = partial(getattr(losses, config.distillation_loss.name), gt_loss_func=loss_func, **config.distillation_loss.args) with torch.autograd.detect_anomaly( ) if config.detect_anomaly else dummy_context_mgr(): # type: ignore result, model = fit_with_distillation( student_model=model, teacher_model=teacher_model, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, train_dl=train_dl, valid_dl=val_dl, config=config, device=dev, output_dir=paths.output_dir, tensorboard_output_path=paths.tensorboard_output_path, full=full_pipeline, **asdict(config.training)) else: with torch.autograd.detect_anomaly( ) if config.detect_anomaly else dummy_context_mgr(): # type: ignore # run training result, model = fit( model=model, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, train_dl=train_dl, valid_dl=val_dl, config=config, device=dev, output_dir=paths.output_dir, tensorboard_output_path=paths.tensorboard_output_path, **asdict(config.training)) #Reload best model sd = torch.load(os.path.join(paths.output_dir, "best_model.pkl")) model.load_state_dict(sd) test_metrics = compute_metrics(config.metrics, model, test_dl, dev) result['test_metrics'] = test_metrics print(result) dump_experiment_result(args, config, paths.output_dir, result) if urlparse(args.job_dir).scheme == "gs": copy_local_to_gs(paths.local_base_output_path, args.job_dir) assert_expected_metrics(result, config.expected_metrics) return test_metrics['ndcg_10']