Exemple #1
0
def copy_file_to_local(uri: str) -> str:
    temp_dir = tempfile.mkdtemp()
    local_file = "local_file"
    command = "gsutil cp {gs_uri} {local_path}".format(gs_uri=uri,
                                                       local_path=os.path.join(
                                                           temp_dir,
                                                           local_file))
    execute_command(command)
    return os.path.join(temp_dir, local_file)
Exemple #2
0
def run():
    # reproducibility
    torch.manual_seed(42)
    torch.cuda.manual_seed_all(42)
    np.random.seed(42)

    args = parse_args()

    paths = PathsContainer.from_args(args.output, args.run_id,
                                     args.config_file_name)

    os.makedirs(paths.base_output_path, exist_ok=True)

    create_output_dirs(paths.output_dir)
    logger = init_logger(paths.output_dir)

    logger.info("will save data in {output_dir}".format(
        output_dir=paths.base_output_path))

    # read config
    config = Config.from_json(paths.config_path)
    logger.info("Config:\n {}".format(pformat(vars(config), width=1)))

    output_config_path = os.path.join(paths.output_dir, "used_config.json")
    execute_command("cp {} {}".format(paths.config_path, output_config_path))

    # train_ds, val_ds
    train_ds, val_ds = load_libsvm_dataset(
        input_path=config.data.path,
        slate_length=config.data.slate_length,
        validation_ds_role=config.data.validation_ds_role,
    )

    n_features = train_ds.shape[-1]
    assert n_features == val_ds.shape[
        -1], "Last dimensions of train_ds and val_ds do not match!"

    # train_dl, val_dl
    train_dl, val_dl = create_data_loaders(train_ds,
                                           val_ds,
                                           num_workers=config.data.num_workers,
                                           batch_size=config.data.batch_size)

    # gpu support
    dev = get_torch_device()
    logger.info("Model training will execute on {}".format(dev.type))

    # instantiate model
    model = make_model(**asdict(config.model, recurse=False),
                       n_features=n_features)
    if torch.cuda.device_count() > 1:
        model = CustomDataParallel(model)
        logger.info("Model training will be distributed to {} GPUs.".format(
            torch.cuda.device_count()))
    model.to(dev)

    # load optimizer, loss and LR scheduler
    optimizer = getattr(optim,
                        config.optimizer.name)(params=model.parameters(),
                                               **config.optimizer.args)
    loss_func = partial(getattr(losses, config.loss.name), **config.loss.args)
    if config.lr_scheduler.name:
        scheduler = getattr(optim.lr_scheduler, config.lr_scheduler.name)(
            optimizer, **config.lr_scheduler.args)
    else:
        scheduler = None

    with torch.autograd.detect_anomaly(
    ) if config.detect_anomaly else dummy_context_mgr():
        # run training
        result = fit(**asdict(config.training),
                     model=model,
                     loss_func=loss_func,
                     optimizer=optimizer,
                     scheduler=scheduler,
                     train_dl=train_dl,
                     valid_dl=val_dl,
                     config=config,
                     device=dev,
                     output_dir=paths.output_dir,
                     tensorboard_output_path=paths.tensorboard_output_path)

    dump_experiment_result(args, config, paths.output_dir, result)

    assert_expected_metrics(result, config.expected_metrics)
Exemple #3
0
def clean_up(path):
    rm_command = "rm -rf {path}".format(path=path)
    execute_command(rm_command)
Exemple #4
0
def run():
    # reproducibility
    torch.manual_seed(42)
    torch.cuda.manual_seed_all(42)
    np.random.seed(42)

    args = parse_args()

    paths = PathsContainer.from_args(args.job_dir, args.run_id,
                                     args.config_file_name)

    os.makedirs(paths.base_output_path, exist_ok=True)

    create_output_dirs(paths.output_dir)
    logger = init_logger(paths.output_dir)

    logger.info("will save data in {output_dir}".format(
        output_dir=paths.base_output_path))

    # read config
    config = Config.from_json(paths.config_path)
    logger.info("Config:\n {}".format(pformat(vars(config), width=1)))

    output_config_path = os.path.join(paths.output_dir, "used_config.json")
    execute_command("cp {} {}".format(paths.config_path, output_config_path))

    train_ds, val_ds = load_libsvm_dataset(
        input_path=config.data.path,
        slate_length=config.data.slate_length,
        validation_ds_role=config.data.validation_ds_role,
    )

    # load dstore and use as feature func
    dstore = Dstore(**config.dstore)
    n_features = train_ds.shape[-1]
    n_features = dstore.get_n_features(n_features, config)

    train_dl, val_dl = create_data_loaders(train_ds,
                                           val_ds,
                                           num_workers=config.data.num_workers,
                                           batch_size=config.data.batch_size,
                                           dstore=dstore)

    if dstore.prefetch:
        dstore.run_prefetch([train_dl, val_dl])

    # gpu support
    dev = get_torch_device()
    logger.info("Will use device {}".format(dev.type))

    # instantiate model
    model = make_model(n_features=n_features,
                       dstore=dstore,
                       **asdict(config.model, recurse=False))

    model.load_state_dict(load_state_dict_from_file(args.input_model_path,
                                                    dev))
    logger.info(f"loaded model weights from {args.input_model_path}")

    if torch.cuda.device_count() > 1:
        model = CustomDataParallel(model)
        logger.info("Model training will be distributed to {} GPUs.".format(
            torch.cuda.device_count()))
    model.to(dev)

    datasets = {'vali': val_dl}

    ranked_slates = rank_slates(datasets, model, dstore, config)

    # save output
    for role, out in ranked_slates.items():
        write_out_dir(paths.output_dir, role, out, dstore)

    print('DONE')
Exemple #5
0
def copy_local_to_gs(source_local: str, destination_uri: str) -> None:
    command = "gsutil cp -r {source_local}/* {destination_uri}".format(
        source_local=source_local, destination_uri=destination_uri)
    execute_command(command)
Exemple #6
0
def run():
    # reproducibility
    torch.manual_seed(42)
    torch.cuda.manual_seed_all(42)
    np.random.seed(42)

    args = parse_args()

    paths = PathsContainer.from_args(args.job_dir, args.run_id, args.config_file_name)

    os.makedirs(paths.base_output_path, exist_ok=True)

    create_output_dirs(paths.output_dir)
    logger = init_logger(paths.output_dir)

    logger.info("will save data in {output_dir}".format(output_dir=paths.base_output_path))

    # read config
    config = Config.from_json(paths.config_path)
    logger.info("Config:\n {}".format(pformat(vars(config), width=1)))

    output_config_path = os.path.join(paths.output_dir, "used_config.json")
    execute_command("cp {} {}".format(paths.config_path, output_config_path))

    datasets = {role: load_libsvm_dataset_role(role, config.data.path, config.data.slate_length) for role in args.roles}

    n_features = [ds.shape[-1] for ds in datasets.values()]
    assert all_equal(n_features), f"Last dimensions of datasets must match but got {n_features}"

    # gpu support
    dev = get_torch_device()
    logger.info("Will use device {}".format(dev.type))

    # instantiate model
    model = make_model(n_features=n_features[0], **asdict(config.model, recurse=False))

    model.load_state_dict(load_state_dict_from_file(args.input_model_path, dev))
    logger.info(f"loaded model weights from {args.input_model_path}")

    if torch.cuda.device_count() > 1:
        model = CustomDataParallel(model)
        logger.info("Model training will be distributed to {} GPUs.".format(torch.cuda.device_count()))
    model.to(dev)

    assert config.click_model is not None, "click_model must be defined in config for this run"
    click_model = instantiate_from_recursive_name_args(name_args=config.click_model)

    ranked_slates = rank_slates(datasets, model, config)

    clicked_slates = {role: click_on_slates(slates, click_model, include_empty=False) for role, slates in ranked_slates.items()}

    # save clickthrough datasets
    for role, slates in clicked_slates.items():
        write_to_libsvm_without_masked(os.path.join(paths.output_dir, f"{role}.txt"), *slates)

    # calculate metrics
    metered_slates = {role: metrics_on_clicked_slates(slates) for role, slates in clicked_slates.items()}

    for role, metrics in metered_slates.items():
        metrics_df = pd.DataFrame(metrics)
        logger.info(f"{role} metrics summary:")
        logger.info(metrics_df.mean())
        metrics_df.to_csv(os.path.join(paths.output_dir, f"{role}_metrics.csv"), index=False)
        pd.DataFrame(metrics_df.mean()).T.to_csv(os.path.join(paths.output_dir, f"{role}_metrics_mean.csv"), index=False)

    if urlparse(args.job_dir).scheme == "gs":
        copy_local_to_gs(paths.local_base_output_path, args.job_dir)
Exemple #7
0
def run(args):
    # reproducibility
    torch.manual_seed(42)
    torch.cuda.manual_seed_all(42)
    np.random.seed(42)

    paths = PathsContainer.from_args(args.job_dir, args.run_id,
                                     args.config_file_name)

    create_output_dirs(paths.output_dir)

    logger = init_logger(paths.output_dir)
    logger.info(f"created paths container {paths}")

    # read config
    config = Config.from_json(paths.config_path)
    logger.info("Config:\n {}".format(pformat(vars(config), width=1)))

    output_config_path = os.path.join(paths.output_dir, "used_config.json")
    execute_command("cp {} {}".format(paths.config_path, output_config_path))

    print("Shared in main", config.data.shared)
    # train_ds, val_ds, test_ds
    train_ds, val_ds, test_ds = load_libsvm_dataset(
        input_path=config.data.path,
        slate_length=config.data.slate_length,
        validation_ds_role=config.data.validation_ds_role,
        test_ds_role=config.data.test_ds_role,
        sigma=config.data.noise,
        shared=config.data.shared)

    n_features = train_ds.shape[-1]
    assert n_features == val_ds.shape[
        -1], "Last dimensions of train_ds and val_ds do not match!"

    # train_dl, val_dl, test_dl
    train_dl, val_dl, test_dl = create_data_loaders(
        train_ds,
        val_ds,
        test_ds,
        num_workers=config.data.num_workers,
        batch_size=config.data.batch_size)

    # gpu support
    dev = get_torch_device()
    logger.info("Model training will execute on {}".format(dev.type))

    # instantiate model

    use_distillation = True if config.distillation_loss else False
    full_pipeline = True if use_distillation and "full" in config.distillation_loss.name else False
    fit_size = config.teacher_model.fc_model['sizes'][
        -1] if full_pipeline and config.teacher_model.fc_model['sizes'][
            -1] != config.model.fc_model['sizes'][-1] else None
    print("Fit size", fit_size)
    model = make_model(n_features=n_features,
                       **asdict(config.model, recurse=False),
                       fit_size=fit_size,
                       distillation=full_pipeline,
                       seq_len=config.data.slate_length)
    if torch.cuda.device_count() > 1:
        model = CustomDataParallel(model)
        logger.info("Model training will be distributed to {} GPUs.".format(
            torch.cuda.device_count()))
    model.to(dev)

    # load optimizer, loss and LR scheduler
    if hasattr(optim, config.optimizer.name):
        optimizer = getattr(optim,
                            config.optimizer.name)(params=model.parameters(),
                                                   **config.optimizer.args)
    #if hasattr(optimizers, config.optimizer.name):
    #    optimizer = getattr(optimizers, config.optimizer.name)(params=model.parameters(), **config.optimizer.args)
    if config.lr_scheduler.name:
        scheduler = getattr(optim.lr_scheduler, config.lr_scheduler.name)(
            optimizer, **config.lr_scheduler.args)
    else:
        scheduler = None
    loss_func = partial(getattr(losses, config.loss.name), **config.loss.args)

    if args.evaluate:
        test_metrics = compute_metrics(config.metrics, model, test_dl, dev)
        print(test_metrics)
        sys.exit()

    if use_distillation:
        if full_pipeline:
            assert config.teacher_model.transformer.h == config.model.transformer.h
        teacher_model = make_model(n_features=n_features,
                                   **asdict(config.teacher_model,
                                            recurse=False),
                                   distillation=full_pipeline,
                                   fit_size=None)
        if torch.cuda.device_count() > 1:
            teacher_model = CustomDataParallel(teacher_model)
            logger.info(
                "Model training will be distributed to {} GPUs.".format(
                    torch.cuda.device_count()))
        teacher_model.to(dev)
        loss_func = partial(getattr(losses, config.distillation_loss.name),
                            gt_loss_func=loss_func,
                            **config.distillation_loss.args)
        with torch.autograd.detect_anomaly(
        ) if config.detect_anomaly else dummy_context_mgr():  # type: ignore
            result, model = fit_with_distillation(
                student_model=model,
                teacher_model=teacher_model,
                loss_func=loss_func,
                optimizer=optimizer,
                scheduler=scheduler,
                train_dl=train_dl,
                valid_dl=val_dl,
                config=config,
                device=dev,
                output_dir=paths.output_dir,
                tensorboard_output_path=paths.tensorboard_output_path,
                full=full_pipeline,
                **asdict(config.training))

    else:
        with torch.autograd.detect_anomaly(
        ) if config.detect_anomaly else dummy_context_mgr():  # type: ignore
            # run training
            result, model = fit(
                model=model,
                loss_func=loss_func,
                optimizer=optimizer,
                scheduler=scheduler,
                train_dl=train_dl,
                valid_dl=val_dl,
                config=config,
                device=dev,
                output_dir=paths.output_dir,
                tensorboard_output_path=paths.tensorboard_output_path,
                **asdict(config.training))
    #Reload best model
    sd = torch.load(os.path.join(paths.output_dir, "best_model.pkl"))
    model.load_state_dict(sd)
    test_metrics = compute_metrics(config.metrics, model, test_dl, dev)
    result['test_metrics'] = test_metrics
    print(result)
    dump_experiment_result(args, config, paths.output_dir, result)

    if urlparse(args.job_dir).scheme == "gs":
        copy_local_to_gs(paths.local_base_output_path, args.job_dir)

    assert_expected_metrics(result, config.expected_metrics)
    return test_metrics['ndcg_10']