Ejemplo n.º 1
0
def train_eval_lightning(
    train_dataset,
    eval_dataset,
    trainer_module,
    num_epochs,
    use_gpu,
    batch_preprocessor=None,
    reader_options: Optional[ReaderOptions] = None,
    checkpoint_path: Optional[str] = None,
) -> pl.Trainer:
    reader_options = reader_options or ReaderOptions()
    datamodule = PetastormLightningDataModule(train_dataset, eval_dataset,
                                              batch_preprocessor,
                                              reader_options)
    # pyre-fixme[16]: Module `pl` has no attribute `Trainer`.
    # pyre-fixme[16]: Module `pl` has no attribute `Trainer`.
    trainer = pl.Trainer(
        max_epochs=num_epochs * 1000,
        gpus=int(use_gpu),
        reload_dataloaders_every_epoch=True,
        resume_from_checkpoint=checkpoint_path,
        callbacks=[StoppingEpochCallback(num_epochs)],
    )
    trainer.fit(trainer_module, datamodule=datamodule)
    # TODO: evaluate
    return trainer
Ejemplo n.º 2
0
def identify_and_train_network(
    input_table_spec: TableSpec,
    model: ModelManager__Union,
    num_epochs: int,
    use_gpu: Optional[bool] = None,
    reward_options: Optional[RewardOptions] = None,
    reader_options: Optional[ReaderOptions] = None,
    resource_options: Optional[ResourceOptions] = None,
    warmstart_path: Optional[str] = None,
    validator: Optional[ModelValidator__Union] = None,
    publisher: Optional[ModelPublisher__Union] = None,
) -> RLTrainingOutput:
    if use_gpu is None:
        # pyre-fixme[35]: Target cannot be annotated.
        use_gpu: bool = torch.cuda.is_available()

    reward_options = reward_options or RewardOptions()
    reader_options = reader_options or ReaderOptions()

    manager = model.value

    normalization_data_map = None
    setup_data = None

    data_module = manager.get_data_module(
        input_table_spec=input_table_spec,
        reward_options=reward_options,
        reader_options=reader_options,
        resource_options=resource_options,
    )
    if data_module is not None:
        data_module.prepare_data()
        setup_data = data_module.setup_data
    else:
        normalization_data_map = manager.run_feature_identification(
            input_table_spec)

    return query_and_train(
        input_table_spec,
        model,
        num_epochs,
        use_gpu=use_gpu,
        setup_data=setup_data,
        normalization_data_map=normalization_data_map,
        reward_options=reward_options,
        reader_options=reader_options,
        resource_options=resource_options,
        warmstart_path=warmstart_path,
        validator=validator,
        publisher=publisher,
    )
Ejemplo n.º 3
0
    def train_workflow(
        self,
        train_dataset: Dataset,
        eval_dataset: Optional[Dataset],
        normalization_data_map: Dict[str, NormalizationData],
        num_epochs: int,
        use_gpu: bool,
        named_model_ids: ModuleNameToEntityId,
        child_workflow_id: int,
        reward_options: Optional[RewardOptions] = None,
        reader_options: Optional[ReaderOptions] = None,
        resource_options: Optional[ResourceOptions] = None,
        warmstart_path: Optional[str] = None,
    ) -> RLTrainingOutput:
        writer = SummaryWriter()
        logger.info("TensorBoard logging location is: {}".format(
            writer.log_dir))

        warmstart_input_path = warmstart_path or None
        self.initialize_trainer(
            use_gpu=use_gpu,
            # pyre-fixme[6]: Expected `RewardOptions` for 2nd param but got
            #  `Optional[RewardOptions]`.
            # pyre-fixme[6]: Expected `RewardOptions` for 2nd param but got
            #  `Optional[RewardOptions]`.
            reward_options=reward_options,
            normalization_data_map=normalization_data_map,
            warmstart_path=warmstart_input_path,
        )

        if not reader_options:
            reader_options = ReaderOptions()

        with summary_writer_context(writer):
            train_output = self.train(train_dataset, eval_dataset, num_epochs,
                                      reader_options)

        output_paths = {}
        for module_name, serving_module in self.build_serving_modules().items(
        ):
            # TODO: make this a parameter
            torchscript_output_path = f"model_{round(time.time())}.torchscript"
            serving_module = self.build_serving_module()
            torch.jit.save(serving_module, torchscript_output_path)
            logger.info(f"Saved {module_name} to {torchscript_output_path}")
            output_paths[module_name] = torchscript_output_path
        return dataclasses.replace(train_output, output_paths=output_paths)
Ejemplo n.º 4
0
    def __init__(
        self,
        *,
        input_table_spec: Optional[TableSpec] = None,
        reward_options: Optional[RewardOptions] = None,
        setup_data: Optional[Dict[str, bytes]] = None,
        saved_setup_data: Optional[Dict[str, bytes]] = None,
        reader_options: Optional[ReaderOptions] = None,
        model_manager=None,
    ):
        super().__init__()
        self.input_table_spec = input_table_spec
        self.reward_options = reward_options or RewardOptions()
        self.reader_options = reader_options or ReaderOptions()
        self._model_manager = model_manager
        self.setup_data = setup_data
        self.saved_setup_data = saved_setup_data or {}

        self._setup_done = False
Ejemplo n.º 5
0
def train_and_evaluate_generic(
    train_dataset: Dataset,
    eval_dataset: Optional[Dataset],
    trainer: RLTrainer,
    num_epochs: int,
    use_gpu: bool,
    batch_preprocessor: BatchPreprocessor,
    reporter: Observer,
    evaluator: Evaluator,
    reader_options: Optional[ReaderOptions] = None,
) -> None:
    reader_options = reader_options or ReaderOptions()
    epoch_iterator = EpochIterator(num_epochs=num_epochs)
    # pyre-fixme[16]: `HiveDataSetClass` has no attribute `parquet_url`.
    # pyre-fixme[16]: `HiveDataSetClass` has no attribute `parquet_url`.
    train_dataset_size = get_table_row_count(train_dataset.parquet_url)
    # pyre-fixme[16]: `EpochIterator` has no attribute `add_observer`.
    for epoch in epoch_iterator.add_observer(reporter):
        logger.info(f"Starting training epoch {epoch}.")
        dataloader = get_petastorm_dataloader(
            dataset=train_dataset,
            # pyre-fixme[6]: Expected `int` for 2nd param but got `Optional[int]`.
            batch_size=trainer.minibatch_size,
            batch_preprocessor=batch_preprocessor,
            use_gpu=use_gpu,
            reader_options=reader_options,
        )
        dataloader_wrapper = DataLoaderWrapper(
            dataloader=dataloader, dataloader_size=train_dataset_size)
        for batch in dataloader_wrapper:
            trainer.train(batch)

        if eval_dataset is not None:
            eval_data = gather_eval_data(
                trainer=trainer,
                eval_dataset=eval_dataset,
                batch_preprocessor=batch_preprocessor,
                use_gpu=use_gpu,
                reader_options=reader_options,
            )
            # evaluator passes cpe_details to reporter via notify_observers
            evaluator.evaluate_post_training(eval_data)
Ejemplo n.º 6
0
    def __init__(
        self,
        *,
        input_table_spec: Optional[TableSpec] = None,
        reward_options: Optional[RewardOptions] = None,
        setup_data: Optional[Dict[str, bytes]] = None,
        saved_setup_data: Optional[Dict[str, bytes]] = None,
        reader_options: Optional[ReaderOptions] = None,
        resource_options: Optional[ResourceOptions] = None,
        model_manager=None,
    ):
        super().__init__()
        self.input_table_spec = input_table_spec
        self.reward_options = reward_options or RewardOptions()
        self.reader_options = reader_options or ReaderOptions()
        self.resource_options = resource_options or ResourceOptions(gpu=0)
        self._model_manager = model_manager
        self.setup_data = setup_data
        self.saved_setup_data = saved_setup_data or {}

        self._setup_done = False
        self._num_train_data_loader_calls = 0
        self._num_val_data_loader_calls = 0
        self._num_test_data_loader_calls = 0
Ejemplo n.º 7
0
def query_and_train(
    input_table_spec: TableSpec,
    model: ModelManager__Union,
    num_epochs: int,
    use_gpu: bool,
    *,
    setup_data: Optional[Dict[str, bytes]] = None,
    saved_setup_data: Optional[Dict[str, bytes]] = None,
    normalization_data_map: Optional[Dict[str, NormalizationData]] = None,
    reward_options: Optional[RewardOptions] = None,
    reader_options: Optional[ReaderOptions] = None,
    resource_options: Optional[ResourceOptions] = None,
    warmstart_path: Optional[str] = None,
    validator: Optional[ModelValidator__Union] = None,
    publisher: Optional[ModelPublisher__Union] = None,
    named_model_ids: Optional[ModuleNameToEntityId] = None,
    recurring_period: Optional[RecurringPeriod] = None,
) -> RLTrainingOutput:
    child_workflow_id = get_workflow_id()
    if named_model_ids is None:
        named_model_ids = get_new_named_entity_ids(model.value.serving_module_names())

    logger.info("Starting query")

    reward_options = reward_options or RewardOptions()
    reader_options = reader_options or ReaderOptions()
    resource_options = resource_options or ResourceOptions()
    manager = model.value

    if saved_setup_data is not None:

        def _maybe_get_bytes(v) -> bytes:
            if isinstance(v, bytes):
                return v

            # HACK: FBLearner sometimes pack bytes into Blob
            return v.data

        saved_setup_data = {k: _maybe_get_bytes(v) for k, v in saved_setup_data.items()}

    if setup_data is None:
        data_module = manager.get_data_module(
            input_table_spec=input_table_spec,
            reward_options=reward_options,
            reader_options=reader_options,
            saved_setup_data=saved_setup_data,
        )
        if data_module is not None:
            setup_data = data_module.prepare_data()
            # Throw away existing normalization data map
            normalization_data_map = None

    if sum([int(setup_data is not None), int(normalization_data_map is not None)]) != 1:
        raise ValueError("setup_data and normalization_data_map are mutually exclusive")

    train_dataset = None
    eval_dataset = None
    if normalization_data_map is not None:
        calc_cpe_in_training = manager.should_generate_eval_dataset
        sample_range_output = get_sample_range(input_table_spec, calc_cpe_in_training)
        train_dataset = manager.query_data(
            input_table_spec=input_table_spec,
            sample_range=sample_range_output.train_sample_range,
            reward_options=reward_options,
        )
        eval_dataset = None
        if calc_cpe_in_training:
            eval_dataset = manager.query_data(
                input_table_spec=input_table_spec,
                sample_range=sample_range_output.eval_sample_range,
                reward_options=reward_options,
            )

    logger.info("Starting training")
    results = manager.train_workflow(
        train_dataset,
        eval_dataset,
        num_epochs=num_epochs,
        use_gpu=use_gpu,
        setup_data=setup_data,
        normalization_data_map=normalization_data_map,
        named_model_ids=named_model_ids,
        child_workflow_id=child_workflow_id,
        reward_options=reward_options,
        reader_options=reader_options,
        resource_options=resource_options,
        warmstart_path=warmstart_path,
    )

    if validator is not None:
        results = run_validator(validator, results)

    if publisher is not None:
        results = run_publisher(
            publisher,
            model,
            results,
            named_model_ids,
            child_workflow_id,
            recurring_period,
        )

    return results
Ejemplo n.º 8
0
def train_workflow(
    model_manager: ModelManager,
    train_dataset: Optional[Dataset],
    eval_dataset: Optional[Dataset],
    *,
    num_epochs: int,
    use_gpu: bool,
    named_model_ids: ModuleNameToEntityId,
    child_workflow_id: int,
    setup_data: Optional[Dict[str, bytes]] = None,
    normalization_data_map: Optional[Dict[str, NormalizationData]] = None,
    reward_options: Optional[RewardOptions] = None,
    reader_options: Optional[ReaderOptions] = None,
    resource_options: Optional[ResourceOptions] = None,
    warmstart_path: Optional[str] = None,
) -> RLTrainingOutput:
    writer = SummaryWriter()
    logger.info("TensorBoard logging location is: {}".format(writer.log_dir))

    if setup_data is not None:
        data_module = model_manager.get_data_module(
            setup_data=setup_data,
            reward_options=reward_options,
            reader_options=reader_options,
            resource_options=resource_options,
        )
        assert data_module is not None
        data_module.setup()
    else:
        data_module = None

    if normalization_data_map is None:
        assert data_module is not None
        normalization_data_map = data_module.get_normalization_data_map()

    warmstart_input_path = warmstart_path or None
    trainer_module = model_manager.build_trainer(
        use_gpu=use_gpu,
        reward_options=reward_options,
        normalization_data_map=normalization_data_map,
    )

    if not reader_options:
        reader_options = ReaderOptions()

    if not resource_options:
        resource_options = ResourceOptions()

    with summary_writer_context(writer):
        train_output, lightning_trainer = model_manager.train(
            trainer_module,
            train_dataset,
            eval_dataset,
            None,
            data_module,
            num_epochs,
            reader_options,
            resource_options,
            checkpoint_path=warmstart_input_path,
        )

    output_paths = {}
    for module_name, serving_module in model_manager.build_serving_modules(
            trainer_module, normalization_data_map).items():
        torchscript_output_path = f"{model_manager.__class__.__name__}_{module_name}_{round(time.time())}.torchscript"
        torch.jit.save(serving_module, torchscript_output_path)
        logger.info(f"Saved {module_name} to {torchscript_output_path}")
        output_paths[module_name] = torchscript_output_path
    return dataclasses.replace(train_output, output_paths=output_paths)
Ejemplo n.º 9
0
def query_and_train(
    input_table_spec: TableSpec,
    model: ModelManager__Union,
    normalization_data_map: Dict[str, NormalizationData],
    num_epochs: int,
    use_gpu: bool,
    reward_options: Optional[RewardOptions] = None,
    reader_options: Optional[ReaderOptions] = None,
    resource_options: Optional[ResourceOptions] = None,
    warmstart_path: Optional[str] = None,
    validator: Optional[ModelValidator__Union] = None,
    publisher: Optional[ModelPublisher__Union] = None,
    parent_workflow_id: Optional[int] = None,
    recurring_period: Optional[RecurringPeriod] = None,
) -> RLTrainingOutput:
    child_workflow_id = get_workflow_id()
    if parent_workflow_id is None:
        parent_workflow_id = child_workflow_id

    logger.info("Starting query")

    reward_options = reward_options or RewardOptions()
    reader_options = reader_options or ReaderOptions()
    resource_options = resource_options or ResourceOptions()
    manager = model.value

    calc_cpe_in_training = manager.should_generate_eval_dataset
    sample_range_output = get_sample_range(input_table_spec,
                                           calc_cpe_in_training)
    train_dataset = manager.query_data(
        input_table_spec=input_table_spec,
        sample_range=sample_range_output.train_sample_range,
        reward_options=reward_options,
    )
    eval_dataset = None
    if calc_cpe_in_training:
        eval_dataset = manager.query_data(
            input_table_spec=input_table_spec,
            sample_range=sample_range_output.eval_sample_range,
            reward_options=reward_options,
        )

    logger.info("Starting training")
    results = manager.train_workflow(
        train_dataset,
        eval_dataset,
        normalization_data_map,
        num_epochs,
        use_gpu,
        parent_workflow_id=parent_workflow_id,
        child_workflow_id=child_workflow_id,
        reward_options=reward_options,
        reader_options=reader_options,
        resource_options=resource_options,
        warmstart_path=warmstart_path,
    )

    if validator is not None:
        results = run_validator(validator, results)

    if publisher is not None:
        results = run_publisher(
            publisher,
            model,
            results,
            parent_workflow_id,
            child_workflow_id,
            recurring_period,
        )

    return results
Ejemplo n.º 10
0
def train_and_evaluate_generic(
    train_dataset: Dataset,
    eval_dataset: Optional[Dataset],
    trainer: RLTrainer,
    num_epochs: int,
    use_gpu: bool,
    batch_preprocessor: BatchPreprocessor,
    train_page_handler: TrainingPageHandler,
    eval_page_handler: EvaluationPageHandler,
    reader_options: Optional[ReaderOptions] = None,
):
    reader_options = reader_options or ReaderOptions()

    train_dataset_num_rows = get_table_row_count(train_dataset.parquet_url)
    eval_dataset_num_rows = None
    if eval_dataset is not None:
        eval_dataset_num_rows = get_table_row_count(eval_dataset.parquet_url)

    logger.info(f"train_data_num: {train_dataset_num_rows}, "
                f"eval_data_num: {eval_dataset_num_rows}")

    for epoch in range(num_epochs):
        logger.info(f"Epoch {epoch} start feeding training data")
        data_reader = make_batch_reader(
            train_dataset.parquet_url,
            num_epochs=1,
            reader_pool_type=reader_options.petastorm_reader_pool_type,
        )
        with DataLoader(
                data_reader,
                batch_size=trainer.minibatch_size,
                collate_fn=collate_and_preprocess(batch_preprocessor),
        ) as data_loader:
            feed_pages(
                data_loader,
                train_dataset_num_rows,
                epoch,
                trainer.minibatch_size,
                use_gpu,
                train_page_handler,
            )

        if not eval_dataset:
            continue

        logger.info(f"Epoch {epoch} start feeding evaluation data")
        eval_data_reader = make_batch_reader(
            eval_dataset.parquet_url,
            num_epochs=1,
            reader_pool_type=reader_options.petastorm_reader_pool_type,
        )
        with DataLoader(
                eval_data_reader,
                batch_size=trainer.minibatch_size,
                collate_fn=collate_and_preprocess(batch_preprocessor),
        ) as eval_data_loader:
            feed_pages(
                eval_data_loader,
                eval_dataset_num_rows,
                epoch,
                trainer.minibatch_size,
                use_gpu,
                eval_page_handler,
            )