Beispiel #1
0
def main(args):
    address_parser = AddressParser(model_type=args.model_type, device=0)

    if args.mode in ("train", "both"):
        train_container = PickleDatasetContainer(args.train_dataset_path)

        lr_scheduler = StepLR(step_size=20)

        address_parser.retrain(train_container,
                               0.8,
                               epochs=100,
                               batch_size=1024,
                               num_workers=6,
                               learning_rate=0.001,
                               callbacks=[lr_scheduler],
                               logging_path=f"./chekpoints/{args.model_type}")

    if args.mode in ("test", "both"):
        test_container = PickleDatasetContainer(args.test_dataset_path)

        if args.mode == "test":
            checkpoint = handle_pre_trained_checkpoint(args.model_type)
        else:
            checkpoint = "best"

        address_parser.test(test_container,
                            batch_size=2048,
                            num_workers=4,
                            logging_path=f"./chekpoints/{args.model_type}",
                            checkpoint=checkpoint)
Beispiel #2
0
def main(args):
    address_parser = AddressParser(model_type=args.model_type, device=0)

    train_container = PickleDatasetContainer(args.train_dataset_path)

    lr_scheduler = StepLR(step_size=20)

    address_parser.retrain(train_container,
                           0.8,
                           epochs=args.epochs,
                           batch_size=args.batch_size,
                           num_workers=6,
                           learning_rate=args.learning_rate,
                           callbacks=[lr_scheduler],
                           logging_path=f"./chekpoints/{args.model_type}")

    test_container = PickleDatasetContainer(args.test_dataset_path)

    checkpoint = "best"

    address_parser.test(test_container,
                        batch_size=args.batch_size,
                        num_workers=4,
                        logging_path=f"./chekpoints/{args.model_type}",
                        checkpoint=checkpoint)
Beispiel #3
0
    def test_givenABPEmbAddressParser_whenTestWithConfigWithCallbacks_thenCallbackAreUse(
            self):
        address_parser = AddressParser(model_type=self.a_bpemb_model_type,
                                       device=self.a_torch_device,
                                       verbose=self.verbose)

        self.training(address_parser)

        callback_mock = MagicMock()
        performance_after_test = address_parser.test(
            self.test_container,
            batch_size=self.a_batch_size,
            num_workers=self.a_number_of_workers,
            callbacks=[callback_mock],
            logging_path=self.a_checkpoints_saving_dir)

        self.assertIsNotNone(performance_after_test)

        callback_test_start_call = [call.on_test_begin({})]
        callback_mock.assert_has_calls(callback_test_start_call)
        callback_test_end_call = [
            call.on_test_end({
                "time":
                ANY,
                "test_loss":
                performance_after_test["test_loss"],
                "test_accuracy":
                performance_after_test["test_accuracy"]
            })
        ]
        callback_mock.assert_has_calls(callback_test_end_call)
        callback_mock.assert_not_called()
Beispiel #4
0
    def test_givenAFasttextAddressParser_whenTestWithConfigWithCallbacks_thenCallbackAreUse(
        self,
    ):
        address_parser = AddressParser(
            model_type=self.a_fasttext_model_type,
            device=self.a_cpu_device,
            verbose=self.verbose,
        )

        self.training(address_parser, self.training_container, self.a_number_of_workers)

        callback_mock = MagicMock()
        performance_after_test = address_parser.test(
            self.test_container,
            batch_size=self.a_batch_size,
            num_workers=self.a_number_of_workers,
            callbacks=[callback_mock],
        )

        self.assertIsNotNone(performance_after_test)

        callback_test_start_call = [call.on_test_begin({})]
        callback_mock.assert_has_calls(callback_test_start_call)
        callback_test_end_call = [
            call.on_test_end(
                {
                    "time": ANY,
                    "test_loss": performance_after_test["test_loss"],
                    "test_accuracy": performance_after_test["test_accuracy"],
                }
            )
        ]
        callback_mock.assert_has_calls(callback_test_end_call)
        callback_mock.assert_not_called()
Beispiel #5
0
    def test_givenABPEmbAddressParser_whenTest_thenTestOccur(self):
        address_parser = AddressParser(model_type=self.a_bpemb_model_type,
                                       device=self.a_torch_device,
                                       verbose=self.verbose)
        self.training(address_parser)

        performance_after_test = address_parser.test(
            self.test_container,
            batch_size=self.a_batch_size,
            num_workers=self.a_number_of_workers,
            logging_path=self.a_checkpoints_saving_dir)

        self.assertIsNotNone(performance_after_test)
Beispiel #6
0
    def test_givenAFasttextAddressParser_whenTestWithStrCkpt_thenTestOccur(
            self):
        address_parser = AddressParser(model_type=self.a_fasttext_model_type,
                                       device=self.a_torch_device,
                                       verbose=self.verbose)

        performance_after_test = address_parser.test(
            self.test_container,
            batch_size=self.a_batch_size,
            num_workers=self.a_number_of_workers,
            logging_path=self.a_checkpoints_saving_dir,
            checkpoint=self.fasttext_local_path)

        self.assertIsNotNone(performance_after_test)
Beispiel #7
0
    def test_givenAFasttextAddressParser_whenTestWithNumWorkerAt0_thenTestOccur(self):
        address_parser = AddressParser(
            model_type=self.a_fasttext_model_type,
            device=self.a_cpu_device,
            verbose=self.verbose,
        )
        self.training(address_parser, self.training_container, self.a_zero_number_of_workers)

        performance_after_test = address_parser.test(
            self.test_container,
            batch_size=self.a_batch_size,
            num_workers=self.a_number_of_workers,
        )

        self.assertIsNotNone(performance_after_test)
Beispiel #8
0
    def test_givenABPEmbAddressParser_whenTestWithStrCkpt_thenTestOccur(self):
        address_parser = AddressParser(
            model_type=self.a_bpemb_model_type,
            device=self.a_cpu_device,
            verbose=self.verbose,
        )

        self.training(address_parser, self.training_container, self.a_number_of_workers)

        performance_after_test = address_parser.test(
            self.test_container,
            batch_size=self.a_batch_size,
            num_workers=self.a_number_of_workers,
        )

        self.assertIsNotNone(performance_after_test)
Beispiel #9
0
    def test_givenAFasttextAddressParser_whenTestMultipleEpochs_thenTestOccurCorrectly(
        self,
    ):
        address_parser = AddressParser(
            model_type=self.a_fasttext_model_type,
            device=self.a_cpu_device,
            verbose=self.verbose,
        )

        self.training(address_parser, self.training_container, self.a_number_of_workers)

        performance_after_test = address_parser.test(
            self.test_container,
            batch_size=self.a_batch_size,
            num_workers=self.a_number_of_workers,
        )

        self.assertIsNotNone(performance_after_test)
Beispiel #10
0
def test_on_country_data(address_parser: AddressParser, file: str,
                         directory_path: str, args) -> tuple:
    """
    Compute the results over a country data.
    """
    country = pycountry.countries.get(
        alpha_2=file.replace(".p", "").upper()).name
    country = clean_up_name(country)

    print(f"Testing on test files {country}")

    test_file_path = os.path.join(directory_path, file)
    test_container = PickleDatasetContainer(test_file_path)

    results = address_parser.test(
        test_container,
        batch_size=4096,
        num_workers=4,
        logging_path=f"./chekpoints/{args.model_type}",
        checkpoint=args.model_path)
    return results, country
    def test_givenABPEmbAddressParser_whenTestWithBPEmbCkptNewTags_thenTestOccur(self):
        address_parser = AddressParser(
            model_type=self.a_bpemb_model_type,
            device=self.a_cpu_device,
            verbose=self.verbose,
        )

        self.training(
            address_parser,
            self.new_prediction_data_container,
            self.a_number_of_workers,
            prediction_tags=self.with_new_prediction_tags,
        )

        performance_after_test = address_parser.test(
            self.new_prediction_data_container,
            batch_size=self.a_batch_size,
            num_workers=self.a_number_of_workers,
        )

        self.assertIsNotNone(performance_after_test)
Beispiel #12
0
def test_on_country_data(address_parser: AddressParser, file: str,
                         directory_path: str, args) -> tuple:
    """
    Compute the results over a country data.
    """
    country = convert_2_letters_name_into_country_name(file)

    print(f"Testing on test files {country}")

    test_file_path = os.path.join(directory_path, file)
    test_container = PickleDatasetContainer(test_file_path,
                                            is_training_container=False)

    results = address_parser.test(
        test_container,
        batch_size=args.batch_size,
        num_workers=4,
        logging_path=f"./checkpoints/{args.model_type}",
        checkpoint=args.model_path,
    )
    return results, country
    step_size=1, gamma=0.1)  # reduce LR by a factor of 10 each epoch

# The checkpoints (ckpt) are saved in the default "./checkpoints" directory, so if you wish to retrain
# another model (let's say BPEmb), you need to change the `logging_path` directory; otherwise, you will get
# an error when retraining since Poutyne will try to use the last checkpoint.
address_parser.retrain(
    training_container,
    0.8,
    epochs=5,
    batch_size=8,
    num_workers=2,
    callbacks=[lr_scheduler],
)

# Now, let's test our fine-tuned model using the best checkpoint (default parameter).
address_parser.test(test_container, batch_size=256)

# Now let's retrain the fasttext version but with an attention mechanism.
address_parser = AddressParser(model_type="fasttext",
                               device=0,
                               attention_mechanism=True)

# Since the previous checkpoints were saved in the default "./checkpoints" directory, we need to use a new one.
# Otherwise, poutyne will try to reload the previous checkpoints, and our model has changed.
address_parser.retrain(
    training_container,
    0.8,
    epochs=5,
    batch_size=8,
    num_workers=2,
    callbacks=[lr_scheduler],
Beispiel #14
0
class AddressParserRetrainTest(AddressParserPredictTestCase):
    # pylint: disable=too-many-public-methods
    @classmethod
    def setUpClass(cls):
        super(AddressParserRetrainTest, cls).setUpClass()
        cls.a_device = torch.device("cpu")

        cls.a_train_ratio = 0.8
        cls.a_batch_size = BATCH_SIZE
        cls.a_epoch_number = 1
        cls.a_number_of_workers = 1
        cls.a_learning_rate = 0.01
        cls.a_callbacks_list = []
        cls.a_seed = 42
        cls.a_torch_device = torch.device("cuda:0")

        cls.mocked_data_container = ADataContainer()

        cls.a_fasttext_path = "fasttext"
        cls.a_bpemb_path = "bpemb"

        cls.verbose = False

    def address_parser_test_call(self):
        self.address_parser.test(
            self.mocked_data_container,
            self.a_batch_size,
            num_workers=self.a_number_of_workers,
            callbacks=self.a_callbacks_list,
            seed=self.a_seed,
        )

    def assert_experiment_test(self, experiment_mock, model_mock, device):
        experiment_mock.assert_called_with(
            "./checkpoint",  # We always use this as default logging dir.
            model_mock(),
            device=device,
            # For a reason I don't understand if I use self.nll_loss and set it in the
            # class setup, it return a bound method for the nll_loss but it work for
            # the accuracy. So f**k it, here a fix.
            loss_function=nll_loss,
            batch_metrics=[accuracy],
            logging=False,
        )

    def assert_experiment_test_method_is_call(self, data_loader_mock,
                                              experiment_mock, verbose):
        test_call = [
            call().test(data_loader_mock(),
                        seed=self.a_seed,
                        callbacks=[],
                        verbose=verbose)
        ]
        experiment_mock.assert_has_calls(test_call)

    @patch("deepparse.parser.address_parser.DataLoader")
    @patch("deepparse.parser.address_parser.Experiment")
    @patch("deepparse.parser.address_parser.SGD")
    @patch("deepparse.parser.address_parser.DataTransform")
    @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel")
    @patch("deepparse.parser.address_parser.fasttext_data_padding")
    @patch("deepparse.parser.address_parser.FastTextVectorizer")
    @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel")
    @patch("deepparse.parser.address_parser.download_fasttext_embeddings")
    def test_givenAFasttextModel_whenTestCPU_thenInstantiateExperimentProperly(
        self,
        download_weights_mock,
        embeddings_model_mock,
        vectorizer_model_mock,
        data_padding_mock,
        model_mock,
        data_transform_mock,
        optimizer_mock,
        experiment_mock,
        data_loader_mock,
    ):
        self.address_parser = AddressParser(
            model_type=self.a_fasttext_model_type,
            device=self.a_device,
            verbose=self.verbose,
        )
        self.address_parser_test_call()

        self.assert_experiment_test(experiment_mock,
                                    model_mock,
                                    device=self.a_device)

    @patch("deepparse.parser.address_parser.DataLoader")
    @patch("deepparse.parser.address_parser.Experiment")
    @patch("deepparse.parser.address_parser.SGD")
    @patch("deepparse.parser.address_parser.DataTransform")
    @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel")
    @patch("deepparse.parser.address_parser.fasttext_data_padding")
    @patch("deepparse.parser.address_parser.FastTextVectorizer")
    @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel")
    @patch("deepparse.parser.address_parser.download_fasttext_embeddings")
    @skipIf(not torch.cuda.is_available(), "no gpu available")
    def test_givenAFasttextModel_whenTestGPU_thenInstantiateExperimentProperly(
        self,
        download_weights_mock,
        embeddings_model_mock,
        vectorizer_model_mock,
        data_padding_mock,
        model_mock,
        data_transform_mock,
        optimizer_mock,
        experiment_mock,
        data_loader_mock,
    ):
        self.address_parser = AddressParser(
            model_type=self.a_fasttext_model_type,
            device=self.a_torch_device,
            verbose=self.verbose,
        )
        self.address_parser_test_call()

        self.assert_experiment_test(experiment_mock,
                                    model_mock,
                                    device=self.a_torch_device)

    @patch("deepparse.parser.address_parser.DataLoader")
    @patch("deepparse.parser.address_parser.Experiment")
    @patch("deepparse.parser.address_parser.SGD")
    @patch("deepparse.parser.address_parser.DataTransform")
    @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel")
    @patch("deepparse.parser.address_parser.fasttext_data_padding")
    @patch("deepparse.parser.address_parser.FastTextVectorizer")
    @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel")
    @patch("deepparse.parser.address_parser.download_fasttext_embeddings")
    def test_givenAFasttextModel_whenTest_thenInstantiateDataLoaderAndTestProperly(
        self,
        download_weights_mock,
        embeddings_model_mock,
        vectorizer_model_mock,
        data_padding_mock,
        model_mock,
        data_transform_mock,
        optimizer_mock,
        experiment_mock,
        data_loader_mock,
    ):
        self.address_parser = AddressParser(
            model_type=self.a_fasttext_model_type,
            device=self.a_device,
            verbose=self.verbose,
        )
        self.address_parser_test_call()

        self.assert_experiment_test_method_is_call(data_loader_mock,
                                                   experiment_mock,
                                                   verbose=self.verbose)

    @patch("deepparse.parser.address_parser.DataLoader")
    @patch("deepparse.parser.address_parser.Experiment")
    @patch("deepparse.parser.address_parser.SGD")
    @patch("deepparse.parser.address_parser.DataTransform")
    @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel")
    @patch("deepparse.parser.address_parser.fasttext_data_padding")
    @patch("deepparse.parser.address_parser.FastTextVectorizer")
    @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel")
    @patch("deepparse.parser.address_parser.download_fasttext_embeddings")
    def test_givenAFasttextModel_whenTestVerbose_thenInstantiateWithVerbose(
        self,
        download_weights_mock,
        embeddings_model_mock,
        vectorizer_model_mock,
        data_padding_mock,
        model_mock,
        data_transform_mock,
        optimizer_mock,
        experiment_mock,
        data_loader_mock,
    ):
        verbose = True
        self.address_parser = AddressParser(
            model_type=self.a_fasttext_model_type,
            device=self.a_device,
            verbose=verbose)
        self.address_parser_test_call()

        self.assert_experiment_test_method_is_call(data_loader_mock,
                                                   experiment_mock,
                                                   verbose=verbose)

    @patch("deepparse.parser.address_parser.DataLoader")
    @patch("deepparse.parser.address_parser.Experiment")
    @patch("deepparse.parser.address_parser.SGD")
    @patch("deepparse.parser.address_parser.DataTransform")
    @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel")
    @patch("deepparse.parser.address_parser.bpemb_data_padding")
    @patch("deepparse.parser.address_parser.BPEmbVectorizer")
    @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel")
    def test_givenABPEmbModel_whenTestCPU_thenInstantiateExperimentProperly(
        self,
        embeddings_model_mock,
        vectorizer_model_mock,
        data_padding_mock,
        model_mock,
        data_transform_mock,
        optimizer_mock,
        experiment_mock,
        data_loader_mock,
    ):
        self.address_parser = AddressParser(
            model_type=self.a_bpemb_model_type,
            device=self.a_device,
            verbose=self.verbose,
        )
        self.address_parser_test_call()

        self.assert_experiment_test(experiment_mock,
                                    model_mock,
                                    device=self.a_device)

    @patch("deepparse.parser.address_parser.DataLoader")
    @patch("deepparse.parser.address_parser.Experiment")
    @patch("deepparse.parser.address_parser.SGD")
    @patch("deepparse.parser.address_parser.DataTransform")
    @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel")
    @patch("deepparse.parser.address_parser.bpemb_data_padding")
    @patch("deepparse.parser.address_parser.BPEmbVectorizer")
    @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel")
    @skipIf(not torch.cuda.is_available(), "no gpu available")
    def test_givenABPEmbModel_whenTestGPU_thenInstantiateExperimentProperly(
        self,
        embeddings_model_mock,
        vectorizer_model_mock,
        data_padding_mock,
        model_mock,
        data_transform_mock,
        optimizer_mock,
        experiment_mock,
        data_loader_mock,
    ):
        self.address_parser = AddressParser(
            model_type=self.a_bpemb_model_type,
            device=self.a_torch_device,
            verbose=self.verbose,
        )
        self.address_parser_test_call()

        self.assert_experiment_test(experiment_mock,
                                    model_mock,
                                    device=self.a_torch_device)

    @patch("deepparse.parser.address_parser.DataLoader")
    @patch("deepparse.parser.address_parser.Experiment")
    @patch("deepparse.parser.address_parser.SGD")
    @patch("deepparse.parser.address_parser.DataTransform")
    @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel")
    @patch("deepparse.parser.address_parser.bpemb_data_padding")
    @patch("deepparse.parser.address_parser.BPEmbVectorizer")
    @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel")
    def test_givenABPEmbModel_whenTest_thenInstantiateDataLoaderAndTestProperly(
        self,
        embeddings_model_mock,
        vectorizer_model_mock,
        data_padding_mock,
        model_mock,
        data_transform_mock,
        optimizer_mock,
        experiment_mock,
        data_loader_mock,
    ):
        self.address_parser = AddressParser(
            model_type=self.a_bpemb_model_type,
            device=self.a_device,
            verbose=self.verbose,
        )
        self.address_parser_test_call()

        self.assert_experiment_test_method_is_call(data_loader_mock,
                                                   experiment_mock,
                                                   verbose=self.verbose)

    @patch("deepparse.parser.address_parser.DataLoader")
    @patch("deepparse.parser.address_parser.Experiment")
    @patch("deepparse.parser.address_parser.SGD")
    @patch("deepparse.parser.address_parser.DataTransform")
    @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel")
    @patch("deepparse.parser.address_parser.bpemb_data_padding")
    @patch("deepparse.parser.address_parser.BPEmbVectorizer")
    @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel")
    def test_givenABPEmbModel_whenTestVerboseTrue_thenInstantiateWithVerbose(
        self,
        embeddings_model_mock,
        vectorizer_model_mock,
        data_padding_mock,
        model_mock,
        data_transform_mock,
        optimizer_mock,
        experiment_mock,
        data_loader_mock,
    ):
        verbose = True
        self.address_parser = AddressParser(model_type=self.a_bpemb_model_type,
                                            device=self.a_device,
                                            verbose=verbose)
        self.address_parser_test_call()

        self.assert_experiment_test_method_is_call(data_loader_mock,
                                                   experiment_mock,
                                                   verbose=verbose)

    @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel")
    @patch("deepparse.parser.address_parser.bpemb_data_padding")
    @patch("deepparse.parser.address_parser.BPEmbVectorizer")
    @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel")
    def test_givenNotTrainingDataContainer_thenRaiseValueError(
        self,
        embeddings_model_mock,
        vectorizer_model_mock,
        data_padding_mock,
        model_patch,
    ):
        self.address_parser = AddressParser(
            model_type=self.a_bpemb_model_type,
            device=self.a_device,
            verbose=self.verbose,
        )
        mocked_data_container = ADataContainer(is_training_container=False)
        with self.assertRaises(ValueError):
            self.address_parser.test(
                mocked_data_container,
                self.a_batch_size,
                num_workers=self.a_number_of_workers,
                callbacks=self.a_callbacks_list,
                seed=self.a_seed,
            )