Exemple #1
0
def main(args):
    address_parser = AddressParser(model_type=args.model_type, device=0)

    train_container = PickleDatasetContainer(args.train_dataset_path)

    lr_scheduler = StepLR(step_size=20)

    address_parser.retrain(train_container,
                           0.8,
                           epochs=args.epochs,
                           batch_size=args.batch_size,
                           num_workers=6,
                           learning_rate=args.learning_rate,
                           callbacks=[lr_scheduler],
                           logging_path=f"./chekpoints/{args.model_type}")

    test_container = PickleDatasetContainer(args.test_dataset_path)

    checkpoint = "best"

    address_parser.test(test_container,
                        batch_size=args.batch_size,
                        num_workers=4,
                        logging_path=f"./chekpoints/{args.model_type}",
                        checkpoint=checkpoint)
Exemple #2
0
def main(args):
    address_parser = AddressParser(model_type=args.model_type, device=0)

    if args.mode in ("train", "both"):
        train_container = PickleDatasetContainer(args.train_dataset_path)

        lr_scheduler = StepLR(step_size=20)

        address_parser.retrain(train_container,
                               0.8,
                               epochs=100,
                               batch_size=1024,
                               num_workers=6,
                               learning_rate=0.001,
                               callbacks=[lr_scheduler],
                               logging_path=f"./chekpoints/{args.model_type}")

    if args.mode in ("test", "both"):
        test_container = PickleDatasetContainer(args.test_dataset_path)

        if args.mode == "test":
            checkpoint = handle_pre_trained_checkpoint(args.model_type)
        else:
            checkpoint = "best"

        address_parser.test(test_container,
                            batch_size=2048,
                            num_workers=4,
                            logging_path=f"./chekpoints/{args.model_type}",
                            checkpoint=checkpoint)
Exemple #3
0
    def test_givenAPickleDatasetContainer_whenGetOneItem_thenReturnTheCorrectItem(
            self):
        create_pickle_file(self.a_pickle_data_container_path)

        pickle_dataset_container = PickleDatasetContainer(
            self.a_pickle_data_container_path)

        # first data point
        idx = 0
        expected_address = base_string.format(idx)
        expected_tags_idx = a_tags_sequence

        actual_address, actual_tags_idx = pickle_dataset_container[idx]
        self.assertEqual(expected_address, actual_address)
        self.assertListEqual(expected_tags_idx, actual_tags_idx)

        # second data point
        idx = 1
        expected_address = base_string.format(idx)
        expected_tags_idx = a_tags_sequence

        actual_address, actual_tags_idx = pickle_dataset_container[idx]
        self.assertEqual(expected_address, actual_address)
        self.assertListEqual(expected_tags_idx, actual_tags_idx)

        # third data point
        idx = 2
        expected_address = base_string.format(idx)
        expected_tags_idx = a_tags_sequence

        actual_address, actual_tags_idx = pickle_dataset_container[idx]
        self.assertEqual(expected_address, actual_address)
        self.assertListEqual(expected_tags_idx, actual_tags_idx)
    def setUpClass(cls):
        cls.temp_dir_obj = TemporaryDirectory()
        cls.a_data_saving_dir = os.path.join(cls.temp_dir_obj.name, "data")
        os.makedirs(cls.a_data_saving_dir, exist_ok=True)
        file_extension = "p"
        training_dataset_name = "sample_incomplete_data"
        test_dataset_name = "test_sample_data"
        download_from_url(training_dataset_name, cls.a_data_saving_dir, file_extension=file_extension)
        download_from_url(test_dataset_name, cls.a_data_saving_dir, file_extension=file_extension)

        cls.training_container = PickleDatasetContainer(
            os.path.join(cls.a_data_saving_dir, training_dataset_name + "." + file_extension)
        )
        cls.test_container = PickleDatasetContainer(
            os.path.join(cls.a_data_saving_dir, test_dataset_name + "." + file_extension)
        )

        cls.a_fasttext_model_type = "fasttext"
        cls.a_fasttext_light_model_type = "fasttext-light"
        cls.a_bpemb_model_type = "bpemb"

        cls.verbose = False

        # training constant
        cls.a_single_epoch = 1
        cls.a_three_epoch = 3
        cls.a_train_ratio = 0.8
        cls.a_batch_size = 128
        cls.a_number_of_workers = 2
        cls.a_learning_rate = 0.001

        cls.a_torch_device = torch.device("cuda:0")
        cls.a_cpu_device = "cpu"

        cls.a_zero_number_of_workers = 0

        cls.fasttext_local_path = os.path.join(CACHE_PATH, "fasttext.ckpt")
        cls.bpemb_local_path = os.path.join(CACHE_PATH, "bpemb.ckpt")

        cls.with_new_prediction_tags = {
            "ALastTag": 0,
            "ATag": 1,
            "AnotherTag": 2,
            "EOS": 3,
        }
Exemple #5
0
    def test_given_list_of_tuple_data_when_predict_container_raise_data_error(
            self):
        number_of_data_points = 4
        create_pickle_file(self.a_pickle_data_container_path,
                           number_of_data_points=number_of_data_points,
                           predict_container=False)

        with self.assertRaises(DataError):
            PickleDatasetContainer(self.a_pickle_data_container_path,
                                   is_training_container=False)
    def setUpClass(cls):
        super(AddressParserIntegrationTestNewTags, cls).setUpClass()

        file_extension = "p"
        training_dataset_name = "test_sample_data_new_prediction_tags"
        download_from_url(training_dataset_name, cls.a_data_saving_dir, file_extension=file_extension)

        cls.new_prediction_data_container = PickleDatasetContainer(
            os.path.join(cls.a_data_saving_dir, training_dataset_name + "." + file_extension)
        )
Exemple #7
0
    def test_givenAPickleDatasetContainer_whenInstantiate_thenDataIsPickleContent(
            self):
        number_of_data_points = 4
        create_pickle_file(self.a_pickle_data_container_path,
                           number_of_data_points=number_of_data_points)

        pickle_dataset_container = PickleDatasetContainer(
            self.a_pickle_data_container_path)
        expected = number_of_data_points
        self.assertEqual(expected, len(pickle_dataset_container))

        number_of_data_points = 5
        create_pickle_file(self.a_pickle_data_container_path,
                           number_of_data_points=number_of_data_points)

        pickle_dataset_container = PickleDatasetContainer(
            self.a_pickle_data_container_path)
        expected = number_of_data_points
        self.assertEqual(expected, len(pickle_dataset_container))
Exemple #8
0
    def test_integration_predict_container(self):
        number_of_data_points = 4
        create_pickle_file(self.a_pickle_data_container_path,
                           number_of_data_points=number_of_data_points,
                           predict_container=True)

        pickle_dataset_container = PickleDatasetContainer(
            self.a_pickle_data_container_path, is_training_container=False)
        expected = number_of_data_points
        self.assertEqual(expected, len(pickle_dataset_container))

        number_of_data_points = 5
        create_pickle_file(self.a_pickle_data_container_path,
                           number_of_data_points=number_of_data_points,
                           predict_container=True)

        pickle_dataset_container = PickleDatasetContainer(
            self.a_pickle_data_container_path, is_training_container=False)
        expected = number_of_data_points
        self.assertEqual(expected, len(pickle_dataset_container))
Exemple #9
0
    def test_integration(self):
        number_of_data_points = 4
        create_pickle_file(
            self.a_pickle_data_container_path,
            number_of_data_points=number_of_data_points,
        )

        pickle_dataset_container = PickleDatasetContainer(
            self.a_pickle_data_container_path)
        expected = number_of_data_points
        self.assertEqual(expected, len(pickle_dataset_container))

        number_of_data_points = 5
        create_pickle_file(
            self.a_pickle_data_container_path,
            number_of_data_points=number_of_data_points,
        )

        pickle_dataset_container = PickleDatasetContainer(
            self.a_pickle_data_container_path)
        expected = number_of_data_points
        self.assertEqual(expected, len(pickle_dataset_container))
Exemple #10
0
    def setUpClass(cls):
        cls.a_data_saving_dir = "./data"
        file_extension = "p"
        training_dataset_name = "sample_incomplete_data"
        test_dataset_name = "test_sample_data"
        download_from_url(training_dataset_name,
                          cls.a_data_saving_dir,
                          file_extension=file_extension)
        download_from_url(test_dataset_name,
                          cls.a_data_saving_dir,
                          file_extension=file_extension)

        cls.training_container = PickleDatasetContainer(
            os.path.join(cls.a_data_saving_dir,
                         training_dataset_name + "." + file_extension))
        cls.test_container = PickleDatasetContainer(
            os.path.join(cls.a_data_saving_dir,
                         test_dataset_name + "." + file_extension))

        cls.a_fasttext_model_type = "fasttext"
        cls.a_fasttext_light_model_type = "fasttext-light"
        cls.a_bpemb_model_type = "bpemb"

        cls.verbose = False

        # training constant
        cls.a_single_epoch = 1
        cls.a_three_epoch = 3
        cls.a_train_ratio = 0.8
        cls.a_batch_size = 128
        cls.a_number_of_workers = 2
        cls.a_learning_rate = 0.001
        cls.a_checkpoints_saving_dir = "./chekpoints"

        cls.fasttext_local_path = os.path.join(CACHE_PATH, "fasttext.ckpt")
        cls.bpemb_local_path = os.path.join(CACHE_PATH, "bpemb.ckpt")
Exemple #11
0
    def test_givenAPickleDatasetContainer_whenGetSlice_thenReturnTheCorrectItems(
            self):
        create_pickle_file(self.a_pickle_data_container_path)

        pickle_dataset_container = PickleDatasetContainer(
            self.a_pickle_data_container_path)

        expected = [np.array(range(0, 10)),
                    np.array(range(10, 20))]  # first and second data points
        actual = pickle_dataset_container[0:2]
        self.assertListOfArraysEqual(expected, actual)

        expected = [np.array(range(20, 30)),
                    np.array(range(30, 40))]  # third and forth data points
        actual = pickle_dataset_container[2:4]
        self.assertListOfArraysEqual(expected, actual)
Exemple #12
0
    def test_givenAPickleDatasetContainer_whenGetOneItem_thenReturnTheCorrectItem(
            self):
        create_pickle_file(self.a_pickle_data_container_path)

        pickle_dataset_container = PickleDatasetContainer(
            self.a_pickle_data_container_path)

        expected = list(range(0, 10))  # first data point
        actual = pickle_dataset_container[0]
        self.assertEqual(expected, actual.tolist())

        expected = list(range(10, 20))  # second data point
        actual = pickle_dataset_container[1]
        self.assertEqual(expected, actual.tolist())

        expected = list(range(20, 30))  # third data point
        actual = pickle_dataset_container[2]
        self.assertEqual(expected, actual.tolist())
Exemple #13
0
def test_on_country_data(address_parser: AddressParser, file: str,
                         directory_path: str, args) -> tuple:
    """
    Compute the results over a country data.
    """
    country = pycountry.countries.get(
        alpha_2=file.replace(".p", "").upper()).name
    country = clean_up_name(country)

    print(f"Testing on test files {country}")

    test_file_path = os.path.join(directory_path, file)
    test_container = PickleDatasetContainer(test_file_path)

    results = address_parser.test(
        test_container,
        batch_size=4096,
        num_workers=4,
        logging_path=f"./chekpoints/{args.model_type}",
        checkpoint=args.model_path)
    return results, country
Exemple #14
0
def test_on_country_data(address_parser: AddressParser, file: str,
                         directory_path: str, args) -> tuple:
    """
    Compute the results over a country data.
    """
    country = convert_2_letters_name_into_country_name(file)

    print(f"Testing on test files {country}")

    test_file_path = os.path.join(directory_path, file)
    test_container = PickleDatasetContainer(test_file_path,
                                            is_training_container=False)

    results = address_parser.test(
        test_container,
        batch_size=args.batch_size,
        num_workers=4,
        logging_path=f"./checkpoints/{args.model_type}",
        checkpoint=args.model_path,
    )
    return results, country
Exemple #15
0
    def test_givenAPickleDatasetContainer_whenGetSlice_thenReturnTheCorrectItems(
            self):
        create_pickle_file(self.a_pickle_data_container_path)

        pickle_dataset_container = PickleDatasetContainer(
            self.a_pickle_data_container_path)

        start_idx = 0
        end_idx = 2
        expected_addresses = [
            base_string.format(idx) for idx in range(start_idx, end_idx)
        ]
        expected_tags_idxs = [a_tags_sequence] * (end_idx - start_idx)

        sliced_addresses = pickle_dataset_container[start_idx:end_idx]
        self.assertIsInstance(sliced_addresses, list)
        for actual_address_tuple, expected_address, expected_tags_idx in zip(
                sliced_addresses, expected_addresses, expected_tags_idxs):
            actual_address, actual_tags_idx = actual_address_tuple[
                0], actual_address_tuple[1]
            self.assertEqual(expected_address, actual_address)
            self.assertListEqual(expected_tags_idx, actual_tags_idx)

        start_idx = 2
        end_idx = 4
        expected_addresses = [
            base_string.format(idx) for idx in range(start_idx, end_idx)
        ]
        expected_tags_idxs = [a_tags_sequence] * (end_idx - start_idx)

        sliced_addresses = pickle_dataset_container[start_idx:end_idx]
        self.assertIsInstance(sliced_addresses, list)
        for actual_address_tuple, expected_address, expected_tags_idx in zip(
                sliced_addresses, expected_addresses, expected_tags_idxs):
            actual_address, actual_tags_idx = actual_address_tuple[
                0], actual_address_tuple[1]
            self.assertEqual(expected_address, actual_address)
            self.assertListEqual(expected_tags_idx, actual_tags_idx)
Exemple #16
0
    def setUpClass(cls):
        cls.an_address_to_parse = "350 rue des lilas o"
        cls.temp_dir_obj = TemporaryDirectory()
        cls.a_data_saving_dir = os.path.join(cls.temp_dir_obj.name, "data")
        os.makedirs(cls.a_data_saving_dir, exist_ok=True)
        file_extension = "p"
        training_dataset_name = "sample_incomplete_data"
        download_from_url(training_dataset_name,
                          cls.a_data_saving_dir,
                          file_extension=file_extension)

        cls.training_container = PickleDatasetContainer(
            os.path.join(cls.a_data_saving_dir,
                         training_dataset_name + "." + file_extension))

        cls.a_fasttext_model_type = "fasttext"
        cls.a_bpemb_model_type = "bpemb"

        cls.verbose = False

        # training constant
        cls.a_single_epoch = 1
        cls.a_train_ratio = 0.8
        cls.a_batch_size = 128
        cls.a_number_of_workers = 2
        cls.a_learning_rate = 0.001

        cls.a_torch_device = torch.device("cuda:0")
        cls.a_cpu_device = "cpu"

        cls.seq2seq_params = {
            "encoder_hidden_size": 512,
            "decoder_hidden_size": 512
        }

        cls.retrain_file_name_format = "retrained_{}_address_parser"
Exemple #17
0
import os

from deepparse import download_from_url
from deepparse.dataset_container import PickleDatasetContainer
from deepparse.parser import AddressParser

# Here is an example on how to parse multiple addresses
# First, let's download the train and test data from the public repository.
saving_dir = "./data"
file_extension = "p"
test_dataset_name = "predict"
download_from_url(test_dataset_name, saving_dir, file_extension=file_extension)

#  Now let's load the dataset using one of our dataset container
addresses_to_parse = PickleDatasetContainer("./data/predict.p", is_training_container=False)

# Let's download a BPEmb retrained model create just for this example, but you can also use one of yours.
retrained_model_name = "retrained_light_bpemb_address_parser"
model_file_extension = "ckpt"
download_from_url(retrained_model_name, saving_dir, file_extension=model_file_extension)

address_parser = AddressParser(
    model_type="bpemb",
    device=0,
    path_to_retrained_model=os.path.join(saving_dir, retrained_model_name + "." + model_file_extension),
)

# We can now parse some addresses
parsed_addresses = address_parser(addresses_to_parse[0:300])
Exemple #18
0
def main(args=None) -> None:
    # pylint: disable=too-many-locals, too-many-branches
    """
    CLI function to rapidly parse an addresses dataset and output it in another file.

    Examples of usage:

    .. code-block:: sh

        parse fasttext ./dataset_path.csv parsed_address.pickle

    Using a gpu device

    .. code-block:: sh

        parse fasttext ./dataset_path.csv parsed_address.pickle --device 0

    Using a CSV dataset

    .. code-block:: sh

        parse fasttext ./dataset.csv parsed_address.pickle --path_to_retrained_model ./path

    """
    if args is None:  # pragma: no cover
        args = sys.argv[1:]

    parsed_args = get_args(args)

    dataset_path = parsed_args.dataset_path
    if is_csv_path(dataset_path):
        csv_column_name = parsed_args.csv_column_name
        if csv_column_name is None:
            raise ValueError(
                "For a CSV dataset path, you need to specify the 'csv_column_name' argument to provide the"
                " column name to extract address.")
        csv_column_separator = parsed_args.csv_column_separator
        addresses_to_parse = CSVDatasetContainer(
            dataset_path,
            column_names=[csv_column_name],
            separator=csv_column_separator,
            is_training_container=False)
    elif is_pickle_path(dataset_path):
        addresses_to_parse = PickleDatasetContainer(
            dataset_path, is_training_container=False)
    else:
        raise ValueError(
            "The dataset path argument is not a CSV or pickle file.")

    export_filename = parsed_args.export_filename
    export_path = generate_export_path(dataset_path, export_filename)

    if is_csv_path(export_filename):
        export_fn = partial(to_csv,
                            export_path=export_path,
                            sep=csv_column_separator)
    elif is_pickle_path(export_filename):
        export_fn = partial(to_pickle, export_path=export_path)
    elif is_json_path(export_filename):
        export_fn = partial(to_json, export_path=export_path)
    else:
        raise ValueError("We do not support this type of export.")

    parsing_model = parsed_args.parsing_model
    device = parsed_args.device
    path_to_retrained_model = parsed_args.path_to_retrained_model

    if "cpu" not in device:
        device = int(device)
    parser_args = {"device": device}
    if "-attention" in parsing_model:
        parser_args.update({"attention_mechanism": True})
        parsing_model = parsing_model.strip("attention").strip("-")
    parser_args.update({"model_type": parsing_model})

    if path_to_retrained_model is not None:
        parser_args.update(
            {"path_to_retrained_model": path_to_retrained_model})

    address_parser = AddressParser(**parser_args)

    if parsed_args.log:
        logging_export_path = replace_path_extension(export_path, ".log")
        logging.basicConfig(filename=logging_export_path,
                            format="%(asctime)s : %(levelname)s : %(message)s",
                            level=logging.INFO)

        text_to_log = f"Parsing dataset file {dataset_path} using the parser {address_parser}"
        logging.info(text_to_log)

    parsed_address = address_parser(addresses_to_parse)

    export_fn(parsed_address)

    print(f"{len(addresses_to_parse)} addresses have been parsed.")

    if parsed_args.log:
        text_to_log = (
            f"{len(addresses_to_parse)} addresses have been parsed.\n"
            f"The parsed addresses are outputted here: {export_path}")
        logging.info(text_to_log)
Exemple #19
0
from deepparse import download_from_url
from deepparse.dataset_container import PickleDatasetContainer
from deepparse.parser import AddressParser

# First, let's download the train and test data from the public repository.
saving_dir = "./data"
file_extension = "p"
training_dataset_name = "sample_noisy_data"
test_dataset_name = "test_sample_data"
download_from_url(training_dataset_name,
                  saving_dir,
                  file_extension=file_extension)
download_from_url(test_dataset_name, saving_dir, file_extension=file_extension)

# Now let's create a training and test container.
training_container = PickleDatasetContainer(
    os.path.join(saving_dir, training_dataset_name + "." + file_extension))
test_container = PickleDatasetContainer(
    os.path.join(saving_dir, test_dataset_name + "." + file_extension))

# We will retrain the fasttext version of our pretrained model.
address_parser = AddressParser(model_type="fasttext", device=0)

# Now let's retrain for 5 epochs using a batch size of 8 since the data is really small for the example.
# Let's start with the default learning rate of 0.01 and use a learning rate scheduler to lower the learning rate
# as we progress.
lr_scheduler = poutyne.StepLR(
    step_size=1, gamma=0.1)  # reduce LR by a factor of 10 each epoch

# The checkpoints (ckpt) are saved in the default "./chekpoints" directory.
address_parser.retrain(training_container,
                       0.8,