Esempio n. 1
0
    def test_givenAFasttextEmbeddingsToLoad_whenLoad_thenLoadProperly(self):
        download_from_url("fake_embeddings_cc.fr.300", self.a_directory_path,
                          "bin")
        embeddings_path = self.a_fake_embeddings_path

        embeddings = load_fasttext_embeddings(embeddings_path)

        self.assertIsInstance(embeddings, _FastText)
Esempio n. 2
0
    def models_setup(cls, model: str) -> None:
        # We download the "normal" model
        download_from_url(file_name=model, saving_dir=cls.path, file_extension="ckpt")

        # We download the "pre_trained" model
        model = cls.retrain_file_name_format.format(model)
        download_from_url(file_name=model, saving_dir=cls.path, file_extension="ckpt")
        cls.re_trained_output_dim = 3
Esempio n. 3
0
    def test_givenBPEmbVersion_whenDownloadOk_thenDownloadIt(self):
        file_name = "bpemb"

        download_from_url(file_name, self.fake_cache_path,
                          self.a_file_extension)

        self.assertTrue(
            os.path.exists(
                os.path.join(self.fake_cache_path,
                             f"{file_name}.{self.a_file_extension}")))
    def setUpClass(cls):
        super(AddressParserIntegrationTestNewTags, cls).setUpClass()

        file_extension = "p"
        training_dataset_name = "test_sample_data_new_prediction_tags"
        download_from_url(training_dataset_name, cls.a_data_saving_dir, file_extension=file_extension)

        cls.new_prediction_data_container = PickleDatasetContainer(
            os.path.join(cls.a_data_saving_dir, training_dataset_name + "." + file_extension)
        )
    def setUpClass(cls):
        super(FastTextEmbeddingsModelIntegrationTest, cls).setUpClass()
        cls.file_name = "fake_embeddings_cc.fr.300"
        cls.temp_dir_obj = TemporaryDirectory()
        cls.fake_cache_path = os.path.join(cls.temp_dir_obj.name, "fake_cache")
        download_from_url(cls.file_name, cls.fake_cache_path, "bin")

        cls.a_fasttext_model_path = os.path.join(cls.fake_cache_path, cls.file_name + ".bin")

        cls.verbose = False
    def setUpClass(cls):
        cls.temp_dir_obj = TemporaryDirectory()
        cls.weights_dir = os.path.join(cls.temp_dir_obj.name, "./weights")

        download_from_url(file_name="decoder_hidden", saving_dir=cls.weights_dir, file_extension="p")

        cls.a_torch_device = torch.device("cuda:0")
        cls.a_cpu_device = torch.device("cpu")

        cls.input_size_dim = 1
        cls.hidden_size = 1024
        cls.num_layers = 1
        cls.a_batch_size = 2
        cls.sequence_len = 1
    def setUpClass(cls):
        cls.a_torch_device = torch.device("cuda:0")
        cls.a_cpu_device = torch.device("cpu")

        cls.input_size_dim = 300
        cls.hidden_size = 1024
        cls.num_layers = 1
        cls.a_batch_size = 2

        cls.temp_dir_obj = TemporaryDirectory()
        cls.weights_dir = os.path.join(cls.temp_dir_obj.name, "weights")
        download_from_url(
            file_name="to_predict_fasttext",
            saving_dir=cls.weights_dir,
            file_extension="p",
        )
Esempio n. 8
0
    def setUpClass(cls):
        cls.temp_dir_obj = TemporaryDirectory()
        cls.a_data_saving_dir = os.path.join(cls.temp_dir_obj.name, "data")
        os.makedirs(cls.a_data_saving_dir, exist_ok=True)
        file_extension = "p"
        training_dataset_name = "sample_incomplete_data"
        test_dataset_name = "test_sample_data"
        download_from_url(training_dataset_name, cls.a_data_saving_dir, file_extension=file_extension)
        download_from_url(test_dataset_name, cls.a_data_saving_dir, file_extension=file_extension)

        cls.training_container = PickleDatasetContainer(
            os.path.join(cls.a_data_saving_dir, training_dataset_name + "." + file_extension)
        )
        cls.test_container = PickleDatasetContainer(
            os.path.join(cls.a_data_saving_dir, test_dataset_name + "." + file_extension)
        )

        cls.a_fasttext_model_type = "fasttext"
        cls.a_fasttext_light_model_type = "fasttext-light"
        cls.a_bpemb_model_type = "bpemb"

        cls.verbose = False

        # training constant
        cls.a_single_epoch = 1
        cls.a_three_epoch = 3
        cls.a_train_ratio = 0.8
        cls.a_batch_size = 128
        cls.a_number_of_workers = 2
        cls.a_learning_rate = 0.001

        cls.a_torch_device = torch.device("cuda:0")
        cls.a_cpu_device = "cpu"

        cls.a_zero_number_of_workers = 0

        cls.fasttext_local_path = os.path.join(CACHE_PATH, "fasttext.ckpt")
        cls.bpemb_local_path = os.path.join(CACHE_PATH, "bpemb.ckpt")

        cls.with_new_prediction_tags = {
            "ALastTag": 0,
            "ATag": 1,
            "AnotherTag": 2,
            "EOS": 3,
        }
Esempio n. 9
0
    def setUpClass(cls):
        cls.verbose = False
        cls.a_torch_device = torch.device("cuda:0")
        cls.a_cpu_device = torch.device("cpu")
        cls.begin_of_sequence_idx = -1  # BOS
        cls.encoder_hidden_size = 1024
        cls.decoder_hidden_size = 1024
        cls.input_size = 300
        cls.num_layers = 1

        cls.number_of_tags = 9  # default tag space of our models
        cls.a_target_vector = torch.tensor([[0, 1, 1, 4, 5, 8], [1, 0, 3, 8, 0, 0]], device=cls.a_torch_device)

        cls.output_size = 9

        cls.temp_dir_obj = TemporaryDirectory()
        cls.weights_dir = os.path.join(cls.temp_dir_obj.name, "./weights")

        download_from_url(file_name="to_predict_bpemb", saving_dir=cls.weights_dir, file_extension="p")
        download_from_url(
            file_name="to_predict_fasttext",
            saving_dir=cls.weights_dir,
            file_extension="p",
        )
        download_from_url(file_name="decoder_hidden", saving_dir=cls.weights_dir, file_extension="p")

        cls.path = os.path.join(cls.temp_dir_obj.name, ".cache", "deepparse")
        cls.retrain_file_name_format = "retrained_{}_address_parser"
Esempio n. 10
0
    def setUpClass(cls):
        cls.a_data_saving_dir = "./data"
        file_extension = "p"
        training_dataset_name = "sample_incomplete_data"
        test_dataset_name = "test_sample_data"
        download_from_url(training_dataset_name,
                          cls.a_data_saving_dir,
                          file_extension=file_extension)
        download_from_url(test_dataset_name,
                          cls.a_data_saving_dir,
                          file_extension=file_extension)

        cls.training_container = PickleDatasetContainer(
            os.path.join(cls.a_data_saving_dir,
                         training_dataset_name + "." + file_extension))
        cls.test_container = PickleDatasetContainer(
            os.path.join(cls.a_data_saving_dir,
                         test_dataset_name + "." + file_extension))

        cls.a_fasttext_model_type = "fasttext"
        cls.a_fasttext_light_model_type = "fasttext-light"
        cls.a_bpemb_model_type = "bpemb"

        cls.verbose = False

        # training constant
        cls.a_single_epoch = 1
        cls.a_three_epoch = 3
        cls.a_train_ratio = 0.8
        cls.a_batch_size = 128
        cls.a_number_of_workers = 2
        cls.a_learning_rate = 0.001
        cls.a_checkpoints_saving_dir = "./chekpoints"

        cls.fasttext_local_path = os.path.join(CACHE_PATH, "fasttext.ckpt")
        cls.bpemb_local_path = os.path.join(CACHE_PATH, "bpemb.ckpt")
Esempio n. 11
0
    def setUpClass(cls):
        cls.an_address_to_parse = "350 rue des lilas o"
        cls.temp_dir_obj = TemporaryDirectory()
        cls.a_data_saving_dir = os.path.join(cls.temp_dir_obj.name, "data")
        os.makedirs(cls.a_data_saving_dir, exist_ok=True)
        file_extension = "p"
        training_dataset_name = "sample_incomplete_data"
        download_from_url(training_dataset_name,
                          cls.a_data_saving_dir,
                          file_extension=file_extension)

        cls.training_container = PickleDatasetContainer(
            os.path.join(cls.a_data_saving_dir,
                         training_dataset_name + "." + file_extension))

        cls.a_fasttext_model_type = "fasttext"
        cls.a_bpemb_model_type = "bpemb"

        cls.verbose = False

        # training constant
        cls.a_single_epoch = 1
        cls.a_train_ratio = 0.8
        cls.a_batch_size = 128
        cls.a_number_of_workers = 2
        cls.a_learning_rate = 0.001

        cls.a_torch_device = torch.device("cuda:0")
        cls.a_cpu_device = "cpu"

        cls.seq2seq_params = {
            "encoder_hidden_size": 512,
            "decoder_hidden_size": 512
        }

        cls.retrain_file_name_format = "retrained_{}_address_parser"
Esempio n. 12
0
import os

from deepparse import download_from_url
from deepparse.dataset_container import PickleDatasetContainer
from deepparse.parser import AddressParser

# Here is an example on how to parse multiple addresses
# First, let's download the train and test data from the public repository.
saving_dir = "./data"
file_extension = "p"
test_dataset_name = "predict"
download_from_url(test_dataset_name, saving_dir, file_extension=file_extension)

#  Now let's load the dataset using one of our dataset container
addresses_to_parse = PickleDatasetContainer("./data/predict.p", is_training_container=False)

# Let's download a BPEmb retrained model create just for this example, but you can also use one of yours.
retrained_model_name = "retrained_light_bpemb_address_parser"
model_file_extension = "ckpt"
download_from_url(retrained_model_name, saving_dir, file_extension=model_file_extension)

address_parser = AddressParser(
    model_type="bpemb",
    device=0,
    path_to_retrained_model=os.path.join(saving_dir, retrained_model_name + "." + model_file_extension),
)

# We can now parse some addresses
parsed_addresses = address_parser(addresses_to_parse[0:300])
import os

import poutyne

from deepparse import download_from_url
from deepparse.dataset_container import CSVDatasetContainer
from deepparse.parser import AddressParser

# First, let's download the train and test data from the public repository but using a CSV format dataset.
saving_dir = "./data"
file_extension = "csv"
training_dataset_name = "sample_incomplete_data"
test_dataset_name = "test_sample_data"
download_from_url(training_dataset_name,
                  saving_dir,
                  file_extension=file_extension)
download_from_url(test_dataset_name, saving_dir, file_extension=file_extension)

# Now let's create a training and test container.
training_container = CSVDatasetContainer(
    os.path.join(saving_dir, training_dataset_name + "." + file_extension),
    column_names=['Address', 'Tags'],
    separator=',',
)
test_container = CSVDatasetContainer(os.path.join(
    saving_dir, test_dataset_name + "." + file_extension),
                                     column_names=['Address', 'Tags'],
                                     separator=',')

# We will retrain the fasttext version of our pretrained model.
address_parser = AddressParser(model_type="fasttext", device=0)
Esempio n. 14
0
    def test_givenBPEmbVersion_whenDownload404_thenHTTPError(self):
        wrong_file_name = "wrong_bpemb"

        with self.assertRaises(requests.exceptions.HTTPError):
            download_from_url(wrong_file_name, self.fake_cache_path,
                              self.a_file_extension)
Esempio n. 15
0
import os.path
import pickle
from statistics import mean

from deepparse import download_from_url
from deepparse.parser import AddressParser
from models_evaluation.timer.timer import Timer

download_from_url("speed_test_dataset", "./data", "p")

addresses = pickle.load(open("./data/speed_test_dataset.p", "rb"))
addresses, tags = zip(*addresses)

speed_test_directory = "results/speed_test_results"
os.makedirs(speed_test_directory, exist_ok=True)

for model in ["fasttext", "bpemb"]:
    for attention_mechanism in [True, False]:
        for device in [0, "cpu"]:
            with open(
                    os.path.join(
                        speed_test_directory,
                        f"speed_test_results_on_{device}_with_{model}_attention-{attention_mechanism}.txt",
                    ),
                    "w",
            ) as file:
                times = []
                for batch_size in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]:
                    address_parser = AddressParser(
                        model_type=model,
                        device=device,