def test_givenAFasttextEmbeddingsToLoad_whenLoad_thenLoadProperly(self): download_from_url("fake_embeddings_cc.fr.300", self.a_directory_path, "bin") embeddings_path = self.a_fake_embeddings_path embeddings = load_fasttext_embeddings(embeddings_path) self.assertIsInstance(embeddings, _FastText)
def models_setup(cls, model: str) -> None: # We download the "normal" model download_from_url(file_name=model, saving_dir=cls.path, file_extension="ckpt") # We download the "pre_trained" model model = cls.retrain_file_name_format.format(model) download_from_url(file_name=model, saving_dir=cls.path, file_extension="ckpt") cls.re_trained_output_dim = 3
def test_givenBPEmbVersion_whenDownloadOk_thenDownloadIt(self): file_name = "bpemb" download_from_url(file_name, self.fake_cache_path, self.a_file_extension) self.assertTrue( os.path.exists( os.path.join(self.fake_cache_path, f"{file_name}.{self.a_file_extension}")))
def setUpClass(cls): super(AddressParserIntegrationTestNewTags, cls).setUpClass() file_extension = "p" training_dataset_name = "test_sample_data_new_prediction_tags" download_from_url(training_dataset_name, cls.a_data_saving_dir, file_extension=file_extension) cls.new_prediction_data_container = PickleDatasetContainer( os.path.join(cls.a_data_saving_dir, training_dataset_name + "." + file_extension) )
def setUpClass(cls): super(FastTextEmbeddingsModelIntegrationTest, cls).setUpClass() cls.file_name = "fake_embeddings_cc.fr.300" cls.temp_dir_obj = TemporaryDirectory() cls.fake_cache_path = os.path.join(cls.temp_dir_obj.name, "fake_cache") download_from_url(cls.file_name, cls.fake_cache_path, "bin") cls.a_fasttext_model_path = os.path.join(cls.fake_cache_path, cls.file_name + ".bin") cls.verbose = False
def setUpClass(cls): cls.temp_dir_obj = TemporaryDirectory() cls.weights_dir = os.path.join(cls.temp_dir_obj.name, "./weights") download_from_url(file_name="decoder_hidden", saving_dir=cls.weights_dir, file_extension="p") cls.a_torch_device = torch.device("cuda:0") cls.a_cpu_device = torch.device("cpu") cls.input_size_dim = 1 cls.hidden_size = 1024 cls.num_layers = 1 cls.a_batch_size = 2 cls.sequence_len = 1
def setUpClass(cls): cls.a_torch_device = torch.device("cuda:0") cls.a_cpu_device = torch.device("cpu") cls.input_size_dim = 300 cls.hidden_size = 1024 cls.num_layers = 1 cls.a_batch_size = 2 cls.temp_dir_obj = TemporaryDirectory() cls.weights_dir = os.path.join(cls.temp_dir_obj.name, "weights") download_from_url( file_name="to_predict_fasttext", saving_dir=cls.weights_dir, file_extension="p", )
def setUpClass(cls): cls.temp_dir_obj = TemporaryDirectory() cls.a_data_saving_dir = os.path.join(cls.temp_dir_obj.name, "data") os.makedirs(cls.a_data_saving_dir, exist_ok=True) file_extension = "p" training_dataset_name = "sample_incomplete_data" test_dataset_name = "test_sample_data" download_from_url(training_dataset_name, cls.a_data_saving_dir, file_extension=file_extension) download_from_url(test_dataset_name, cls.a_data_saving_dir, file_extension=file_extension) cls.training_container = PickleDatasetContainer( os.path.join(cls.a_data_saving_dir, training_dataset_name + "." + file_extension) ) cls.test_container = PickleDatasetContainer( os.path.join(cls.a_data_saving_dir, test_dataset_name + "." + file_extension) ) cls.a_fasttext_model_type = "fasttext" cls.a_fasttext_light_model_type = "fasttext-light" cls.a_bpemb_model_type = "bpemb" cls.verbose = False # training constant cls.a_single_epoch = 1 cls.a_three_epoch = 3 cls.a_train_ratio = 0.8 cls.a_batch_size = 128 cls.a_number_of_workers = 2 cls.a_learning_rate = 0.001 cls.a_torch_device = torch.device("cuda:0") cls.a_cpu_device = "cpu" cls.a_zero_number_of_workers = 0 cls.fasttext_local_path = os.path.join(CACHE_PATH, "fasttext.ckpt") cls.bpemb_local_path = os.path.join(CACHE_PATH, "bpemb.ckpt") cls.with_new_prediction_tags = { "ALastTag": 0, "ATag": 1, "AnotherTag": 2, "EOS": 3, }
def setUpClass(cls): cls.verbose = False cls.a_torch_device = torch.device("cuda:0") cls.a_cpu_device = torch.device("cpu") cls.begin_of_sequence_idx = -1 # BOS cls.encoder_hidden_size = 1024 cls.decoder_hidden_size = 1024 cls.input_size = 300 cls.num_layers = 1 cls.number_of_tags = 9 # default tag space of our models cls.a_target_vector = torch.tensor([[0, 1, 1, 4, 5, 8], [1, 0, 3, 8, 0, 0]], device=cls.a_torch_device) cls.output_size = 9 cls.temp_dir_obj = TemporaryDirectory() cls.weights_dir = os.path.join(cls.temp_dir_obj.name, "./weights") download_from_url(file_name="to_predict_bpemb", saving_dir=cls.weights_dir, file_extension="p") download_from_url( file_name="to_predict_fasttext", saving_dir=cls.weights_dir, file_extension="p", ) download_from_url(file_name="decoder_hidden", saving_dir=cls.weights_dir, file_extension="p") cls.path = os.path.join(cls.temp_dir_obj.name, ".cache", "deepparse") cls.retrain_file_name_format = "retrained_{}_address_parser"
def setUpClass(cls): cls.a_data_saving_dir = "./data" file_extension = "p" training_dataset_name = "sample_incomplete_data" test_dataset_name = "test_sample_data" download_from_url(training_dataset_name, cls.a_data_saving_dir, file_extension=file_extension) download_from_url(test_dataset_name, cls.a_data_saving_dir, file_extension=file_extension) cls.training_container = PickleDatasetContainer( os.path.join(cls.a_data_saving_dir, training_dataset_name + "." + file_extension)) cls.test_container = PickleDatasetContainer( os.path.join(cls.a_data_saving_dir, test_dataset_name + "." + file_extension)) cls.a_fasttext_model_type = "fasttext" cls.a_fasttext_light_model_type = "fasttext-light" cls.a_bpemb_model_type = "bpemb" cls.verbose = False # training constant cls.a_single_epoch = 1 cls.a_three_epoch = 3 cls.a_train_ratio = 0.8 cls.a_batch_size = 128 cls.a_number_of_workers = 2 cls.a_learning_rate = 0.001 cls.a_checkpoints_saving_dir = "./chekpoints" cls.fasttext_local_path = os.path.join(CACHE_PATH, "fasttext.ckpt") cls.bpemb_local_path = os.path.join(CACHE_PATH, "bpemb.ckpt")
def setUpClass(cls): cls.an_address_to_parse = "350 rue des lilas o" cls.temp_dir_obj = TemporaryDirectory() cls.a_data_saving_dir = os.path.join(cls.temp_dir_obj.name, "data") os.makedirs(cls.a_data_saving_dir, exist_ok=True) file_extension = "p" training_dataset_name = "sample_incomplete_data" download_from_url(training_dataset_name, cls.a_data_saving_dir, file_extension=file_extension) cls.training_container = PickleDatasetContainer( os.path.join(cls.a_data_saving_dir, training_dataset_name + "." + file_extension)) cls.a_fasttext_model_type = "fasttext" cls.a_bpemb_model_type = "bpemb" cls.verbose = False # training constant cls.a_single_epoch = 1 cls.a_train_ratio = 0.8 cls.a_batch_size = 128 cls.a_number_of_workers = 2 cls.a_learning_rate = 0.001 cls.a_torch_device = torch.device("cuda:0") cls.a_cpu_device = "cpu" cls.seq2seq_params = { "encoder_hidden_size": 512, "decoder_hidden_size": 512 } cls.retrain_file_name_format = "retrained_{}_address_parser"
import os from deepparse import download_from_url from deepparse.dataset_container import PickleDatasetContainer from deepparse.parser import AddressParser # Here is an example on how to parse multiple addresses # First, let's download the train and test data from the public repository. saving_dir = "./data" file_extension = "p" test_dataset_name = "predict" download_from_url(test_dataset_name, saving_dir, file_extension=file_extension) # Now let's load the dataset using one of our dataset container addresses_to_parse = PickleDatasetContainer("./data/predict.p", is_training_container=False) # Let's download a BPEmb retrained model create just for this example, but you can also use one of yours. retrained_model_name = "retrained_light_bpemb_address_parser" model_file_extension = "ckpt" download_from_url(retrained_model_name, saving_dir, file_extension=model_file_extension) address_parser = AddressParser( model_type="bpemb", device=0, path_to_retrained_model=os.path.join(saving_dir, retrained_model_name + "." + model_file_extension), ) # We can now parse some addresses parsed_addresses = address_parser(addresses_to_parse[0:300])
import os import poutyne from deepparse import download_from_url from deepparse.dataset_container import CSVDatasetContainer from deepparse.parser import AddressParser # First, let's download the train and test data from the public repository but using a CSV format dataset. saving_dir = "./data" file_extension = "csv" training_dataset_name = "sample_incomplete_data" test_dataset_name = "test_sample_data" download_from_url(training_dataset_name, saving_dir, file_extension=file_extension) download_from_url(test_dataset_name, saving_dir, file_extension=file_extension) # Now let's create a training and test container. training_container = CSVDatasetContainer( os.path.join(saving_dir, training_dataset_name + "." + file_extension), column_names=['Address', 'Tags'], separator=',', ) test_container = CSVDatasetContainer(os.path.join( saving_dir, test_dataset_name + "." + file_extension), column_names=['Address', 'Tags'], separator=',') # We will retrain the fasttext version of our pretrained model. address_parser = AddressParser(model_type="fasttext", device=0)
def test_givenBPEmbVersion_whenDownload404_thenHTTPError(self): wrong_file_name = "wrong_bpemb" with self.assertRaises(requests.exceptions.HTTPError): download_from_url(wrong_file_name, self.fake_cache_path, self.a_file_extension)
import os.path import pickle from statistics import mean from deepparse import download_from_url from deepparse.parser import AddressParser from models_evaluation.timer.timer import Timer download_from_url("speed_test_dataset", "./data", "p") addresses = pickle.load(open("./data/speed_test_dataset.p", "rb")) addresses, tags = zip(*addresses) speed_test_directory = "results/speed_test_results" os.makedirs(speed_test_directory, exist_ok=True) for model in ["fasttext", "bpemb"]: for attention_mechanism in [True, False]: for device in [0, "cpu"]: with open( os.path.join( speed_test_directory, f"speed_test_results_on_{device}_with_{model}_attention-{attention_mechanism}.txt", ), "w", ) as file: times = [] for batch_size in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]: address_parser = AddressParser( model_type=model, device=device,