def test_givenAPickleDatasetContainer_whenGetOneItem_thenReturnTheCorrectItem( self): create_pickle_file(self.a_pickle_data_container_path) pickle_dataset_container = PickleDatasetContainer( self.a_pickle_data_container_path) # first data point idx = 0 expected_address = base_string.format(idx) expected_tags_idx = a_tags_sequence actual_address, actual_tags_idx = pickle_dataset_container[idx] self.assertEqual(expected_address, actual_address) self.assertListEqual(expected_tags_idx, actual_tags_idx) # second data point idx = 1 expected_address = base_string.format(idx) expected_tags_idx = a_tags_sequence actual_address, actual_tags_idx = pickle_dataset_container[idx] self.assertEqual(expected_address, actual_address) self.assertListEqual(expected_tags_idx, actual_tags_idx) # third data point idx = 2 expected_address = base_string.format(idx) expected_tags_idx = a_tags_sequence actual_address, actual_tags_idx = pickle_dataset_container[idx] self.assertEqual(expected_address, actual_address) self.assertListEqual(expected_tags_idx, actual_tags_idx)
def test_given_list_of_tuple_data_when_predict_container_raise_data_error( self): number_of_data_points = 4 create_pickle_file(self.a_pickle_data_container_path, number_of_data_points=number_of_data_points, predict_container=False) with self.assertRaises(DataError): PickleDatasetContainer(self.a_pickle_data_container_path, is_training_container=False)
def test_integration_cpu(self): create_pickle_file(self.fake_data_path_pickle, predict_container=True) parse.main([ self.a_fasttext_model_type, self.fake_data_path_pickle, self.pickle_p_export_filename, "--device", self.cpu_device, ]) export_path = generate_export_path(self.fake_data_path_pickle, self.pickle_p_export_filename) self.assertTrue(os.path.isfile(export_path))
def test_integration_no_logging(self): with self._caplog.at_level(logging.INFO): create_pickle_file(self.fake_data_path_pickle, predict_container=True) parse.main([ self.a_fasttext_model_type, self.fake_data_path_pickle, self.pickle_p_export_filename, "--device", self.cpu_device, "--log", "False", ]) self.assertEqual(0, len(self._caplog.records))
def test_integration_predict_container(self): number_of_data_points = 4 create_pickle_file(self.a_pickle_data_container_path, number_of_data_points=number_of_data_points, predict_container=True) pickle_dataset_container = PickleDatasetContainer( self.a_pickle_data_container_path, is_training_container=False) expected = number_of_data_points self.assertEqual(expected, len(pickle_dataset_container)) number_of_data_points = 5 create_pickle_file(self.a_pickle_data_container_path, number_of_data_points=number_of_data_points, predict_container=True) pickle_dataset_container = PickleDatasetContainer( self.a_pickle_data_container_path, is_training_container=False) expected = number_of_data_points self.assertEqual(expected, len(pickle_dataset_container))
def test_ifPathToFastTextRetrainModel_thenUseFastTextRetrainModel(self): with self._caplog.at_level(logging.INFO): path_to_retrained_model = self.path_to_retrain_fasttext create_pickle_file(self.fake_data_path_pickle, predict_container=True) parse.main([ self.a_fasttext_model_type, self.fake_data_path_pickle, self.pickle_p_export_filename, "--device", self.cpu_device, "--path_to_retrained_model", path_to_retrained_model, ]) expected_first_message = ( f"Parsing dataset file {self.fake_data_path_pickle} using the parser " f"FastTextAddressParser") actual_first_message = self._caplog.records[0].message self.assertEqual(expected_first_message, actual_first_message)
def test_integration(self): number_of_data_points = 4 create_pickle_file( self.a_pickle_data_container_path, number_of_data_points=number_of_data_points, ) pickle_dataset_container = PickleDatasetContainer( self.a_pickle_data_container_path) expected = number_of_data_points self.assertEqual(expected, len(pickle_dataset_container)) number_of_data_points = 5 create_pickle_file( self.a_pickle_data_container_path, number_of_data_points=number_of_data_points, ) pickle_dataset_container = PickleDatasetContainer( self.a_pickle_data_container_path) expected = number_of_data_points self.assertEqual(expected, len(pickle_dataset_container))
def test_givenAPickleDatasetContainer_whenGetSlice_thenReturnTheCorrectItems( self): create_pickle_file(self.a_pickle_data_container_path) pickle_dataset_container = PickleDatasetContainer( self.a_pickle_data_container_path) start_idx = 0 end_idx = 2 expected_addresses = [ base_string.format(idx) for idx in range(start_idx, end_idx) ] expected_tags_idxs = [a_tags_sequence] * (end_idx - start_idx) sliced_addresses = pickle_dataset_container[start_idx:end_idx] self.assertIsInstance(sliced_addresses, list) for actual_address_tuple, expected_address, expected_tags_idx in zip( sliced_addresses, expected_addresses, expected_tags_idxs): actual_address, actual_tags_idx = actual_address_tuple[ 0], actual_address_tuple[1] self.assertEqual(expected_address, actual_address) self.assertListEqual(expected_tags_idx, actual_tags_idx) start_idx = 2 end_idx = 4 expected_addresses = [ base_string.format(idx) for idx in range(start_idx, end_idx) ] expected_tags_idxs = [a_tags_sequence] * (end_idx - start_idx) sliced_addresses = pickle_dataset_container[start_idx:end_idx] self.assertIsInstance(sliced_addresses, list) for actual_address_tuple, expected_address, expected_tags_idx in zip( sliced_addresses, expected_addresses, expected_tags_idxs): actual_address, actual_tags_idx = actual_address_tuple[ 0], actual_address_tuple[1] self.assertEqual(expected_address, actual_address) self.assertListEqual(expected_tags_idx, actual_tags_idx)
def test_ifPathToFakeRetrainModel_thenUseFakeRetrainModel(self): with self._caplog.at_level(logging.INFO): # We use the default path to fasttext model as a "retrain model path" path_to_retrained_model = os.path.join(os.path.expanduser("~"), ".cache", "deepparse", "fasttext.ckpt") create_pickle_file(self.fake_data_path_pickle, predict_container=True) parse.main([ self.a_fasttext_model_type, self.fake_data_path_pickle, self.pickle_p_export_filename, "--device", self.cpu_device, "--path_to_retrained_model", path_to_retrained_model, ]) expected_first_message = ( f"Parsing dataset file {self.fake_data_path_pickle} using the parser " f"FastTextAddressParser") actual_first_message = self._caplog.records[0].message self.assertEqual(expected_first_message, actual_first_message)
def test_integration_logging(self): with self._caplog.at_level(logging.INFO): create_pickle_file(self.fake_data_path_pickle, predict_container=True) parse.main([ self.a_fasttext_model_type, self.fake_data_path_pickle, self.pickle_p_export_filename, "--device", self.cpu_device, ]) expected_first_message = ( f"Parsing dataset file {self.fake_data_path_pickle} using the parser " f"FastTextAddressParser") actual_first_message = self._caplog.records[0].message self.assertEqual(expected_first_message, actual_first_message) export_path = generate_export_path(self.fake_data_path_pickle, "a_file.p") expected_second_message = ( f"4 addresses have been parsed.\n" f"The parsed addresses are outputted here: {export_path}") actual_second_message = self._caplog.records[1].message self.assertEqual(expected_second_message, actual_second_message)