def test_should_be_able_to_inverse_transform_label(self): preprocessor = Preprocessor() X = [['Word1']] y = [['label1']] _, y_transformed = preprocessor.fit_transform(X, y) y_inverse = preprocessor.inverse_transform(y_transformed[0]) assert y_inverse == y[0]
def test_should_transform_unseen_label(self): preprocessor = Preprocessor(return_lengths=False, padding=False) X_train = [['Word1']] y_train = [['label1']] X_test = [['Word1', 'Word1']] y_test = [['label1', 'label2']] p = preprocessor.fit(X_train, y_train) _, y_transformed = p.transform(X_test, y_test) assert y_transformed == [[1, 0]]
def load(self, dir_path='data/models/sequenceLabelling/', weight_file=DEFAULT_WEIGHT_FILE_NAME): model_path = os.path.join(dir_path, self.model_config.model_name) self.model_config = ModelConfig.load( os.path.join(model_path, CONFIG_FILE_NAME)) if self.model_config.embeddings_name is not None: # load embeddings # Do not use cache in 'prediction/production' mode self.embeddings = Embeddings(self.model_config.embeddings_name, resource_registry=self.registry, use_ELMo=self.model_config.use_ELMo, use_cache=False) self.model_config.word_embedding_size = self.embeddings.embed_size else: self.embeddings = None self.model_config.word_embedding_size = 0 self.p = Preprocessor.load( os.path.join(dir_path, self.model_config.model_name, PROCESSOR_FILE_NAME)) self.model = get_model(self.model_config, self.p, ntags=len(self.p.vocab_tag), load_pretrained_weights=False, local_path=os.path.join( dir_path, self.model_config.model_name)) print( "load weights from", os.path.join(dir_path, self.model_config.model_name, weight_file)) self.model.load(filepath=os.path.join( dir_path, self.model_config.model_name, weight_file)) self.model.print_summary()
def test_should_fit_single_word_dataset(self): preprocessor = Preprocessor() X = [['Word1']] y = [['label1']] X_transformed, y_transformed = preprocessor.fit_transform(X, y) LOGGER.debug('vocab_char: %s', preprocessor.vocab_char) LOGGER.debug('vocab_case: %s', preprocessor.vocab_case) LOGGER.debug('vocab_tag: %s', preprocessor.vocab_tag) LOGGER.debug('X_transformed: %s', X_transformed) LOGGER.debug('y_transformed: %s', y_transformed) for c in 'Word1': assert c in preprocessor.vocab_char for case in {'numeric', 'allLower', 'allUpper', 'initialUpper'}: assert case in preprocessor.vocab_case assert 'label1' in preprocessor.vocab_tag assert len(X_transformed) == 1 assert len(y_transformed) == 1
def test_serialize_to_json(self, tmp_path): preprocessor = FeaturesPreprocessor(features_indices=[1]) features_batch = [[[FEATURE_VALUE_1, FEATURE_VALUE_2], [FEATURE_VALUE_1, FEATURE_VALUE_3], [FEATURE_VALUE_1, FEATURE_VALUE_4]]] X_train = [['Word1']] y_train = [['label1']] preprocessor.fit(features_batch) word_preprocessor = Preprocessor(feature_preprocessor=preprocessor) word_preprocessor.fit(X_train, y_train) serialised_file_path = os.path.join(str(tmp_path), "serialised.json") word_preprocessor.save(file_path=serialised_file_path) back = Preprocessor.load(serialised_file_path) assert back is not None assert back.feature_preprocessor is not None original_as_dict = word_preprocessor.__dict__ back_as_dict = back.__dict__ for key in back_as_dict.keys(): if key == 'feature_preprocessor': for sub_key in back_as_dict[key].__dict__.keys(): assert back_as_dict[key].__dict__[ sub_key] == original_as_dict[key].__dict__[sub_key] else: assert back_as_dict[key] == original_as_dict[key]
def test_load_withUmmappedVariable_shouldIgnore(self, preprocessor2: str): p = Preprocessor.load(preprocessor2) assert len(p.vocab_char) == 70
def test_load_example(self, preprocessor1): p = Preprocessor.load(preprocessor1) assert len(p.vocab_char) == 70
def test_should_fit_empty_dataset(self): preprocessor = Preprocessor() preprocessor.fit([], [])
def test_should_be_able_to_instantiate_with_default_values(self): Preprocessor()