def test_should_fit_single_value_feature(self): preprocessor = FeaturesPreprocessor() features_batch = [[[FEATURE_VALUE_1]]] features_transformed = preprocessor.fit_transform(features_batch) features_length = len(preprocessor.features_indices) assert features_length == 1 assert all_close(features_transformed, [[[1]]])
def test_should_serialize_features_indices_input_preprocessor(self): features_preprocessor = DelftFeaturesPreprocessor(features_indices=[0]) features_preprocessor.fit(SAMPLE_FEATURES) preprocessor = DelftWordPreprocessor( feature_preprocessor=features_preprocessor) LOGGER.debug('original features_vocabulary_size: %s', features_preprocessor.features_vocabulary_size) LOGGER.debug('original features_indices: %s', features_preprocessor.features_indices) LOGGER.debug('original features_map_to_index: %s', features_preprocessor.features_map_to_index) output_json = json.dumps(get_preprocessor_json(preprocessor)) LOGGER.debug('output_json: %s', output_json) loaded_preprocessor = get_preprocessor_for_json( json.loads(output_json)) LOGGER.debug('type: %s', type(loaded_preprocessor)) assert isinstance(loaded_preprocessor, DelftWordPreprocessor) loaded_features_preprocessor = loaded_preprocessor.feature_preprocessor LOGGER.debug('type: %s', type(loaded_features_preprocessor)) assert isinstance(loaded_features_preprocessor, DelftFeaturesPreprocessor) assert (loaded_features_preprocessor.features_vocabulary_size == features_preprocessor.features_vocabulary_size) assert (loaded_features_preprocessor.features_indices == features_preprocessor.features_indices) assert (loaded_features_preprocessor.features_map_to_index == features_preprocessor.features_map_to_index)
def test_serialize_to_json(self, tmp_path): preprocessor = FeaturesPreprocessor(features_indices=[1]) features_batch = [[[FEATURE_VALUE_1, FEATURE_VALUE_2], [FEATURE_VALUE_1, FEATURE_VALUE_3], [FEATURE_VALUE_1, FEATURE_VALUE_4]]] X_train = [['Word1']] y_train = [['label1']] preprocessor.fit(features_batch) word_preprocessor = Preprocessor(feature_preprocessor=preprocessor) word_preprocessor.fit(X_train, y_train) serialised_file_path = os.path.join(str(tmp_path), "serialised.json") word_preprocessor.save(file_path=serialised_file_path) back = Preprocessor.load(serialised_file_path) assert back is not None assert back.feature_preprocessor is not None original_as_dict = word_preprocessor.__dict__ back_as_dict = back.__dict__ for key in back_as_dict.keys(): if key == 'feature_preprocessor': for sub_key in back_as_dict[key].__dict__.keys(): assert back_as_dict[key].__dict__[ sub_key] == original_as_dict[key].__dict__[sub_key] else: assert back_as_dict[key] == original_as_dict[key]
def test_should_fit_single_multiple_value_features(self): preprocessor = FeaturesPreprocessor() features_batch = [[[FEATURE_VALUE_1], [FEATURE_VALUE_2]]] features_transformed = preprocessor.fit_transform(features_batch) features_length = len(preprocessor.features_indices) assert features_length == 1 assert len(features_transformed[0]) == 2 assert np.array_equal(features_transformed, np.asarray([[[1], [2]]]))
def test_should_select_features(self): preprocessor = FeaturesPreprocessor(features_indices=[1]) features_batch = [[[FEATURE_VALUE_1, FEATURE_VALUE_2], [FEATURE_VALUE_1, FEATURE_VALUE_3], [FEATURE_VALUE_1, FEATURE_VALUE_4]]] features_transformed = preprocessor.fit_transform(features_batch) features_length = len(preprocessor.features_indices) assert features_length == 1 assert all_close(features_transformed, [[[1], [2], [3]]])
def get_features_preprocessor( model_config: ModelConfig, features: np.array = None) -> T_FeaturesPreprocessor: if not model_config.use_features: LOGGER.info('features not enabled') return None if features is None: LOGGER.info('no features available') return None if model_config.use_features_indices_input: LOGGER.info( 'using feature indices as input, features_indices=%s, features_vocab_size=%s', model_config.features_indices, model_config.features_vocabulary_size ) return FeaturesPreprocessor( features_indices=model_config.features_indices, features_vocabulary_size=model_config.features_vocabulary_size ) LOGGER.info( 'using feature indices=%s', model_config.features_indices ) return ScienceBeamFeaturesPreprocessor( features_indices=model_config.features_indices, continuous_features_indices=model_config.continuous_features_indices )
def test_should_fit_empty_dataset(self): preprocessor = FeaturesPreprocessor() preprocessor.fit([])
def test_should_be_able_to_instantiate_with_default_values(self): FeaturesPreprocessor()
def test_should_transform_unseen_to_zero(self): preprocessor = FeaturesPreprocessor() features_batch = [[[FEATURE_VALUE_1]]] preprocessor.fit(features_batch) features_transformed = preprocessor.transform([[[FEATURE_VALUE_2]]]) assert all_close(features_transformed, [[[0]]])